Annotation of sql/sqlite/regexp.C, revision 1.2

1.1       moko        1: /*
                      2: ** 2012-11-13
                      3: **
                      4: ** The author disclaims copyright to this source code.  In place of
                      5: ** a legal notice, here is a blessing:
                      6: **
                      7: **    May you do good and not evil.
                      8: **    May you find forgiveness for yourself and forgive others.
                      9: **    May you share freely, never taking more than you give.
                     10: **
                     11: ******************************************************************************
                     12: **
                     13: ** The code in this file implements a compact but reasonably
                     14: ** efficient regular-expression matcher for posix extended regular
                     15: ** expressions against UTF8 text.
                     16: **
                     17: ** This file is an SQLite extension.  It registers a single function
                     18: ** named "regexp(A,B)" where A is the regular expression and B is the
                     19: ** string to be matched.  By registering this function, SQLite will also
                     20: ** then implement the "B regexp A" operator.  Note that with the function
                     21: ** the regular expression comes first, but with the operator it comes
                     22: ** second.
                     23: **
                     24: **  The following regular expression syntax is supported:
                     25: **
                     26: **     X*      zero or more occurrences of X
                     27: **     X+      one or more occurrences of X
                     28: **     X?      zero or one occurrences of X
                     29: **     X{p,q}  between p and q occurrences of X
                     30: **     (X)     match X
                     31: **     X|Y     X or Y
                     32: **     ^X      X occurring at the beginning of the string
                     33: **     X$      X occurring at the end of the string
                     34: **     .       Match any single character
                     35: **     \c      Character c where c is one of \{}()[]|*+?.
                     36: **     \c      C-language escapes for c in afnrtv.  ex: \t or \n
                     37: **     \uXXXX  Where XXXX is exactly 4 hex digits, unicode value XXXX
                     38: **     \xXX    Where XX is exactly 2 hex digits, unicode value XX
                     39: **     [abc]   Any single character from the set abc
                     40: **     [^abc]  Any single character not in the set abc
                     41: **     [a-z]   Any single character in the range a-z
                     42: **     [^a-z]  Any single character not in the range a-z
                     43: **     \b      Word boundary
                     44: **     \w      Word character.  [A-Za-z0-9_]
                     45: **     \W      Non-word character
                     46: **     \d      Digit
                     47: **     \D      Non-digit
                     48: **     \s      Whitespace character
                     49: **     \S      Non-whitespace character
                     50: **
                     51: ** A nondeterministic finite automaton (NFA) is used for matching, so the
                     52: ** performance is bounded by O(N*M) where N is the size of the regular
                     53: ** expression and M is the size of the input string.  The matcher never
                     54: ** exhibits exponential behavior.  Note that the X{p,q} operator expands
                     55: ** to p copies of X following by q-p copies of X? and that the size of the
                     56: ** regular expression in the O(N*M) performance bound is computed after
                     57: ** this expansion.
                     58: */
                     59: #include <string.h>
                     60: #include <stdlib.h>
                     61: 
                     62: /*
                     63: ** The following #defines change the names of some functions implemented in
                     64: ** this file to prevent name collisions with C-library functions of the
                     65: ** same name.
                     66: */
                     67: #define re_match   pa_re_match
                     68: #define re_compile pa_re_compile
                     69: #define re_free    pa_re_free
                     70: 
                     71: /* The end-of-input character */
                     72: #define RE_EOF            0    /* End of input */
                     73: 
                     74: /* The NFA is implemented as sequence of opcodes taken from the following
                     75: ** set.  Each opcode has a single integer argument.
                     76: */
                     77: #define RE_OP_MATCH       1    /* Match the one character in the argument */
                     78: #define RE_OP_ANY         2    /* Match any one character.  (Implements ".") */
                     79: #define RE_OP_ANYSTAR     3    /* Special optimized version of .* */
                     80: #define RE_OP_FORK        4    /* Continue to both next and opcode at iArg */
                     81: #define RE_OP_GOTO        5    /* Jump to opcode at iArg */
                     82: #define RE_OP_ACCEPT      6    /* Halt and indicate a successful match */
                     83: #define RE_OP_CC_INC      7    /* Beginning of a [...] character class */
                     84: #define RE_OP_CC_EXC      8    /* Beginning of a [^...] character class */
                     85: #define RE_OP_CC_VALUE    9    /* Single value in a character class */
                     86: #define RE_OP_CC_RANGE   10    /* Range of values in a character class */
                     87: #define RE_OP_WORD       11    /* Perl word character [A-Za-z0-9_] */
                     88: #define RE_OP_NOTWORD    12    /* Not a perl word character */
                     89: #define RE_OP_DIGIT      13    /* digit:  [0-9] */
                     90: #define RE_OP_NOTDIGIT   14    /* Not a digit */
                     91: #define RE_OP_SPACE      15    /* space:  [ \t\n\r\v\f] */
                     92: #define RE_OP_NOTSPACE   16    /* Not a digit */
                     93: #define RE_OP_BOUNDARY   17    /* Boundary between word and non-word */
                     94: 
                     95: /* Each opcode is a "state" in the NFA */
                     96: typedef unsigned short ReStateNumber;
                     97: 
                     98: /* Because this is an NFA and not a DFA, multiple states can be active at
                     99: ** once.  An instance of the following object records all active states in
                    100: ** the NFA.  The implementation is optimized for the common case where the
                    101: ** number of actives states is small.
                    102: */
                    103: typedef struct ReStateSet {
                    104:   unsigned nState;            /* Number of current states */
                    105:   ReStateNumber *aState;      /* Current states */
                    106: } ReStateSet;
                    107: 
                    108: /* An input string read one character at a time.
                    109: */
                    110: typedef struct ReInput ReInput;
                    111: struct ReInput {
                    112:   const unsigned char *z;  /* All text */
                    113:   int i;                   /* Next byte to read */
                    114:   int mx;                  /* EOF when i>=mx */
                    115: };
                    116: 
                    117: /* A compiled NFA (or an NFA that is in the process of being compiled) is
                    118: ** an instance of the following object.
                    119: */
                    120: typedef struct ReCompiled ReCompiled;
                    121: struct ReCompiled {
                    122:   ReInput sIn;                /* Regular expression text */
                    123:   const char *zErr;           /* Error message to return */
                    124:   char *aOp;                  /* Operators for the virtual machine */
                    125:   int *aArg;                  /* Arguments to each operator */
                    126:   unsigned (*xNextChar)(ReInput*);  /* Next character function */
                    127:   unsigned char zInit[12];    /* Initial text to match */
                    128:   int nInit;                  /* Number of characters in zInit */
                    129:   unsigned nState;            /* Number of entries in aOp[] and aArg[] */
                    130:   unsigned nAlloc;            /* Slots allocated for aOp[] and aArg[] */
                    131: };
                    132: 
                    133: /* Add a state to the given state set if it is not already there */
                    134: static void re_add_state(ReStateSet *pSet, int newState){
                    135:   unsigned i;
                    136:   for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return;
                    137:   pSet->aState[pSet->nState++] = (ReStateNumber)newState;
                    138: }
                    139: 
                    140: /* Extract the next unicode character from *pzIn and return it.  Advance
                    141: ** *pzIn to the first byte past the end of the character returned.  To
                    142: ** be clear:  this routine converts utf8 to unicode.  This routine is 
                    143: ** optimized for the common case where the next character is a single byte.
                    144: */
                    145: static unsigned re_next_char(ReInput *p){
                    146:   unsigned c;
                    147:   if( p->i>=p->mx ) return 0;
                    148:   c = p->z[p->i++];
                    149:   if( c>=0x80 ){
                    150:     if( (c&0xe0)==0xc0 && p->i<p->mx && (p->z[p->i]&0xc0)==0x80 ){
                    151:       c = (c&0x1f)<<6 | (p->z[p->i++]&0x3f);
                    152:       if( c<0x80 ) c = 0xfffd;
                    153:     }else if( (c&0xf0)==0xe0 && p->i+1<p->mx && (p->z[p->i]&0xc0)==0x80
                    154:            && (p->z[p->i+1]&0xc0)==0x80 ){
                    155:       c = (c&0x0f)<<12 | ((p->z[p->i]&0x3f)<<6) | (p->z[p->i+1]&0x3f);
                    156:       p->i += 2;
                    157:       if( c<=0x7ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd;
                    158:     }else if( (c&0xf8)==0xf0 && p->i+3<p->mx && (p->z[p->i]&0xc0)==0x80
                    159:            && (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){
                    160:       c = (c&0x07)<<18 | ((p->z[p->i]&0x3f)<<12) | ((p->z[p->i+1]&0x3f)<<6)
                    161:                        | (p->z[p->i+2]&0x3f);
                    162:       p->i += 3;
                    163:       if( c<=0xffff || c>0x10ffff ) c = 0xfffd;
                    164:     }else{
                    165:       c = 0xfffd;
                    166:     }
                    167:   }
                    168:   return c;
                    169: }
                    170: static unsigned re_next_char_nocase(ReInput *p){
                    171:   unsigned c = re_next_char(p);
                    172:   if( c>='A' && c<='Z' ) c += 'a' - 'A';
                    173:   return c;
                    174: }
                    175: 
                    176: /* Return true if c is a perl "word" character:  [A-Za-z0-9_] */
                    177: static int re_word_char(int c){
                    178:   return (c>='0' && c<='9') || (c>='a' && c<='z')
                    179:       || (c>='A' && c<='Z') || c=='_';
                    180: }
                    181: 
                    182: /* Return true if c is a "digit" character:  [0-9] */
                    183: static int re_digit_char(int c){
                    184:   return (c>='0' && c<='9');
                    185: }
                    186: 
                    187: /* Return true if c is a perl "space" character:  [ \t\r\n\v\f] */
                    188: static int re_space_char(int c){
                    189:   return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
                    190: }
                    191: 
                    192: /* Run a compiled regular expression on the zero-terminated input
                    193: ** string zIn[].  Return true on a match and false if there is no match.
                    194: */
                    195: int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){
                    196:   ReStateSet aStateSet[2], *pThis, *pNext;
                    197:   ReStateNumber aSpace[100];
                    198:   ReStateNumber *pToFree;
                    199:   unsigned int i = 0;
                    200:   unsigned int iSwap = 0;
                    201:   int c = RE_EOF+1;
                    202:   int cPrev = 0;
                    203:   int rc = 0;
                    204:   ReInput in;
                    205: 
                    206:   in.z = zIn;
                    207:   in.i = 0;
                    208:   in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn);
                    209: 
                    210:   /* Look for the initial prefix match, if there is one. */
                    211:   if( pRe->nInit ){
                    212:     unsigned char x = pRe->zInit[0];
                    213:     while( in.i+pRe->nInit<=in.mx 
                    214:      && (zIn[in.i]!=x ||
                    215:          strncmp((const char*)zIn+in.i, (const char*)pRe->zInit, pRe->nInit)!=0)
                    216:     ){
                    217:       in.i++;
                    218:     }
                    219:     if( in.i+pRe->nInit>in.mx ) return 0;
                    220:   }
                    221: 
                    222:   if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
                    223:     pToFree = 0;
                    224:     aStateSet[0].aState = aSpace;
                    225:   }else{
                    226:     pToFree = (ReStateNumber *)malloc( sizeof(ReStateNumber)*2*pRe->nState );
                    227:     if( pToFree==0 ) return -1;
                    228:     aStateSet[0].aState = pToFree;
                    229:   }
                    230:   aStateSet[1].aState = &aStateSet[0].aState[pRe->nState];
                    231:   pNext = &aStateSet[1];
                    232:   pNext->nState = 0;
                    233:   re_add_state(pNext, 0);
                    234:   while( c!=RE_EOF && pNext->nState>0 ){
                    235:     cPrev = c;
                    236:     c = pRe->xNextChar(&in);
                    237:     pThis = pNext;
                    238:     pNext = &aStateSet[iSwap];
                    239:     iSwap = 1 - iSwap;
                    240:     pNext->nState = 0;
                    241:     for(i=0; i<pThis->nState; i++){
                    242:       int x = pThis->aState[i];
                    243:       switch( pRe->aOp[x] ){
                    244:         case RE_OP_MATCH: {
                    245:           if( pRe->aArg[x]==c ) re_add_state(pNext, x+1);
                    246:           break;
                    247:         }
                    248:         case RE_OP_ANY: {
                    249:           re_add_state(pNext, x+1);
                    250:           break;
                    251:         }
                    252:         case RE_OP_WORD: {
                    253:           if( re_word_char(c) ) re_add_state(pNext, x+1);
                    254:           break;
                    255:         }
                    256:         case RE_OP_NOTWORD: {
                    257:           if( !re_word_char(c) ) re_add_state(pNext, x+1);
                    258:           break;
                    259:         }
                    260:         case RE_OP_DIGIT: {
                    261:           if( re_digit_char(c) ) re_add_state(pNext, x+1);
                    262:           break;
                    263:         }
                    264:         case RE_OP_NOTDIGIT: {
                    265:           if( !re_digit_char(c) ) re_add_state(pNext, x+1);
                    266:           break;
                    267:         }
                    268:         case RE_OP_SPACE: {
                    269:           if( re_space_char(c) ) re_add_state(pNext, x+1);
                    270:           break;
                    271:         }
                    272:         case RE_OP_NOTSPACE: {
                    273:           if( !re_space_char(c) ) re_add_state(pNext, x+1);
                    274:           break;
                    275:         }
                    276:         case RE_OP_BOUNDARY: {
                    277:           if( re_word_char(c)!=re_word_char(cPrev) ) re_add_state(pThis, x+1);
                    278:           break;
                    279:         }
                    280:         case RE_OP_ANYSTAR: {
                    281:           re_add_state(pNext, x);
                    282:           re_add_state(pThis, x+1);
                    283:           break;
                    284:         }
                    285:         case RE_OP_FORK: {
                    286:           re_add_state(pThis, x+pRe->aArg[x]);
                    287:           re_add_state(pThis, x+1);
                    288:           break;
                    289:         }
                    290:         case RE_OP_GOTO: {
                    291:           re_add_state(pThis, x+pRe->aArg[x]);
                    292:           break;
                    293:         }
                    294:         case RE_OP_ACCEPT: {
                    295:           rc = 1;
                    296:           goto re_match_end;
                    297:         }
                    298:         case RE_OP_CC_INC:
                    299:         case RE_OP_CC_EXC: {
1.2     ! moko      300:           if(!c)
        !           301:             break;
1.1       moko      302:           int j = 1;
                    303:           int n = pRe->aArg[x];
                    304:           int hit = 0;
                    305:           for(j=1; j>0 && j<n; j++){
                    306:             if( pRe->aOp[x+j]==RE_OP_CC_VALUE ){
                    307:               if( pRe->aArg[x+j]==c ){
                    308:                 hit = 1;
                    309:                 j = -1;
                    310:               }
                    311:             }else{
                    312:               if( pRe->aArg[x+j]<=c && pRe->aArg[x+j+1]>=c ){
                    313:                 hit = 1;
                    314:                 j = -1;
                    315:               }else{
                    316:                 j++;
                    317:               }
                    318:             }
                    319:           }
                    320:           if( pRe->aOp[x]==RE_OP_CC_EXC ) hit = !hit;
                    321:           if( hit ) re_add_state(pNext, x+n);
1.2     ! moko      322:           break;
1.1       moko      323:         }
                    324:       }
                    325:     }
                    326:   }
                    327:   for(i=0; i<pNext->nState; i++){
                    328:     if( pRe->aOp[pNext->aState[i]]==RE_OP_ACCEPT ){ rc = 1; break; }
                    329:   }
                    330: re_match_end:
                    331:   free(pToFree);
                    332:   return rc;
                    333: }
                    334: 
                    335: /* Resize the opcode and argument arrays for an RE under construction.
                    336: */
                    337: static int re_resize(ReCompiled *p, int N){
                    338:   char *aOp;
                    339:   int *aArg;
                    340:   aOp = (char *)realloc(p->aOp, N*sizeof(p->aOp[0]));
                    341:   if( aOp==0 ) return 1;
                    342:   p->aOp = aOp;
                    343:   aArg = (int *)realloc(p->aArg, N*sizeof(p->aArg[0]));
                    344:   if( aArg==0 ) return 1;
                    345:   p->aArg = aArg;
                    346:   p->nAlloc = N;
                    347:   return 0;
                    348: }
                    349: 
                    350: /* Insert a new opcode and argument into an RE under construction.  The
                    351: ** insertion point is just prior to existing opcode iBefore.
                    352: */
                    353: static int re_insert(ReCompiled *p, int iBefore, int op, int arg){
                    354:   int i;
                    355:   if( p->nAlloc<=p->nState && re_resize(p, p->nAlloc*2) ) return 0;
                    356:   for(i=p->nState; i>iBefore; i--){
                    357:     p->aOp[i] = p->aOp[i-1];
                    358:     p->aArg[i] = p->aArg[i-1];
                    359:   }
                    360:   p->nState++;
                    361:   p->aOp[iBefore] = (char)op;
                    362:   p->aArg[iBefore] = arg;
                    363:   return iBefore;
                    364: }
                    365: 
                    366: /* Append a new opcode and argument to the end of the RE under construction.
                    367: */
                    368: static int re_append(ReCompiled *p, int op, int arg){
                    369:   return re_insert(p, p->nState, op, arg);
                    370: }
                    371: 
                    372: /* Make a copy of N opcodes starting at iStart onto the end of the RE
                    373: ** under construction.
                    374: */
                    375: static void re_copy(ReCompiled *p, int iStart, int N){
                    376:   if( p->nState+N>=p->nAlloc && re_resize(p, p->nAlloc*2+N) ) return;
                    377:   memcpy(&p->aOp[p->nState], &p->aOp[iStart], N*sizeof(p->aOp[0]));
                    378:   memcpy(&p->aArg[p->nState], &p->aArg[iStart], N*sizeof(p->aArg[0]));
                    379:   p->nState += N;
                    380: }
                    381: 
                    382: /* Return true if c is a hexadecimal digit character:  [0-9a-fA-F]
                    383: ** If c is a hex digit, also set *pV = (*pV)*16 + valueof(c).  If
                    384: ** c is not a hex digit *pV is unchanged.
                    385: */
                    386: static int re_hex(int c, int *pV){
                    387:   if( c>='0' && c<='9' ){
                    388:     c -= '0';
                    389:   }else if( c>='a' && c<='f' ){
                    390:     c -= 'a' - 10;
                    391:   }else if( c>='A' && c<='F' ){
                    392:     c -= 'A' - 10;
                    393:   }else{
                    394:     return 0;
                    395:   }
                    396:   *pV = (*pV)*16 + (c & 0xff);
                    397:   return 1;
                    398: }
                    399: 
                    400: /* A backslash character has been seen, read the next character and
                    401: ** return its interpretation.
                    402: */
                    403: static unsigned re_esc_char(ReCompiled *p){
                    404:   static const char zEsc[] = "afnrtv\\()*.+?[$^{|}]";
                    405:   static const char zTrans[] = "\a\f\n\r\t\v";
                    406:   int i, v = 0;
                    407:   char c;
                    408:   if( p->sIn.i>=p->sIn.mx ) return 0;
                    409:   c = p->sIn.z[p->sIn.i];
                    410:   if( c=='u' && p->sIn.i+4<p->sIn.mx ){
                    411:     const unsigned char *zIn = p->sIn.z + p->sIn.i;
                    412:     if( re_hex(zIn[1],&v)
                    413:      && re_hex(zIn[2],&v)
                    414:      && re_hex(zIn[3],&v)
                    415:      && re_hex(zIn[4],&v)
                    416:     ){
                    417:       p->sIn.i += 5;
                    418:       return v;
                    419:     }
                    420:   }
                    421:   if( c=='x' && p->sIn.i+2<p->sIn.mx ){
                    422:     const unsigned char *zIn = p->sIn.z + p->sIn.i;
                    423:     if( re_hex(zIn[1],&v)
                    424:      && re_hex(zIn[2],&v)
                    425:     ){
                    426:       p->sIn.i += 3;
                    427:       return v;
                    428:     }
                    429:   }
                    430:   for(i=0; zEsc[i] && zEsc[i]!=c; i++){}
                    431:   if( zEsc[i] ){
                    432:     if( i<6 ) c = zTrans[i];
                    433:     p->sIn.i++;
                    434:   }else{
                    435:     p->zErr = "unknown \\ escape";
                    436:   }
                    437:   return c;
                    438: }
                    439: 
                    440: /* Forward declaration */
                    441: static const char *re_subcompile_string(ReCompiled*);
                    442: 
                    443: /* Peek at the next byte of input */
                    444: static unsigned char rePeek(ReCompiled *p){
                    445:   return p->sIn.i<p->sIn.mx ? p->sIn.z[p->sIn.i] : 0;
                    446: }
                    447: 
                    448: /* Compile RE text into a sequence of opcodes.  Continue up to the
                    449: ** first unmatched ")" character, then return.  If an error is found,
                    450: ** return a pointer to the error message string.
                    451: */
                    452: static const char *re_subcompile_re(ReCompiled *p){
                    453:   const char *zErr;
                    454:   int iStart, iEnd, iGoto;
                    455:   iStart = p->nState;
                    456:   zErr = re_subcompile_string(p);
                    457:   if( zErr ) return zErr;
                    458:   while( rePeek(p)=='|' ){
                    459:     iEnd = p->nState;
                    460:     re_insert(p, iStart, RE_OP_FORK, iEnd + 2 - iStart);
                    461:     iGoto = re_append(p, RE_OP_GOTO, 0);
                    462:     p->sIn.i++;
                    463:     zErr = re_subcompile_string(p);
                    464:     if( zErr ) return zErr;
                    465:     p->aArg[iGoto] = p->nState - iGoto;
                    466:   }
                    467:   return 0;
                    468: }
                    469: 
                    470: /* Compile an element of regular expression text (anything that can be
                    471: ** an operand to the "|" operator).  Return NULL on success or a pointer
                    472: ** to the error message if there is a problem.
                    473: */
                    474: static const char *re_subcompile_string(ReCompiled *p){
                    475:   int iPrev = -1;
                    476:   int iStart;
                    477:   unsigned c;
                    478:   const char *zErr;
                    479:   while( (c = p->xNextChar(&p->sIn))!=0 ){
                    480:     iStart = p->nState;
                    481:     switch( c ){
                    482:       case '|':
                    483:       case '$': 
                    484:       case ')': {
                    485:         p->sIn.i--;
                    486:         return 0;
                    487:       }
                    488:       case '(': {
                    489:         zErr = re_subcompile_re(p);
                    490:         if( zErr ) return zErr;
                    491:         if( rePeek(p)!=')' ) return "unmatched '('";
                    492:         p->sIn.i++;
                    493:         break;
                    494:       }
                    495:       case '.': {
                    496:         if( rePeek(p)=='*' ){
                    497:           re_append(p, RE_OP_ANYSTAR, 0);
                    498:           p->sIn.i++;
                    499:         }else{ 
                    500:           re_append(p, RE_OP_ANY, 0);
                    501:         }
                    502:         break;
                    503:       }
                    504:       case '*': {
                    505:         if( iPrev<0 ) return "'*' without operand";
                    506:         re_insert(p, iPrev, RE_OP_GOTO, p->nState - iPrev + 1);
                    507:         re_append(p, RE_OP_FORK, iPrev - p->nState + 1);
                    508:         break;
                    509:       }
                    510:       case '+': {
                    511:         if( iPrev<0 ) return "'+' without operand";
                    512:         re_append(p, RE_OP_FORK, iPrev - p->nState);
                    513:         break;
                    514:       }
                    515:       case '?': {
                    516:         if( iPrev<0 ) return "'?' without operand";
                    517:         re_insert(p, iPrev, RE_OP_FORK, p->nState - iPrev+1);
                    518:         break;
                    519:       }
                    520:       case '{': {
                    521:         int m = 0, n = 0;
                    522:         int sz, j;
                    523:         if( iPrev<0 ) return "'{m,n}' without operand";
                    524:         while( (c=rePeek(p))>='0' && c<='9' ){ m = m*10 + c - '0'; p->sIn.i++; }
                    525:         n = m;
                    526:         if( c==',' ){
                    527:           p->sIn.i++;
                    528:           n = 0;
                    529:           while( (c=rePeek(p))>='0' && c<='9' ){ n = n*10 + c-'0'; p->sIn.i++; }
                    530:         }
                    531:         if( c!='}' ) return "unmatched '{'";
                    532:         if( n>0 && n<m ) return "n less than m in '{m,n}'";
                    533:         p->sIn.i++;
                    534:         sz = p->nState - iPrev;
                    535:         if( m==0 ){
                    536:           if( n==0 ) return "both m and n are zero in '{m,n}'";
                    537:           re_insert(p, iPrev, RE_OP_FORK, sz+1);
                    538:           n--;
                    539:         }else{
                    540:           for(j=1; j<m; j++) re_copy(p, iPrev, sz);
                    541:         }
                    542:         for(j=m; j<n; j++){
                    543:           re_append(p, RE_OP_FORK, sz+1);
                    544:           re_copy(p, iPrev, sz);
                    545:         }
                    546:         if( n==0 && m>0 ){
                    547:           re_append(p, RE_OP_FORK, -sz);
                    548:         }
                    549:         break;
                    550:       }
                    551:       case '[': {
                    552:         int iFirst = p->nState;
                    553:         if( rePeek(p)=='^' ){
                    554:           re_append(p, RE_OP_CC_EXC, 0);
                    555:           p->sIn.i++;
                    556:         }else{
                    557:           re_append(p, RE_OP_CC_INC, 0);
                    558:         }
                    559:         while( (c = p->xNextChar(&p->sIn))!=0 ){
                    560:           if( c=='[' && rePeek(p)==':' ){
                    561:             return "POSIX character classes not supported";
                    562:           }
                    563:           if( c=='\\' ) c = re_esc_char(p);
                    564:           if( rePeek(p)=='-' ){
                    565:             re_append(p, RE_OP_CC_RANGE, c);
                    566:             p->sIn.i++;
                    567:             c = p->xNextChar(&p->sIn);
                    568:             if( c=='\\' ) c = re_esc_char(p);
                    569:             re_append(p, RE_OP_CC_RANGE, c);
                    570:           }else{
                    571:             re_append(p, RE_OP_CC_VALUE, c);
                    572:           }
                    573:           if( rePeek(p)==']' ){ p->sIn.i++; break; }
                    574:         }
                    575:         if( c==0 ) return "unclosed '['";
                    576:         p->aArg[iFirst] = p->nState - iFirst;
                    577:         break;
                    578:       }
                    579:       case '\\': {
                    580:         int specialOp = 0;
                    581:         switch( rePeek(p) ){
                    582:           case 'b': specialOp = RE_OP_BOUNDARY;   break;
                    583:           case 'd': specialOp = RE_OP_DIGIT;      break;
                    584:           case 'D': specialOp = RE_OP_NOTDIGIT;   break;
                    585:           case 's': specialOp = RE_OP_SPACE;      break;
                    586:           case 'S': specialOp = RE_OP_NOTSPACE;   break;
                    587:           case 'w': specialOp = RE_OP_WORD;       break;
                    588:           case 'W': specialOp = RE_OP_NOTWORD;    break;
                    589:         }
                    590:         if( specialOp ){
                    591:           p->sIn.i++;
                    592:           re_append(p, specialOp, 0);
                    593:         }else{
                    594:           c = re_esc_char(p);
                    595:           re_append(p, RE_OP_MATCH, c);
                    596:         }
                    597:         break;
                    598:       }
                    599:       default: {
                    600:         re_append(p, RE_OP_MATCH, c);
                    601:         break;
                    602:       }
                    603:     }
                    604:     iPrev = iStart;
                    605:   }
                    606:   return 0;
                    607: }
                    608: 
                    609: /* Free and reclaim all the memory used by a previously compiled
                    610: ** regular expression.  Applications should invoke this routine once
                    611: ** for every call to re_compile() to avoid memory leaks.
                    612: */
                    613: void re_free(ReCompiled *pRe){
                    614:   if( pRe ){
                    615:     free(pRe->aOp);
                    616:     free(pRe->aArg);
                    617:     free(pRe);
                    618:   }
                    619: }
                    620: 
                    621: /*
                    622: ** Compile a textual regular expression in zIn[] into a compiled regular
                    623: ** expression suitable for us by re_match() and return a pointer to the
                    624: ** compiled regular expression in *ppRe.  Return NULL on success or an
                    625: ** error message if something goes wrong.
                    626: */
                    627: const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){
                    628:   ReCompiled *pRe;
                    629:   const char *zErr;
                    630:   int i, j;
                    631: 
                    632:   *ppRe = 0;
                    633:   pRe = (ReCompiled *)malloc( sizeof(*pRe) );
                    634:   if( pRe==0 ){
                    635:     return "out of memory";
                    636:   }
                    637:   memset(pRe, 0, sizeof(*pRe));
                    638:   pRe->xNextChar = noCase ? re_next_char_nocase : re_next_char;
                    639:   if( re_resize(pRe, 30) ){
                    640:     re_free(pRe);
                    641:     return "out of memory";
                    642:   }
                    643:   if( zIn[0]=='^' ){
                    644:     zIn++;
                    645:   }else{
                    646:     re_append(pRe, RE_OP_ANYSTAR, 0);
                    647:   }
                    648:   pRe->sIn.z = (unsigned char*)zIn;
                    649:   pRe->sIn.i = 0;
                    650:   pRe->sIn.mx = (int)strlen(zIn);
                    651:   zErr = re_subcompile_re(pRe);
                    652:   if( zErr ){
                    653:     re_free(pRe);
                    654:     return zErr;
                    655:   }
                    656:   if( rePeek(pRe)=='$' && pRe->sIn.i+1>=pRe->sIn.mx ){
                    657:     re_append(pRe, RE_OP_MATCH, RE_EOF);
                    658:     re_append(pRe, RE_OP_ACCEPT, 0);
                    659:     *ppRe = pRe;
                    660:   }else if( pRe->sIn.i>=pRe->sIn.mx ){
                    661:     re_append(pRe, RE_OP_ACCEPT, 0);
                    662:     *ppRe = pRe;
                    663:   }else{
                    664:     re_free(pRe);
                    665:     return "unrecognized character";
                    666:   }
                    667: 
                    668:   /* The following is a performance optimization.  If the regex begins with
                    669:   ** ".*" (if the input regex lacks an initial "^") and afterwards there are
                    670:   ** one or more matching characters, enter those matching characters into
                    671:   ** zInit[].  The re_match() routine can then search ahead in the input 
                    672:   ** string looking for the initial match without having to run the whole
                    673:   ** regex engine over the string.  Do not worry able trying to match
                    674:   ** unicode characters beyond plane 0 - those are very rare and this is
                    675:   ** just an optimization. */
                    676:   if( pRe->aOp[0]==RE_OP_ANYSTAR ){
                    677:     for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
                    678:       unsigned x = pRe->aArg[i];
                    679:       if( x<=127 ){
                    680:         pRe->zInit[j++] = (unsigned char)x;
                    681:       }else if( x<=0xfff ){
                    682:         pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6));
                    683:         pRe->zInit[j++] = 0x80 | (x&0x3f);
                    684:       }else if( x<=0xffff ){
                    685:         pRe->zInit[j++] = (unsigned char)(0xd0 | (x>>12));
                    686:         pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
                    687:         pRe->zInit[j++] = 0x80 | (x&0x3f);
                    688:       }else{
                    689:         break;
                    690:       }
                    691:     }
                    692:     if( j>0 && pRe->zInit[j-1]==0 ) j--;
                    693:     pRe->nInit = j;
                    694:   }
                    695:   return pRe->zErr;
                    696: }

E-mail: