Annotation of sql/sqlite/regexp.C, revision 1.1

1.1     ! moko        1: /*
        !             2: ** 2012-11-13
        !             3: **
        !             4: ** The author disclaims copyright to this source code.  In place of
        !             5: ** a legal notice, here is a blessing:
        !             6: **
        !             7: **    May you do good and not evil.
        !             8: **    May you find forgiveness for yourself and forgive others.
        !             9: **    May you share freely, never taking more than you give.
        !            10: **
        !            11: ******************************************************************************
        !            12: **
        !            13: ** The code in this file implements a compact but reasonably
        !            14: ** efficient regular-expression matcher for posix extended regular
        !            15: ** expressions against UTF8 text.
        !            16: **
        !            17: ** This file is an SQLite extension.  It registers a single function
        !            18: ** named "regexp(A,B)" where A is the regular expression and B is the
        !            19: ** string to be matched.  By registering this function, SQLite will also
        !            20: ** then implement the "B regexp A" operator.  Note that with the function
        !            21: ** the regular expression comes first, but with the operator it comes
        !            22: ** second.
        !            23: **
        !            24: **  The following regular expression syntax is supported:
        !            25: **
        !            26: **     X*      zero or more occurrences of X
        !            27: **     X+      one or more occurrences of X
        !            28: **     X?      zero or one occurrences of X
        !            29: **     X{p,q}  between p and q occurrences of X
        !            30: **     (X)     match X
        !            31: **     X|Y     X or Y
        !            32: **     ^X      X occurring at the beginning of the string
        !            33: **     X$      X occurring at the end of the string
        !            34: **     .       Match any single character
        !            35: **     \c      Character c where c is one of \{}()[]|*+?.
        !            36: **     \c      C-language escapes for c in afnrtv.  ex: \t or \n
        !            37: **     \uXXXX  Where XXXX is exactly 4 hex digits, unicode value XXXX
        !            38: **     \xXX    Where XX is exactly 2 hex digits, unicode value XX
        !            39: **     [abc]   Any single character from the set abc
        !            40: **     [^abc]  Any single character not in the set abc
        !            41: **     [a-z]   Any single character in the range a-z
        !            42: **     [^a-z]  Any single character not in the range a-z
        !            43: **     \b      Word boundary
        !            44: **     \w      Word character.  [A-Za-z0-9_]
        !            45: **     \W      Non-word character
        !            46: **     \d      Digit
        !            47: **     \D      Non-digit
        !            48: **     \s      Whitespace character
        !            49: **     \S      Non-whitespace character
        !            50: **
        !            51: ** A nondeterministic finite automaton (NFA) is used for matching, so the
        !            52: ** performance is bounded by O(N*M) where N is the size of the regular
        !            53: ** expression and M is the size of the input string.  The matcher never
        !            54: ** exhibits exponential behavior.  Note that the X{p,q} operator expands
        !            55: ** to p copies of X following by q-p copies of X? and that the size of the
        !            56: ** regular expression in the O(N*M) performance bound is computed after
        !            57: ** this expansion.
        !            58: */
        !            59: #include <string.h>
        !            60: #include <stdlib.h>
        !            61: 
        !            62: /*
        !            63: ** The following #defines change the names of some functions implemented in
        !            64: ** this file to prevent name collisions with C-library functions of the
        !            65: ** same name.
        !            66: */
        !            67: #define re_match   pa_re_match
        !            68: #define re_compile pa_re_compile
        !            69: #define re_free    pa_re_free
        !            70: 
        !            71: /* The end-of-input character */
        !            72: #define RE_EOF            0    /* End of input */
        !            73: 
        !            74: /* The NFA is implemented as sequence of opcodes taken from the following
        !            75: ** set.  Each opcode has a single integer argument.
        !            76: */
        !            77: #define RE_OP_MATCH       1    /* Match the one character in the argument */
        !            78: #define RE_OP_ANY         2    /* Match any one character.  (Implements ".") */
        !            79: #define RE_OP_ANYSTAR     3    /* Special optimized version of .* */
        !            80: #define RE_OP_FORK        4    /* Continue to both next and opcode at iArg */
        !            81: #define RE_OP_GOTO        5    /* Jump to opcode at iArg */
        !            82: #define RE_OP_ACCEPT      6    /* Halt and indicate a successful match */
        !            83: #define RE_OP_CC_INC      7    /* Beginning of a [...] character class */
        !            84: #define RE_OP_CC_EXC      8    /* Beginning of a [^...] character class */
        !            85: #define RE_OP_CC_VALUE    9    /* Single value in a character class */
        !            86: #define RE_OP_CC_RANGE   10    /* Range of values in a character class */
        !            87: #define RE_OP_WORD       11    /* Perl word character [A-Za-z0-9_] */
        !            88: #define RE_OP_NOTWORD    12    /* Not a perl word character */
        !            89: #define RE_OP_DIGIT      13    /* digit:  [0-9] */
        !            90: #define RE_OP_NOTDIGIT   14    /* Not a digit */
        !            91: #define RE_OP_SPACE      15    /* space:  [ \t\n\r\v\f] */
        !            92: #define RE_OP_NOTSPACE   16    /* Not a digit */
        !            93: #define RE_OP_BOUNDARY   17    /* Boundary between word and non-word */
        !            94: 
        !            95: /* Each opcode is a "state" in the NFA */
        !            96: typedef unsigned short ReStateNumber;
        !            97: 
        !            98: /* Because this is an NFA and not a DFA, multiple states can be active at
        !            99: ** once.  An instance of the following object records all active states in
        !           100: ** the NFA.  The implementation is optimized for the common case where the
        !           101: ** number of actives states is small.
        !           102: */
        !           103: typedef struct ReStateSet {
        !           104:   unsigned nState;            /* Number of current states */
        !           105:   ReStateNumber *aState;      /* Current states */
        !           106: } ReStateSet;
        !           107: 
        !           108: /* An input string read one character at a time.
        !           109: */
        !           110: typedef struct ReInput ReInput;
        !           111: struct ReInput {
        !           112:   const unsigned char *z;  /* All text */
        !           113:   int i;                   /* Next byte to read */
        !           114:   int mx;                  /* EOF when i>=mx */
        !           115: };
        !           116: 
        !           117: /* A compiled NFA (or an NFA that is in the process of being compiled) is
        !           118: ** an instance of the following object.
        !           119: */
        !           120: typedef struct ReCompiled ReCompiled;
        !           121: struct ReCompiled {
        !           122:   ReInput sIn;                /* Regular expression text */
        !           123:   const char *zErr;           /* Error message to return */
        !           124:   char *aOp;                  /* Operators for the virtual machine */
        !           125:   int *aArg;                  /* Arguments to each operator */
        !           126:   unsigned (*xNextChar)(ReInput*);  /* Next character function */
        !           127:   unsigned char zInit[12];    /* Initial text to match */
        !           128:   int nInit;                  /* Number of characters in zInit */
        !           129:   unsigned nState;            /* Number of entries in aOp[] and aArg[] */
        !           130:   unsigned nAlloc;            /* Slots allocated for aOp[] and aArg[] */
        !           131: };
        !           132: 
        !           133: /* Add a state to the given state set if it is not already there */
        !           134: static void re_add_state(ReStateSet *pSet, int newState){
        !           135:   unsigned i;
        !           136:   for(i=0; i<pSet->nState; i++) if( pSet->aState[i]==newState ) return;
        !           137:   pSet->aState[pSet->nState++] = (ReStateNumber)newState;
        !           138: }
        !           139: 
        !           140: /* Extract the next unicode character from *pzIn and return it.  Advance
        !           141: ** *pzIn to the first byte past the end of the character returned.  To
        !           142: ** be clear:  this routine converts utf8 to unicode.  This routine is 
        !           143: ** optimized for the common case where the next character is a single byte.
        !           144: */
        !           145: static unsigned re_next_char(ReInput *p){
        !           146:   unsigned c;
        !           147:   if( p->i>=p->mx ) return 0;
        !           148:   c = p->z[p->i++];
        !           149:   if( c>=0x80 ){
        !           150:     if( (c&0xe0)==0xc0 && p->i<p->mx && (p->z[p->i]&0xc0)==0x80 ){
        !           151:       c = (c&0x1f)<<6 | (p->z[p->i++]&0x3f);
        !           152:       if( c<0x80 ) c = 0xfffd;
        !           153:     }else if( (c&0xf0)==0xe0 && p->i+1<p->mx && (p->z[p->i]&0xc0)==0x80
        !           154:            && (p->z[p->i+1]&0xc0)==0x80 ){
        !           155:       c = (c&0x0f)<<12 | ((p->z[p->i]&0x3f)<<6) | (p->z[p->i+1]&0x3f);
        !           156:       p->i += 2;
        !           157:       if( c<=0x7ff || (c>=0xd800 && c<=0xdfff) ) c = 0xfffd;
        !           158:     }else if( (c&0xf8)==0xf0 && p->i+3<p->mx && (p->z[p->i]&0xc0)==0x80
        !           159:            && (p->z[p->i+1]&0xc0)==0x80 && (p->z[p->i+2]&0xc0)==0x80 ){
        !           160:       c = (c&0x07)<<18 | ((p->z[p->i]&0x3f)<<12) | ((p->z[p->i+1]&0x3f)<<6)
        !           161:                        | (p->z[p->i+2]&0x3f);
        !           162:       p->i += 3;
        !           163:       if( c<=0xffff || c>0x10ffff ) c = 0xfffd;
        !           164:     }else{
        !           165:       c = 0xfffd;
        !           166:     }
        !           167:   }
        !           168:   return c;
        !           169: }
        !           170: static unsigned re_next_char_nocase(ReInput *p){
        !           171:   unsigned c = re_next_char(p);
        !           172:   if( c>='A' && c<='Z' ) c += 'a' - 'A';
        !           173:   return c;
        !           174: }
        !           175: 
        !           176: /* Return true if c is a perl "word" character:  [A-Za-z0-9_] */
        !           177: static int re_word_char(int c){
        !           178:   return (c>='0' && c<='9') || (c>='a' && c<='z')
        !           179:       || (c>='A' && c<='Z') || c=='_';
        !           180: }
        !           181: 
        !           182: /* Return true if c is a "digit" character:  [0-9] */
        !           183: static int re_digit_char(int c){
        !           184:   return (c>='0' && c<='9');
        !           185: }
        !           186: 
        !           187: /* Return true if c is a perl "space" character:  [ \t\r\n\v\f] */
        !           188: static int re_space_char(int c){
        !           189:   return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
        !           190: }
        !           191: 
        !           192: /* Run a compiled regular expression on the zero-terminated input
        !           193: ** string zIn[].  Return true on a match and false if there is no match.
        !           194: */
        !           195: int re_match(ReCompiled *pRe, const unsigned char *zIn, int nIn){
        !           196:   ReStateSet aStateSet[2], *pThis, *pNext;
        !           197:   ReStateNumber aSpace[100];
        !           198:   ReStateNumber *pToFree;
        !           199:   unsigned int i = 0;
        !           200:   unsigned int iSwap = 0;
        !           201:   int c = RE_EOF+1;
        !           202:   int cPrev = 0;
        !           203:   int rc = 0;
        !           204:   ReInput in;
        !           205: 
        !           206:   in.z = zIn;
        !           207:   in.i = 0;
        !           208:   in.mx = nIn>=0 ? nIn : (int)strlen((char const*)zIn);
        !           209: 
        !           210:   /* Look for the initial prefix match, if there is one. */
        !           211:   if( pRe->nInit ){
        !           212:     unsigned char x = pRe->zInit[0];
        !           213:     while( in.i+pRe->nInit<=in.mx 
        !           214:      && (zIn[in.i]!=x ||
        !           215:          strncmp((const char*)zIn+in.i, (const char*)pRe->zInit, pRe->nInit)!=0)
        !           216:     ){
        !           217:       in.i++;
        !           218:     }
        !           219:     if( in.i+pRe->nInit>in.mx ) return 0;
        !           220:   }
        !           221: 
        !           222:   if( pRe->nState<=(sizeof(aSpace)/(sizeof(aSpace[0])*2)) ){
        !           223:     pToFree = 0;
        !           224:     aStateSet[0].aState = aSpace;
        !           225:   }else{
        !           226:     pToFree = (ReStateNumber *)malloc( sizeof(ReStateNumber)*2*pRe->nState );
        !           227:     if( pToFree==0 ) return -1;
        !           228:     aStateSet[0].aState = pToFree;
        !           229:   }
        !           230:   aStateSet[1].aState = &aStateSet[0].aState[pRe->nState];
        !           231:   pNext = &aStateSet[1];
        !           232:   pNext->nState = 0;
        !           233:   re_add_state(pNext, 0);
        !           234:   while( c!=RE_EOF && pNext->nState>0 ){
        !           235:     cPrev = c;
        !           236:     c = pRe->xNextChar(&in);
        !           237:     pThis = pNext;
        !           238:     pNext = &aStateSet[iSwap];
        !           239:     iSwap = 1 - iSwap;
        !           240:     pNext->nState = 0;
        !           241:     for(i=0; i<pThis->nState; i++){
        !           242:       int x = pThis->aState[i];
        !           243:       switch( pRe->aOp[x] ){
        !           244:         case RE_OP_MATCH: {
        !           245:           if( pRe->aArg[x]==c ) re_add_state(pNext, x+1);
        !           246:           break;
        !           247:         }
        !           248:         case RE_OP_ANY: {
        !           249:           re_add_state(pNext, x+1);
        !           250:           break;
        !           251:         }
        !           252:         case RE_OP_WORD: {
        !           253:           if( re_word_char(c) ) re_add_state(pNext, x+1);
        !           254:           break;
        !           255:         }
        !           256:         case RE_OP_NOTWORD: {
        !           257:           if( !re_word_char(c) ) re_add_state(pNext, x+1);
        !           258:           break;
        !           259:         }
        !           260:         case RE_OP_DIGIT: {
        !           261:           if( re_digit_char(c) ) re_add_state(pNext, x+1);
        !           262:           break;
        !           263:         }
        !           264:         case RE_OP_NOTDIGIT: {
        !           265:           if( !re_digit_char(c) ) re_add_state(pNext, x+1);
        !           266:           break;
        !           267:         }
        !           268:         case RE_OP_SPACE: {
        !           269:           if( re_space_char(c) ) re_add_state(pNext, x+1);
        !           270:           break;
        !           271:         }
        !           272:         case RE_OP_NOTSPACE: {
        !           273:           if( !re_space_char(c) ) re_add_state(pNext, x+1);
        !           274:           break;
        !           275:         }
        !           276:         case RE_OP_BOUNDARY: {
        !           277:           if( re_word_char(c)!=re_word_char(cPrev) ) re_add_state(pThis, x+1);
        !           278:           break;
        !           279:         }
        !           280:         case RE_OP_ANYSTAR: {
        !           281:           re_add_state(pNext, x);
        !           282:           re_add_state(pThis, x+1);
        !           283:           break;
        !           284:         }
        !           285:         case RE_OP_FORK: {
        !           286:           re_add_state(pThis, x+pRe->aArg[x]);
        !           287:           re_add_state(pThis, x+1);
        !           288:           break;
        !           289:         }
        !           290:         case RE_OP_GOTO: {
        !           291:           re_add_state(pThis, x+pRe->aArg[x]);
        !           292:           break;
        !           293:         }
        !           294:         case RE_OP_ACCEPT: {
        !           295:           rc = 1;
        !           296:           goto re_match_end;
        !           297:         }
        !           298:         case RE_OP_CC_INC:
        !           299:         case RE_OP_CC_EXC: {
        !           300:           int j = 1;
        !           301:           int n = pRe->aArg[x];
        !           302:           int hit = 0;
        !           303:           for(j=1; j>0 && j<n; j++){
        !           304:             if( pRe->aOp[x+j]==RE_OP_CC_VALUE ){
        !           305:               if( pRe->aArg[x+j]==c ){
        !           306:                 hit = 1;
        !           307:                 j = -1;
        !           308:               }
        !           309:             }else{
        !           310:               if( pRe->aArg[x+j]<=c && pRe->aArg[x+j+1]>=c ){
        !           311:                 hit = 1;
        !           312:                 j = -1;
        !           313:               }else{
        !           314:                 j++;
        !           315:               }
        !           316:             }
        !           317:           }
        !           318:           if( pRe->aOp[x]==RE_OP_CC_EXC ) hit = !hit;
        !           319:           if( hit ) re_add_state(pNext, x+n);
        !           320:           break;            
        !           321:         }
        !           322:       }
        !           323:     }
        !           324:   }
        !           325:   for(i=0; i<pNext->nState; i++){
        !           326:     if( pRe->aOp[pNext->aState[i]]==RE_OP_ACCEPT ){ rc = 1; break; }
        !           327:   }
        !           328: re_match_end:
        !           329:   free(pToFree);
        !           330:   return rc;
        !           331: }
        !           332: 
        !           333: /* Resize the opcode and argument arrays for an RE under construction.
        !           334: */
        !           335: static int re_resize(ReCompiled *p, int N){
        !           336:   char *aOp;
        !           337:   int *aArg;
        !           338:   aOp = (char *)realloc(p->aOp, N*sizeof(p->aOp[0]));
        !           339:   if( aOp==0 ) return 1;
        !           340:   p->aOp = aOp;
        !           341:   aArg = (int *)realloc(p->aArg, N*sizeof(p->aArg[0]));
        !           342:   if( aArg==0 ) return 1;
        !           343:   p->aArg = aArg;
        !           344:   p->nAlloc = N;
        !           345:   return 0;
        !           346: }
        !           347: 
        !           348: /* Insert a new opcode and argument into an RE under construction.  The
        !           349: ** insertion point is just prior to existing opcode iBefore.
        !           350: */
        !           351: static int re_insert(ReCompiled *p, int iBefore, int op, int arg){
        !           352:   int i;
        !           353:   if( p->nAlloc<=p->nState && re_resize(p, p->nAlloc*2) ) return 0;
        !           354:   for(i=p->nState; i>iBefore; i--){
        !           355:     p->aOp[i] = p->aOp[i-1];
        !           356:     p->aArg[i] = p->aArg[i-1];
        !           357:   }
        !           358:   p->nState++;
        !           359:   p->aOp[iBefore] = (char)op;
        !           360:   p->aArg[iBefore] = arg;
        !           361:   return iBefore;
        !           362: }
        !           363: 
        !           364: /* Append a new opcode and argument to the end of the RE under construction.
        !           365: */
        !           366: static int re_append(ReCompiled *p, int op, int arg){
        !           367:   return re_insert(p, p->nState, op, arg);
        !           368: }
        !           369: 
        !           370: /* Make a copy of N opcodes starting at iStart onto the end of the RE
        !           371: ** under construction.
        !           372: */
        !           373: static void re_copy(ReCompiled *p, int iStart, int N){
        !           374:   if( p->nState+N>=p->nAlloc && re_resize(p, p->nAlloc*2+N) ) return;
        !           375:   memcpy(&p->aOp[p->nState], &p->aOp[iStart], N*sizeof(p->aOp[0]));
        !           376:   memcpy(&p->aArg[p->nState], &p->aArg[iStart], N*sizeof(p->aArg[0]));
        !           377:   p->nState += N;
        !           378: }
        !           379: 
        !           380: /* Return true if c is a hexadecimal digit character:  [0-9a-fA-F]
        !           381: ** If c is a hex digit, also set *pV = (*pV)*16 + valueof(c).  If
        !           382: ** c is not a hex digit *pV is unchanged.
        !           383: */
        !           384: static int re_hex(int c, int *pV){
        !           385:   if( c>='0' && c<='9' ){
        !           386:     c -= '0';
        !           387:   }else if( c>='a' && c<='f' ){
        !           388:     c -= 'a' - 10;
        !           389:   }else if( c>='A' && c<='F' ){
        !           390:     c -= 'A' - 10;
        !           391:   }else{
        !           392:     return 0;
        !           393:   }
        !           394:   *pV = (*pV)*16 + (c & 0xff);
        !           395:   return 1;
        !           396: }
        !           397: 
        !           398: /* A backslash character has been seen, read the next character and
        !           399: ** return its interpretation.
        !           400: */
        !           401: static unsigned re_esc_char(ReCompiled *p){
        !           402:   static const char zEsc[] = "afnrtv\\()*.+?[$^{|}]";
        !           403:   static const char zTrans[] = "\a\f\n\r\t\v";
        !           404:   int i, v = 0;
        !           405:   char c;
        !           406:   if( p->sIn.i>=p->sIn.mx ) return 0;
        !           407:   c = p->sIn.z[p->sIn.i];
        !           408:   if( c=='u' && p->sIn.i+4<p->sIn.mx ){
        !           409:     const unsigned char *zIn = p->sIn.z + p->sIn.i;
        !           410:     if( re_hex(zIn[1],&v)
        !           411:      && re_hex(zIn[2],&v)
        !           412:      && re_hex(zIn[3],&v)
        !           413:      && re_hex(zIn[4],&v)
        !           414:     ){
        !           415:       p->sIn.i += 5;
        !           416:       return v;
        !           417:     }
        !           418:   }
        !           419:   if( c=='x' && p->sIn.i+2<p->sIn.mx ){
        !           420:     const unsigned char *zIn = p->sIn.z + p->sIn.i;
        !           421:     if( re_hex(zIn[1],&v)
        !           422:      && re_hex(zIn[2],&v)
        !           423:     ){
        !           424:       p->sIn.i += 3;
        !           425:       return v;
        !           426:     }
        !           427:   }
        !           428:   for(i=0; zEsc[i] && zEsc[i]!=c; i++){}
        !           429:   if( zEsc[i] ){
        !           430:     if( i<6 ) c = zTrans[i];
        !           431:     p->sIn.i++;
        !           432:   }else{
        !           433:     p->zErr = "unknown \\ escape";
        !           434:   }
        !           435:   return c;
        !           436: }
        !           437: 
        !           438: /* Forward declaration */
        !           439: static const char *re_subcompile_string(ReCompiled*);
        !           440: 
        !           441: /* Peek at the next byte of input */
        !           442: static unsigned char rePeek(ReCompiled *p){
        !           443:   return p->sIn.i<p->sIn.mx ? p->sIn.z[p->sIn.i] : 0;
        !           444: }
        !           445: 
        !           446: /* Compile RE text into a sequence of opcodes.  Continue up to the
        !           447: ** first unmatched ")" character, then return.  If an error is found,
        !           448: ** return a pointer to the error message string.
        !           449: */
        !           450: static const char *re_subcompile_re(ReCompiled *p){
        !           451:   const char *zErr;
        !           452:   int iStart, iEnd, iGoto;
        !           453:   iStart = p->nState;
        !           454:   zErr = re_subcompile_string(p);
        !           455:   if( zErr ) return zErr;
        !           456:   while( rePeek(p)=='|' ){
        !           457:     iEnd = p->nState;
        !           458:     re_insert(p, iStart, RE_OP_FORK, iEnd + 2 - iStart);
        !           459:     iGoto = re_append(p, RE_OP_GOTO, 0);
        !           460:     p->sIn.i++;
        !           461:     zErr = re_subcompile_string(p);
        !           462:     if( zErr ) return zErr;
        !           463:     p->aArg[iGoto] = p->nState - iGoto;
        !           464:   }
        !           465:   return 0;
        !           466: }
        !           467: 
        !           468: /* Compile an element of regular expression text (anything that can be
        !           469: ** an operand to the "|" operator).  Return NULL on success or a pointer
        !           470: ** to the error message if there is a problem.
        !           471: */
        !           472: static const char *re_subcompile_string(ReCompiled *p){
        !           473:   int iPrev = -1;
        !           474:   int iStart;
        !           475:   unsigned c;
        !           476:   const char *zErr;
        !           477:   while( (c = p->xNextChar(&p->sIn))!=0 ){
        !           478:     iStart = p->nState;
        !           479:     switch( c ){
        !           480:       case '|':
        !           481:       case '$': 
        !           482:       case ')': {
        !           483:         p->sIn.i--;
        !           484:         return 0;
        !           485:       }
        !           486:       case '(': {
        !           487:         zErr = re_subcompile_re(p);
        !           488:         if( zErr ) return zErr;
        !           489:         if( rePeek(p)!=')' ) return "unmatched '('";
        !           490:         p->sIn.i++;
        !           491:         break;
        !           492:       }
        !           493:       case '.': {
        !           494:         if( rePeek(p)=='*' ){
        !           495:           re_append(p, RE_OP_ANYSTAR, 0);
        !           496:           p->sIn.i++;
        !           497:         }else{ 
        !           498:           re_append(p, RE_OP_ANY, 0);
        !           499:         }
        !           500:         break;
        !           501:       }
        !           502:       case '*': {
        !           503:         if( iPrev<0 ) return "'*' without operand";
        !           504:         re_insert(p, iPrev, RE_OP_GOTO, p->nState - iPrev + 1);
        !           505:         re_append(p, RE_OP_FORK, iPrev - p->nState + 1);
        !           506:         break;
        !           507:       }
        !           508:       case '+': {
        !           509:         if( iPrev<0 ) return "'+' without operand";
        !           510:         re_append(p, RE_OP_FORK, iPrev - p->nState);
        !           511:         break;
        !           512:       }
        !           513:       case '?': {
        !           514:         if( iPrev<0 ) return "'?' without operand";
        !           515:         re_insert(p, iPrev, RE_OP_FORK, p->nState - iPrev+1);
        !           516:         break;
        !           517:       }
        !           518:       case '{': {
        !           519:         int m = 0, n = 0;
        !           520:         int sz, j;
        !           521:         if( iPrev<0 ) return "'{m,n}' without operand";
        !           522:         while( (c=rePeek(p))>='0' && c<='9' ){ m = m*10 + c - '0'; p->sIn.i++; }
        !           523:         n = m;
        !           524:         if( c==',' ){
        !           525:           p->sIn.i++;
        !           526:           n = 0;
        !           527:           while( (c=rePeek(p))>='0' && c<='9' ){ n = n*10 + c-'0'; p->sIn.i++; }
        !           528:         }
        !           529:         if( c!='}' ) return "unmatched '{'";
        !           530:         if( n>0 && n<m ) return "n less than m in '{m,n}'";
        !           531:         p->sIn.i++;
        !           532:         sz = p->nState - iPrev;
        !           533:         if( m==0 ){
        !           534:           if( n==0 ) return "both m and n are zero in '{m,n}'";
        !           535:           re_insert(p, iPrev, RE_OP_FORK, sz+1);
        !           536:           n--;
        !           537:         }else{
        !           538:           for(j=1; j<m; j++) re_copy(p, iPrev, sz);
        !           539:         }
        !           540:         for(j=m; j<n; j++){
        !           541:           re_append(p, RE_OP_FORK, sz+1);
        !           542:           re_copy(p, iPrev, sz);
        !           543:         }
        !           544:         if( n==0 && m>0 ){
        !           545:           re_append(p, RE_OP_FORK, -sz);
        !           546:         }
        !           547:         break;
        !           548:       }
        !           549:       case '[': {
        !           550:         int iFirst = p->nState;
        !           551:         if( rePeek(p)=='^' ){
        !           552:           re_append(p, RE_OP_CC_EXC, 0);
        !           553:           p->sIn.i++;
        !           554:         }else{
        !           555:           re_append(p, RE_OP_CC_INC, 0);
        !           556:         }
        !           557:         while( (c = p->xNextChar(&p->sIn))!=0 ){
        !           558:           if( c=='[' && rePeek(p)==':' ){
        !           559:             return "POSIX character classes not supported";
        !           560:           }
        !           561:           if( c=='\\' ) c = re_esc_char(p);
        !           562:           if( rePeek(p)=='-' ){
        !           563:             re_append(p, RE_OP_CC_RANGE, c);
        !           564:             p->sIn.i++;
        !           565:             c = p->xNextChar(&p->sIn);
        !           566:             if( c=='\\' ) c = re_esc_char(p);
        !           567:             re_append(p, RE_OP_CC_RANGE, c);
        !           568:           }else{
        !           569:             re_append(p, RE_OP_CC_VALUE, c);
        !           570:           }
        !           571:           if( rePeek(p)==']' ){ p->sIn.i++; break; }
        !           572:         }
        !           573:         if( c==0 ) return "unclosed '['";
        !           574:         p->aArg[iFirst] = p->nState - iFirst;
        !           575:         break;
        !           576:       }
        !           577:       case '\\': {
        !           578:         int specialOp = 0;
        !           579:         switch( rePeek(p) ){
        !           580:           case 'b': specialOp = RE_OP_BOUNDARY;   break;
        !           581:           case 'd': specialOp = RE_OP_DIGIT;      break;
        !           582:           case 'D': specialOp = RE_OP_NOTDIGIT;   break;
        !           583:           case 's': specialOp = RE_OP_SPACE;      break;
        !           584:           case 'S': specialOp = RE_OP_NOTSPACE;   break;
        !           585:           case 'w': specialOp = RE_OP_WORD;       break;
        !           586:           case 'W': specialOp = RE_OP_NOTWORD;    break;
        !           587:         }
        !           588:         if( specialOp ){
        !           589:           p->sIn.i++;
        !           590:           re_append(p, specialOp, 0);
        !           591:         }else{
        !           592:           c = re_esc_char(p);
        !           593:           re_append(p, RE_OP_MATCH, c);
        !           594:         }
        !           595:         break;
        !           596:       }
        !           597:       default: {
        !           598:         re_append(p, RE_OP_MATCH, c);
        !           599:         break;
        !           600:       }
        !           601:     }
        !           602:     iPrev = iStart;
        !           603:   }
        !           604:   return 0;
        !           605: }
        !           606: 
        !           607: /* Free and reclaim all the memory used by a previously compiled
        !           608: ** regular expression.  Applications should invoke this routine once
        !           609: ** for every call to re_compile() to avoid memory leaks.
        !           610: */
        !           611: void re_free(ReCompiled *pRe){
        !           612:   if( pRe ){
        !           613:     free(pRe->aOp);
        !           614:     free(pRe->aArg);
        !           615:     free(pRe);
        !           616:   }
        !           617: }
        !           618: 
        !           619: /*
        !           620: ** Compile a textual regular expression in zIn[] into a compiled regular
        !           621: ** expression suitable for us by re_match() and return a pointer to the
        !           622: ** compiled regular expression in *ppRe.  Return NULL on success or an
        !           623: ** error message if something goes wrong.
        !           624: */
        !           625: const char *re_compile(ReCompiled **ppRe, const char *zIn, int noCase){
        !           626:   ReCompiled *pRe;
        !           627:   const char *zErr;
        !           628:   int i, j;
        !           629: 
        !           630:   *ppRe = 0;
        !           631:   pRe = (ReCompiled *)malloc( sizeof(*pRe) );
        !           632:   if( pRe==0 ){
        !           633:     return "out of memory";
        !           634:   }
        !           635:   memset(pRe, 0, sizeof(*pRe));
        !           636:   pRe->xNextChar = noCase ? re_next_char_nocase : re_next_char;
        !           637:   if( re_resize(pRe, 30) ){
        !           638:     re_free(pRe);
        !           639:     return "out of memory";
        !           640:   }
        !           641:   if( zIn[0]=='^' ){
        !           642:     zIn++;
        !           643:   }else{
        !           644:     re_append(pRe, RE_OP_ANYSTAR, 0);
        !           645:   }
        !           646:   pRe->sIn.z = (unsigned char*)zIn;
        !           647:   pRe->sIn.i = 0;
        !           648:   pRe->sIn.mx = (int)strlen(zIn);
        !           649:   zErr = re_subcompile_re(pRe);
        !           650:   if( zErr ){
        !           651:     re_free(pRe);
        !           652:     return zErr;
        !           653:   }
        !           654:   if( rePeek(pRe)=='$' && pRe->sIn.i+1>=pRe->sIn.mx ){
        !           655:     re_append(pRe, RE_OP_MATCH, RE_EOF);
        !           656:     re_append(pRe, RE_OP_ACCEPT, 0);
        !           657:     *ppRe = pRe;
        !           658:   }else if( pRe->sIn.i>=pRe->sIn.mx ){
        !           659:     re_append(pRe, RE_OP_ACCEPT, 0);
        !           660:     *ppRe = pRe;
        !           661:   }else{
        !           662:     re_free(pRe);
        !           663:     return "unrecognized character";
        !           664:   }
        !           665: 
        !           666:   /* The following is a performance optimization.  If the regex begins with
        !           667:   ** ".*" (if the input regex lacks an initial "^") and afterwards there are
        !           668:   ** one or more matching characters, enter those matching characters into
        !           669:   ** zInit[].  The re_match() routine can then search ahead in the input 
        !           670:   ** string looking for the initial match without having to run the whole
        !           671:   ** regex engine over the string.  Do not worry able trying to match
        !           672:   ** unicode characters beyond plane 0 - those are very rare and this is
        !           673:   ** just an optimization. */
        !           674:   if( pRe->aOp[0]==RE_OP_ANYSTAR ){
        !           675:     for(j=0, i=1; j<sizeof(pRe->zInit)-2 && pRe->aOp[i]==RE_OP_MATCH; i++){
        !           676:       unsigned x = pRe->aArg[i];
        !           677:       if( x<=127 ){
        !           678:         pRe->zInit[j++] = (unsigned char)x;
        !           679:       }else if( x<=0xfff ){
        !           680:         pRe->zInit[j++] = (unsigned char)(0xc0 | (x>>6));
        !           681:         pRe->zInit[j++] = 0x80 | (x&0x3f);
        !           682:       }else if( x<=0xffff ){
        !           683:         pRe->zInit[j++] = (unsigned char)(0xd0 | (x>>12));
        !           684:         pRe->zInit[j++] = 0x80 | ((x>>6)&0x3f);
        !           685:         pRe->zInit[j++] = 0x80 | (x&0x3f);
        !           686:       }else{
        !           687:         break;
        !           688:       }
        !           689:     }
        !           690:     if( j>0 && pRe->zInit[j-1]==0 ) j--;
        !           691:     pRe->nInit = j;
        !           692:   }
        !           693:   return pRe->zErr;
        !           694: }

E-mail: