Annotation of parser3/src/lib/pcre/study.c, revision 1.1.14.1

1.1       paf         1: /*************************************************
                      2: *      Perl-Compatible Regular Expressions       *
                      3: *************************************************/
                      4: 
                      5: /*
                      6: This is a library of functions to support regular expressions whose syntax
                      7: and semantics are as close as possible to those of the Perl 5 language. See
                      8: the file Tech.Notes for some information on the internals.
                      9: 
                     10: Written by: Philip Hazel <ph10@cam.ac.uk>
                     11: 
                     12:            Copyright (c) 1997-1999 University of Cambridge
                     13: 
                     14: -----------------------------------------------------------------------------
                     15: Permission is granted to anyone to use this software for any purpose on any
                     16: computer system, and to redistribute it freely, subject to the following
                     17: restrictions:
                     18: 
                     19: 1. This software is distributed in the hope that it will be useful,
                     20:    but WITHOUT ANY WARRANTY; without even the implied warranty of
                     21:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
                     22: 
                     23: 2. The origin of this software must not be misrepresented, either by
                     24:    explicit claim or by omission.
                     25: 
                     26: 3. Altered versions must be plainly marked as such, and must not be
                     27:    misrepresented as being the original software.
                     28: 
                     29: 4. If PCRE is embedded in any software that is released under the GNU
                     30:    General Purpose Licence (GPL), then the terms of that licence shall
                     31:    supersede any condition above with which it is incompatible.
                     32: -----------------------------------------------------------------------------
                     33: */
                     34: 
                     35: 
                     36: /* Include the internals header, which itself includes Standard C headers plus
                     37: the external pcre header. */
                     38: 
                     39: #include "internal.h"
                     40: 
                     41: 
                     42: 
                     43: /*************************************************
                     44: *      Set a bit and maybe its alternate case    *
                     45: *************************************************/
                     46: 
                     47: /* Given a character, set its bit in the table, and also the bit for the other
                     48: version of a letter if we are caseless.
                     49: 
                     50: Arguments:
                     51:   start_bits    points to the bit map
                     52:   c             is the character
                     53:   caseless      the caseless flag
                     54:   cd            the block with char table pointers
                     55: 
                     56: Returns:        nothing
                     57: */
                     58: 
                     59: static void
                     60: set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
                     61: {
                     62: start_bits[c/8] |= (1 << (c&7));
                     63: if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
                     64:   start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
                     65: }
                     66: 
                     67: 
                     68: 
                     69: /*************************************************
                     70: *          Create bitmap of starting chars       *
                     71: *************************************************/
                     72: 
                     73: /* This function scans a compiled unanchored expression and attempts to build a
                     74: bitmap of the set of initial characters. If it can't, it returns FALSE. As time
                     75: goes by, we may be able to get more clever at doing this.
                     76: 
                     77: Arguments:
                     78:   code         points to an expression
                     79:   start_bits   points to a 32-byte table, initialized to 0
                     80:   caseless     the current state of the caseless flag
                     81:   cd           the block with char table pointers
                     82: 
                     83: Returns:       TRUE if table built, FALSE otherwise
                     84: */
                     85: 
                     86: static BOOL
                     87: set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
                     88:   compile_data *cd)
                     89: {
                     90: register int c;
                     91: 
                     92: /* This next statement and the later reference to dummy are here in order to
                     93: trick the optimizer of the IBM C compiler for OS/2 into generating correct
                     94: code. Apparently IBM isn't going to fix the problem, and we would rather not
                     95: disable optimization (in this module it actually makes a big difference, and
                     96: the pcre module can use all the optimization it can get). */
                     97: 
                     98: volatile int dummy;
                     99: 
                    100: do
                    101:   {
                    102:   const uschar *tcode = code + 3;
                    103:   BOOL try_next = TRUE;
                    104: 
                    105:   while (try_next)
                    106:     {
                    107:     try_next = FALSE;
                    108: 
                    109:     /* If a branch starts with a bracket or a positive lookahead assertion,
                    110:     recurse to set bits from within them. That's all for this branch. */
                    111: 
                    112:     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
                    113:       {
                    114:       if (!set_start_bits(tcode, start_bits, caseless, cd))
                    115:         return FALSE;
                    116:       }
                    117: 
                    118:     else switch(*tcode)
                    119:       {
                    120:       default:
                    121:       return FALSE;
                    122: 
                    123:       /* Skip over lookbehind and negative lookahead assertions */
                    124: 
                    125:       case OP_ASSERT_NOT:
                    126:       case OP_ASSERTBACK:
                    127:       case OP_ASSERTBACK_NOT:
                    128:       try_next = TRUE;
                    129:       do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
                    130:       tcode += 3;
                    131:       break;
                    132: 
                    133:       /* Skip over an option setting, changing the caseless flag */
                    134: 
                    135:       case OP_OPT:
                    136:       caseless = (tcode[1] & PCRE_CASELESS) != 0;
                    137:       tcode += 2;
                    138:       try_next = TRUE;
                    139:       break;
                    140: 
                    141:       /* BRAZERO does the bracket, but carries on. */
                    142: 
                    143:       case OP_BRAZERO:
                    144:       case OP_BRAMINZERO:
                    145:       if (!set_start_bits(++tcode, start_bits, caseless, cd))
                    146:         return FALSE;
                    147:       dummy = 1;
                    148:       do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
                    149:       tcode += 3;
                    150:       try_next = TRUE;
                    151:       break;
                    152: 
                    153:       /* Single-char * or ? sets the bit and tries the next item */
                    154: 
                    155:       case OP_STAR:
                    156:       case OP_MINSTAR:
                    157:       case OP_QUERY:
                    158:       case OP_MINQUERY:
                    159:       set_bit(start_bits, tcode[1], caseless, cd);
                    160:       tcode += 2;
                    161:       try_next = TRUE;
                    162:       break;
                    163: 
                    164:       /* Single-char upto sets the bit and tries the next */
                    165: 
                    166:       case OP_UPTO:
                    167:       case OP_MINUPTO:
                    168:       set_bit(start_bits, tcode[3], caseless, cd);
                    169:       tcode += 4;
                    170:       try_next = TRUE;
                    171:       break;
                    172: 
                    173:       /* At least one single char sets the bit and stops */
                    174: 
                    175:       case OP_EXACT:       /* Fall through */
                    176:       tcode++;
                    177: 
                    178:       case OP_CHARS:       /* Fall through */
                    179:       tcode++;
                    180: 
                    181:       case OP_PLUS:
                    182:       case OP_MINPLUS:
                    183:       set_bit(start_bits, tcode[1], caseless, cd);
                    184:       break;
                    185: 
                    186:       /* Single character type sets the bits and stops */
                    187: 
                    188:       case OP_NOT_DIGIT:
                    189:       for (c = 0; c < 32; c++)
                    190:         start_bits[c] |= ~cd->cbits[c+cbit_digit];
                    191:       break;
                    192: 
                    193:       case OP_DIGIT:
                    194:       for (c = 0; c < 32; c++)
                    195:         start_bits[c] |= cd->cbits[c+cbit_digit];
                    196:       break;
                    197: 
                    198:       case OP_NOT_WHITESPACE:
                    199:       for (c = 0; c < 32; c++)
                    200:         start_bits[c] |= ~cd->cbits[c+cbit_space];
                    201:       break;
                    202: 
                    203:       case OP_WHITESPACE:
                    204:       for (c = 0; c < 32; c++)
                    205:         start_bits[c] |= cd->cbits[c+cbit_space];
                    206:       break;
                    207: 
                    208:       case OP_NOT_WORDCHAR:
                    209:       for (c = 0; c < 32; c++)
                    210:         start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
                    211:       break;
                    212: 
                    213:       case OP_WORDCHAR:
                    214:       for (c = 0; c < 32; c++)
                    215:         start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
                    216:       break;
                    217: 
                    218:       /* One or more character type fudges the pointer and restarts, knowing
                    219:       it will hit a single character type and stop there. */
                    220: 
                    221:       case OP_TYPEPLUS:
                    222:       case OP_TYPEMINPLUS:
                    223:       tcode++;
                    224:       try_next = TRUE;
                    225:       break;
                    226: 
                    227:       case OP_TYPEEXACT:
                    228:       tcode += 3;
                    229:       try_next = TRUE;
                    230:       break;
                    231: 
                    232:       /* Zero or more repeats of character types set the bits and then
                    233:       try again. */
                    234: 
                    235:       case OP_TYPEUPTO:
                    236:       case OP_TYPEMINUPTO:
                    237:       tcode += 2;               /* Fall through */
                    238: 
                    239:       case OP_TYPESTAR:
                    240:       case OP_TYPEMINSTAR:
                    241:       case OP_TYPEQUERY:
                    242:       case OP_TYPEMINQUERY:
                    243:       switch(tcode[1])
                    244:         {
                    245:         case OP_NOT_DIGIT:
                    246:         for (c = 0; c < 32; c++)
                    247:           start_bits[c] |= ~cd->cbits[c+cbit_digit];
                    248:         break;
                    249: 
                    250:         case OP_DIGIT:
                    251:         for (c = 0; c < 32; c++)
                    252:           start_bits[c] |= cd->cbits[c+cbit_digit];
                    253:         break;
                    254: 
                    255:         case OP_NOT_WHITESPACE:
                    256:         for (c = 0; c < 32; c++)
                    257:           start_bits[c] |= ~cd->cbits[c+cbit_space];
                    258:         break;
                    259: 
                    260:         case OP_WHITESPACE:
                    261:         for (c = 0; c < 32; c++)
                    262:           start_bits[c] |= cd->cbits[c+cbit_space];
                    263:         break;
                    264: 
                    265:         case OP_NOT_WORDCHAR:
                    266:         for (c = 0; c < 32; c++)
                    267:           start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
                    268:         break;
                    269: 
                    270:         case OP_WORDCHAR:
                    271:         for (c = 0; c < 32; c++)
                    272:           start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
                    273:         break;
                    274:         }
                    275: 
                    276:       tcode += 2;
                    277:       try_next = TRUE;
                    278:       break;
                    279: 
                    280:       /* Character class: set the bits and either carry on or not,
                    281:       according to the repeat count. */
                    282: 
                    283:       case OP_CLASS:
                    284:         {
                    285:         tcode++;
                    286:         for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
                    287:         tcode += 32;
                    288:         switch (*tcode)
                    289:           {
                    290:           case OP_CRSTAR:
                    291:           case OP_CRMINSTAR:
                    292:           case OP_CRQUERY:
                    293:           case OP_CRMINQUERY:
                    294:           tcode++;
                    295:           try_next = TRUE;
                    296:           break;
                    297: 
                    298:           case OP_CRRANGE:
                    299:           case OP_CRMINRANGE:
                    300:           if (((tcode[1] << 8) + tcode[2]) == 0)
                    301:             {
                    302:             tcode += 5;
                    303:             try_next = TRUE;
                    304:             }
                    305:           break;
                    306:           }
                    307:         }
                    308:       break; /* End of class handling */
                    309: 
                    310:       }      /* End of switch */
                    311:     }        /* End of try_next loop */
                    312: 
                    313:   code += (code[1] << 8) + code[2];   /* Advance to next branch */
                    314:   }
                    315: while (*code == OP_ALT);
                    316: return TRUE;
                    317: }
                    318: 
                    319: 
                    320: 
                    321: /*************************************************
                    322: *          Study a compiled expression           *
                    323: *************************************************/
                    324: 
                    325: /* This function is handed a compiled expression that it must study to produce
                    326: information that will speed up the matching. It returns a pcre_extra block
                    327: which then gets handed back to pcre_exec().
                    328: 
                    329: Arguments:
                    330:   re        points to the compiled expression
                    331:   options   contains option bits
                    332:   errorptr  points to where to place error messages;
                    333:             set NULL unless error
                    334: 
                    335: Returns:    pointer to a pcre_extra block,
                    336:             NULL on error or if no optimization possible
                    337: */
                    338: 
                    339: pcre_extra *
1.1.14.1! paf       340: pcre_study(const pcre *external_re, int options, const char* *errorptr)
1.1       paf       341: {
                    342: uschar start_bits[32];
                    343: real_pcre_extra *extra;
                    344: const real_pcre *re = (const real_pcre *)external_re;
                    345: compile_data compile_block;
                    346: 
                    347: *errorptr = NULL;
                    348: 
                    349: if (re == NULL || re->magic_number != MAGIC_NUMBER)
                    350:   {
                    351:   *errorptr = "argument is not a compiled regular expression";
                    352:   return NULL;
                    353:   }
                    354: 
                    355: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
                    356:   {
                    357:   *errorptr = "unknown or incorrect option bit(s) set";
                    358:   return NULL;
                    359:   }
                    360: 
                    361: /* For an anchored pattern, or an unchored pattern that has a first char, or a
                    362: multiline pattern that matches only at "line starts", no further processing at
                    363: present. */
                    364: 
                    365: if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
                    366:   return NULL;
                    367: 
                    368: /* Set the character tables in the block which is passed around */
                    369: 
                    370: compile_block.lcc = re->tables + lcc_offset;
                    371: compile_block.fcc = re->tables + fcc_offset;
                    372: compile_block.cbits = re->tables + cbits_offset;
                    373: compile_block.ctypes = re->tables + ctypes_offset;
                    374: 
                    375: /* See if we can find a fixed set of initial characters for the pattern. */
                    376: 
                    377: memset(start_bits, 0, 32 * sizeof(uschar));
                    378: if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
                    379:   &compile_block)) return NULL;
                    380: 
                    381: /* Get an "extra" block and put the information therein. */
                    382: 
                    383: extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
                    384: 
                    385: if (extra == NULL)
                    386:   {
                    387:   *errorptr = "failed to get memory";
                    388:   return NULL;
                    389:   }
                    390: 
                    391: extra->options = PCRE_STUDY_MAPPED;
                    392: memcpy(extra->start_bits, start_bits, sizeof(start_bits));
                    393: 
                    394: return (pcre_extra *)extra;
                    395: }
                    396: 
                    397: /* End of study.c */

E-mail: