Annotation of win32/pcre/pcre_study.c, revision 1.2

1.1       misha       1: /*************************************************
                      2: *      Perl-Compatible Regular Expressions       *
                      3: *************************************************/
                      4: 
                      5: /* PCRE is a library of functions to support regular expressions whose syntax
                      6: and semantics are as close as possible to those of the Perl 5 language.
                      7: 
                      8:                        Written by Philip Hazel
                      9:            Copyright (c) 1997-2008 University of Cambridge
                     10: 
                     11: -----------------------------------------------------------------------------
                     12: Redistribution and use in source and binary forms, with or without
                     13: modification, are permitted provided that the following conditions are met:
                     14: 
                     15:     * Redistributions of source code must retain the above copyright notice,
                     16:       this list of conditions and the following disclaimer.
                     17: 
                     18:     * Redistributions in binary form must reproduce the above copyright
                     19:       notice, this list of conditions and the following disclaimer in the
                     20:       documentation and/or other materials provided with the distribution.
                     21: 
                     22:     * Neither the name of the University of Cambridge nor the names of its
                     23:       contributors may be used to endorse or promote products derived from
                     24:       this software without specific prior written permission.
                     25: 
                     26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
                     27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
                     30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     36: POSSIBILITY OF SUCH DAMAGE.
                     37: -----------------------------------------------------------------------------
                     38: */
                     39: 
                     40: 
                     41: /* This module contains the external function pcre_study(), along with local
                     42: supporting functions. */
                     43: 
                     44: 
                     45: #ifdef HAVE_CONFIG_H
                     46: #include "config.h"
                     47: #endif
                     48: 
                     49: #include "pcre_internal.h"
                     50: 
                     51: 
                     52: /* Returns from set_start_bits() */
                     53: 
                     54: enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
                     55: 
                     56: 
                     57: /*************************************************
                     58: *      Set a bit and maybe its alternate case    *
                     59: *************************************************/
                     60: 
                     61: /* Given a character, set its bit in the table, and also the bit for the other
                     62: version of a letter if we are caseless.
                     63: 
                     64: Arguments:
                     65:   start_bits    points to the bit map
                     66:   c             is the character
                     67:   caseless      the caseless flag
                     68:   cd            the block with char table pointers
                     69: 
                     70: Returns:        nothing
                     71: */
                     72: 
                     73: static void
                     74: set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
                     75: {
                     76: start_bits[c/8] |= (1 << (c&7));
                     77: if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
                     78:   start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
                     79: }
                     80: 
                     81: 
                     82: 
                     83: /*************************************************
                     84: *          Create bitmap of starting bytes       *
                     85: *************************************************/
                     86: 
                     87: /* This function scans a compiled unanchored expression recursively and
                     88: attempts to build a bitmap of the set of possible starting bytes. As time goes
                     89: by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
                     90: useful for parenthesized groups in patterns such as (a*)b where the group
                     91: provides some optional starting bytes but scanning must continue at the outer
                     92: level to find at least one mandatory byte. At the outermost level, this
                     93: function fails unless the result is SSB_DONE.
                     94: 
                     95: Arguments:
                     96:   code         points to an expression
                     97:   start_bits   points to a 32-byte table, initialized to 0
                     98:   caseless     the current state of the caseless flag
                     99:   utf8         TRUE if in UTF-8 mode
                    100:   cd           the block with char table pointers
                    101: 
                    102: Returns:       SSB_FAIL     => Failed to find any starting bytes
                    103:                SSB_DONE     => Found mandatory starting bytes
                    104:                SSB_CONTINUE => Found optional starting bytes
                    105: */
                    106: 
                    107: static int
                    108: set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
                    109:   BOOL utf8, compile_data *cd)
                    110: {
                    111: register int c;
                    112: int yield = SSB_DONE;
                    113: 
                    114: #if 0
                    115: /* ========================================================================= */
                    116: /* The following comment and code was inserted in January 1999. In May 2006,
                    117: when it was observed to cause compiler warnings about unused values, I took it
                    118: out again. If anybody is still using OS/2, they will have to put it back
                    119: manually. */
                    120: 
                    121: /* This next statement and the later reference to dummy are here in order to
                    122: trick the optimizer of the IBM C compiler for OS/2 into generating correct
                    123: code. Apparently IBM isn't going to fix the problem, and we would rather not
                    124: disable optimization (in this module it actually makes a big difference, and
                    125: the pcre module can use all the optimization it can get). */
                    126: 
                    127: volatile int dummy;
                    128: /* ========================================================================= */
                    129: #endif
                    130: 
                    131: do
                    132:   {
                    133:   const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
                    134:   BOOL try_next = TRUE;
                    135: 
                    136:   while (try_next)    /* Loop for items in this branch */
                    137:     {
                    138:     int rc;
                    139:     switch(*tcode)
                    140:       {
                    141:       /* Fail if we reach something we don't understand */
                    142: 
                    143:       default:
                    144:       return SSB_FAIL;
                    145: 
                    146:       /* If we hit a bracket or a positive lookahead assertion, recurse to set
                    147:       bits from within the subpattern. If it can't find anything, we have to
                    148:       give up. If it finds some mandatory character(s), we are done for this
                    149:       branch. Otherwise, carry on scanning after the subpattern. */
                    150: 
                    151:       case OP_BRA:
                    152:       case OP_SBRA:
                    153:       case OP_CBRA:
                    154:       case OP_SCBRA:
                    155:       case OP_ONCE:
                    156:       case OP_ASSERT:
                    157:       rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
                    158:       if (rc == SSB_FAIL) return SSB_FAIL;
                    159:       if (rc == SSB_DONE) try_next = FALSE; else
                    160:         {
                    161:         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
                    162:         tcode += 1 + LINK_SIZE;
                    163:         }
                    164:       break;
                    165: 
                    166:       /* If we hit ALT or KET, it means we haven't found anything mandatory in
                    167:       this branch, though we might have found something optional. For ALT, we
                    168:       continue with the next alternative, but we have to arrange that the final
                    169:       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
                    170:       return SSB_CONTINUE: if this is the top level, that indicates failure,
                    171:       but after a nested subpattern, it causes scanning to continue. */
                    172: 
                    173:       case OP_ALT:
                    174:       yield = SSB_CONTINUE;
                    175:       try_next = FALSE;
                    176:       break;
                    177: 
                    178:       case OP_KET:
                    179:       case OP_KETRMAX:
                    180:       case OP_KETRMIN:
                    181:       return SSB_CONTINUE;
                    182: 
                    183:       /* Skip over callout */
                    184: 
                    185:       case OP_CALLOUT:
                    186:       tcode += 2 + 2*LINK_SIZE;
                    187:       break;
                    188: 
                    189:       /* Skip over lookbehind and negative lookahead assertions */
                    190: 
                    191:       case OP_ASSERT_NOT:
                    192:       case OP_ASSERTBACK:
                    193:       case OP_ASSERTBACK_NOT:
                    194:       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
                    195:       tcode += 1 + LINK_SIZE;
                    196:       break;
                    197: 
                    198:       /* Skip over an option setting, changing the caseless flag */
                    199: 
                    200:       case OP_OPT:
                    201:       caseless = (tcode[1] & PCRE_CASELESS) != 0;
                    202:       tcode += 2;
                    203:       break;
                    204: 
                    205:       /* BRAZERO does the bracket, but carries on. */
                    206: 
                    207:       case OP_BRAZERO:
                    208:       case OP_BRAMINZERO:
                    209:       if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
                    210:         return SSB_FAIL;
                    211: /* =========================================================================
                    212:       See the comment at the head of this function concerning the next line,
                    213:       which was an old fudge for the benefit of OS/2.
                    214:       dummy = 1;
                    215:   ========================================================================= */
                    216:       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
                    217:       tcode += 1 + LINK_SIZE;
                    218:       break;
                    219: 
                    220:       /* SKIPZERO skips the bracket. */
                    221: 
                    222:       case OP_SKIPZERO:
1.2     ! misha     223:       tcode++;
1.1       misha     224:       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
                    225:       tcode += 1 + LINK_SIZE;
                    226:       break;
                    227: 
                    228:       /* Single-char * or ? sets the bit and tries the next item */
                    229: 
                    230:       case OP_STAR:
                    231:       case OP_MINSTAR:
                    232:       case OP_POSSTAR:
                    233:       case OP_QUERY:
                    234:       case OP_MINQUERY:
                    235:       case OP_POSQUERY:
                    236:       set_bit(start_bits, tcode[1], caseless, cd);
                    237:       tcode += 2;
                    238: #ifdef SUPPORT_UTF8
                    239:       if (utf8 && tcode[-1] >= 0xc0)
                    240:         tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
                    241: #endif
                    242:       break;
                    243: 
                    244:       /* Single-char upto sets the bit and tries the next */
                    245: 
                    246:       case OP_UPTO:
                    247:       case OP_MINUPTO:
                    248:       case OP_POSUPTO:
                    249:       set_bit(start_bits, tcode[3], caseless, cd);
                    250:       tcode += 4;
                    251: #ifdef SUPPORT_UTF8
                    252:       if (utf8 && tcode[-1] >= 0xc0)
                    253:         tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
                    254: #endif
                    255:       break;
                    256: 
                    257:       /* At least one single char sets the bit and stops */
                    258: 
                    259:       case OP_EXACT:       /* Fall through */
                    260:       tcode += 2;
                    261: 
                    262:       case OP_CHAR:
                    263:       case OP_CHARNC:
                    264:       case OP_PLUS:
                    265:       case OP_MINPLUS:
                    266:       case OP_POSPLUS:
                    267:       set_bit(start_bits, tcode[1], caseless, cd);
                    268:       try_next = FALSE;
                    269:       break;
                    270: 
                    271:       /* Single character type sets the bits and stops */
                    272: 
                    273:       case OP_NOT_DIGIT:
                    274:       for (c = 0; c < 32; c++)
                    275:         start_bits[c] |= ~cd->cbits[c+cbit_digit];
                    276:       try_next = FALSE;
                    277:       break;
                    278: 
                    279:       case OP_DIGIT:
                    280:       for (c = 0; c < 32; c++)
                    281:         start_bits[c] |= cd->cbits[c+cbit_digit];
                    282:       try_next = FALSE;
                    283:       break;
                    284: 
                    285:       /* The cbit_space table has vertical tab as whitespace; we have to
                    286:       discard it. */
                    287: 
                    288:       case OP_NOT_WHITESPACE:
                    289:       for (c = 0; c < 32; c++)
                    290:         {
                    291:         int d = cd->cbits[c+cbit_space];
                    292:         if (c == 1) d &= ~0x08;
                    293:         start_bits[c] |= ~d;
                    294:         }
                    295:       try_next = FALSE;
                    296:       break;
                    297: 
                    298:       /* The cbit_space table has vertical tab as whitespace; we have to
                    299:       discard it. */
                    300: 
                    301:       case OP_WHITESPACE:
                    302:       for (c = 0; c < 32; c++)
                    303:         {
                    304:         int d = cd->cbits[c+cbit_space];
                    305:         if (c == 1) d &= ~0x08;
                    306:         start_bits[c] |= d;
                    307:         }
                    308:       try_next = FALSE;
                    309:       break;
                    310: 
                    311:       case OP_NOT_WORDCHAR:
                    312:       for (c = 0; c < 32; c++)
                    313:         start_bits[c] |= ~cd->cbits[c+cbit_word];
                    314:       try_next = FALSE;
                    315:       break;
                    316: 
                    317:       case OP_WORDCHAR:
                    318:       for (c = 0; c < 32; c++)
                    319:         start_bits[c] |= cd->cbits[c+cbit_word];
                    320:       try_next = FALSE;
                    321:       break;
                    322: 
                    323:       /* One or more character type fudges the pointer and restarts, knowing
                    324:       it will hit a single character type and stop there. */
                    325: 
                    326:       case OP_TYPEPLUS:
                    327:       case OP_TYPEMINPLUS:
                    328:       tcode++;
                    329:       break;
                    330: 
                    331:       case OP_TYPEEXACT:
                    332:       tcode += 3;
                    333:       break;
                    334: 
                    335:       /* Zero or more repeats of character types set the bits and then
                    336:       try again. */
                    337: 
                    338:       case OP_TYPEUPTO:
                    339:       case OP_TYPEMINUPTO:
                    340:       case OP_TYPEPOSUPTO:
                    341:       tcode += 2;               /* Fall through */
                    342: 
                    343:       case OP_TYPESTAR:
                    344:       case OP_TYPEMINSTAR:
                    345:       case OP_TYPEPOSSTAR:
                    346:       case OP_TYPEQUERY:
                    347:       case OP_TYPEMINQUERY:
                    348:       case OP_TYPEPOSQUERY:
                    349:       switch(tcode[1])
                    350:         {
                    351:         case OP_ANY:
                    352:         case OP_ALLANY:
                    353:         return SSB_FAIL;
                    354: 
                    355:         case OP_NOT_DIGIT:
                    356:         for (c = 0; c < 32; c++)
                    357:           start_bits[c] |= ~cd->cbits[c+cbit_digit];
                    358:         break;
                    359: 
                    360:         case OP_DIGIT:
                    361:         for (c = 0; c < 32; c++)
                    362:           start_bits[c] |= cd->cbits[c+cbit_digit];
                    363:         break;
                    364: 
                    365:         /* The cbit_space table has vertical tab as whitespace; we have to
                    366:         discard it. */
                    367: 
                    368:         case OP_NOT_WHITESPACE:
                    369:         for (c = 0; c < 32; c++)
                    370:           {
                    371:           int d = cd->cbits[c+cbit_space];
                    372:           if (c == 1) d &= ~0x08;
                    373:           start_bits[c] |= ~d;
                    374:           }
                    375:         break;
                    376: 
                    377:         /* The cbit_space table has vertical tab as whitespace; we have to
                    378:         discard it. */
                    379: 
                    380:         case OP_WHITESPACE:
                    381:         for (c = 0; c < 32; c++)
                    382:           {
                    383:           int d = cd->cbits[c+cbit_space];
                    384:           if (c == 1) d &= ~0x08;
                    385:           start_bits[c] |= d;
                    386:           }
                    387:         break;
                    388: 
                    389:         case OP_NOT_WORDCHAR:
                    390:         for (c = 0; c < 32; c++)
                    391:           start_bits[c] |= ~cd->cbits[c+cbit_word];
                    392:         break;
                    393: 
                    394:         case OP_WORDCHAR:
                    395:         for (c = 0; c < 32; c++)
                    396:           start_bits[c] |= cd->cbits[c+cbit_word];
                    397:         break;
                    398:         }
                    399: 
                    400:       tcode += 2;
                    401:       break;
                    402: 
                    403:       /* Character class where all the information is in a bit map: set the
                    404:       bits and either carry on or not, according to the repeat count. If it was
                    405:       a negative class, and we are operating with UTF-8 characters, any byte
                    406:       with a value >= 0xc4 is a potentially valid starter because it starts a
                    407:       character with a value > 255. */
                    408: 
                    409:       case OP_NCLASS:
                    410: #ifdef SUPPORT_UTF8
                    411:       if (utf8)
                    412:         {
                    413:         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
                    414:         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
                    415:         }
                    416: #endif
                    417:       /* Fall through */
                    418: 
                    419:       case OP_CLASS:
                    420:         {
                    421:         tcode++;
                    422: 
                    423:         /* In UTF-8 mode, the bits in a bit map correspond to character
                    424:         values, not to byte values. However, the bit map we are constructing is
                    425:         for byte values. So we have to do a conversion for characters whose
                    426:         value is > 127. In fact, there are only two possible starting bytes for
                    427:         characters in the range 128 - 255. */
                    428: 
                    429: #ifdef SUPPORT_UTF8
                    430:         if (utf8)
                    431:           {
                    432:           for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
                    433:           for (c = 128; c < 256; c++)
                    434:             {
                    435:             if ((tcode[c/8] && (1 << (c&7))) != 0)
                    436:               {
                    437:               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
                    438:               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
                    439:               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
                    440:               }
                    441:             }
                    442:           }
                    443: 
                    444:         /* In non-UTF-8 mode, the two bit maps are completely compatible. */
                    445: 
                    446:         else
                    447: #endif
                    448:           {
                    449:           for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
                    450:           }
                    451: 
                    452:         /* Advance past the bit map, and act on what follows */
                    453: 
                    454:         tcode += 32;
                    455:         switch (*tcode)
                    456:           {
                    457:           case OP_CRSTAR:
                    458:           case OP_CRMINSTAR:
                    459:           case OP_CRQUERY:
                    460:           case OP_CRMINQUERY:
                    461:           tcode++;
                    462:           break;
                    463: 
                    464:           case OP_CRRANGE:
                    465:           case OP_CRMINRANGE:
                    466:           if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
                    467:             else try_next = FALSE;
                    468:           break;
                    469: 
                    470:           default:
                    471:           try_next = FALSE;
                    472:           break;
                    473:           }
                    474:         }
                    475:       break; /* End of bitmap class handling */
                    476: 
                    477:       }      /* End of switch */
                    478:     }        /* End of try_next loop */
                    479: 
                    480:   code += GET(code, 1);   /* Advance to next branch */
                    481:   }
                    482: while (*code == OP_ALT);
                    483: return yield;
                    484: }
                    485: 
                    486: 
                    487: 
                    488: /*************************************************
                    489: *          Study a compiled expression           *
                    490: *************************************************/
                    491: 
                    492: /* This function is handed a compiled expression that it must study to produce
                    493: information that will speed up the matching. It returns a pcre_extra block
                    494: which then gets handed back to pcre_exec().
                    495: 
                    496: Arguments:
                    497:   re        points to the compiled expression
                    498:   options   contains option bits
                    499:   errorptr  points to where to place error messages;
                    500:             set NULL unless error
                    501: 
                    502: Returns:    pointer to a pcre_extra block, with study_data filled in and the
                    503:               appropriate flag set;
                    504:             NULL on error or if no optimization possible
                    505: */
                    506: 
1.2     ! misha     507: PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1.1       misha     508: pcre_study(const pcre *external_re, int options, const char **errorptr)
                    509: {
                    510: uschar start_bits[32];
                    511: pcre_extra *extra;
                    512: pcre_study_data *study;
                    513: const uschar *tables;
                    514: uschar *code;
                    515: compile_data compile_block;
                    516: const real_pcre *re = (const real_pcre *)external_re;
                    517: 
                    518: *errorptr = NULL;
                    519: 
                    520: if (re == NULL || re->magic_number != MAGIC_NUMBER)
                    521:   {
                    522:   *errorptr = "argument is not a compiled regular expression";
                    523:   return NULL;
                    524:   }
                    525: 
                    526: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
                    527:   {
                    528:   *errorptr = "unknown or incorrect option bit(s) set";
                    529:   return NULL;
                    530:   }
                    531: 
                    532: code = (uschar *)re + re->name_table_offset +
                    533:   (re->name_count * re->name_entry_size);
                    534: 
                    535: /* For an anchored pattern, or an unanchored pattern that has a first char, or
                    536: a multiline pattern that matches only at "line starts", no further processing
                    537: at present. */
                    538: 
                    539: if ((re->options & PCRE_ANCHORED) != 0 ||
                    540:     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
                    541:   return NULL;
                    542: 
                    543: /* Set the character tables in the block that is passed around */
                    544: 
                    545: tables = re->tables;
                    546: if (tables == NULL)
                    547:   (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
                    548:   (void *)(&tables));
                    549: 
                    550: compile_block.lcc = tables + lcc_offset;
                    551: compile_block.fcc = tables + fcc_offset;
                    552: compile_block.cbits = tables + cbits_offset;
                    553: compile_block.ctypes = tables + ctypes_offset;
                    554: 
                    555: /* See if we can find a fixed set of initial characters for the pattern. */
                    556: 
                    557: memset(start_bits, 0, 32 * sizeof(uschar));
                    558: if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
                    559:   (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
                    560: 
                    561: /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
                    562: the latter, which is pointed to by the former, which may also get additional
                    563: data set later by the calling program. At the moment, the size of
                    564: pcre_study_data is fixed. We nevertheless save it in a field for returning via
                    565: the pcre_fullinfo() function so that if it becomes variable in the future, we
                    566: don't have to change that code. */
                    567: 
                    568: extra = (pcre_extra *)(pcre_malloc)
                    569:   (sizeof(pcre_extra) + sizeof(pcre_study_data));
                    570: 
                    571: if (extra == NULL)
                    572:   {
                    573:   *errorptr = "failed to get memory";
                    574:   return NULL;
                    575:   }
                    576: 
                    577: study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
                    578: extra->flags = PCRE_EXTRA_STUDY_DATA;
                    579: extra->study_data = study;
                    580: 
                    581: study->size = sizeof(pcre_study_data);
                    582: study->options = PCRE_STUDY_MAPPED;
                    583: memcpy(study->start_bits, start_bits, sizeof(start_bits));
                    584: 
                    585: return extra;
                    586: }
                    587: 
                    588: /* End of pcre_study.c */

E-mail: