Annotation of win32/pcre/pcre_study.c, revision 1.4

1.1       misha       1: /*************************************************
                      2: *      Perl-Compatible Regular Expressions       *
                      3: *************************************************/
                      4: 
                      5: /* PCRE is a library of functions to support regular expressions whose syntax
                      6: and semantics are as close as possible to those of the Perl 5 language.
                      7: 
                      8:                        Written by Philip Hazel
1.3       misha       9:            Copyright (c) 1997-2010 University of Cambridge
1.1       misha      10: 
                     11: -----------------------------------------------------------------------------
                     12: Redistribution and use in source and binary forms, with or without
                     13: modification, are permitted provided that the following conditions are met:
                     14: 
                     15:     * Redistributions of source code must retain the above copyright notice,
                     16:       this list of conditions and the following disclaimer.
                     17: 
                     18:     * Redistributions in binary form must reproduce the above copyright
                     19:       notice, this list of conditions and the following disclaimer in the
                     20:       documentation and/or other materials provided with the distribution.
                     21: 
                     22:     * Neither the name of the University of Cambridge nor the names of its
                     23:       contributors may be used to endorse or promote products derived from
                     24:       this software without specific prior written permission.
                     25: 
                     26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
                     27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
                     30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     36: POSSIBILITY OF SUCH DAMAGE.
                     37: -----------------------------------------------------------------------------
                     38: */
                     39: 
                     40: 
                     41: /* This module contains the external function pcre_study(), along with local
                     42: supporting functions. */
                     43: 
                     44: 
                     45: #ifdef HAVE_CONFIG_H
                     46: #include "config.h"
                     47: #endif
                     48: 
                     49: #include "pcre_internal.h"
                     50: 
1.3       misha      51: #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
1.1       misha      52: 
                     53: /* Returns from set_start_bits() */
                     54: 
                     55: enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
                     56: 
                     57: 
1.3       misha      58: 
                     59: /*************************************************
                     60: *   Find the minimum subject length for a group  *
                     61: *************************************************/
                     62: 
                     63: /* Scan a parenthesized group and compute the minimum length of subject that
                     64: is needed to match it. This is a lower bound; it does not mean there is a
                     65: string of that length that matches. In UTF8 mode, the result is in characters
                     66: rather than bytes.
                     67: 
                     68: Arguments:
                     69:   code       pointer to start of group (the bracket)
                     70:   startcode  pointer to start of the whole pattern
                     71:   options    the compiling options
                     72: 
                     73: Returns:   the minimum length
                     74:            -1 if \C was encountered
                     75:            -2 internal error (missing capturing bracket)
                     76: */
                     77: 
                     78: static int
                     79: find_minlength(const uschar *code, const uschar *startcode, int options)
                     80: {
                     81: int length = -1;
                     82: BOOL utf8 = (options & PCRE_UTF8) != 0;
                     83: BOOL had_recurse = FALSE;
                     84: register int branchlength = 0;
                     85: register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
                     86: 
                     87: if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
                     88: 
                     89: /* Scan along the opcodes for this branch. If we get to the end of the
                     90: branch, check the length against that of the other branches. */
                     91: 
                     92: for (;;)
                     93:   {
                     94:   int d, min;
                     95:   uschar *cs, *ce;
                     96:   register int op = *cc;
                     97: 
                     98:   switch (op)
                     99:     {
                    100:     case OP_COND:
                    101:     case OP_SCOND:
                    102: 
                    103:     /* If there is only one branch in a condition, the implied branch has zero
                    104:     length, so we don't add anything. This covers the DEFINE "condition"
                    105:     automatically. */
                    106: 
                    107:     cs = cc + GET(cc, 1);
                    108:     if (*cs != OP_ALT)
                    109:       {
                    110:       cc = cs + 1 + LINK_SIZE;
                    111:       break;
                    112:       }
                    113: 
                    114:     /* Otherwise we can fall through and treat it the same as any other
                    115:     subpattern. */
                    116: 
                    117:     case OP_CBRA:
                    118:     case OP_SCBRA:
                    119:     case OP_BRA:
                    120:     case OP_SBRA:
                    121:     case OP_ONCE:
                    122:     d = find_minlength(cc, startcode, options);
                    123:     if (d < 0) return d;
                    124:     branchlength += d;
                    125:     do cc += GET(cc, 1); while (*cc == OP_ALT);
                    126:     cc += 1 + LINK_SIZE;
                    127:     break;
                    128: 
                    129:     /* Reached end of a branch; if it's a ket it is the end of a nested
                    130:     call. If it's ALT it is an alternation in a nested call. If it is
                    131:     END it's the end of the outer call. All can be handled by the same code. */
                    132: 
                    133:     case OP_ALT:
                    134:     case OP_KET:
                    135:     case OP_KETRMAX:
                    136:     case OP_KETRMIN:
                    137:     case OP_END:
                    138:     if (length < 0 || (!had_recurse && branchlength < length))
                    139:       length = branchlength;
                    140:     if (*cc != OP_ALT) return length;
                    141:     cc += 1 + LINK_SIZE;
                    142:     branchlength = 0;
                    143:     had_recurse = FALSE;
                    144:     break;
                    145: 
                    146:     /* Skip over assertive subpatterns */
                    147: 
                    148:     case OP_ASSERT:
                    149:     case OP_ASSERT_NOT:
                    150:     case OP_ASSERTBACK:
                    151:     case OP_ASSERTBACK_NOT:
                    152:     do cc += GET(cc, 1); while (*cc == OP_ALT);
                    153:     /* Fall through */
                    154: 
                    155:     /* Skip over things that don't match chars */
                    156: 
                    157:     case OP_REVERSE:
                    158:     case OP_CREF:
                    159:     case OP_NCREF:
                    160:     case OP_RREF:
                    161:     case OP_NRREF:
                    162:     case OP_DEF:
                    163:     case OP_OPT:
                    164:     case OP_CALLOUT:
                    165:     case OP_SOD:
                    166:     case OP_SOM:
                    167:     case OP_EOD:
                    168:     case OP_EODN:
                    169:     case OP_CIRC:
                    170:     case OP_DOLL:
                    171:     case OP_NOT_WORD_BOUNDARY:
                    172:     case OP_WORD_BOUNDARY:
                    173:     cc += _pcre_OP_lengths[*cc];
                    174:     break;
                    175: 
                    176:     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
                    177: 
                    178:     case OP_BRAZERO:
                    179:     case OP_BRAMINZERO:
                    180:     case OP_SKIPZERO:
                    181:     cc += _pcre_OP_lengths[*cc];
                    182:     do cc += GET(cc, 1); while (*cc == OP_ALT);
                    183:     cc += 1 + LINK_SIZE;
                    184:     break;
                    185: 
                    186:     /* Handle literal characters and + repetitions */
                    187: 
                    188:     case OP_CHAR:
                    189:     case OP_CHARNC:
                    190:     case OP_NOT:
                    191:     case OP_PLUS:
                    192:     case OP_MINPLUS:
                    193:     case OP_POSPLUS:
                    194:     case OP_NOTPLUS:
                    195:     case OP_NOTMINPLUS:
                    196:     case OP_NOTPOSPLUS:
                    197:     branchlength++;
                    198:     cc += 2;
                    199: #ifdef SUPPORT_UTF8
                    200:     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
                    201: #endif
                    202:     break;
                    203: 
                    204:     case OP_TYPEPLUS:
                    205:     case OP_TYPEMINPLUS:
                    206:     case OP_TYPEPOSPLUS:
                    207:     branchlength++;
                    208:     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
                    209:     break;
                    210: 
                    211:     /* Handle exact repetitions. The count is already in characters, but we
                    212:     need to skip over a multibyte character in UTF8 mode.  */
                    213: 
                    214:     case OP_EXACT:
                    215:     case OP_NOTEXACT:
                    216:     branchlength += GET2(cc,1);
                    217:     cc += 4;
                    218: #ifdef SUPPORT_UTF8
                    219:     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
                    220: #endif
                    221:     break;
                    222: 
                    223:     case OP_TYPEEXACT:
                    224:     branchlength += GET2(cc,1);
                    225:     cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
                    226:     break;
                    227: 
                    228:     /* Handle single-char non-literal matchers */
                    229: 
                    230:     case OP_PROP:
                    231:     case OP_NOTPROP:
                    232:     cc += 2;
                    233:     /* Fall through */
                    234: 
                    235:     case OP_NOT_DIGIT:
                    236:     case OP_DIGIT:
                    237:     case OP_NOT_WHITESPACE:
                    238:     case OP_WHITESPACE:
                    239:     case OP_NOT_WORDCHAR:
                    240:     case OP_WORDCHAR:
                    241:     case OP_ANY:
                    242:     case OP_ALLANY:
                    243:     case OP_EXTUNI:
                    244:     case OP_HSPACE:
                    245:     case OP_NOT_HSPACE:
                    246:     case OP_VSPACE:
                    247:     case OP_NOT_VSPACE:
                    248:     branchlength++;
                    249:     cc++;
                    250:     break;
                    251: 
                    252:     /* "Any newline" might match two characters */
                    253: 
                    254:     case OP_ANYNL:
                    255:     branchlength += 2;
                    256:     cc++;
                    257:     break;
                    258: 
                    259:     /* The single-byte matcher means we can't proceed in UTF-8 mode */
                    260: 
                    261:     case OP_ANYBYTE:
                    262: #ifdef SUPPORT_UTF8
                    263:     if (utf8) return -1;
                    264: #endif
                    265:     branchlength++;
                    266:     cc++;
                    267:     break;
                    268: 
                    269:     /* For repeated character types, we have to test for \p and \P, which have
                    270:     an extra two bytes of parameters. */
                    271: 
                    272:     case OP_TYPESTAR:
                    273:     case OP_TYPEMINSTAR:
                    274:     case OP_TYPEQUERY:
                    275:     case OP_TYPEMINQUERY:
                    276:     case OP_TYPEPOSSTAR:
                    277:     case OP_TYPEPOSQUERY:
                    278:     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
                    279:     cc += _pcre_OP_lengths[op];
                    280:     break;
                    281: 
                    282:     case OP_TYPEUPTO:
                    283:     case OP_TYPEMINUPTO:
                    284:     case OP_TYPEPOSUPTO:
                    285:     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
                    286:     cc += _pcre_OP_lengths[op];
                    287:     break;
                    288: 
                    289:     /* Check a class for variable quantification */
                    290: 
                    291: #ifdef SUPPORT_UTF8
                    292:     case OP_XCLASS:
                    293:     cc += GET(cc, 1) - 33;
                    294:     /* Fall through */
                    295: #endif
                    296: 
                    297:     case OP_CLASS:
                    298:     case OP_NCLASS:
                    299:     cc += 33;
                    300: 
                    301:     switch (*cc)
                    302:       {
                    303:       case OP_CRPLUS:
                    304:       case OP_CRMINPLUS:
                    305:       branchlength++;
                    306:       /* Fall through */
                    307: 
                    308:       case OP_CRSTAR:
                    309:       case OP_CRMINSTAR:
                    310:       case OP_CRQUERY:
                    311:       case OP_CRMINQUERY:
                    312:       cc++;
                    313:       break;
                    314: 
                    315:       case OP_CRRANGE:
                    316:       case OP_CRMINRANGE:
                    317:       branchlength += GET2(cc,1);
                    318:       cc += 5;
                    319:       break;
                    320: 
                    321:       default:
                    322:       branchlength++;
                    323:       break;
                    324:       }
                    325:     break;
                    326: 
                    327:     /* Backreferences and subroutine calls are treated in the same way: we find
                    328:     the minimum length for the subpattern. A recursion, however, causes an
                    329:     a flag to be set that causes the length of this branch to be ignored. The
                    330:     logic is that a recursion can only make sense if there is another
                    331:     alternation that stops the recursing. That will provide the minimum length
                    332:     (when no recursion happens). A backreference within the group that it is
                    333:     referencing behaves in the same way.
                    334: 
                    335:     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
                    336:     matches an empty string (by default it causes a matching failure), so in
                    337:     that case we must set the minimum length to zero. */
                    338: 
                    339:     case OP_REF:
                    340:     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
                    341:       {
                    342:       ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
                    343:       if (cs == NULL) return -2;
                    344:       do ce += GET(ce, 1); while (*ce == OP_ALT);
                    345:       if (cc > cs && cc < ce)
                    346:         {
                    347:         d = 0;
                    348:         had_recurse = TRUE;
                    349:         }
                    350:       else d = find_minlength(cs, startcode, options);
                    351:       }
                    352:     else d = 0;
                    353:     cc += 3;
                    354: 
                    355:     /* Handle repeated back references */
                    356: 
                    357:     switch (*cc)
                    358:       {
                    359:       case OP_CRSTAR:
                    360:       case OP_CRMINSTAR:
                    361:       case OP_CRQUERY:
                    362:       case OP_CRMINQUERY:
                    363:       min = 0;
                    364:       cc++;
                    365:       break;
                    366: 
                    367:       case OP_CRRANGE:
                    368:       case OP_CRMINRANGE:
                    369:       min = GET2(cc, 1);
                    370:       cc += 5;
                    371:       break;
                    372: 
                    373:       default:
                    374:       min = 1;
                    375:       break;
                    376:       }
                    377: 
                    378:     branchlength += min * d;
                    379:     break;
                    380: 
                    381:     case OP_RECURSE:
                    382:     cs = ce = (uschar *)startcode + GET(cc, 1);
                    383:     if (cs == NULL) return -2;
                    384:     do ce += GET(ce, 1); while (*ce == OP_ALT);
                    385:     if (cc > cs && cc < ce)
                    386:       had_recurse = TRUE;
                    387:     else
                    388:       branchlength += find_minlength(cs, startcode, options);
                    389:     cc += 1 + LINK_SIZE;
                    390:     break;
                    391: 
                    392:     /* Anything else does not or need not match a character. We can get the
                    393:     item's length from the table, but for those that can match zero occurrences
                    394:     of a character, we must take special action for UTF-8 characters. */
                    395: 
                    396:     case OP_UPTO:
                    397:     case OP_NOTUPTO:
                    398:     case OP_MINUPTO:
                    399:     case OP_NOTMINUPTO:
                    400:     case OP_POSUPTO:
                    401:     case OP_STAR:
                    402:     case OP_MINSTAR:
                    403:     case OP_NOTMINSTAR:
                    404:     case OP_POSSTAR:
                    405:     case OP_NOTPOSSTAR:
                    406:     case OP_QUERY:
                    407:     case OP_MINQUERY:
                    408:     case OP_NOTMINQUERY:
                    409:     case OP_POSQUERY:
                    410:     case OP_NOTPOSQUERY:
                    411:     cc += _pcre_OP_lengths[op];
                    412: #ifdef SUPPORT_UTF8
                    413:     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
                    414: #endif
                    415:     break;
                    416: 
                    417:     /* Skip these, but we need to add in the name length. */
                    418: 
                    419:     case OP_MARK:
                    420:     case OP_PRUNE_ARG:
                    421:     case OP_SKIP_ARG:
1.4     ! misha     422:     cc += _pcre_OP_lengths[op] + cc[1];
        !           423:     break;
        !           424: 
1.3       misha     425:     case OP_THEN_ARG:
1.4     ! misha     426:     cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
1.3       misha     427:     break;
                    428: 
                    429:     /* For the record, these are the opcodes that are matched by "default":
                    430:     OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
                    431:     OP_THEN. */
                    432: 
                    433:     default:
                    434:     cc += _pcre_OP_lengths[op];
                    435:     break;
                    436:     }
                    437:   }
                    438: /* Control never gets here */
                    439: }
                    440: 
                    441: 
                    442: 
1.1       misha     443: /*************************************************
                    444: *      Set a bit and maybe its alternate case    *
                    445: *************************************************/
                    446: 
1.3       misha     447: /* Given a character, set its first byte's bit in the table, and also the
                    448: corresponding bit for the other version of a letter if we are caseless. In
                    449: UTF-8 mode, for characters greater than 127, we can only do the caseless thing
                    450: when Unicode property support is available.
1.1       misha     451: 
                    452: Arguments:
                    453:   start_bits    points to the bit map
1.3       misha     454:   p             points to the character
1.1       misha     455:   caseless      the caseless flag
                    456:   cd            the block with char table pointers
1.3       misha     457:   utf8          TRUE for UTF-8 mode
                    458: 
                    459: Returns:        pointer after the character
                    460: */
                    461: 
                    462: static const uschar *
                    463: set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
                    464:   compile_data *cd, BOOL utf8)
                    465: {
                    466: unsigned int c = *p;
                    467: 
                    468: SET_BIT(c);
                    469: 
                    470: #ifdef SUPPORT_UTF8
                    471: if (utf8 && c > 127)
                    472:   {
                    473:   GETCHARINC(c, p);
                    474: #ifdef SUPPORT_UCP
                    475:   if (caseless)
                    476:     {
                    477:     uschar buff[8];
                    478:     c = UCD_OTHERCASE(c);
                    479:     (void)_pcre_ord2utf8(c, buff);
                    480:     SET_BIT(buff[0]);
                    481:     }
                    482: #endif
                    483:   return p;
                    484:   }
                    485: #endif
                    486: 
                    487: /* Not UTF-8 mode, or character is less than 127. */
                    488: 
                    489: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
                    490: return p + 1;
                    491: }
                    492: 
                    493: 
                    494: 
                    495: /*************************************************
                    496: *     Set bits for a positive character type     *
                    497: *************************************************/
                    498: 
                    499: /* This function sets starting bits for a character type. In UTF-8 mode, we can
                    500: only do a direct setting for bytes less than 128, as otherwise there can be
                    501: confusion with bytes in the middle of UTF-8 characters. In a "traditional"
                    502: environment, the tables will only recognize ASCII characters anyway, but in at
                    503: least one Windows environment, some higher bytes bits were set in the tables.
                    504: So we deal with that case by considering the UTF-8 encoding.
                    505: 
                    506: Arguments:
                    507:   start_bits     the starting bitmap
                    508:   cbit type      the type of character wanted
                    509:   table_limit    32 for non-UTF-8; 16 for UTF-8
                    510:   cd             the block with char table pointers
                    511: 
                    512: Returns:         nothing
                    513: */
                    514: 
                    515: static void
                    516: set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
                    517:   compile_data *cd)
                    518: {
                    519: register int c;
                    520: for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
                    521: if (table_limit == 32) return;
                    522: for (c = 128; c < 256; c++)
                    523:   {
                    524:   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
                    525:     {
                    526:     uschar buff[8];
                    527:     (void)_pcre_ord2utf8(c, buff);
                    528:     SET_BIT(buff[0]);
                    529:     }
                    530:   }
                    531: }
                    532: 
                    533: 
                    534: /*************************************************
                    535: *     Set bits for a negative character type     *
                    536: *************************************************/
                    537: 
                    538: /* This function sets starting bits for a negative character type such as \D.
                    539: In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
                    540: otherwise there can be confusion with bytes in the middle of UTF-8 characters.
                    541: Unlike in the positive case, where we can set appropriate starting bits for
                    542: specific high-valued UTF-8 characters, in this case we have to set the bits for
                    543: all high-valued characters. The lowest is 0xc2, but we overkill by starting at
                    544: 0xc0 (192) for simplicity.
1.1       misha     545: 
1.3       misha     546: Arguments:
                    547:   start_bits     the starting bitmap
                    548:   cbit type      the type of character wanted
                    549:   table_limit    32 for non-UTF-8; 16 for UTF-8
                    550:   cd             the block with char table pointers
                    551: 
                    552: Returns:         nothing
1.1       misha     553: */
                    554: 
                    555: static void
1.3       misha     556: set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
                    557:   compile_data *cd)
1.1       misha     558: {
1.3       misha     559: register int c;
                    560: for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
                    561: if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
1.1       misha     562: }
                    563: 
                    564: 
                    565: 
                    566: /*************************************************
                    567: *          Create bitmap of starting bytes       *
                    568: *************************************************/
                    569: 
                    570: /* This function scans a compiled unanchored expression recursively and
                    571: attempts to build a bitmap of the set of possible starting bytes. As time goes
                    572: by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
                    573: useful for parenthesized groups in patterns such as (a*)b where the group
                    574: provides some optional starting bytes but scanning must continue at the outer
                    575: level to find at least one mandatory byte. At the outermost level, this
                    576: function fails unless the result is SSB_DONE.
                    577: 
                    578: Arguments:
                    579:   code         points to an expression
                    580:   start_bits   points to a 32-byte table, initialized to 0
                    581:   caseless     the current state of the caseless flag
                    582:   utf8         TRUE if in UTF-8 mode
                    583:   cd           the block with char table pointers
                    584: 
                    585: Returns:       SSB_FAIL     => Failed to find any starting bytes
                    586:                SSB_DONE     => Found mandatory starting bytes
                    587:                SSB_CONTINUE => Found optional starting bytes
                    588: */
                    589: 
                    590: static int
                    591: set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
                    592:   BOOL utf8, compile_data *cd)
                    593: {
                    594: register int c;
                    595: int yield = SSB_DONE;
1.3       misha     596: int table_limit = utf8? 16:32;
1.1       misha     597: 
                    598: #if 0
                    599: /* ========================================================================= */
                    600: /* The following comment and code was inserted in January 1999. In May 2006,
                    601: when it was observed to cause compiler warnings about unused values, I took it
                    602: out again. If anybody is still using OS/2, they will have to put it back
                    603: manually. */
                    604: 
                    605: /* This next statement and the later reference to dummy are here in order to
                    606: trick the optimizer of the IBM C compiler for OS/2 into generating correct
                    607: code. Apparently IBM isn't going to fix the problem, and we would rather not
                    608: disable optimization (in this module it actually makes a big difference, and
                    609: the pcre module can use all the optimization it can get). */
                    610: 
                    611: volatile int dummy;
                    612: /* ========================================================================= */
                    613: #endif
                    614: 
                    615: do
                    616:   {
                    617:   const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
                    618:   BOOL try_next = TRUE;
                    619: 
                    620:   while (try_next)    /* Loop for items in this branch */
                    621:     {
                    622:     int rc;
                    623:     switch(*tcode)
                    624:       {
                    625:       /* Fail if we reach something we don't understand */
                    626: 
                    627:       default:
                    628:       return SSB_FAIL;
                    629: 
                    630:       /* If we hit a bracket or a positive lookahead assertion, recurse to set
                    631:       bits from within the subpattern. If it can't find anything, we have to
                    632:       give up. If it finds some mandatory character(s), we are done for this
                    633:       branch. Otherwise, carry on scanning after the subpattern. */
                    634: 
                    635:       case OP_BRA:
                    636:       case OP_SBRA:
                    637:       case OP_CBRA:
                    638:       case OP_SCBRA:
                    639:       case OP_ONCE:
                    640:       case OP_ASSERT:
                    641:       rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
                    642:       if (rc == SSB_FAIL) return SSB_FAIL;
                    643:       if (rc == SSB_DONE) try_next = FALSE; else
                    644:         {
                    645:         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
                    646:         tcode += 1 + LINK_SIZE;
                    647:         }
                    648:       break;
                    649: 
                    650:       /* If we hit ALT or KET, it means we haven't found anything mandatory in
                    651:       this branch, though we might have found something optional. For ALT, we
                    652:       continue with the next alternative, but we have to arrange that the final
                    653:       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
                    654:       return SSB_CONTINUE: if this is the top level, that indicates failure,
                    655:       but after a nested subpattern, it causes scanning to continue. */
                    656: 
                    657:       case OP_ALT:
                    658:       yield = SSB_CONTINUE;
                    659:       try_next = FALSE;
                    660:       break;
                    661: 
                    662:       case OP_KET:
                    663:       case OP_KETRMAX:
                    664:       case OP_KETRMIN:
                    665:       return SSB_CONTINUE;
                    666: 
                    667:       /* Skip over callout */
                    668: 
                    669:       case OP_CALLOUT:
                    670:       tcode += 2 + 2*LINK_SIZE;
                    671:       break;
                    672: 
                    673:       /* Skip over lookbehind and negative lookahead assertions */
                    674: 
                    675:       case OP_ASSERT_NOT:
                    676:       case OP_ASSERTBACK:
                    677:       case OP_ASSERTBACK_NOT:
                    678:       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
                    679:       tcode += 1 + LINK_SIZE;
                    680:       break;
                    681: 
                    682:       /* Skip over an option setting, changing the caseless flag */
                    683: 
                    684:       case OP_OPT:
                    685:       caseless = (tcode[1] & PCRE_CASELESS) != 0;
                    686:       tcode += 2;
                    687:       break;
                    688: 
                    689:       /* BRAZERO does the bracket, but carries on. */
                    690: 
                    691:       case OP_BRAZERO:
                    692:       case OP_BRAMINZERO:
                    693:       if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
                    694:         return SSB_FAIL;
                    695: /* =========================================================================
                    696:       See the comment at the head of this function concerning the next line,
                    697:       which was an old fudge for the benefit of OS/2.
                    698:       dummy = 1;
                    699:   ========================================================================= */
                    700:       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
                    701:       tcode += 1 + LINK_SIZE;
                    702:       break;
                    703: 
                    704:       /* SKIPZERO skips the bracket. */
                    705: 
                    706:       case OP_SKIPZERO:
1.2       misha     707:       tcode++;
1.1       misha     708:       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
                    709:       tcode += 1 + LINK_SIZE;
                    710:       break;
                    711: 
                    712:       /* Single-char * or ? sets the bit and tries the next item */
                    713: 
                    714:       case OP_STAR:
                    715:       case OP_MINSTAR:
                    716:       case OP_POSSTAR:
                    717:       case OP_QUERY:
                    718:       case OP_MINQUERY:
                    719:       case OP_POSQUERY:
1.3       misha     720:       tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
1.1       misha     721:       break;
                    722: 
                    723:       /* Single-char upto sets the bit and tries the next */
                    724: 
                    725:       case OP_UPTO:
                    726:       case OP_MINUPTO:
                    727:       case OP_POSUPTO:
1.3       misha     728:       tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
1.1       misha     729:       break;
                    730: 
                    731:       /* At least one single char sets the bit and stops */
                    732: 
                    733:       case OP_EXACT:       /* Fall through */
                    734:       tcode += 2;
                    735: 
                    736:       case OP_CHAR:
                    737:       case OP_CHARNC:
                    738:       case OP_PLUS:
                    739:       case OP_MINPLUS:
                    740:       case OP_POSPLUS:
1.3       misha     741:       (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
                    742:       try_next = FALSE;
                    743:       break;
                    744: 
                    745:       /* Special spacing and line-terminating items. These recognize specific
                    746:       lists of characters. The difference between VSPACE and ANYNL is that the
                    747:       latter can match the two-character CRLF sequence, but that is not
                    748:       relevant for finding the first character, so their code here is
                    749:       identical. */
                    750: 
                    751:       case OP_HSPACE:
                    752:       SET_BIT(0x09);
                    753:       SET_BIT(0x20);
                    754:       if (utf8)
                    755:         {
                    756:         SET_BIT(0xC2);  /* For U+00A0 */
                    757:         SET_BIT(0xE1);  /* For U+1680, U+180E */
                    758:         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
                    759:         SET_BIT(0xE3);  /* For U+3000 */
                    760:         }
                    761:       else SET_BIT(0xA0);
                    762:       try_next = FALSE;
                    763:       break;
                    764: 
                    765:       case OP_ANYNL:
                    766:       case OP_VSPACE:
                    767:       SET_BIT(0x0A);
                    768:       SET_BIT(0x0B);
                    769:       SET_BIT(0x0C);
                    770:       SET_BIT(0x0D);
                    771:       if (utf8)
                    772:         {
                    773:         SET_BIT(0xC2);  /* For U+0085 */
                    774:         SET_BIT(0xE2);  /* For U+2028, U+2029 */
                    775:         }
                    776:       else SET_BIT(0x85);
1.1       misha     777:       try_next = FALSE;
                    778:       break;
                    779: 
1.3       misha     780:       /* Single character types set the bits and stop. Note that if PCRE_UCP
                    781:       is set, we do not see these op codes because \d etc are converted to
                    782:       properties. Therefore, these apply in the case when only characters less
                    783:       than 256 are recognized to match the types. */
1.1       misha     784: 
                    785:       case OP_NOT_DIGIT:
1.3       misha     786:       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1.1       misha     787:       try_next = FALSE;
                    788:       break;
                    789: 
                    790:       case OP_DIGIT:
1.3       misha     791:       set_type_bits(start_bits, cbit_digit, table_limit, cd);
1.1       misha     792:       try_next = FALSE;
                    793:       break;
                    794: 
                    795:       /* The cbit_space table has vertical tab as whitespace; we have to
1.3       misha     796:       ensure it is set as not whitespace. */
1.1       misha     797: 
                    798:       case OP_NOT_WHITESPACE:
1.3       misha     799:       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
                    800:       start_bits[1] |= 0x08;
1.1       misha     801:       try_next = FALSE;
                    802:       break;
                    803: 
                    804:       /* The cbit_space table has vertical tab as whitespace; we have to
1.3       misha     805:       not set it from the table. */
1.1       misha     806: 
                    807:       case OP_WHITESPACE:
1.3       misha     808:       c = start_bits[1];    /* Save in case it was already set */
                    809:       set_type_bits(start_bits, cbit_space, table_limit, cd);
                    810:       start_bits[1] = (start_bits[1] & ~0x08) | c;
1.1       misha     811:       try_next = FALSE;
                    812:       break;
                    813: 
                    814:       case OP_NOT_WORDCHAR:
1.3       misha     815:       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1.1       misha     816:       try_next = FALSE;
                    817:       break;
                    818: 
                    819:       case OP_WORDCHAR:
1.3       misha     820:       set_type_bits(start_bits, cbit_word, table_limit, cd);
1.1       misha     821:       try_next = FALSE;
                    822:       break;
                    823: 
                    824:       /* One or more character type fudges the pointer and restarts, knowing
                    825:       it will hit a single character type and stop there. */
                    826: 
                    827:       case OP_TYPEPLUS:
                    828:       case OP_TYPEMINPLUS:
1.3       misha     829:       case OP_TYPEPOSPLUS:
1.1       misha     830:       tcode++;
                    831:       break;
                    832: 
                    833:       case OP_TYPEEXACT:
                    834:       tcode += 3;
                    835:       break;
                    836: 
                    837:       /* Zero or more repeats of character types set the bits and then
                    838:       try again. */
                    839: 
                    840:       case OP_TYPEUPTO:
                    841:       case OP_TYPEMINUPTO:
                    842:       case OP_TYPEPOSUPTO:
                    843:       tcode += 2;               /* Fall through */
                    844: 
                    845:       case OP_TYPESTAR:
                    846:       case OP_TYPEMINSTAR:
                    847:       case OP_TYPEPOSSTAR:
                    848:       case OP_TYPEQUERY:
                    849:       case OP_TYPEMINQUERY:
                    850:       case OP_TYPEPOSQUERY:
                    851:       switch(tcode[1])
                    852:         {
1.3       misha     853:         default:
1.1       misha     854:         case OP_ANY:
                    855:         case OP_ALLANY:
                    856:         return SSB_FAIL;
                    857: 
1.3       misha     858:         case OP_HSPACE:
                    859:         SET_BIT(0x09);
                    860:         SET_BIT(0x20);
                    861:         if (utf8)
                    862:           {
                    863:           SET_BIT(0xC2);  /* For U+00A0 */
                    864:           SET_BIT(0xE1);  /* For U+1680, U+180E */
                    865:           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
                    866:           SET_BIT(0xE3);  /* For U+3000 */
                    867:           }
                    868:         else SET_BIT(0xA0);
                    869:         break;
                    870: 
                    871:         case OP_ANYNL:
                    872:         case OP_VSPACE:
                    873:         SET_BIT(0x0A);
                    874:         SET_BIT(0x0B);
                    875:         SET_BIT(0x0C);
                    876:         SET_BIT(0x0D);
                    877:         if (utf8)
                    878:           {
                    879:           SET_BIT(0xC2);  /* For U+0085 */
                    880:           SET_BIT(0xE2);  /* For U+2028, U+2029 */
                    881:           }
                    882:         else SET_BIT(0x85);
                    883:         break;
                    884: 
1.1       misha     885:         case OP_NOT_DIGIT:
1.3       misha     886:         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1.1       misha     887:         break;
                    888: 
                    889:         case OP_DIGIT:
1.3       misha     890:         set_type_bits(start_bits, cbit_digit, table_limit, cd);
1.1       misha     891:         break;
                    892: 
                    893:         /* The cbit_space table has vertical tab as whitespace; we have to
1.3       misha     894:         ensure it gets set as not whitespace. */
1.1       misha     895: 
                    896:         case OP_NOT_WHITESPACE:
1.3       misha     897:         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
                    898:         start_bits[1] |= 0x08;
1.1       misha     899:         break;
                    900: 
                    901:         /* The cbit_space table has vertical tab as whitespace; we have to
1.3       misha     902:         avoid setting it. */
1.1       misha     903: 
                    904:         case OP_WHITESPACE:
1.3       misha     905:         c = start_bits[1];    /* Save in case it was already set */
                    906:         set_type_bits(start_bits, cbit_space, table_limit, cd);
                    907:         start_bits[1] = (start_bits[1] & ~0x08) | c;
1.1       misha     908:         break;
                    909: 
                    910:         case OP_NOT_WORDCHAR:
1.3       misha     911:         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1.1       misha     912:         break;
                    913: 
                    914:         case OP_WORDCHAR:
1.3       misha     915:         set_type_bits(start_bits, cbit_word, table_limit, cd);
1.1       misha     916:         break;
                    917:         }
                    918: 
                    919:       tcode += 2;
                    920:       break;
                    921: 
                    922:       /* Character class where all the information is in a bit map: set the
                    923:       bits and either carry on or not, according to the repeat count. If it was
                    924:       a negative class, and we are operating with UTF-8 characters, any byte
                    925:       with a value >= 0xc4 is a potentially valid starter because it starts a
                    926:       character with a value > 255. */
                    927: 
                    928:       case OP_NCLASS:
                    929: #ifdef SUPPORT_UTF8
                    930:       if (utf8)
                    931:         {
                    932:         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
                    933:         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
                    934:         }
                    935: #endif
                    936:       /* Fall through */
                    937: 
                    938:       case OP_CLASS:
                    939:         {
                    940:         tcode++;
                    941: 
                    942:         /* In UTF-8 mode, the bits in a bit map correspond to character
                    943:         values, not to byte values. However, the bit map we are constructing is
                    944:         for byte values. So we have to do a conversion for characters whose
                    945:         value is > 127. In fact, there are only two possible starting bytes for
                    946:         characters in the range 128 - 255. */
                    947: 
                    948: #ifdef SUPPORT_UTF8
                    949:         if (utf8)
                    950:           {
                    951:           for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
                    952:           for (c = 128; c < 256; c++)
                    953:             {
                    954:             if ((tcode[c/8] && (1 << (c&7))) != 0)
                    955:               {
                    956:               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
                    957:               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
                    958:               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
                    959:               }
                    960:             }
                    961:           }
                    962: 
                    963:         /* In non-UTF-8 mode, the two bit maps are completely compatible. */
                    964: 
                    965:         else
                    966: #endif
                    967:           {
                    968:           for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
                    969:           }
                    970: 
                    971:         /* Advance past the bit map, and act on what follows */
                    972: 
                    973:         tcode += 32;
                    974:         switch (*tcode)
                    975:           {
                    976:           case OP_CRSTAR:
                    977:           case OP_CRMINSTAR:
                    978:           case OP_CRQUERY:
                    979:           case OP_CRMINQUERY:
                    980:           tcode++;
                    981:           break;
                    982: 
                    983:           case OP_CRRANGE:
                    984:           case OP_CRMINRANGE:
                    985:           if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
                    986:             else try_next = FALSE;
                    987:           break;
                    988: 
                    989:           default:
                    990:           try_next = FALSE;
                    991:           break;
                    992:           }
                    993:         }
                    994:       break; /* End of bitmap class handling */
                    995: 
                    996:       }      /* End of switch */
                    997:     }        /* End of try_next loop */
                    998: 
                    999:   code += GET(code, 1);   /* Advance to next branch */
                   1000:   }
                   1001: while (*code == OP_ALT);
                   1002: return yield;
                   1003: }
                   1004: 
                   1005: 
                   1006: 
                   1007: /*************************************************
                   1008: *          Study a compiled expression           *
                   1009: *************************************************/
                   1010: 
                   1011: /* This function is handed a compiled expression that it must study to produce
                   1012: information that will speed up the matching. It returns a pcre_extra block
                   1013: which then gets handed back to pcre_exec().
                   1014: 
                   1015: Arguments:
                   1016:   re        points to the compiled expression
                   1017:   options   contains option bits
                   1018:   errorptr  points to where to place error messages;
                   1019:             set NULL unless error
                   1020: 
                   1021: Returns:    pointer to a pcre_extra block, with study_data filled in and the
1.3       misha    1022:               appropriate flags set;
1.1       misha    1023:             NULL on error or if no optimization possible
                   1024: */
                   1025: 
1.2       misha    1026: PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1.1       misha    1027: pcre_study(const pcre *external_re, int options, const char **errorptr)
                   1028: {
1.3       misha    1029: int min;
                   1030: BOOL bits_set = FALSE;
1.1       misha    1031: uschar start_bits[32];
                   1032: pcre_extra *extra;
                   1033: pcre_study_data *study;
                   1034: const uschar *tables;
                   1035: uschar *code;
                   1036: compile_data compile_block;
                   1037: const real_pcre *re = (const real_pcre *)external_re;
                   1038: 
                   1039: *errorptr = NULL;
                   1040: 
                   1041: if (re == NULL || re->magic_number != MAGIC_NUMBER)
                   1042:   {
                   1043:   *errorptr = "argument is not a compiled regular expression";
                   1044:   return NULL;
                   1045:   }
                   1046: 
                   1047: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
                   1048:   {
                   1049:   *errorptr = "unknown or incorrect option bit(s) set";
                   1050:   return NULL;
                   1051:   }
                   1052: 
                   1053: code = (uschar *)re + re->name_table_offset +
                   1054:   (re->name_count * re->name_entry_size);
                   1055: 
                   1056: /* For an anchored pattern, or an unanchored pattern that has a first char, or
1.3       misha    1057: a multiline pattern that matches only at "line starts", there is no point in
                   1058: seeking a list of starting bytes. */
                   1059: 
                   1060: if ((re->options & PCRE_ANCHORED) == 0 &&
                   1061:     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
                   1062:   {
                   1063:   /* Set the character tables in the block that is passed around */
1.1       misha    1064: 
1.3       misha    1065:   tables = re->tables;
                   1066:   if (tables == NULL)
                   1067:     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
                   1068:     (void *)(&tables));
                   1069: 
                   1070:   compile_block.lcc = tables + lcc_offset;
                   1071:   compile_block.fcc = tables + fcc_offset;
                   1072:   compile_block.cbits = tables + cbits_offset;
                   1073:   compile_block.ctypes = tables + ctypes_offset;
                   1074: 
                   1075:   /* See if we can find a fixed set of initial characters for the pattern. */
                   1076: 
                   1077:   memset(start_bits, 0, 32 * sizeof(uschar));
                   1078:   bits_set = set_start_bits(code, start_bits,
                   1079:     (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
                   1080:     &compile_block) == SSB_DONE;
                   1081:   }
                   1082: 
                   1083: /* Find the minimum length of subject string. */
                   1084: 
                   1085: min = find_minlength(code, code, re->options);
1.1       misha    1086: 
1.3       misha    1087: /* Return NULL if no optimization is possible. */
1.1       misha    1088: 
1.3       misha    1089: if (!bits_set && min < 0) return NULL;
1.1       misha    1090: 
                   1091: /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
                   1092: the latter, which is pointed to by the former, which may also get additional
                   1093: data set later by the calling program. At the moment, the size of
                   1094: pcre_study_data is fixed. We nevertheless save it in a field for returning via
                   1095: the pcre_fullinfo() function so that if it becomes variable in the future, we
                   1096: don't have to change that code. */
                   1097: 
                   1098: extra = (pcre_extra *)(pcre_malloc)
                   1099:   (sizeof(pcre_extra) + sizeof(pcre_study_data));
                   1100: 
                   1101: if (extra == NULL)
                   1102:   {
                   1103:   *errorptr = "failed to get memory";
                   1104:   return NULL;
                   1105:   }
                   1106: 
                   1107: study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
                   1108: extra->flags = PCRE_EXTRA_STUDY_DATA;
                   1109: extra->study_data = study;
                   1110: 
                   1111: study->size = sizeof(pcre_study_data);
1.3       misha    1112: study->flags = 0;
                   1113: 
                   1114: if (bits_set)
                   1115:   {
                   1116:   study->flags |= PCRE_STUDY_MAPPED;
                   1117:   memcpy(study->start_bits, start_bits, sizeof(start_bits));
                   1118:   }
                   1119: 
                   1120: if (min >= 0)
                   1121:   {
                   1122:   study->flags |= PCRE_STUDY_MINLEN;
                   1123:   study->minlength = min;
                   1124:   }
1.1       misha    1125: 
                   1126: return extra;
                   1127: }
                   1128: 
                   1129: /* End of pcre_study.c */

E-mail: