win32/pcre/pcre_study.c - annotate

Return to pcre_study.c CVS log
Up to [parser3project] / win32 / pcre
Annotation of win32/pcre/pcre_study.c, revision 1.3

1.1       misha       1: /*************************************************
                      2: *      Perl-Compatible Regular Expressions       *
                      3: *************************************************/
                      4: 
                      5: /* PCRE is a library of functions to support regular expressions whose syntax
                      6: and semantics are as close as possible to those of the Perl 5 language.
                      7: 
                      8:                        Written by Philip Hazel
1.3     ! misha       9:            Copyright (c) 1997-2010 University of Cambridge
1.1       misha      10: 
                     11: -----------------------------------------------------------------------------
                     12: Redistribution and use in source and binary forms, with or without
                     13: modification, are permitted provided that the following conditions are met:
                     14: 
                     15:     * Redistributions of source code must retain the above copyright notice,
                     16:       this list of conditions and the following disclaimer.
                     17: 
                     18:     * Redistributions in binary form must reproduce the above copyright
                     19:       notice, this list of conditions and the following disclaimer in the
                     20:       documentation and/or other materials provided with the distribution.
                     21: 
                     22:     * Neither the name of the University of Cambridge nor the names of its
                     23:       contributors may be used to endorse or promote products derived from
                     24:       this software without specific prior written permission.
                     25: 
                     26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
                     27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
                     30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     36: POSSIBILITY OF SUCH DAMAGE.
                     37: -----------------------------------------------------------------------------
                     38: */
                     39: 
                     40: 
                     41: /* This module contains the external function pcre_study(), along with local
                     42: supporting functions. */
                     43: 
                     44: 
                     45: #ifdef HAVE_CONFIG_H
                     46: #include "config.h"
                     47: #endif
                     48: 
                     49: #include "pcre_internal.h"
                     50: 
1.3     ! misha      51: #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
1.1       misha      52: 
                     53: /* Returns from set_start_bits() */
                     54: 
                     55: enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
                     56: 
                     57: 
1.3     ! misha      58: 
        !            59: /*************************************************
        !            60: *   Find the minimum subject length for a group  *
        !            61: *************************************************/
        !            62: 
        !            63: /* Scan a parenthesized group and compute the minimum length of subject that
        !            64: is needed to match it. This is a lower bound; it does not mean there is a
        !            65: string of that length that matches. In UTF8 mode, the result is in characters
        !            66: rather than bytes.
        !            67: 
        !            68: Arguments:
        !            69:   code       pointer to start of group (the bracket)
        !            70:   startcode  pointer to start of the whole pattern
        !            71:   options    the compiling options
        !            72: 
        !            73: Returns:   the minimum length
        !            74:            -1 if \C was encountered
        !            75:            -2 internal error (missing capturing bracket)
        !            76: */
        !            77: 
        !            78: static int
        !            79: find_minlength(const uschar *code, const uschar *startcode, int options)
        !            80: {
        !            81: int length = -1;
        !            82: BOOL utf8 = (options & PCRE_UTF8) != 0;
        !            83: BOOL had_recurse = FALSE;
        !            84: register int branchlength = 0;
        !            85: register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
        !            86: 
        !            87: if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
        !            88: 
        !            89: /* Scan along the opcodes for this branch. If we get to the end of the
        !            90: branch, check the length against that of the other branches. */
        !            91: 
        !            92: for (;;)
        !            93:   {
        !            94:   int d, min;
        !            95:   uschar *cs, *ce;
        !            96:   register int op = *cc;
        !            97: 
        !            98:   switch (op)
        !            99:     {
        !           100:     case OP_COND:
        !           101:     case OP_SCOND:
        !           102: 
        !           103:     /* If there is only one branch in a condition, the implied branch has zero
        !           104:     length, so we don't add anything. This covers the DEFINE "condition"
        !           105:     automatically. */
        !           106: 
        !           107:     cs = cc + GET(cc, 1);
        !           108:     if (*cs != OP_ALT)
        !           109:       {
        !           110:       cc = cs + 1 + LINK_SIZE;
        !           111:       break;
        !           112:       }
        !           113: 
        !           114:     /* Otherwise we can fall through and treat it the same as any other
        !           115:     subpattern. */
        !           116: 
        !           117:     case OP_CBRA:
        !           118:     case OP_SCBRA:
        !           119:     case OP_BRA:
        !           120:     case OP_SBRA:
        !           121:     case OP_ONCE:
        !           122:     d = find_minlength(cc, startcode, options);
        !           123:     if (d < 0) return d;
        !           124:     branchlength += d;
        !           125:     do cc += GET(cc, 1); while (*cc == OP_ALT);
        !           126:     cc += 1 + LINK_SIZE;
        !           127:     break;
        !           128: 
        !           129:     /* Reached end of a branch; if it's a ket it is the end of a nested
        !           130:     call. If it's ALT it is an alternation in a nested call. If it is
        !           131:     END it's the end of the outer call. All can be handled by the same code. */
        !           132: 
        !           133:     case OP_ALT:
        !           134:     case OP_KET:
        !           135:     case OP_KETRMAX:
        !           136:     case OP_KETRMIN:
        !           137:     case OP_END:
        !           138:     if (length < 0 || (!had_recurse && branchlength < length))
        !           139:       length = branchlength;
        !           140:     if (*cc != OP_ALT) return length;
        !           141:     cc += 1 + LINK_SIZE;
        !           142:     branchlength = 0;
        !           143:     had_recurse = FALSE;
        !           144:     break;
        !           145: 
        !           146:     /* Skip over assertive subpatterns */
        !           147: 
        !           148:     case OP_ASSERT:
        !           149:     case OP_ASSERT_NOT:
        !           150:     case OP_ASSERTBACK:
        !           151:     case OP_ASSERTBACK_NOT:
        !           152:     do cc += GET(cc, 1); while (*cc == OP_ALT);
        !           153:     /* Fall through */
        !           154: 
        !           155:     /* Skip over things that don't match chars */
        !           156: 
        !           157:     case OP_REVERSE:
        !           158:     case OP_CREF:
        !           159:     case OP_NCREF:
        !           160:     case OP_RREF:
        !           161:     case OP_NRREF:
        !           162:     case OP_DEF:
        !           163:     case OP_OPT:
        !           164:     case OP_CALLOUT:
        !           165:     case OP_SOD:
        !           166:     case OP_SOM:
        !           167:     case OP_EOD:
        !           168:     case OP_EODN:
        !           169:     case OP_CIRC:
        !           170:     case OP_DOLL:
        !           171:     case OP_NOT_WORD_BOUNDARY:
        !           172:     case OP_WORD_BOUNDARY:
        !           173:     cc += _pcre_OP_lengths[*cc];
        !           174:     break;
        !           175: 
        !           176:     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
        !           177: 
        !           178:     case OP_BRAZERO:
        !           179:     case OP_BRAMINZERO:
        !           180:     case OP_SKIPZERO:
        !           181:     cc += _pcre_OP_lengths[*cc];
        !           182:     do cc += GET(cc, 1); while (*cc == OP_ALT);
        !           183:     cc += 1 + LINK_SIZE;
        !           184:     break;
        !           185: 
        !           186:     /* Handle literal characters and + repetitions */
        !           187: 
        !           188:     case OP_CHAR:
        !           189:     case OP_CHARNC:
        !           190:     case OP_NOT:
        !           191:     case OP_PLUS:
        !           192:     case OP_MINPLUS:
        !           193:     case OP_POSPLUS:
        !           194:     case OP_NOTPLUS:
        !           195:     case OP_NOTMINPLUS:
        !           196:     case OP_NOTPOSPLUS:
        !           197:     branchlength++;
        !           198:     cc += 2;
        !           199: #ifdef SUPPORT_UTF8
        !           200:     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
        !           201: #endif
        !           202:     break;
        !           203: 
        !           204:     case OP_TYPEPLUS:
        !           205:     case OP_TYPEMINPLUS:
        !           206:     case OP_TYPEPOSPLUS:
        !           207:     branchlength++;
        !           208:     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
        !           209:     break;
        !           210: 
        !           211:     /* Handle exact repetitions. The count is already in characters, but we
        !           212:     need to skip over a multibyte character in UTF8 mode.  */
        !           213: 
        !           214:     case OP_EXACT:
        !           215:     case OP_NOTEXACT:
        !           216:     branchlength += GET2(cc,1);
        !           217:     cc += 4;
        !           218: #ifdef SUPPORT_UTF8
        !           219:     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
        !           220: #endif
        !           221:     break;
        !           222: 
        !           223:     case OP_TYPEEXACT:
        !           224:     branchlength += GET2(cc,1);
        !           225:     cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
        !           226:     break;
        !           227: 
        !           228:     /* Handle single-char non-literal matchers */
        !           229: 
        !           230:     case OP_PROP:
        !           231:     case OP_NOTPROP:
        !           232:     cc += 2;
        !           233:     /* Fall through */
        !           234: 
        !           235:     case OP_NOT_DIGIT:
        !           236:     case OP_DIGIT:
        !           237:     case OP_NOT_WHITESPACE:
        !           238:     case OP_WHITESPACE:
        !           239:     case OP_NOT_WORDCHAR:
        !           240:     case OP_WORDCHAR:
        !           241:     case OP_ANY:
        !           242:     case OP_ALLANY:
        !           243:     case OP_EXTUNI:
        !           244:     case OP_HSPACE:
        !           245:     case OP_NOT_HSPACE:
        !           246:     case OP_VSPACE:
        !           247:     case OP_NOT_VSPACE:
        !           248:     branchlength++;
        !           249:     cc++;
        !           250:     break;
        !           251: 
        !           252:     /* "Any newline" might match two characters */
        !           253: 
        !           254:     case OP_ANYNL:
        !           255:     branchlength += 2;
        !           256:     cc++;
        !           257:     break;
        !           258: 
        !           259:     /* The single-byte matcher means we can't proceed in UTF-8 mode */
        !           260: 
        !           261:     case OP_ANYBYTE:
        !           262: #ifdef SUPPORT_UTF8
        !           263:     if (utf8) return -1;
        !           264: #endif
        !           265:     branchlength++;
        !           266:     cc++;
        !           267:     break;
        !           268: 
        !           269:     /* For repeated character types, we have to test for \p and \P, which have
        !           270:     an extra two bytes of parameters. */
        !           271: 
        !           272:     case OP_TYPESTAR:
        !           273:     case OP_TYPEMINSTAR:
        !           274:     case OP_TYPEQUERY:
        !           275:     case OP_TYPEMINQUERY:
        !           276:     case OP_TYPEPOSSTAR:
        !           277:     case OP_TYPEPOSQUERY:
        !           278:     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
        !           279:     cc += _pcre_OP_lengths[op];
        !           280:     break;
        !           281: 
        !           282:     case OP_TYPEUPTO:
        !           283:     case OP_TYPEMINUPTO:
        !           284:     case OP_TYPEPOSUPTO:
        !           285:     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
        !           286:     cc += _pcre_OP_lengths[op];
        !           287:     break;
        !           288: 
        !           289:     /* Check a class for variable quantification */
        !           290: 
        !           291: #ifdef SUPPORT_UTF8
        !           292:     case OP_XCLASS:
        !           293:     cc += GET(cc, 1) - 33;
        !           294:     /* Fall through */
        !           295: #endif
        !           296: 
        !           297:     case OP_CLASS:
        !           298:     case OP_NCLASS:
        !           299:     cc += 33;
        !           300: 
        !           301:     switch (*cc)
        !           302:       {
        !           303:       case OP_CRPLUS:
        !           304:       case OP_CRMINPLUS:
        !           305:       branchlength++;
        !           306:       /* Fall through */
        !           307: 
        !           308:       case OP_CRSTAR:
        !           309:       case OP_CRMINSTAR:
        !           310:       case OP_CRQUERY:
        !           311:       case OP_CRMINQUERY:
        !           312:       cc++;
        !           313:       break;
        !           314: 
        !           315:       case OP_CRRANGE:
        !           316:       case OP_CRMINRANGE:
        !           317:       branchlength += GET2(cc,1);
        !           318:       cc += 5;
        !           319:       break;
        !           320: 
        !           321:       default:
        !           322:       branchlength++;
        !           323:       break;
        !           324:       }
        !           325:     break;
        !           326: 
        !           327:     /* Backreferences and subroutine calls are treated in the same way: we find
        !           328:     the minimum length for the subpattern. A recursion, however, causes an
        !           329:     a flag to be set that causes the length of this branch to be ignored. The
        !           330:     logic is that a recursion can only make sense if there is another
        !           331:     alternation that stops the recursing. That will provide the minimum length
        !           332:     (when no recursion happens). A backreference within the group that it is
        !           333:     referencing behaves in the same way.
        !           334: 
        !           335:     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
        !           336:     matches an empty string (by default it causes a matching failure), so in
        !           337:     that case we must set the minimum length to zero. */
        !           338: 
        !           339:     case OP_REF:
        !           340:     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
        !           341:       {
        !           342:       ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
        !           343:       if (cs == NULL) return -2;
        !           344:       do ce += GET(ce, 1); while (*ce == OP_ALT);
        !           345:       if (cc > cs && cc < ce)
        !           346:         {
        !           347:         d = 0;
        !           348:         had_recurse = TRUE;
        !           349:         }
        !           350:       else d = find_minlength(cs, startcode, options);
        !           351:       }
        !           352:     else d = 0;
        !           353:     cc += 3;
        !           354: 
        !           355:     /* Handle repeated back references */
        !           356: 
        !           357:     switch (*cc)
        !           358:       {
        !           359:       case OP_CRSTAR:
        !           360:       case OP_CRMINSTAR:
        !           361:       case OP_CRQUERY:
        !           362:       case OP_CRMINQUERY:
        !           363:       min = 0;
        !           364:       cc++;
        !           365:       break;
        !           366: 
        !           367:       case OP_CRRANGE:
        !           368:       case OP_CRMINRANGE:
        !           369:       min = GET2(cc, 1);
        !           370:       cc += 5;
        !           371:       break;
        !           372: 
        !           373:       default:
        !           374:       min = 1;
        !           375:       break;
        !           376:       }
        !           377: 
        !           378:     branchlength += min * d;
        !           379:     break;
        !           380: 
        !           381:     case OP_RECURSE:
        !           382:     cs = ce = (uschar *)startcode + GET(cc, 1);
        !           383:     if (cs == NULL) return -2;
        !           384:     do ce += GET(ce, 1); while (*ce == OP_ALT);
        !           385:     if (cc > cs && cc < ce)
        !           386:       had_recurse = TRUE;
        !           387:     else
        !           388:       branchlength += find_minlength(cs, startcode, options);
        !           389:     cc += 1 + LINK_SIZE;
        !           390:     break;
        !           391: 
        !           392:     /* Anything else does not or need not match a character. We can get the
        !           393:     item's length from the table, but for those that can match zero occurrences
        !           394:     of a character, we must take special action for UTF-8 characters. */
        !           395: 
        !           396:     case OP_UPTO:
        !           397:     case OP_NOTUPTO:
        !           398:     case OP_MINUPTO:
        !           399:     case OP_NOTMINUPTO:
        !           400:     case OP_POSUPTO:
        !           401:     case OP_STAR:
        !           402:     case OP_MINSTAR:
        !           403:     case OP_NOTMINSTAR:
        !           404:     case OP_POSSTAR:
        !           405:     case OP_NOTPOSSTAR:
        !           406:     case OP_QUERY:
        !           407:     case OP_MINQUERY:
        !           408:     case OP_NOTMINQUERY:
        !           409:     case OP_POSQUERY:
        !           410:     case OP_NOTPOSQUERY:
        !           411:     cc += _pcre_OP_lengths[op];
        !           412: #ifdef SUPPORT_UTF8
        !           413:     if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
        !           414: #endif
        !           415:     break;
        !           416: 
        !           417:     /* Skip these, but we need to add in the name length. */
        !           418: 
        !           419:     case OP_MARK:
        !           420:     case OP_PRUNE_ARG:
        !           421:     case OP_SKIP_ARG:
        !           422:     case OP_THEN_ARG:
        !           423:     cc += _pcre_OP_lengths[op] + cc[1];
        !           424:     break;
        !           425: 
        !           426:     /* For the record, these are the opcodes that are matched by "default":
        !           427:     OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
        !           428:     OP_THEN. */
        !           429: 
        !           430:     default:
        !           431:     cc += _pcre_OP_lengths[op];
        !           432:     break;
        !           433:     }
        !           434:   }
        !           435: /* Control never gets here */
        !           436: }
        !           437: 
        !           438: 
        !           439: 
1.1       misha     440: /*************************************************
                    441: *      Set a bit and maybe its alternate case    *
                    442: *************************************************/
                    443: 
1.3     ! misha     444: /* Given a character, set its first byte's bit in the table, and also the
        !           445: corresponding bit for the other version of a letter if we are caseless. In
        !           446: UTF-8 mode, for characters greater than 127, we can only do the caseless thing
        !           447: when Unicode property support is available.
1.1       misha     448: 
                    449: Arguments:
                    450:   start_bits    points to the bit map
1.3     ! misha     451:   p             points to the character
1.1       misha     452:   caseless      the caseless flag
                    453:   cd            the block with char table pointers
1.3     ! misha     454:   utf8          TRUE for UTF-8 mode
        !           455: 
        !           456: Returns:        pointer after the character
        !           457: */
        !           458: 
        !           459: static const uschar *
        !           460: set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
        !           461:   compile_data *cd, BOOL utf8)
        !           462: {
        !           463: unsigned int c = *p;
        !           464: 
        !           465: SET_BIT(c);
        !           466: 
        !           467: #ifdef SUPPORT_UTF8
        !           468: if (utf8 && c > 127)
        !           469:   {
        !           470:   GETCHARINC(c, p);
        !           471: #ifdef SUPPORT_UCP
        !           472:   if (caseless)
        !           473:     {
        !           474:     uschar buff[8];
        !           475:     c = UCD_OTHERCASE(c);
        !           476:     (void)_pcre_ord2utf8(c, buff);
        !           477:     SET_BIT(buff[0]);
        !           478:     }
        !           479: #endif
        !           480:   return p;
        !           481:   }
        !           482: #endif
        !           483: 
        !           484: /* Not UTF-8 mode, or character is less than 127. */
        !           485: 
        !           486: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
        !           487: return p + 1;
        !           488: }
        !           489: 
        !           490: 
        !           491: 
        !           492: /*************************************************
        !           493: *     Set bits for a positive character type     *
        !           494: *************************************************/
        !           495: 
        !           496: /* This function sets starting bits for a character type. In UTF-8 mode, we can
        !           497: only do a direct setting for bytes less than 128, as otherwise there can be
        !           498: confusion with bytes in the middle of UTF-8 characters. In a "traditional"
        !           499: environment, the tables will only recognize ASCII characters anyway, but in at
        !           500: least one Windows environment, some higher bytes bits were set in the tables.
        !           501: So we deal with that case by considering the UTF-8 encoding.
        !           502: 
        !           503: Arguments:
        !           504:   start_bits     the starting bitmap
        !           505:   cbit type      the type of character wanted
        !           506:   table_limit    32 for non-UTF-8; 16 for UTF-8
        !           507:   cd             the block with char table pointers
        !           508: 
        !           509: Returns:         nothing
        !           510: */
        !           511: 
        !           512: static void
        !           513: set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
        !           514:   compile_data *cd)
        !           515: {
        !           516: register int c;
        !           517: for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
        !           518: if (table_limit == 32) return;
        !           519: for (c = 128; c < 256; c++)
        !           520:   {
        !           521:   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
        !           522:     {
        !           523:     uschar buff[8];
        !           524:     (void)_pcre_ord2utf8(c, buff);
        !           525:     SET_BIT(buff[0]);
        !           526:     }
        !           527:   }
        !           528: }
        !           529: 
        !           530: 
        !           531: /*************************************************
        !           532: *     Set bits for a negative character type     *
        !           533: *************************************************/
        !           534: 
        !           535: /* This function sets starting bits for a negative character type such as \D.
        !           536: In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
        !           537: otherwise there can be confusion with bytes in the middle of UTF-8 characters.
        !           538: Unlike in the positive case, where we can set appropriate starting bits for
        !           539: specific high-valued UTF-8 characters, in this case we have to set the bits for
        !           540: all high-valued characters. The lowest is 0xc2, but we overkill by starting at
        !           541: 0xc0 (192) for simplicity.
1.1       misha     542: 
1.3     ! misha     543: Arguments:
        !           544:   start_bits     the starting bitmap
        !           545:   cbit type      the type of character wanted
        !           546:   table_limit    32 for non-UTF-8; 16 for UTF-8
        !           547:   cd             the block with char table pointers
        !           548: 
        !           549: Returns:         nothing
1.1       misha     550: */
                    551: 
                    552: static void
1.3     ! misha     553: set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
        !           554:   compile_data *cd)
1.1       misha     555: {
1.3     ! misha     556: register int c;
        !           557: for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
        !           558: if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
1.1       misha     559: }
                    560: 
                    561: 
                    562: 
                    563: /*************************************************
                    564: *          Create bitmap of starting bytes       *
                    565: *************************************************/
                    566: 
                    567: /* This function scans a compiled unanchored expression recursively and
                    568: attempts to build a bitmap of the set of possible starting bytes. As time goes
                    569: by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
                    570: useful for parenthesized groups in patterns such as (a*)b where the group
                    571: provides some optional starting bytes but scanning must continue at the outer
                    572: level to find at least one mandatory byte. At the outermost level, this
                    573: function fails unless the result is SSB_DONE.
                    574: 
                    575: Arguments:
                    576:   code         points to an expression
                    577:   start_bits   points to a 32-byte table, initialized to 0
                    578:   caseless     the current state of the caseless flag
                    579:   utf8         TRUE if in UTF-8 mode
                    580:   cd           the block with char table pointers
                    581: 
                    582: Returns:       SSB_FAIL     => Failed to find any starting bytes
                    583:                SSB_DONE     => Found mandatory starting bytes
                    584:                SSB_CONTINUE => Found optional starting bytes
                    585: */
                    586: 
                    587: static int
                    588: set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
                    589:   BOOL utf8, compile_data *cd)
                    590: {
                    591: register int c;
                    592: int yield = SSB_DONE;
1.3     ! misha     593: int table_limit = utf8? 16:32;
1.1       misha     594: 
                    595: #if 0
                    596: /* ========================================================================= */
                    597: /* The following comment and code was inserted in January 1999. In May 2006,
                    598: when it was observed to cause compiler warnings about unused values, I took it
                    599: out again. If anybody is still using OS/2, they will have to put it back
                    600: manually. */
                    601: 
                    602: /* This next statement and the later reference to dummy are here in order to
                    603: trick the optimizer of the IBM C compiler for OS/2 into generating correct
                    604: code. Apparently IBM isn't going to fix the problem, and we would rather not
                    605: disable optimization (in this module it actually makes a big difference, and
                    606: the pcre module can use all the optimization it can get). */
                    607: 
                    608: volatile int dummy;
                    609: /* ========================================================================= */
                    610: #endif
                    611: 
                    612: do
                    613:   {
                    614:   const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
                    615:   BOOL try_next = TRUE;
                    616: 
                    617:   while (try_next)    /* Loop for items in this branch */
                    618:     {
                    619:     int rc;
                    620:     switch(*tcode)
                    621:       {
                    622:       /* Fail if we reach something we don't understand */
                    623: 
                    624:       default:
                    625:       return SSB_FAIL;
                    626: 
                    627:       /* If we hit a bracket or a positive lookahead assertion, recurse to set
                    628:       bits from within the subpattern. If it can't find anything, we have to
                    629:       give up. If it finds some mandatory character(s), we are done for this
                    630:       branch. Otherwise, carry on scanning after the subpattern. */
                    631: 
                    632:       case OP_BRA:
                    633:       case OP_SBRA:
                    634:       case OP_CBRA:
                    635:       case OP_SCBRA:
                    636:       case OP_ONCE:
                    637:       case OP_ASSERT:
                    638:       rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
                    639:       if (rc == SSB_FAIL) return SSB_FAIL;
                    640:       if (rc == SSB_DONE) try_next = FALSE; else
                    641:         {
                    642:         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
                    643:         tcode += 1 + LINK_SIZE;
                    644:         }
                    645:       break;
                    646: 
                    647:       /* If we hit ALT or KET, it means we haven't found anything mandatory in
                    648:       this branch, though we might have found something optional. For ALT, we
                    649:       continue with the next alternative, but we have to arrange that the final
                    650:       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
                    651:       return SSB_CONTINUE: if this is the top level, that indicates failure,
                    652:       but after a nested subpattern, it causes scanning to continue. */
                    653: 
                    654:       case OP_ALT:
                    655:       yield = SSB_CONTINUE;
                    656:       try_next = FALSE;
                    657:       break;
                    658: 
                    659:       case OP_KET:
                    660:       case OP_KETRMAX:
                    661:       case OP_KETRMIN:
                    662:       return SSB_CONTINUE;
                    663: 
                    664:       /* Skip over callout */
                    665: 
                    666:       case OP_CALLOUT:
                    667:       tcode += 2 + 2*LINK_SIZE;
                    668:       break;
                    669: 
                    670:       /* Skip over lookbehind and negative lookahead assertions */
                    671: 
                    672:       case OP_ASSERT_NOT:
                    673:       case OP_ASSERTBACK:
                    674:       case OP_ASSERTBACK_NOT:
                    675:       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
                    676:       tcode += 1 + LINK_SIZE;
                    677:       break;
                    678: 
                    679:       /* Skip over an option setting, changing the caseless flag */
                    680: 
                    681:       case OP_OPT:
                    682:       caseless = (tcode[1] & PCRE_CASELESS) != 0;
                    683:       tcode += 2;
                    684:       break;
                    685: 
                    686:       /* BRAZERO does the bracket, but carries on. */
                    687: 
                    688:       case OP_BRAZERO:
                    689:       case OP_BRAMINZERO:
                    690:       if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
                    691:         return SSB_FAIL;
                    692: /* =========================================================================
                    693:       See the comment at the head of this function concerning the next line,
                    694:       which was an old fudge for the benefit of OS/2.
                    695:       dummy = 1;
                    696:   ========================================================================= */
                    697:       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
                    698:       tcode += 1 + LINK_SIZE;
                    699:       break;
                    700: 
                    701:       /* SKIPZERO skips the bracket. */
                    702: 
                    703:       case OP_SKIPZERO:
1.2       misha     704:       tcode++;
1.1       misha     705:       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
                    706:       tcode += 1 + LINK_SIZE;
                    707:       break;
                    708: 
                    709:       /* Single-char * or ? sets the bit and tries the next item */
                    710: 
                    711:       case OP_STAR:
                    712:       case OP_MINSTAR:
                    713:       case OP_POSSTAR:
                    714:       case OP_QUERY:
                    715:       case OP_MINQUERY:
                    716:       case OP_POSQUERY:
1.3     ! misha     717:       tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
1.1       misha     718:       break;
                    719: 
                    720:       /* Single-char upto sets the bit and tries the next */
                    721: 
                    722:       case OP_UPTO:
                    723:       case OP_MINUPTO:
                    724:       case OP_POSUPTO:
1.3     ! misha     725:       tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
1.1       misha     726:       break;
                    727: 
                    728:       /* At least one single char sets the bit and stops */
                    729: 
                    730:       case OP_EXACT:       /* Fall through */
                    731:       tcode += 2;
                    732: 
                    733:       case OP_CHAR:
                    734:       case OP_CHARNC:
                    735:       case OP_PLUS:
                    736:       case OP_MINPLUS:
                    737:       case OP_POSPLUS:
1.3     ! misha     738:       (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
        !           739:       try_next = FALSE;
        !           740:       break;
        !           741: 
        !           742:       /* Special spacing and line-terminating items. These recognize specific
        !           743:       lists of characters. The difference between VSPACE and ANYNL is that the
        !           744:       latter can match the two-character CRLF sequence, but that is not
        !           745:       relevant for finding the first character, so their code here is
        !           746:       identical. */
        !           747: 
        !           748:       case OP_HSPACE:
        !           749:       SET_BIT(0x09);
        !           750:       SET_BIT(0x20);
        !           751:       if (utf8)
        !           752:         {
        !           753:         SET_BIT(0xC2);  /* For U+00A0 */
        !           754:         SET_BIT(0xE1);  /* For U+1680, U+180E */
        !           755:         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
        !           756:         SET_BIT(0xE3);  /* For U+3000 */
        !           757:         }
        !           758:       else SET_BIT(0xA0);
        !           759:       try_next = FALSE;
        !           760:       break;
        !           761: 
        !           762:       case OP_ANYNL:
        !           763:       case OP_VSPACE:
        !           764:       SET_BIT(0x0A);
        !           765:       SET_BIT(0x0B);
        !           766:       SET_BIT(0x0C);
        !           767:       SET_BIT(0x0D);
        !           768:       if (utf8)
        !           769:         {
        !           770:         SET_BIT(0xC2);  /* For U+0085 */
        !           771:         SET_BIT(0xE2);  /* For U+2028, U+2029 */
        !           772:         }
        !           773:       else SET_BIT(0x85);
1.1       misha     774:       try_next = FALSE;
                    775:       break;
                    776: 
1.3     ! misha     777:       /* Single character types set the bits and stop. Note that if PCRE_UCP
        !           778:       is set, we do not see these op codes because \d etc are converted to
        !           779:       properties. Therefore, these apply in the case when only characters less
        !           780:       than 256 are recognized to match the types. */
1.1       misha     781: 
                    782:       case OP_NOT_DIGIT:
1.3     ! misha     783:       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1.1       misha     784:       try_next = FALSE;
                    785:       break;
                    786: 
                    787:       case OP_DIGIT:
1.3     ! misha     788:       set_type_bits(start_bits, cbit_digit, table_limit, cd);
1.1       misha     789:       try_next = FALSE;
                    790:       break;
                    791: 
                    792:       /* The cbit_space table has vertical tab as whitespace; we have to
1.3     ! misha     793:       ensure it is set as not whitespace. */
1.1       misha     794: 
                    795:       case OP_NOT_WHITESPACE:
1.3     ! misha     796:       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
        !           797:       start_bits[1] |= 0x08;
1.1       misha     798:       try_next = FALSE;
                    799:       break;
                    800: 
                    801:       /* The cbit_space table has vertical tab as whitespace; we have to
1.3     ! misha     802:       not set it from the table. */
1.1       misha     803: 
                    804:       case OP_WHITESPACE:
1.3     ! misha     805:       c = start_bits[1];    /* Save in case it was already set */
        !           806:       set_type_bits(start_bits, cbit_space, table_limit, cd);
        !           807:       start_bits[1] = (start_bits[1] & ~0x08) | c;
1.1       misha     808:       try_next = FALSE;
                    809:       break;
                    810: 
                    811:       case OP_NOT_WORDCHAR:
1.3     ! misha     812:       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1.1       misha     813:       try_next = FALSE;
                    814:       break;
                    815: 
                    816:       case OP_WORDCHAR:
1.3     ! misha     817:       set_type_bits(start_bits, cbit_word, table_limit, cd);
1.1       misha     818:       try_next = FALSE;
                    819:       break;
                    820: 
                    821:       /* One or more character type fudges the pointer and restarts, knowing
                    822:       it will hit a single character type and stop there. */
                    823: 
                    824:       case OP_TYPEPLUS:
                    825:       case OP_TYPEMINPLUS:
1.3     ! misha     826:       case OP_TYPEPOSPLUS:
1.1       misha     827:       tcode++;
                    828:       break;
                    829: 
                    830:       case OP_TYPEEXACT:
                    831:       tcode += 3;
                    832:       break;
                    833: 
                    834:       /* Zero or more repeats of character types set the bits and then
                    835:       try again. */
                    836: 
                    837:       case OP_TYPEUPTO:
                    838:       case OP_TYPEMINUPTO:
                    839:       case OP_TYPEPOSUPTO:
                    840:       tcode += 2;               /* Fall through */
                    841: 
                    842:       case OP_TYPESTAR:
                    843:       case OP_TYPEMINSTAR:
                    844:       case OP_TYPEPOSSTAR:
                    845:       case OP_TYPEQUERY:
                    846:       case OP_TYPEMINQUERY:
                    847:       case OP_TYPEPOSQUERY:
                    848:       switch(tcode[1])
                    849:         {
1.3     ! misha     850:         default:
1.1       misha     851:         case OP_ANY:
                    852:         case OP_ALLANY:
                    853:         return SSB_FAIL;
                    854: 
1.3     ! misha     855:         case OP_HSPACE:
        !           856:         SET_BIT(0x09);
        !           857:         SET_BIT(0x20);
        !           858:         if (utf8)
        !           859:           {
        !           860:           SET_BIT(0xC2);  /* For U+00A0 */
        !           861:           SET_BIT(0xE1);  /* For U+1680, U+180E */
        !           862:           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
        !           863:           SET_BIT(0xE3);  /* For U+3000 */
        !           864:           }
        !           865:         else SET_BIT(0xA0);
        !           866:         break;
        !           867: 
        !           868:         case OP_ANYNL:
        !           869:         case OP_VSPACE:
        !           870:         SET_BIT(0x0A);
        !           871:         SET_BIT(0x0B);
        !           872:         SET_BIT(0x0C);
        !           873:         SET_BIT(0x0D);
        !           874:         if (utf8)
        !           875:           {
        !           876:           SET_BIT(0xC2);  /* For U+0085 */
        !           877:           SET_BIT(0xE2);  /* For U+2028, U+2029 */
        !           878:           }
        !           879:         else SET_BIT(0x85);
        !           880:         break;
        !           881: 
1.1       misha     882:         case OP_NOT_DIGIT:
1.3     ! misha     883:         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1.1       misha     884:         break;
                    885: 
                    886:         case OP_DIGIT:
1.3     ! misha     887:         set_type_bits(start_bits, cbit_digit, table_limit, cd);
1.1       misha     888:         break;
                    889: 
                    890:         /* The cbit_space table has vertical tab as whitespace; we have to
1.3     ! misha     891:         ensure it gets set as not whitespace. */
1.1       misha     892: 
                    893:         case OP_NOT_WHITESPACE:
1.3     ! misha     894:         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
        !           895:         start_bits[1] |= 0x08;
1.1       misha     896:         break;
                    897: 
                    898:         /* The cbit_space table has vertical tab as whitespace; we have to
1.3     ! misha     899:         avoid setting it. */
1.1       misha     900: 
                    901:         case OP_WHITESPACE:
1.3     ! misha     902:         c = start_bits[1];    /* Save in case it was already set */
        !           903:         set_type_bits(start_bits, cbit_space, table_limit, cd);
        !           904:         start_bits[1] = (start_bits[1] & ~0x08) | c;
1.1       misha     905:         break;
                    906: 
                    907:         case OP_NOT_WORDCHAR:
1.3     ! misha     908:         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1.1       misha     909:         break;
                    910: 
                    911:         case OP_WORDCHAR:
1.3     ! misha     912:         set_type_bits(start_bits, cbit_word, table_limit, cd);
1.1       misha     913:         break;
                    914:         }
                    915: 
                    916:       tcode += 2;
                    917:       break;
                    918: 
                    919:       /* Character class where all the information is in a bit map: set the
                    920:       bits and either carry on or not, according to the repeat count. If it was
                    921:       a negative class, and we are operating with UTF-8 characters, any byte
                    922:       with a value >= 0xc4 is a potentially valid starter because it starts a
                    923:       character with a value > 255. */
                    924: 
                    925:       case OP_NCLASS:
                    926: #ifdef SUPPORT_UTF8
                    927:       if (utf8)
                    928:         {
                    929:         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
                    930:         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
                    931:         }
                    932: #endif
                    933:       /* Fall through */
                    934: 
                    935:       case OP_CLASS:
                    936:         {
                    937:         tcode++;
                    938: 
                    939:         /* In UTF-8 mode, the bits in a bit map correspond to character
                    940:         values, not to byte values. However, the bit map we are constructing is
                    941:         for byte values. So we have to do a conversion for characters whose
                    942:         value is > 127. In fact, there are only two possible starting bytes for
                    943:         characters in the range 128 - 255. */
                    944: 
                    945: #ifdef SUPPORT_UTF8
                    946:         if (utf8)
                    947:           {
                    948:           for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
                    949:           for (c = 128; c < 256; c++)
                    950:             {
                    951:             if ((tcode[c/8] && (1 << (c&7))) != 0)
                    952:               {
                    953:               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
                    954:               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
                    955:               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
                    956:               }
                    957:             }
                    958:           }
                    959: 
                    960:         /* In non-UTF-8 mode, the two bit maps are completely compatible. */
                    961: 
                    962:         else
                    963: #endif
                    964:           {
                    965:           for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
                    966:           }
                    967: 
                    968:         /* Advance past the bit map, and act on what follows */
                    969: 
                    970:         tcode += 32;
                    971:         switch (*tcode)
                    972:           {
                    973:           case OP_CRSTAR:
                    974:           case OP_CRMINSTAR:
                    975:           case OP_CRQUERY:
                    976:           case OP_CRMINQUERY:
                    977:           tcode++;
                    978:           break;
                    979: 
                    980:           case OP_CRRANGE:
                    981:           case OP_CRMINRANGE:
                    982:           if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
                    983:             else try_next = FALSE;
                    984:           break;
                    985: 
                    986:           default:
                    987:           try_next = FALSE;
                    988:           break;
                    989:           }
                    990:         }
                    991:       break; /* End of bitmap class handling */
                    992: 
                    993:       }      /* End of switch */
                    994:     }        /* End of try_next loop */
                    995: 
                    996:   code += GET(code, 1);   /* Advance to next branch */
                    997:   }
                    998: while (*code == OP_ALT);
                    999: return yield;
                   1000: }
                   1001: 
                   1002: 
                   1003: 
                   1004: /*************************************************
                   1005: *          Study a compiled expression           *
                   1006: *************************************************/
                   1007: 
                   1008: /* This function is handed a compiled expression that it must study to produce
                   1009: information that will speed up the matching. It returns a pcre_extra block
                   1010: which then gets handed back to pcre_exec().
                   1011: 
                   1012: Arguments:
                   1013:   re        points to the compiled expression
                   1014:   options   contains option bits
                   1015:   errorptr  points to where to place error messages;
                   1016:             set NULL unless error
                   1017: 
                   1018: Returns:    pointer to a pcre_extra block, with study_data filled in and the
1.3     ! misha    1019:               appropriate flags set;
1.1       misha    1020:             NULL on error or if no optimization possible
                   1021: */
                   1022: 
1.2       misha    1023: PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1.1       misha    1024: pcre_study(const pcre *external_re, int options, const char **errorptr)
                   1025: {
1.3     ! misha    1026: int min;
        !          1027: BOOL bits_set = FALSE;
1.1       misha    1028: uschar start_bits[32];
                   1029: pcre_extra *extra;
                   1030: pcre_study_data *study;
                   1031: const uschar *tables;
                   1032: uschar *code;
                   1033: compile_data compile_block;
                   1034: const real_pcre *re = (const real_pcre *)external_re;
                   1035: 
                   1036: *errorptr = NULL;
                   1037: 
                   1038: if (re == NULL || re->magic_number != MAGIC_NUMBER)
                   1039:   {
                   1040:   *errorptr = "argument is not a compiled regular expression";
                   1041:   return NULL;
                   1042:   }
                   1043: 
                   1044: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
                   1045:   {
                   1046:   *errorptr = "unknown or incorrect option bit(s) set";
                   1047:   return NULL;
                   1048:   }
                   1049: 
                   1050: code = (uschar *)re + re->name_table_offset +
                   1051:   (re->name_count * re->name_entry_size);
                   1052: 
                   1053: /* For an anchored pattern, or an unanchored pattern that has a first char, or
1.3     ! misha    1054: a multiline pattern that matches only at "line starts", there is no point in
        !          1055: seeking a list of starting bytes. */
        !          1056: 
        !          1057: if ((re->options & PCRE_ANCHORED) == 0 &&
        !          1058:     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
        !          1059:   {
        !          1060:   /* Set the character tables in the block that is passed around */
1.1       misha    1061: 
1.3     ! misha    1062:   tables = re->tables;
        !          1063:   if (tables == NULL)
        !          1064:     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
        !          1065:     (void *)(&tables));
        !          1066: 
        !          1067:   compile_block.lcc = tables + lcc_offset;
        !          1068:   compile_block.fcc = tables + fcc_offset;
        !          1069:   compile_block.cbits = tables + cbits_offset;
        !          1070:   compile_block.ctypes = tables + ctypes_offset;
        !          1071: 
        !          1072:   /* See if we can find a fixed set of initial characters for the pattern. */
        !          1073: 
        !          1074:   memset(start_bits, 0, 32 * sizeof(uschar));
        !          1075:   bits_set = set_start_bits(code, start_bits,
        !          1076:     (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
        !          1077:     &compile_block) == SSB_DONE;
        !          1078:   }
        !          1079: 
        !          1080: /* Find the minimum length of subject string. */
        !          1081: 
        !          1082: min = find_minlength(code, code, re->options);
1.1       misha    1083: 
1.3     ! misha    1084: /* Return NULL if no optimization is possible. */
1.1       misha    1085: 
1.3     ! misha    1086: if (!bits_set && min < 0) return NULL;
1.1       misha    1087: 
                   1088: /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
                   1089: the latter, which is pointed to by the former, which may also get additional
                   1090: data set later by the calling program. At the moment, the size of
                   1091: pcre_study_data is fixed. We nevertheless save it in a field for returning via
                   1092: the pcre_fullinfo() function so that if it becomes variable in the future, we
                   1093: don't have to change that code. */
                   1094: 
                   1095: extra = (pcre_extra *)(pcre_malloc)
                   1096:   (sizeof(pcre_extra) + sizeof(pcre_study_data));
                   1097: 
                   1098: if (extra == NULL)
                   1099:   {
                   1100:   *errorptr = "failed to get memory";
                   1101:   return NULL;
                   1102:   }
                   1103: 
                   1104: study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
                   1105: extra->flags = PCRE_EXTRA_STUDY_DATA;
                   1106: extra->study_data = study;
                   1107: 
                   1108: study->size = sizeof(pcre_study_data);
1.3     ! misha    1109: study->flags = 0;
        !          1110: 
        !          1111: if (bits_set)
        !          1112:   {
        !          1113:   study->flags |= PCRE_STUDY_MAPPED;
        !          1114:   memcpy(study->start_bits, start_bits, sizeof(start_bits));
        !          1115:   }
        !          1116: 
        !          1117: if (min >= 0)
        !          1118:   {
        !          1119:   study->flags |= PCRE_STUDY_MINLEN;
        !          1120:   study->minlength = min;
        !          1121:   }
1.1       misha    1122: 
                   1123: return extra;
                   1124: }
                   1125: 
                   1126: /* End of pcre_study.c */
E-mail: