Annotation of parser3/src/pcre/study.c, revision 1.1

1.1     ! paf         1: /*************************************************
        !             2: *      Perl-Compatible Regular Expressions       *
        !             3: *************************************************/
        !             4: 
        !             5: /*
        !             6: This is a library of functions to support regular expressions whose syntax
        !             7: and semantics are as close as possible to those of the Perl 5 language. See
        !             8: the file Tech.Notes for some information on the internals.
        !             9: 
        !            10: Written by: Philip Hazel <ph10@cam.ac.uk>
        !            11: 
        !            12:            Copyright (c) 1997-1999 University of Cambridge
        !            13: 
        !            14: -----------------------------------------------------------------------------
        !            15: Permission is granted to anyone to use this software for any purpose on any
        !            16: computer system, and to redistribute it freely, subject to the following
        !            17: restrictions:
        !            18: 
        !            19: 1. This software is distributed in the hope that it will be useful,
        !            20:    but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            21:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
        !            22: 
        !            23: 2. The origin of this software must not be misrepresented, either by
        !            24:    explicit claim or by omission.
        !            25: 
        !            26: 3. Altered versions must be plainly marked as such, and must not be
        !            27:    misrepresented as being the original software.
        !            28: 
        !            29: 4. If PCRE is embedded in any software that is released under the GNU
        !            30:    General Purpose Licence (GPL), then the terms of that licence shall
        !            31:    supersede any condition above with which it is incompatible.
        !            32: -----------------------------------------------------------------------------
        !            33: */
        !            34: 
        !            35: 
        !            36: /* Include the internals header, which itself includes Standard C headers plus
        !            37: the external pcre header. */
        !            38: 
        !            39: #include "internal.h"
        !            40: 
        !            41: 
        !            42: 
        !            43: /*************************************************
        !            44: *      Set a bit and maybe its alternate case    *
        !            45: *************************************************/
        !            46: 
        !            47: /* Given a character, set its bit in the table, and also the bit for the other
        !            48: version of a letter if we are caseless.
        !            49: 
        !            50: Arguments:
        !            51:   start_bits    points to the bit map
        !            52:   c             is the character
        !            53:   caseless      the caseless flag
        !            54:   cd            the block with char table pointers
        !            55: 
        !            56: Returns:        nothing
        !            57: */
        !            58: 
        !            59: static void
        !            60: set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
        !            61: {
        !            62: start_bits[c/8] |= (1 << (c&7));
        !            63: if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
        !            64:   start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
        !            65: }
        !            66: 
        !            67: 
        !            68: 
        !            69: /*************************************************
        !            70: *          Create bitmap of starting chars       *
        !            71: *************************************************/
        !            72: 
        !            73: /* This function scans a compiled unanchored expression and attempts to build a
        !            74: bitmap of the set of initial characters. If it can't, it returns FALSE. As time
        !            75: goes by, we may be able to get more clever at doing this.
        !            76: 
        !            77: Arguments:
        !            78:   code         points to an expression
        !            79:   start_bits   points to a 32-byte table, initialized to 0
        !            80:   caseless     the current state of the caseless flag
        !            81:   cd           the block with char table pointers
        !            82: 
        !            83: Returns:       TRUE if table built, FALSE otherwise
        !            84: */
        !            85: 
        !            86: static BOOL
        !            87: set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
        !            88:   compile_data *cd)
        !            89: {
        !            90: register int c;
        !            91: 
        !            92: /* This next statement and the later reference to dummy are here in order to
        !            93: trick the optimizer of the IBM C compiler for OS/2 into generating correct
        !            94: code. Apparently IBM isn't going to fix the problem, and we would rather not
        !            95: disable optimization (in this module it actually makes a big difference, and
        !            96: the pcre module can use all the optimization it can get). */
        !            97: 
        !            98: volatile int dummy;
        !            99: 
        !           100: do
        !           101:   {
        !           102:   const uschar *tcode = code + 3;
        !           103:   BOOL try_next = TRUE;
        !           104: 
        !           105:   while (try_next)
        !           106:     {
        !           107:     try_next = FALSE;
        !           108: 
        !           109:     /* If a branch starts with a bracket or a positive lookahead assertion,
        !           110:     recurse to set bits from within them. That's all for this branch. */
        !           111: 
        !           112:     if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
        !           113:       {
        !           114:       if (!set_start_bits(tcode, start_bits, caseless, cd))
        !           115:         return FALSE;
        !           116:       }
        !           117: 
        !           118:     else switch(*tcode)
        !           119:       {
        !           120:       default:
        !           121:       return FALSE;
        !           122: 
        !           123:       /* Skip over lookbehind and negative lookahead assertions */
        !           124: 
        !           125:       case OP_ASSERT_NOT:
        !           126:       case OP_ASSERTBACK:
        !           127:       case OP_ASSERTBACK_NOT:
        !           128:       try_next = TRUE;
        !           129:       do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
        !           130:       tcode += 3;
        !           131:       break;
        !           132: 
        !           133:       /* Skip over an option setting, changing the caseless flag */
        !           134: 
        !           135:       case OP_OPT:
        !           136:       caseless = (tcode[1] & PCRE_CASELESS) != 0;
        !           137:       tcode += 2;
        !           138:       try_next = TRUE;
        !           139:       break;
        !           140: 
        !           141:       /* BRAZERO does the bracket, but carries on. */
        !           142: 
        !           143:       case OP_BRAZERO:
        !           144:       case OP_BRAMINZERO:
        !           145:       if (!set_start_bits(++tcode, start_bits, caseless, cd))
        !           146:         return FALSE;
        !           147:       dummy = 1;
        !           148:       do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
        !           149:       tcode += 3;
        !           150:       try_next = TRUE;
        !           151:       break;
        !           152: 
        !           153:       /* Single-char * or ? sets the bit and tries the next item */
        !           154: 
        !           155:       case OP_STAR:
        !           156:       case OP_MINSTAR:
        !           157:       case OP_QUERY:
        !           158:       case OP_MINQUERY:
        !           159:       set_bit(start_bits, tcode[1], caseless, cd);
        !           160:       tcode += 2;
        !           161:       try_next = TRUE;
        !           162:       break;
        !           163: 
        !           164:       /* Single-char upto sets the bit and tries the next */
        !           165: 
        !           166:       case OP_UPTO:
        !           167:       case OP_MINUPTO:
        !           168:       set_bit(start_bits, tcode[3], caseless, cd);
        !           169:       tcode += 4;
        !           170:       try_next = TRUE;
        !           171:       break;
        !           172: 
        !           173:       /* At least one single char sets the bit and stops */
        !           174: 
        !           175:       case OP_EXACT:       /* Fall through */
        !           176:       tcode++;
        !           177: 
        !           178:       case OP_CHARS:       /* Fall through */
        !           179:       tcode++;
        !           180: 
        !           181:       case OP_PLUS:
        !           182:       case OP_MINPLUS:
        !           183:       set_bit(start_bits, tcode[1], caseless, cd);
        !           184:       break;
        !           185: 
        !           186:       /* Single character type sets the bits and stops */
        !           187: 
        !           188:       case OP_NOT_DIGIT:
        !           189:       for (c = 0; c < 32; c++)
        !           190:         start_bits[c] |= ~cd->cbits[c+cbit_digit];
        !           191:       break;
        !           192: 
        !           193:       case OP_DIGIT:
        !           194:       for (c = 0; c < 32; c++)
        !           195:         start_bits[c] |= cd->cbits[c+cbit_digit];
        !           196:       break;
        !           197: 
        !           198:       case OP_NOT_WHITESPACE:
        !           199:       for (c = 0; c < 32; c++)
        !           200:         start_bits[c] |= ~cd->cbits[c+cbit_space];
        !           201:       break;
        !           202: 
        !           203:       case OP_WHITESPACE:
        !           204:       for (c = 0; c < 32; c++)
        !           205:         start_bits[c] |= cd->cbits[c+cbit_space];
        !           206:       break;
        !           207: 
        !           208:       case OP_NOT_WORDCHAR:
        !           209:       for (c = 0; c < 32; c++)
        !           210:         start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
        !           211:       break;
        !           212: 
        !           213:       case OP_WORDCHAR:
        !           214:       for (c = 0; c < 32; c++)
        !           215:         start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
        !           216:       break;
        !           217: 
        !           218:       /* One or more character type fudges the pointer and restarts, knowing
        !           219:       it will hit a single character type and stop there. */
        !           220: 
        !           221:       case OP_TYPEPLUS:
        !           222:       case OP_TYPEMINPLUS:
        !           223:       tcode++;
        !           224:       try_next = TRUE;
        !           225:       break;
        !           226: 
        !           227:       case OP_TYPEEXACT:
        !           228:       tcode += 3;
        !           229:       try_next = TRUE;
        !           230:       break;
        !           231: 
        !           232:       /* Zero or more repeats of character types set the bits and then
        !           233:       try again. */
        !           234: 
        !           235:       case OP_TYPEUPTO:
        !           236:       case OP_TYPEMINUPTO:
        !           237:       tcode += 2;               /* Fall through */
        !           238: 
        !           239:       case OP_TYPESTAR:
        !           240:       case OP_TYPEMINSTAR:
        !           241:       case OP_TYPEQUERY:
        !           242:       case OP_TYPEMINQUERY:
        !           243:       switch(tcode[1])
        !           244:         {
        !           245:         case OP_NOT_DIGIT:
        !           246:         for (c = 0; c < 32; c++)
        !           247:           start_bits[c] |= ~cd->cbits[c+cbit_digit];
        !           248:         break;
        !           249: 
        !           250:         case OP_DIGIT:
        !           251:         for (c = 0; c < 32; c++)
        !           252:           start_bits[c] |= cd->cbits[c+cbit_digit];
        !           253:         break;
        !           254: 
        !           255:         case OP_NOT_WHITESPACE:
        !           256:         for (c = 0; c < 32; c++)
        !           257:           start_bits[c] |= ~cd->cbits[c+cbit_space];
        !           258:         break;
        !           259: 
        !           260:         case OP_WHITESPACE:
        !           261:         for (c = 0; c < 32; c++)
        !           262:           start_bits[c] |= cd->cbits[c+cbit_space];
        !           263:         break;
        !           264: 
        !           265:         case OP_NOT_WORDCHAR:
        !           266:         for (c = 0; c < 32; c++)
        !           267:           start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
        !           268:         break;
        !           269: 
        !           270:         case OP_WORDCHAR:
        !           271:         for (c = 0; c < 32; c++)
        !           272:           start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
        !           273:         break;
        !           274:         }
        !           275: 
        !           276:       tcode += 2;
        !           277:       try_next = TRUE;
        !           278:       break;
        !           279: 
        !           280:       /* Character class: set the bits and either carry on or not,
        !           281:       according to the repeat count. */
        !           282: 
        !           283:       case OP_CLASS:
        !           284:         {
        !           285:         tcode++;
        !           286:         for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
        !           287:         tcode += 32;
        !           288:         switch (*tcode)
        !           289:           {
        !           290:           case OP_CRSTAR:
        !           291:           case OP_CRMINSTAR:
        !           292:           case OP_CRQUERY:
        !           293:           case OP_CRMINQUERY:
        !           294:           tcode++;
        !           295:           try_next = TRUE;
        !           296:           break;
        !           297: 
        !           298:           case OP_CRRANGE:
        !           299:           case OP_CRMINRANGE:
        !           300:           if (((tcode[1] << 8) + tcode[2]) == 0)
        !           301:             {
        !           302:             tcode += 5;
        !           303:             try_next = TRUE;
        !           304:             }
        !           305:           break;
        !           306:           }
        !           307:         }
        !           308:       break; /* End of class handling */
        !           309: 
        !           310:       }      /* End of switch */
        !           311:     }        /* End of try_next loop */
        !           312: 
        !           313:   code += (code[1] << 8) + code[2];   /* Advance to next branch */
        !           314:   }
        !           315: while (*code == OP_ALT);
        !           316: return TRUE;
        !           317: }
        !           318: 
        !           319: 
        !           320: 
        !           321: /*************************************************
        !           322: *          Study a compiled expression           *
        !           323: *************************************************/
        !           324: 
        !           325: /* This function is handed a compiled expression that it must study to produce
        !           326: information that will speed up the matching. It returns a pcre_extra block
        !           327: which then gets handed back to pcre_exec().
        !           328: 
        !           329: Arguments:
        !           330:   re        points to the compiled expression
        !           331:   options   contains option bits
        !           332:   errorptr  points to where to place error messages;
        !           333:             set NULL unless error
        !           334: 
        !           335: Returns:    pointer to a pcre_extra block,
        !           336:             NULL on error or if no optimization possible
        !           337: */
        !           338: 
        !           339: pcre_extra *
        !           340: pcre_study(const pcre *external_re, int options, const char **errorptr)
        !           341: {
        !           342: uschar start_bits[32];
        !           343: real_pcre_extra *extra;
        !           344: const real_pcre *re = (const real_pcre *)external_re;
        !           345: compile_data compile_block;
        !           346: 
        !           347: *errorptr = NULL;
        !           348: 
        !           349: if (re == NULL || re->magic_number != MAGIC_NUMBER)
        !           350:   {
        !           351:   *errorptr = "argument is not a compiled regular expression";
        !           352:   return NULL;
        !           353:   }
        !           354: 
        !           355: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
        !           356:   {
        !           357:   *errorptr = "unknown or incorrect option bit(s) set";
        !           358:   return NULL;
        !           359:   }
        !           360: 
        !           361: /* For an anchored pattern, or an unchored pattern that has a first char, or a
        !           362: multiline pattern that matches only at "line starts", no further processing at
        !           363: present. */
        !           364: 
        !           365: if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
        !           366:   return NULL;
        !           367: 
        !           368: /* Set the character tables in the block which is passed around */
        !           369: 
        !           370: compile_block.lcc = re->tables + lcc_offset;
        !           371: compile_block.fcc = re->tables + fcc_offset;
        !           372: compile_block.cbits = re->tables + cbits_offset;
        !           373: compile_block.ctypes = re->tables + ctypes_offset;
        !           374: 
        !           375: /* See if we can find a fixed set of initial characters for the pattern. */
        !           376: 
        !           377: memset(start_bits, 0, 32 * sizeof(uschar));
        !           378: if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
        !           379:   &compile_block)) return NULL;
        !           380: 
        !           381: /* Get an "extra" block and put the information therein. */
        !           382: 
        !           383: extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
        !           384: 
        !           385: if (extra == NULL)
        !           386:   {
        !           387:   *errorptr = "failed to get memory";
        !           388:   return NULL;
        !           389:   }
        !           390: 
        !           391: extra->options = PCRE_STUDY_MAPPED;
        !           392: memcpy(extra->start_bits, start_bits, sizeof(start_bits));
        !           393: 
        !           394: return (pcre_extra *)extra;
        !           395: }
        !           396: 
        !           397: /* End of study.c */

E-mail: