Annotation of parser3/src/pcre/study.c, revision 1.1
1.1 ! paf 1: /*************************************************
! 2: * Perl-Compatible Regular Expressions *
! 3: *************************************************/
! 4:
! 5: /*
! 6: This is a library of functions to support regular expressions whose syntax
! 7: and semantics are as close as possible to those of the Perl 5 language. See
! 8: the file Tech.Notes for some information on the internals.
! 9:
! 10: Written by: Philip Hazel <ph10@cam.ac.uk>
! 11:
! 12: Copyright (c) 1997-1999 University of Cambridge
! 13:
! 14: -----------------------------------------------------------------------------
! 15: Permission is granted to anyone to use this software for any purpose on any
! 16: computer system, and to redistribute it freely, subject to the following
! 17: restrictions:
! 18:
! 19: 1. This software is distributed in the hope that it will be useful,
! 20: but WITHOUT ANY WARRANTY; without even the implied warranty of
! 21: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
! 22:
! 23: 2. The origin of this software must not be misrepresented, either by
! 24: explicit claim or by omission.
! 25:
! 26: 3. Altered versions must be plainly marked as such, and must not be
! 27: misrepresented as being the original software.
! 28:
! 29: 4. If PCRE is embedded in any software that is released under the GNU
! 30: General Purpose Licence (GPL), then the terms of that licence shall
! 31: supersede any condition above with which it is incompatible.
! 32: -----------------------------------------------------------------------------
! 33: */
! 34:
! 35:
! 36: /* Include the internals header, which itself includes Standard C headers plus
! 37: the external pcre header. */
! 38:
! 39: #include "internal.h"
! 40:
! 41:
! 42:
! 43: /*************************************************
! 44: * Set a bit and maybe its alternate case *
! 45: *************************************************/
! 46:
! 47: /* Given a character, set its bit in the table, and also the bit for the other
! 48: version of a letter if we are caseless.
! 49:
! 50: Arguments:
! 51: start_bits points to the bit map
! 52: c is the character
! 53: caseless the caseless flag
! 54: cd the block with char table pointers
! 55:
! 56: Returns: nothing
! 57: */
! 58:
! 59: static void
! 60: set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
! 61: {
! 62: start_bits[c/8] |= (1 << (c&7));
! 63: if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
! 64: start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
! 65: }
! 66:
! 67:
! 68:
! 69: /*************************************************
! 70: * Create bitmap of starting chars *
! 71: *************************************************/
! 72:
! 73: /* This function scans a compiled unanchored expression and attempts to build a
! 74: bitmap of the set of initial characters. If it can't, it returns FALSE. As time
! 75: goes by, we may be able to get more clever at doing this.
! 76:
! 77: Arguments:
! 78: code points to an expression
! 79: start_bits points to a 32-byte table, initialized to 0
! 80: caseless the current state of the caseless flag
! 81: cd the block with char table pointers
! 82:
! 83: Returns: TRUE if table built, FALSE otherwise
! 84: */
! 85:
! 86: static BOOL
! 87: set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
! 88: compile_data *cd)
! 89: {
! 90: register int c;
! 91:
! 92: /* This next statement and the later reference to dummy are here in order to
! 93: trick the optimizer of the IBM C compiler for OS/2 into generating correct
! 94: code. Apparently IBM isn't going to fix the problem, and we would rather not
! 95: disable optimization (in this module it actually makes a big difference, and
! 96: the pcre module can use all the optimization it can get). */
! 97:
! 98: volatile int dummy;
! 99:
! 100: do
! 101: {
! 102: const uschar *tcode = code + 3;
! 103: BOOL try_next = TRUE;
! 104:
! 105: while (try_next)
! 106: {
! 107: try_next = FALSE;
! 108:
! 109: /* If a branch starts with a bracket or a positive lookahead assertion,
! 110: recurse to set bits from within them. That's all for this branch. */
! 111:
! 112: if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
! 113: {
! 114: if (!set_start_bits(tcode, start_bits, caseless, cd))
! 115: return FALSE;
! 116: }
! 117:
! 118: else switch(*tcode)
! 119: {
! 120: default:
! 121: return FALSE;
! 122:
! 123: /* Skip over lookbehind and negative lookahead assertions */
! 124:
! 125: case OP_ASSERT_NOT:
! 126: case OP_ASSERTBACK:
! 127: case OP_ASSERTBACK_NOT:
! 128: try_next = TRUE;
! 129: do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
! 130: tcode += 3;
! 131: break;
! 132:
! 133: /* Skip over an option setting, changing the caseless flag */
! 134:
! 135: case OP_OPT:
! 136: caseless = (tcode[1] & PCRE_CASELESS) != 0;
! 137: tcode += 2;
! 138: try_next = TRUE;
! 139: break;
! 140:
! 141: /* BRAZERO does the bracket, but carries on. */
! 142:
! 143: case OP_BRAZERO:
! 144: case OP_BRAMINZERO:
! 145: if (!set_start_bits(++tcode, start_bits, caseless, cd))
! 146: return FALSE;
! 147: dummy = 1;
! 148: do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
! 149: tcode += 3;
! 150: try_next = TRUE;
! 151: break;
! 152:
! 153: /* Single-char * or ? sets the bit and tries the next item */
! 154:
! 155: case OP_STAR:
! 156: case OP_MINSTAR:
! 157: case OP_QUERY:
! 158: case OP_MINQUERY:
! 159: set_bit(start_bits, tcode[1], caseless, cd);
! 160: tcode += 2;
! 161: try_next = TRUE;
! 162: break;
! 163:
! 164: /* Single-char upto sets the bit and tries the next */
! 165:
! 166: case OP_UPTO:
! 167: case OP_MINUPTO:
! 168: set_bit(start_bits, tcode[3], caseless, cd);
! 169: tcode += 4;
! 170: try_next = TRUE;
! 171: break;
! 172:
! 173: /* At least one single char sets the bit and stops */
! 174:
! 175: case OP_EXACT: /* Fall through */
! 176: tcode++;
! 177:
! 178: case OP_CHARS: /* Fall through */
! 179: tcode++;
! 180:
! 181: case OP_PLUS:
! 182: case OP_MINPLUS:
! 183: set_bit(start_bits, tcode[1], caseless, cd);
! 184: break;
! 185:
! 186: /* Single character type sets the bits and stops */
! 187:
! 188: case OP_NOT_DIGIT:
! 189: for (c = 0; c < 32; c++)
! 190: start_bits[c] |= ~cd->cbits[c+cbit_digit];
! 191: break;
! 192:
! 193: case OP_DIGIT:
! 194: for (c = 0; c < 32; c++)
! 195: start_bits[c] |= cd->cbits[c+cbit_digit];
! 196: break;
! 197:
! 198: case OP_NOT_WHITESPACE:
! 199: for (c = 0; c < 32; c++)
! 200: start_bits[c] |= ~cd->cbits[c+cbit_space];
! 201: break;
! 202:
! 203: case OP_WHITESPACE:
! 204: for (c = 0; c < 32; c++)
! 205: start_bits[c] |= cd->cbits[c+cbit_space];
! 206: break;
! 207:
! 208: case OP_NOT_WORDCHAR:
! 209: for (c = 0; c < 32; c++)
! 210: start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
! 211: break;
! 212:
! 213: case OP_WORDCHAR:
! 214: for (c = 0; c < 32; c++)
! 215: start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
! 216: break;
! 217:
! 218: /* One or more character type fudges the pointer and restarts, knowing
! 219: it will hit a single character type and stop there. */
! 220:
! 221: case OP_TYPEPLUS:
! 222: case OP_TYPEMINPLUS:
! 223: tcode++;
! 224: try_next = TRUE;
! 225: break;
! 226:
! 227: case OP_TYPEEXACT:
! 228: tcode += 3;
! 229: try_next = TRUE;
! 230: break;
! 231:
! 232: /* Zero or more repeats of character types set the bits and then
! 233: try again. */
! 234:
! 235: case OP_TYPEUPTO:
! 236: case OP_TYPEMINUPTO:
! 237: tcode += 2; /* Fall through */
! 238:
! 239: case OP_TYPESTAR:
! 240: case OP_TYPEMINSTAR:
! 241: case OP_TYPEQUERY:
! 242: case OP_TYPEMINQUERY:
! 243: switch(tcode[1])
! 244: {
! 245: case OP_NOT_DIGIT:
! 246: for (c = 0; c < 32; c++)
! 247: start_bits[c] |= ~cd->cbits[c+cbit_digit];
! 248: break;
! 249:
! 250: case OP_DIGIT:
! 251: for (c = 0; c < 32; c++)
! 252: start_bits[c] |= cd->cbits[c+cbit_digit];
! 253: break;
! 254:
! 255: case OP_NOT_WHITESPACE:
! 256: for (c = 0; c < 32; c++)
! 257: start_bits[c] |= ~cd->cbits[c+cbit_space];
! 258: break;
! 259:
! 260: case OP_WHITESPACE:
! 261: for (c = 0; c < 32; c++)
! 262: start_bits[c] |= cd->cbits[c+cbit_space];
! 263: break;
! 264:
! 265: case OP_NOT_WORDCHAR:
! 266: for (c = 0; c < 32; c++)
! 267: start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
! 268: break;
! 269:
! 270: case OP_WORDCHAR:
! 271: for (c = 0; c < 32; c++)
! 272: start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
! 273: break;
! 274: }
! 275:
! 276: tcode += 2;
! 277: try_next = TRUE;
! 278: break;
! 279:
! 280: /* Character class: set the bits and either carry on or not,
! 281: according to the repeat count. */
! 282:
! 283: case OP_CLASS:
! 284: {
! 285: tcode++;
! 286: for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
! 287: tcode += 32;
! 288: switch (*tcode)
! 289: {
! 290: case OP_CRSTAR:
! 291: case OP_CRMINSTAR:
! 292: case OP_CRQUERY:
! 293: case OP_CRMINQUERY:
! 294: tcode++;
! 295: try_next = TRUE;
! 296: break;
! 297:
! 298: case OP_CRRANGE:
! 299: case OP_CRMINRANGE:
! 300: if (((tcode[1] << 8) + tcode[2]) == 0)
! 301: {
! 302: tcode += 5;
! 303: try_next = TRUE;
! 304: }
! 305: break;
! 306: }
! 307: }
! 308: break; /* End of class handling */
! 309:
! 310: } /* End of switch */
! 311: } /* End of try_next loop */
! 312:
! 313: code += (code[1] << 8) + code[2]; /* Advance to next branch */
! 314: }
! 315: while (*code == OP_ALT);
! 316: return TRUE;
! 317: }
! 318:
! 319:
! 320:
! 321: /*************************************************
! 322: * Study a compiled expression *
! 323: *************************************************/
! 324:
! 325: /* This function is handed a compiled expression that it must study to produce
! 326: information that will speed up the matching. It returns a pcre_extra block
! 327: which then gets handed back to pcre_exec().
! 328:
! 329: Arguments:
! 330: re points to the compiled expression
! 331: options contains option bits
! 332: errorptr points to where to place error messages;
! 333: set NULL unless error
! 334:
! 335: Returns: pointer to a pcre_extra block,
! 336: NULL on error or if no optimization possible
! 337: */
! 338:
! 339: pcre_extra *
! 340: pcre_study(const pcre *external_re, int options, const char **errorptr)
! 341: {
! 342: uschar start_bits[32];
! 343: real_pcre_extra *extra;
! 344: const real_pcre *re = (const real_pcre *)external_re;
! 345: compile_data compile_block;
! 346:
! 347: *errorptr = NULL;
! 348:
! 349: if (re == NULL || re->magic_number != MAGIC_NUMBER)
! 350: {
! 351: *errorptr = "argument is not a compiled regular expression";
! 352: return NULL;
! 353: }
! 354:
! 355: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
! 356: {
! 357: *errorptr = "unknown or incorrect option bit(s) set";
! 358: return NULL;
! 359: }
! 360:
! 361: /* For an anchored pattern, or an unchored pattern that has a first char, or a
! 362: multiline pattern that matches only at "line starts", no further processing at
! 363: present. */
! 364:
! 365: if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
! 366: return NULL;
! 367:
! 368: /* Set the character tables in the block which is passed around */
! 369:
! 370: compile_block.lcc = re->tables + lcc_offset;
! 371: compile_block.fcc = re->tables + fcc_offset;
! 372: compile_block.cbits = re->tables + cbits_offset;
! 373: compile_block.ctypes = re->tables + ctypes_offset;
! 374:
! 375: /* See if we can find a fixed set of initial characters for the pattern. */
! 376:
! 377: memset(start_bits, 0, 32 * sizeof(uschar));
! 378: if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
! 379: &compile_block)) return NULL;
! 380:
! 381: /* Get an "extra" block and put the information therein. */
! 382:
! 383: extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
! 384:
! 385: if (extra == NULL)
! 386: {
! 387: *errorptr = "failed to get memory";
! 388: return NULL;
! 389: }
! 390:
! 391: extra->options = PCRE_STUDY_MAPPED;
! 392: memcpy(extra->start_bits, start_bits, sizeof(start_bits));
! 393:
! 394: return (pcre_extra *)extra;
! 395: }
! 396:
! 397: /* End of study.c */
E-mail: