Annotation of win32/pcre/pcre_study.c, revision 1.1
1.1 ! misha 1: /*************************************************
! 2: * Perl-Compatible Regular Expressions *
! 3: *************************************************/
! 4:
! 5: /* PCRE is a library of functions to support regular expressions whose syntax
! 6: and semantics are as close as possible to those of the Perl 5 language.
! 7:
! 8: Written by Philip Hazel
! 9: Copyright (c) 1997-2008 University of Cambridge
! 10:
! 11: -----------------------------------------------------------------------------
! 12: Redistribution and use in source and binary forms, with or without
! 13: modification, are permitted provided that the following conditions are met:
! 14:
! 15: * Redistributions of source code must retain the above copyright notice,
! 16: this list of conditions and the following disclaimer.
! 17:
! 18: * Redistributions in binary form must reproduce the above copyright
! 19: notice, this list of conditions and the following disclaimer in the
! 20: documentation and/or other materials provided with the distribution.
! 21:
! 22: * Neither the name of the University of Cambridge nor the names of its
! 23: contributors may be used to endorse or promote products derived from
! 24: this software without specific prior written permission.
! 25:
! 26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
! 27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
! 28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
! 29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
! 30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! 31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! 32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! 33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! 34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! 35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! 36: POSSIBILITY OF SUCH DAMAGE.
! 37: -----------------------------------------------------------------------------
! 38: */
! 39:
! 40:
! 41: /* This module contains the external function pcre_study(), along with local
! 42: supporting functions. */
! 43:
! 44:
! 45: #ifdef HAVE_CONFIG_H
! 46: #include "config.h"
! 47: #endif
! 48:
! 49: #include "pcre_internal.h"
! 50:
! 51:
! 52: /* Returns from set_start_bits() */
! 53:
! 54: enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
! 55:
! 56:
! 57: /*************************************************
! 58: * Set a bit and maybe its alternate case *
! 59: *************************************************/
! 60:
! 61: /* Given a character, set its bit in the table, and also the bit for the other
! 62: version of a letter if we are caseless.
! 63:
! 64: Arguments:
! 65: start_bits points to the bit map
! 66: c is the character
! 67: caseless the caseless flag
! 68: cd the block with char table pointers
! 69:
! 70: Returns: nothing
! 71: */
! 72:
! 73: static void
! 74: set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
! 75: {
! 76: start_bits[c/8] |= (1 << (c&7));
! 77: if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
! 78: start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
! 79: }
! 80:
! 81:
! 82:
! 83: /*************************************************
! 84: * Create bitmap of starting bytes *
! 85: *************************************************/
! 86:
! 87: /* This function scans a compiled unanchored expression recursively and
! 88: attempts to build a bitmap of the set of possible starting bytes. As time goes
! 89: by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
! 90: useful for parenthesized groups in patterns such as (a*)b where the group
! 91: provides some optional starting bytes but scanning must continue at the outer
! 92: level to find at least one mandatory byte. At the outermost level, this
! 93: function fails unless the result is SSB_DONE.
! 94:
! 95: Arguments:
! 96: code points to an expression
! 97: start_bits points to a 32-byte table, initialized to 0
! 98: caseless the current state of the caseless flag
! 99: utf8 TRUE if in UTF-8 mode
! 100: cd the block with char table pointers
! 101:
! 102: Returns: SSB_FAIL => Failed to find any starting bytes
! 103: SSB_DONE => Found mandatory starting bytes
! 104: SSB_CONTINUE => Found optional starting bytes
! 105: */
! 106:
! 107: static int
! 108: set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
! 109: BOOL utf8, compile_data *cd)
! 110: {
! 111: register int c;
! 112: int yield = SSB_DONE;
! 113:
! 114: #if 0
! 115: /* ========================================================================= */
! 116: /* The following comment and code was inserted in January 1999. In May 2006,
! 117: when it was observed to cause compiler warnings about unused values, I took it
! 118: out again. If anybody is still using OS/2, they will have to put it back
! 119: manually. */
! 120:
! 121: /* This next statement and the later reference to dummy are here in order to
! 122: trick the optimizer of the IBM C compiler for OS/2 into generating correct
! 123: code. Apparently IBM isn't going to fix the problem, and we would rather not
! 124: disable optimization (in this module it actually makes a big difference, and
! 125: the pcre module can use all the optimization it can get). */
! 126:
! 127: volatile int dummy;
! 128: /* ========================================================================= */
! 129: #endif
! 130:
! 131: do
! 132: {
! 133: const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
! 134: BOOL try_next = TRUE;
! 135:
! 136: while (try_next) /* Loop for items in this branch */
! 137: {
! 138: int rc;
! 139: switch(*tcode)
! 140: {
! 141: /* Fail if we reach something we don't understand */
! 142:
! 143: default:
! 144: return SSB_FAIL;
! 145:
! 146: /* If we hit a bracket or a positive lookahead assertion, recurse to set
! 147: bits from within the subpattern. If it can't find anything, we have to
! 148: give up. If it finds some mandatory character(s), we are done for this
! 149: branch. Otherwise, carry on scanning after the subpattern. */
! 150:
! 151: case OP_BRA:
! 152: case OP_SBRA:
! 153: case OP_CBRA:
! 154: case OP_SCBRA:
! 155: case OP_ONCE:
! 156: case OP_ASSERT:
! 157: rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
! 158: if (rc == SSB_FAIL) return SSB_FAIL;
! 159: if (rc == SSB_DONE) try_next = FALSE; else
! 160: {
! 161: do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
! 162: tcode += 1 + LINK_SIZE;
! 163: }
! 164: break;
! 165:
! 166: /* If we hit ALT or KET, it means we haven't found anything mandatory in
! 167: this branch, though we might have found something optional. For ALT, we
! 168: continue with the next alternative, but we have to arrange that the final
! 169: result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
! 170: return SSB_CONTINUE: if this is the top level, that indicates failure,
! 171: but after a nested subpattern, it causes scanning to continue. */
! 172:
! 173: case OP_ALT:
! 174: yield = SSB_CONTINUE;
! 175: try_next = FALSE;
! 176: break;
! 177:
! 178: case OP_KET:
! 179: case OP_KETRMAX:
! 180: case OP_KETRMIN:
! 181: return SSB_CONTINUE;
! 182:
! 183: /* Skip over callout */
! 184:
! 185: case OP_CALLOUT:
! 186: tcode += 2 + 2*LINK_SIZE;
! 187: break;
! 188:
! 189: /* Skip over lookbehind and negative lookahead assertions */
! 190:
! 191: case OP_ASSERT_NOT:
! 192: case OP_ASSERTBACK:
! 193: case OP_ASSERTBACK_NOT:
! 194: do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
! 195: tcode += 1 + LINK_SIZE;
! 196: break;
! 197:
! 198: /* Skip over an option setting, changing the caseless flag */
! 199:
! 200: case OP_OPT:
! 201: caseless = (tcode[1] & PCRE_CASELESS) != 0;
! 202: tcode += 2;
! 203: break;
! 204:
! 205: /* BRAZERO does the bracket, but carries on. */
! 206:
! 207: case OP_BRAZERO:
! 208: case OP_BRAMINZERO:
! 209: if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
! 210: return SSB_FAIL;
! 211: /* =========================================================================
! 212: See the comment at the head of this function concerning the next line,
! 213: which was an old fudge for the benefit of OS/2.
! 214: dummy = 1;
! 215: ========================================================================= */
! 216: do tcode += GET(tcode,1); while (*tcode == OP_ALT);
! 217: tcode += 1 + LINK_SIZE;
! 218: break;
! 219:
! 220: /* SKIPZERO skips the bracket. */
! 221:
! 222: case OP_SKIPZERO:
! 223: do tcode += GET(tcode,1); while (*tcode == OP_ALT);
! 224: tcode += 1 + LINK_SIZE;
! 225: break;
! 226:
! 227: /* Single-char * or ? sets the bit and tries the next item */
! 228:
! 229: case OP_STAR:
! 230: case OP_MINSTAR:
! 231: case OP_POSSTAR:
! 232: case OP_QUERY:
! 233: case OP_MINQUERY:
! 234: case OP_POSQUERY:
! 235: set_bit(start_bits, tcode[1], caseless, cd);
! 236: tcode += 2;
! 237: #ifdef SUPPORT_UTF8
! 238: if (utf8 && tcode[-1] >= 0xc0)
! 239: tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
! 240: #endif
! 241: break;
! 242:
! 243: /* Single-char upto sets the bit and tries the next */
! 244:
! 245: case OP_UPTO:
! 246: case OP_MINUPTO:
! 247: case OP_POSUPTO:
! 248: set_bit(start_bits, tcode[3], caseless, cd);
! 249: tcode += 4;
! 250: #ifdef SUPPORT_UTF8
! 251: if (utf8 && tcode[-1] >= 0xc0)
! 252: tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
! 253: #endif
! 254: break;
! 255:
! 256: /* At least one single char sets the bit and stops */
! 257:
! 258: case OP_EXACT: /* Fall through */
! 259: tcode += 2;
! 260:
! 261: case OP_CHAR:
! 262: case OP_CHARNC:
! 263: case OP_PLUS:
! 264: case OP_MINPLUS:
! 265: case OP_POSPLUS:
! 266: set_bit(start_bits, tcode[1], caseless, cd);
! 267: try_next = FALSE;
! 268: break;
! 269:
! 270: /* Single character type sets the bits and stops */
! 271:
! 272: case OP_NOT_DIGIT:
! 273: for (c = 0; c < 32; c++)
! 274: start_bits[c] |= ~cd->cbits[c+cbit_digit];
! 275: try_next = FALSE;
! 276: break;
! 277:
! 278: case OP_DIGIT:
! 279: for (c = 0; c < 32; c++)
! 280: start_bits[c] |= cd->cbits[c+cbit_digit];
! 281: try_next = FALSE;
! 282: break;
! 283:
! 284: /* The cbit_space table has vertical tab as whitespace; we have to
! 285: discard it. */
! 286:
! 287: case OP_NOT_WHITESPACE:
! 288: for (c = 0; c < 32; c++)
! 289: {
! 290: int d = cd->cbits[c+cbit_space];
! 291: if (c == 1) d &= ~0x08;
! 292: start_bits[c] |= ~d;
! 293: }
! 294: try_next = FALSE;
! 295: break;
! 296:
! 297: /* The cbit_space table has vertical tab as whitespace; we have to
! 298: discard it. */
! 299:
! 300: case OP_WHITESPACE:
! 301: for (c = 0; c < 32; c++)
! 302: {
! 303: int d = cd->cbits[c+cbit_space];
! 304: if (c == 1) d &= ~0x08;
! 305: start_bits[c] |= d;
! 306: }
! 307: try_next = FALSE;
! 308: break;
! 309:
! 310: case OP_NOT_WORDCHAR:
! 311: for (c = 0; c < 32; c++)
! 312: start_bits[c] |= ~cd->cbits[c+cbit_word];
! 313: try_next = FALSE;
! 314: break;
! 315:
! 316: case OP_WORDCHAR:
! 317: for (c = 0; c < 32; c++)
! 318: start_bits[c] |= cd->cbits[c+cbit_word];
! 319: try_next = FALSE;
! 320: break;
! 321:
! 322: /* One or more character type fudges the pointer and restarts, knowing
! 323: it will hit a single character type and stop there. */
! 324:
! 325: case OP_TYPEPLUS:
! 326: case OP_TYPEMINPLUS:
! 327: tcode++;
! 328: break;
! 329:
! 330: case OP_TYPEEXACT:
! 331: tcode += 3;
! 332: break;
! 333:
! 334: /* Zero or more repeats of character types set the bits and then
! 335: try again. */
! 336:
! 337: case OP_TYPEUPTO:
! 338: case OP_TYPEMINUPTO:
! 339: case OP_TYPEPOSUPTO:
! 340: tcode += 2; /* Fall through */
! 341:
! 342: case OP_TYPESTAR:
! 343: case OP_TYPEMINSTAR:
! 344: case OP_TYPEPOSSTAR:
! 345: case OP_TYPEQUERY:
! 346: case OP_TYPEMINQUERY:
! 347: case OP_TYPEPOSQUERY:
! 348: switch(tcode[1])
! 349: {
! 350: case OP_ANY:
! 351: case OP_ALLANY:
! 352: return SSB_FAIL;
! 353:
! 354: case OP_NOT_DIGIT:
! 355: for (c = 0; c < 32; c++)
! 356: start_bits[c] |= ~cd->cbits[c+cbit_digit];
! 357: break;
! 358:
! 359: case OP_DIGIT:
! 360: for (c = 0; c < 32; c++)
! 361: start_bits[c] |= cd->cbits[c+cbit_digit];
! 362: break;
! 363:
! 364: /* The cbit_space table has vertical tab as whitespace; we have to
! 365: discard it. */
! 366:
! 367: case OP_NOT_WHITESPACE:
! 368: for (c = 0; c < 32; c++)
! 369: {
! 370: int d = cd->cbits[c+cbit_space];
! 371: if (c == 1) d &= ~0x08;
! 372: start_bits[c] |= ~d;
! 373: }
! 374: break;
! 375:
! 376: /* The cbit_space table has vertical tab as whitespace; we have to
! 377: discard it. */
! 378:
! 379: case OP_WHITESPACE:
! 380: for (c = 0; c < 32; c++)
! 381: {
! 382: int d = cd->cbits[c+cbit_space];
! 383: if (c == 1) d &= ~0x08;
! 384: start_bits[c] |= d;
! 385: }
! 386: break;
! 387:
! 388: case OP_NOT_WORDCHAR:
! 389: for (c = 0; c < 32; c++)
! 390: start_bits[c] |= ~cd->cbits[c+cbit_word];
! 391: break;
! 392:
! 393: case OP_WORDCHAR:
! 394: for (c = 0; c < 32; c++)
! 395: start_bits[c] |= cd->cbits[c+cbit_word];
! 396: break;
! 397: }
! 398:
! 399: tcode += 2;
! 400: break;
! 401:
! 402: /* Character class where all the information is in a bit map: set the
! 403: bits and either carry on or not, according to the repeat count. If it was
! 404: a negative class, and we are operating with UTF-8 characters, any byte
! 405: with a value >= 0xc4 is a potentially valid starter because it starts a
! 406: character with a value > 255. */
! 407:
! 408: case OP_NCLASS:
! 409: #ifdef SUPPORT_UTF8
! 410: if (utf8)
! 411: {
! 412: start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
! 413: memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
! 414: }
! 415: #endif
! 416: /* Fall through */
! 417:
! 418: case OP_CLASS:
! 419: {
! 420: tcode++;
! 421:
! 422: /* In UTF-8 mode, the bits in a bit map correspond to character
! 423: values, not to byte values. However, the bit map we are constructing is
! 424: for byte values. So we have to do a conversion for characters whose
! 425: value is > 127. In fact, there are only two possible starting bytes for
! 426: characters in the range 128 - 255. */
! 427:
! 428: #ifdef SUPPORT_UTF8
! 429: if (utf8)
! 430: {
! 431: for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
! 432: for (c = 128; c < 256; c++)
! 433: {
! 434: if ((tcode[c/8] && (1 << (c&7))) != 0)
! 435: {
! 436: int d = (c >> 6) | 0xc0; /* Set bit for this starter */
! 437: start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
! 438: c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
! 439: }
! 440: }
! 441: }
! 442:
! 443: /* In non-UTF-8 mode, the two bit maps are completely compatible. */
! 444:
! 445: else
! 446: #endif
! 447: {
! 448: for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
! 449: }
! 450:
! 451: /* Advance past the bit map, and act on what follows */
! 452:
! 453: tcode += 32;
! 454: switch (*tcode)
! 455: {
! 456: case OP_CRSTAR:
! 457: case OP_CRMINSTAR:
! 458: case OP_CRQUERY:
! 459: case OP_CRMINQUERY:
! 460: tcode++;
! 461: break;
! 462:
! 463: case OP_CRRANGE:
! 464: case OP_CRMINRANGE:
! 465: if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
! 466: else try_next = FALSE;
! 467: break;
! 468:
! 469: default:
! 470: try_next = FALSE;
! 471: break;
! 472: }
! 473: }
! 474: break; /* End of bitmap class handling */
! 475:
! 476: } /* End of switch */
! 477: } /* End of try_next loop */
! 478:
! 479: code += GET(code, 1); /* Advance to next branch */
! 480: }
! 481: while (*code == OP_ALT);
! 482: return yield;
! 483: }
! 484:
! 485:
! 486:
! 487: /*************************************************
! 488: * Study a compiled expression *
! 489: *************************************************/
! 490:
! 491: /* This function is handed a compiled expression that it must study to produce
! 492: information that will speed up the matching. It returns a pcre_extra block
! 493: which then gets handed back to pcre_exec().
! 494:
! 495: Arguments:
! 496: re points to the compiled expression
! 497: options contains option bits
! 498: errorptr points to where to place error messages;
! 499: set NULL unless error
! 500:
! 501: Returns: pointer to a pcre_extra block, with study_data filled in and the
! 502: appropriate flag set;
! 503: NULL on error or if no optimization possible
! 504: */
! 505:
! 506: PCRE_EXP_DEFN pcre_extra *
! 507: pcre_study(const pcre *external_re, int options, const char **errorptr)
! 508: {
! 509: uschar start_bits[32];
! 510: pcre_extra *extra;
! 511: pcre_study_data *study;
! 512: const uschar *tables;
! 513: uschar *code;
! 514: compile_data compile_block;
! 515: const real_pcre *re = (const real_pcre *)external_re;
! 516:
! 517: *errorptr = NULL;
! 518:
! 519: if (re == NULL || re->magic_number != MAGIC_NUMBER)
! 520: {
! 521: *errorptr = "argument is not a compiled regular expression";
! 522: return NULL;
! 523: }
! 524:
! 525: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
! 526: {
! 527: *errorptr = "unknown or incorrect option bit(s) set";
! 528: return NULL;
! 529: }
! 530:
! 531: code = (uschar *)re + re->name_table_offset +
! 532: (re->name_count * re->name_entry_size);
! 533:
! 534: /* For an anchored pattern, or an unanchored pattern that has a first char, or
! 535: a multiline pattern that matches only at "line starts", no further processing
! 536: at present. */
! 537:
! 538: if ((re->options & PCRE_ANCHORED) != 0 ||
! 539: (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
! 540: return NULL;
! 541:
! 542: /* Set the character tables in the block that is passed around */
! 543:
! 544: tables = re->tables;
! 545: if (tables == NULL)
! 546: (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
! 547: (void *)(&tables));
! 548:
! 549: compile_block.lcc = tables + lcc_offset;
! 550: compile_block.fcc = tables + fcc_offset;
! 551: compile_block.cbits = tables + cbits_offset;
! 552: compile_block.ctypes = tables + ctypes_offset;
! 553:
! 554: /* See if we can find a fixed set of initial characters for the pattern. */
! 555:
! 556: memset(start_bits, 0, 32 * sizeof(uschar));
! 557: if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
! 558: (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
! 559:
! 560: /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
! 561: the latter, which is pointed to by the former, which may also get additional
! 562: data set later by the calling program. At the moment, the size of
! 563: pcre_study_data is fixed. We nevertheless save it in a field for returning via
! 564: the pcre_fullinfo() function so that if it becomes variable in the future, we
! 565: don't have to change that code. */
! 566:
! 567: extra = (pcre_extra *)(pcre_malloc)
! 568: (sizeof(pcre_extra) + sizeof(pcre_study_data));
! 569:
! 570: if (extra == NULL)
! 571: {
! 572: *errorptr = "failed to get memory";
! 573: return NULL;
! 574: }
! 575:
! 576: study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
! 577: extra->flags = PCRE_EXTRA_STUDY_DATA;
! 578: extra->study_data = study;
! 579:
! 580: study->size = sizeof(pcre_study_data);
! 581: study->options = PCRE_STUDY_MAPPED;
! 582: memcpy(study->start_bits, start_bits, sizeof(start_bits));
! 583:
! 584: return extra;
! 585: }
! 586:
! 587: /* End of pcre_study.c */
E-mail: