Annotation of parser3/src/lib/pcre/pcre.c, revision 1.1
1.1 ! paf 1: /*************************************************
! 2: * Perl-Compatible Regular Expressions *
! 3: *************************************************/
! 4:
! 5: /*
! 6: This is a library of functions to support regular expressions whose syntax
! 7: and semantics are as close as possible to those of the Perl 5 language. See
! 8: the file Tech.Notes for some information on the internals.
! 9:
! 10: Written by: Philip Hazel <ph10@cam.ac.uk>
! 11:
! 12: Copyright (c) 1997-1999 University of Cambridge
! 13:
! 14: -----------------------------------------------------------------------------
! 15: Permission is granted to anyone to use this software for any purpose on any
! 16: computer system, and to redistribute it freely, subject to the following
! 17: restrictions:
! 18:
! 19: 1. This software is distributed in the hope that it will be useful,
! 20: but WITHOUT ANY WARRANTY; without even the implied warranty of
! 21: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
! 22:
! 23: 2. The origin of this software must not be misrepresented, either by
! 24: explicit claim or by omission.
! 25:
! 26: 3. Altered versions must be plainly marked as such, and must not be
! 27: misrepresented as being the original software.
! 28:
! 29: 4. If PCRE is embedded in any software that is released under the GNU
! 30: General Purpose Licence (GPL), then the terms of that licence shall
! 31: supersede any condition above with which it is incompatible.
! 32: -----------------------------------------------------------------------------
! 33: */
! 34:
! 35:
! 36: /* Define DEBUG to get debugging output on stdout. */
! 37:
! 38: /* #define DEBUG */
! 39:
! 40: /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
! 41: inline, and there are *still* stupid compilers about that don't like indented
! 42: pre-processor statements. I suppose it's only been 10 years... */
! 43:
! 44: #ifdef DEBUG
! 45: #define DPRINTF(p) printf p
! 46: #else
! 47: #define DPRINTF(p) /*nothing*/
! 48: #endif
! 49:
! 50: /* Include the internals header, which itself includes Standard C headers plus
! 51: the external pcre header. */
! 52:
! 53: #include "internal.h"
! 54:
! 55:
! 56: /* Allow compilation as C++ source code, should anybody want to do that. */
! 57:
! 58: #ifdef __cplusplus
! 59: #define class pcre_class
! 60: #endif
! 61:
! 62:
! 63: /* Number of items on the nested bracket stacks at compile time. This should
! 64: not be set greater than 200. */
! 65:
! 66: #define BRASTACK_SIZE 200
! 67:
! 68:
! 69: /* Min and max values for the common repeats; for the maxima, 0 => infinity */
! 70:
! 71: static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
! 72: static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
! 73:
! 74: /* Text forms of OP_ values and things, for debugging (not all used) */
! 75:
! 76: #ifdef DEBUG
! 77: static const char *OP_names[] = {
! 78: "End", "\\A", "\\B", "\\b", "\\D", "\\d",
! 79: "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
! 80: "Opt", "^", "$", "Any", "chars", "not",
! 81: "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
! 82: "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
! 83: "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
! 84: "*", "*?", "+", "+?", "?", "??", "{", "{",
! 85: "class", "Ref",
! 86: "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
! 87: "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
! 88: "Brazero", "Braminzero", "Bra"
! 89: };
! 90: #endif
! 91:
! 92: /* Table for handling escaped characters in the range '0'-'z'. Positive returns
! 93: are simple data values; negative values are for special things like \d and so
! 94: on. Zero means further processing is needed (for things like \x), or the escape
! 95: is invalid. */
! 96:
! 97: static const short int escapes[] = {
! 98: 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
! 99: 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
! 100: '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
! 101: 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
! 102: 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
! 103: 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
! 104: '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
! 105: 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
! 106: 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
! 107: 0, 0, -ESC_z /* x - z */
! 108: };
! 109:
! 110: /* Definition to allow mutual recursion */
! 111:
! 112: static BOOL
! 113: compile_regex(int, int, int *, uschar **, const uschar **, const char **,
! 114: BOOL, int, int *, int *, compile_data *);
! 115:
! 116:
! 117:
! 118: /*************************************************
! 119: * Global variables *
! 120: *************************************************/
! 121:
! 122: /* PCRE is thread-clean and doesn't use any global variables in the normal
! 123: sense. However, it calls memory allocation and free functions via the two
! 124: indirections below, which are can be changed by the caller, but are shared
! 125: between all threads. */
! 126:
! 127: void *(*pcre_malloc)(size_t) = malloc;
! 128: void (*pcre_free)(void *) = free;
! 129:
! 130:
! 131:
! 132:
! 133: /*************************************************
! 134: * Default character tables *
! 135: *************************************************/
! 136:
! 137: /* A default set of character tables is included in the PCRE binary. Its source
! 138: is built by the maketables auxiliary program, which uses the default C ctypes
! 139: functions, and put in the file chartables.c. These tables are used by PCRE
! 140: whenever the caller of pcre_compile() does not provide an alternate set of
! 141: tables. */
! 142:
! 143: #include "chartables.c"
! 144:
! 145:
! 146:
! 147: /*************************************************
! 148: * Return version string *
! 149: *************************************************/
! 150:
! 151: #define STRING(a) # a
! 152: #define XSTRING(s) STRING(s)
! 153:
! 154: const char *
! 155: pcre_version(void)
! 156: {
! 157: return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
! 158: }
! 159:
! 160:
! 161:
! 162:
! 163: /*************************************************
! 164: * Return info about a compiled pattern *
! 165: *************************************************/
! 166:
! 167: /* This function picks potentially useful data out of the private
! 168: structure. The public options are passed back in an int - though the
! 169: re->options field has been expanded to a long int, all the public options
! 170: at the low end of it, and so even on 16-bit systems this will still be OK.
! 171: Therefore, I haven't changed the API for pcre_info().
! 172:
! 173: Arguments:
! 174: external_re points to compiled code
! 175: optptr where to pass back the options
! 176: first_char where to pass back the first character,
! 177: or -1 if multiline and all branches start ^,
! 178: or -2 otherwise
! 179:
! 180: Returns: number of identifying extraction brackets
! 181: or negative values on error
! 182: */
! 183:
! 184: int
! 185: pcre_info(const pcre *external_re, int *optptr, int *first_char)
! 186: {
! 187: const real_pcre *re = (const real_pcre *)external_re;
! 188: if (re == NULL) return PCRE_ERROR_NULL;
! 189: if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
! 190: if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
! 191: if (first_char != NULL)
! 192: *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
! 193: ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
! 194: return re->top_bracket;
! 195: }
! 196:
! 197:
! 198:
! 199:
! 200: #ifdef DEBUG
! 201: /*************************************************
! 202: * Debugging function to print chars *
! 203: *************************************************/
! 204:
! 205: /* Print a sequence of chars in printable format, stopping at the end of the
! 206: subject if the requested.
! 207:
! 208: Arguments:
! 209: p points to characters
! 210: length number to print
! 211: is_subject TRUE if printing from within md->start_subject
! 212: md pointer to matching data block, if is_subject is TRUE
! 213:
! 214: Returns: nothing
! 215: */
! 216:
! 217: static void
! 218: pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
! 219: {
! 220: int c;
! 221: if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
! 222: while (length-- > 0)
! 223: if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
! 224: }
! 225: #endif
! 226:
! 227:
! 228:
! 229:
! 230: /*************************************************
! 231: * Handle escapes *
! 232: *************************************************/
! 233:
! 234: /* This function is called when a \ has been encountered. It either returns a
! 235: positive value for a simple escape such as \n, or a negative value which
! 236: encodes one of the more complicated things such as \d. On entry, ptr is
! 237: pointing at the \. On exit, it is on the final character of the escape
! 238: sequence.
! 239:
! 240: Arguments:
! 241: ptrptr points to the pattern position pointer
! 242: errorptr points to the pointer to the error message
! 243: bracount number of previous extracting brackets
! 244: options the options bits
! 245: isclass TRUE if inside a character class
! 246: cd pointer to char tables block
! 247:
! 248: Returns: zero or positive => a data character
! 249: negative => a special escape sequence
! 250: on error, errorptr is set
! 251: */
! 252:
! 253: static int
! 254: check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
! 255: int options, BOOL isclass, compile_data *cd)
! 256: {
! 257: const uschar *ptr = *ptrptr;
! 258: int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
! 259: int i;
! 260:
! 261: if (c == 0) *errorptr = ERR1;
! 262:
! 263: /* Digits or letters may have special meaning; all others are literals. */
! 264:
! 265: else if (c < '0' || c > 'z') {}
! 266:
! 267: /* Do an initial lookup in a table. A non-zero result is something that can be
! 268: returned immediately. Otherwise further processing may be required. */
! 269:
! 270: else if ((i = escapes[c - '0']) != 0) c = i;
! 271:
! 272: /* Escapes that need further processing, or are illegal. */
! 273:
! 274: else
! 275: {
! 276: const uschar *oldptr;
! 277: switch (c)
! 278: {
! 279: /* The handling of escape sequences consisting of a string of digits
! 280: starting with one that is not zero is not straightforward. By experiment,
! 281: the way Perl works seems to be as follows:
! 282:
! 283: Outside a character class, the digits are read as a decimal number. If the
! 284: number is less than 10, or if there are that many previous extracting
! 285: left brackets, then it is a back reference. Otherwise, up to three octal
! 286: digits are read to form an escaped byte. Thus \123 is likely to be octal
! 287: 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
! 288: value is greater than 377, the least significant 8 bits are taken. Inside a
! 289: character class, \ followed by a digit is always an octal number. */
! 290:
! 291: case '1': case '2': case '3': case '4': case '5':
! 292: case '6': case '7': case '8': case '9':
! 293:
! 294: if (!isclass)
! 295: {
! 296: oldptr = ptr;
! 297: c -= '0';
! 298: while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
! 299: c = c * 10 + *(++ptr) - '0';
! 300: if (c < 10 || c <= bracount)
! 301: {
! 302: c = -(ESC_REF + c);
! 303: break;
! 304: }
! 305: ptr = oldptr; /* Put the pointer back and fall through */
! 306: }
! 307:
! 308: /* Handle an octal number following \. If the first digit is 8 or 9, Perl
! 309: generates a binary zero byte and treats the digit as a following literal.
! 310: Thus we have to pull back the pointer by one. */
! 311:
! 312: if ((c = *ptr) >= '8')
! 313: {
! 314: ptr--;
! 315: c = 0;
! 316: break;
! 317: }
! 318:
! 319: /* \0 always starts an octal number, but we may drop through to here with a
! 320: larger first octal digit */
! 321:
! 322: case '0':
! 323: c -= '0';
! 324: while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
! 325: ptr[1] != '8' && ptr[1] != '9')
! 326: c = c * 8 + *(++ptr) - '0';
! 327: break;
! 328:
! 329: /* Special escapes not starting with a digit are straightforward */
! 330:
! 331: case 'x':
! 332: c = 0;
! 333: while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
! 334: {
! 335: ptr++;
! 336: c = c * 16 + cd->lcc[*ptr] -
! 337: (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
! 338: }
! 339: break;
! 340:
! 341: case 'c':
! 342: c = *(++ptr);
! 343: if (c == 0)
! 344: {
! 345: *errorptr = ERR2;
! 346: return 0;
! 347: }
! 348:
! 349: /* A letter is upper-cased; then the 0x40 bit is flipped */
! 350:
! 351: if (c >= 'a' && c <= 'z') c = cd->fcc[c];
! 352: c ^= 0x40;
! 353: break;
! 354:
! 355: /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
! 356: other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
! 357: for Perl compatibility, it is a literal. This code looks a bit odd, but
! 358: there used to be some cases other than the default, and there may be again
! 359: in future, so I haven't "optimized" it. */
! 360:
! 361: default:
! 362: if ((options & PCRE_EXTRA) != 0) switch(c)
! 363: {
! 364: default:
! 365: *errorptr = ERR3;
! 366: break;
! 367: }
! 368: break;
! 369: }
! 370: }
! 371:
! 372: *ptrptr = ptr;
! 373: return c;
! 374: }
! 375:
! 376:
! 377:
! 378: /*************************************************
! 379: * Check for counted repeat *
! 380: *************************************************/
! 381:
! 382: /* This function is called when a '{' is encountered in a place where it might
! 383: start a quantifier. It looks ahead to see if it really is a quantifier or not.
! 384: It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
! 385: where the ddds are digits.
! 386:
! 387: Arguments:
! 388: p pointer to the first char after '{'
! 389: cd pointer to char tables block
! 390:
! 391: Returns: TRUE or FALSE
! 392: */
! 393:
! 394: static BOOL
! 395: is_counted_repeat(const uschar *p, compile_data *cd)
! 396: {
! 397: if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
! 398: while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
! 399: if (*p == '}') return TRUE;
! 400:
! 401: if (*p++ != ',') return FALSE;
! 402: if (*p == '}') return TRUE;
! 403:
! 404: if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
! 405: while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
! 406: return (*p == '}');
! 407: }
! 408:
! 409:
! 410:
! 411: /*************************************************
! 412: * Read repeat counts *
! 413: *************************************************/
! 414:
! 415: /* Read an item of the form {n,m} and return the values. This is called only
! 416: after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
! 417: so the syntax is guaranteed to be correct, but we need to check the values.
! 418:
! 419: Arguments:
! 420: p pointer to first char after '{'
! 421: minp pointer to int for min
! 422: maxp pointer to int for max
! 423: returned as -1 if no max
! 424: errorptr points to pointer to error message
! 425: cd pointer to character tables clock
! 426:
! 427: Returns: pointer to '}' on success;
! 428: current ptr on error, with errorptr set
! 429: */
! 430:
! 431: static const uschar *
! 432: read_repeat_counts(const uschar *p, int *minp, int *maxp,
! 433: const char **errorptr, compile_data *cd)
! 434: {
! 435: int min = 0;
! 436: int max = -1;
! 437:
! 438: while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
! 439:
! 440: if (*p == '}') max = min; else
! 441: {
! 442: if (*(++p) != '}')
! 443: {
! 444: max = 0;
! 445: while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
! 446: if (max < min)
! 447: {
! 448: *errorptr = ERR4;
! 449: return p;
! 450: }
! 451: }
! 452: }
! 453:
! 454: /* Do paranoid checks, then fill in the required variables, and pass back the
! 455: pointer to the terminating '}'. */
! 456:
! 457: if (min > 65535 || max > 65535)
! 458: *errorptr = ERR5;
! 459: else
! 460: {
! 461: *minp = min;
! 462: *maxp = max;
! 463: }
! 464: return p;
! 465: }
! 466:
! 467:
! 468:
! 469: /*************************************************
! 470: * Find the fixed length of a pattern *
! 471: *************************************************/
! 472:
! 473: /* Scan a pattern and compute the fixed length of subject that will match it,
! 474: if the length is fixed. This is needed for dealing with backward assertions.
! 475:
! 476: Arguments:
! 477: code points to the start of the pattern (the bracket)
! 478:
! 479: Returns: the fixed length, or -1 if there is no fixed length
! 480: */
! 481:
! 482: static int
! 483: find_fixedlength(uschar *code)
! 484: {
! 485: int length = -1;
! 486:
! 487: register int branchlength = 0;
! 488: register uschar *cc = code + 3;
! 489:
! 490: /* Scan along the opcodes for this branch. If we get to the end of the
! 491: branch, check the length against that of the other branches. */
! 492:
! 493: for (;;)
! 494: {
! 495: int d;
! 496: register int op = *cc;
! 497: if (op >= OP_BRA) op = OP_BRA;
! 498:
! 499: switch (op)
! 500: {
! 501: case OP_BRA:
! 502: case OP_ONCE:
! 503: case OP_COND:
! 504: d = find_fixedlength(cc);
! 505: if (d < 0) return -1;
! 506: branchlength += d;
! 507: do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
! 508: cc += 3;
! 509: break;
! 510:
! 511: /* Reached end of a branch; if it's a ket it is the end of a nested
! 512: call. If it's ALT it is an alternation in a nested call. If it is
! 513: END it's the end of the outer call. All can be handled by the same code. */
! 514:
! 515: case OP_ALT:
! 516: case OP_KET:
! 517: case OP_KETRMAX:
! 518: case OP_KETRMIN:
! 519: case OP_END:
! 520: if (length < 0) length = branchlength;
! 521: else if (length != branchlength) return -1;
! 522: if (*cc != OP_ALT) return length;
! 523: cc += 3;
! 524: branchlength = 0;
! 525: break;
! 526:
! 527: /* Skip over assertive subpatterns */
! 528:
! 529: case OP_ASSERT:
! 530: case OP_ASSERT_NOT:
! 531: case OP_ASSERTBACK:
! 532: case OP_ASSERTBACK_NOT:
! 533: do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
! 534: cc += 3;
! 535: break;
! 536:
! 537: /* Skip over things that don't match chars */
! 538:
! 539: case OP_REVERSE:
! 540: cc++;
! 541: /* Fall through */
! 542:
! 543: case OP_CREF:
! 544: case OP_OPT:
! 545: cc++;
! 546: /* Fall through */
! 547:
! 548: case OP_SOD:
! 549: case OP_EOD:
! 550: case OP_EODN:
! 551: case OP_CIRC:
! 552: case OP_DOLL:
! 553: case OP_NOT_WORD_BOUNDARY:
! 554: case OP_WORD_BOUNDARY:
! 555: cc++;
! 556: break;
! 557:
! 558: /* Handle char strings */
! 559:
! 560: case OP_CHARS:
! 561: branchlength += *(++cc);
! 562: cc += *cc + 1;
! 563: break;
! 564:
! 565: /* Handle exact repetitions */
! 566:
! 567: case OP_EXACT:
! 568: case OP_TYPEEXACT:
! 569: branchlength += (cc[1] << 8) + cc[2];
! 570: cc += 4;
! 571: break;
! 572:
! 573: /* Handle single-char matchers */
! 574:
! 575: case OP_NOT_DIGIT:
! 576: case OP_DIGIT:
! 577: case OP_NOT_WHITESPACE:
! 578: case OP_WHITESPACE:
! 579: case OP_NOT_WORDCHAR:
! 580: case OP_WORDCHAR:
! 581: case OP_ANY:
! 582: branchlength++;
! 583: cc++;
! 584: break;
! 585:
! 586:
! 587: /* Check a class for variable quantification */
! 588:
! 589: case OP_CLASS:
! 590: cc += (*cc == OP_REF)? 2 : 33;
! 591:
! 592: switch (*cc)
! 593: {
! 594: case OP_CRSTAR:
! 595: case OP_CRMINSTAR:
! 596: case OP_CRQUERY:
! 597: case OP_CRMINQUERY:
! 598: return -1;
! 599:
! 600: case OP_CRRANGE:
! 601: case OP_CRMINRANGE:
! 602: if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
! 603: branchlength += (cc[1] << 8) + cc[2];
! 604: cc += 5;
! 605: break;
! 606:
! 607: default:
! 608: branchlength++;
! 609: }
! 610: break;
! 611:
! 612: /* Anything else is variable length */
! 613:
! 614: default:
! 615: return -1;
! 616: }
! 617: }
! 618: /* Control never gets here */
! 619: }
! 620:
! 621:
! 622:
! 623:
! 624: /*************************************************
! 625: * Compile one branch *
! 626: *************************************************/
! 627:
! 628: /* Scan the pattern, compiling it into the code vector.
! 629:
! 630: Arguments:
! 631: options the option bits
! 632: brackets points to number of brackets used
! 633: code points to the pointer to the current code point
! 634: ptrptr points to the current pattern pointer
! 635: errorptr points to pointer to error message
! 636: optchanged set to the value of the last OP_OPT item compiled
! 637: reqchar set to the last literal character required, else -1
! 638: countlits set to count of mandatory literal characters
! 639: cd contains pointers to tables
! 640:
! 641: Returns: TRUE on success
! 642: FALSE, with *errorptr set on error
! 643: */
! 644:
! 645: static BOOL
! 646: compile_branch(int options, int *brackets, uschar **codeptr,
! 647: const uschar **ptrptr, const char **errorptr, int *optchanged,
! 648: int *reqchar, int *countlits, compile_data *cd)
! 649: {
! 650: int repeat_type, op_type;
! 651: int repeat_min, repeat_max;
! 652: int bravalue, length;
! 653: int greedy_default, greedy_non_default;
! 654: int prevreqchar;
! 655: int condcount = 0;
! 656: int subcountlits = 0;
! 657: register int c;
! 658: register uschar *code = *codeptr;
! 659: uschar *tempcode;
! 660: const uschar *ptr = *ptrptr;
! 661: const uschar *tempptr;
! 662: uschar *previous = NULL;
! 663: uschar class[32];
! 664:
! 665: /* Set up the default and non-default settings for greediness */
! 666:
! 667: greedy_default = ((options & PCRE_UNGREEDY) != 0);
! 668: greedy_non_default = greedy_default ^ 1;
! 669:
! 670: /* Initialize no required char, and count of literals */
! 671:
! 672: *reqchar = prevreqchar = -1;
! 673: *countlits = 0;
! 674:
! 675: /* Switch on next character until the end of the branch */
! 676:
! 677: for (;; ptr++)
! 678: {
! 679: BOOL negate_class;
! 680: int class_charcount;
! 681: int class_lastchar;
! 682: int newoptions;
! 683: int condref;
! 684: int subreqchar;
! 685:
! 686: c = *ptr;
! 687: if ((options & PCRE_EXTENDED) != 0)
! 688: {
! 689: if ((cd->ctypes[c] & ctype_space) != 0) continue;
! 690: if (c == '#')
! 691: {
! 692: while ((c = *(++ptr)) != 0 && c != '\n');
! 693: continue;
! 694: }
! 695: }
! 696:
! 697: switch(c)
! 698: {
! 699: /* The branch terminates at end of string, |, or ). */
! 700:
! 701: case 0:
! 702: case '|':
! 703: case ')':
! 704: *codeptr = code;
! 705: *ptrptr = ptr;
! 706: return TRUE;
! 707:
! 708: /* Handle single-character metacharacters */
! 709:
! 710: case '^':
! 711: previous = NULL;
! 712: *code++ = OP_CIRC;
! 713: break;
! 714:
! 715: case '$':
! 716: previous = NULL;
! 717: *code++ = OP_DOLL;
! 718: break;
! 719:
! 720: case '.':
! 721: previous = code;
! 722: *code++ = OP_ANY;
! 723: break;
! 724:
! 725: /* Character classes. These always build a 32-byte bitmap of the permitted
! 726: characters, except in the special case where there is only one character.
! 727: For negated classes, we build the map as usual, then invert it at the end.
! 728: */
! 729:
! 730: case '[':
! 731: previous = code;
! 732: *code++ = OP_CLASS;
! 733:
! 734: /* If the first character is '^', set the negation flag and skip it. */
! 735:
! 736: if ((c = *(++ptr)) == '^')
! 737: {
! 738: negate_class = TRUE;
! 739: c = *(++ptr);
! 740: }
! 741: else negate_class = FALSE;
! 742:
! 743: /* Keep a count of chars so that we can optimize the case of just a single
! 744: character. */
! 745:
! 746: class_charcount = 0;
! 747: class_lastchar = -1;
! 748:
! 749: /* Initialize the 32-char bit map to all zeros. We have to build the
! 750: map in a temporary bit of store, in case the class contains only 1
! 751: character, because in that case the compiled code doesn't use the
! 752: bit map. */
! 753:
! 754: memset(class, 0, 32 * sizeof(uschar));
! 755:
! 756: /* Process characters until ] is reached. By writing this as a "do" it
! 757: means that an initial ] is taken as a data character. */
! 758:
! 759: do
! 760: {
! 761: if (c == 0)
! 762: {
! 763: *errorptr = ERR6;
! 764: goto FAILED;
! 765: }
! 766:
! 767: /* Backslash may introduce a single character, or it may introduce one
! 768: of the specials, which just set a flag. Escaped items are checked for
! 769: validity in the pre-compiling pass. The sequence \b is a special case.
! 770: Inside a class (and only there) it is treated as backspace. Elsewhere
! 771: it marks a word boundary. Other escapes have preset maps ready to
! 772: or into the one we are building. We assume they have more than one
! 773: character in them, so set class_count bigger than one. */
! 774:
! 775: if (c == '\\')
! 776: {
! 777: c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
! 778: if (-c == ESC_b) c = '\b';
! 779: else if (c < 0)
! 780: {
! 781: register const uschar *cbits = cd->cbits;
! 782: class_charcount = 10;
! 783: switch (-c)
! 784: {
! 785: case ESC_d:
! 786: for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
! 787: continue;
! 788:
! 789: case ESC_D:
! 790: for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
! 791: continue;
! 792:
! 793: case ESC_w:
! 794: for (c = 0; c < 32; c++)
! 795: class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
! 796: continue;
! 797:
! 798: case ESC_W:
! 799: for (c = 0; c < 32; c++)
! 800: class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
! 801: continue;
! 802:
! 803: case ESC_s:
! 804: for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
! 805: continue;
! 806:
! 807: case ESC_S:
! 808: for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
! 809: continue;
! 810:
! 811: default:
! 812: *errorptr = ERR7;
! 813: goto FAILED;
! 814: }
! 815: }
! 816: /* Fall through if single character */
! 817: }
! 818:
! 819: /* A single character may be followed by '-' to form a range. However,
! 820: Perl does not permit ']' to be the end of the range. A '-' character
! 821: here is treated as a literal. */
! 822:
! 823: if (ptr[1] == '-' && ptr[2] != ']')
! 824: {
! 825: int d;
! 826: ptr += 2;
! 827: d = *ptr;
! 828:
! 829: if (d == 0)
! 830: {
! 831: *errorptr = ERR6;
! 832: goto FAILED;
! 833: }
! 834:
! 835: /* The second part of a range can be a single-character escape, but
! 836: not any of the other escapes. */
! 837:
! 838: if (d == '\\')
! 839: {
! 840: d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
! 841: if (d < 0)
! 842: {
! 843: if (d == -ESC_b) d = '\b'; else
! 844: {
! 845: *errorptr = ERR7;
! 846: goto FAILED;
! 847: }
! 848: }
! 849: }
! 850:
! 851: if (d < c)
! 852: {
! 853: *errorptr = ERR8;
! 854: goto FAILED;
! 855: }
! 856:
! 857: for (; c <= d; c++)
! 858: {
! 859: class[c/8] |= (1 << (c&7));
! 860: if ((options & PCRE_CASELESS) != 0)
! 861: {
! 862: int uc = cd->fcc[c]; /* flip case */
! 863: class[uc/8] |= (1 << (uc&7));
! 864: }
! 865: class_charcount++; /* in case a one-char range */
! 866: class_lastchar = c;
! 867: }
! 868: continue; /* Go get the next char in the class */
! 869: }
! 870:
! 871: /* Handle a lone single character - we can get here for a normal
! 872: non-escape char, or after \ that introduces a single character. */
! 873:
! 874: class [c/8] |= (1 << (c&7));
! 875: if ((options & PCRE_CASELESS) != 0)
! 876: {
! 877: c = cd->fcc[c]; /* flip case */
! 878: class[c/8] |= (1 << (c&7));
! 879: }
! 880: class_charcount++;
! 881: class_lastchar = c;
! 882: }
! 883:
! 884: /* Loop until ']' reached; the check for end of string happens inside the
! 885: loop. This "while" is the end of the "do" above. */
! 886:
! 887: while ((c = *(++ptr)) != ']');
! 888:
! 889: /* If class_charcount is 1 and class_lastchar is not negative, we saw
! 890: precisely one character. This doesn't need the whole 32-byte bit map.
! 891: We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
! 892: it's negative. */
! 893:
! 894: if (class_charcount == 1 && class_lastchar >= 0)
! 895: {
! 896: if (negate_class)
! 897: {
! 898: code[-1] = OP_NOT;
! 899: }
! 900: else
! 901: {
! 902: code[-1] = OP_CHARS;
! 903: *code++ = 1;
! 904: }
! 905: *code++ = class_lastchar;
! 906: }
! 907:
! 908: /* Otherwise, negate the 32-byte map if necessary, and copy it into
! 909: the code vector. */
! 910:
! 911: else
! 912: {
! 913: if (negate_class)
! 914: for (c = 0; c < 32; c++) code[c] = ~class[c];
! 915: else
! 916: memcpy(code, class, 32);
! 917: code += 32;
! 918: }
! 919: break;
! 920:
! 921: /* Various kinds of repeat */
! 922:
! 923: case '{':
! 924: if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
! 925: ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
! 926: if (*errorptr != NULL) goto FAILED;
! 927: goto REPEAT;
! 928:
! 929: case '*':
! 930: repeat_min = 0;
! 931: repeat_max = -1;
! 932: goto REPEAT;
! 933:
! 934: case '+':
! 935: repeat_min = 1;
! 936: repeat_max = -1;
! 937: goto REPEAT;
! 938:
! 939: case '?':
! 940: repeat_min = 0;
! 941: repeat_max = 1;
! 942:
! 943: REPEAT:
! 944: if (previous == NULL)
! 945: {
! 946: *errorptr = ERR9;
! 947: goto FAILED;
! 948: }
! 949:
! 950: /* If the next character is '?' this is a minimizing repeat, by default,
! 951: but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
! 952: next character. */
! 953:
! 954: if (ptr[1] == '?')
! 955: { repeat_type = greedy_non_default; ptr++; }
! 956: else repeat_type = greedy_default;
! 957:
! 958: /* If previous was a string of characters, chop off the last one and use it
! 959: as the subject of the repeat. If there was only one character, we can
! 960: abolish the previous item altogether. A repeat with a zero minimum wipes
! 961: out any reqchar setting, backing up to the previous value. We must also
! 962: adjust the countlits value. */
! 963:
! 964: if (*previous == OP_CHARS)
! 965: {
! 966: int len = previous[1];
! 967:
! 968: if (repeat_min == 0) *reqchar = prevreqchar;
! 969: *countlits += repeat_min - 1;
! 970:
! 971: if (len == 1)
! 972: {
! 973: c = previous[2];
! 974: code = previous;
! 975: }
! 976: else
! 977: {
! 978: c = previous[len+1];
! 979: previous[1]--;
! 980: code--;
! 981: }
! 982: op_type = 0; /* Use single-char op codes */
! 983: goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
! 984: }
! 985:
! 986: /* If previous was a single negated character ([^a] or similar), we use
! 987: one of the special opcodes, replacing it. The code is shared with single-
! 988: character repeats by adding a suitable offset into repeat_type. */
! 989:
! 990: else if ((int)*previous == OP_NOT)
! 991: {
! 992: op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
! 993: c = previous[1];
! 994: code = previous;
! 995: goto OUTPUT_SINGLE_REPEAT;
! 996: }
! 997:
! 998: /* If previous was a character type match (\d or similar), abolish it and
! 999: create a suitable repeat item. The code is shared with single-character
! 1000: repeats by adding a suitable offset into repeat_type. */
! 1001:
! 1002: else if ((int)*previous < OP_EODN || *previous == OP_ANY)
! 1003: {
! 1004: op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
! 1005: c = *previous;
! 1006: code = previous;
! 1007:
! 1008: OUTPUT_SINGLE_REPEAT:
! 1009:
! 1010: /* If the maximum is zero then the minimum must also be zero; Perl allows
! 1011: this case, so we do too - by simply omitting the item altogether. */
! 1012:
! 1013: if (repeat_max == 0) goto END_REPEAT;
! 1014:
! 1015: /* Combine the op_type with the repeat_type */
! 1016:
! 1017: repeat_type += op_type;
! 1018:
! 1019: /* A minimum of zero is handled either as the special case * or ?, or as
! 1020: an UPTO, with the maximum given. */
! 1021:
! 1022: if (repeat_min == 0)
! 1023: {
! 1024: if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
! 1025: else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
! 1026: else
! 1027: {
! 1028: *code++ = OP_UPTO + repeat_type;
! 1029: *code++ = repeat_max >> 8;
! 1030: *code++ = (repeat_max & 255);
! 1031: }
! 1032: }
! 1033:
! 1034: /* The case {1,} is handled as the special case + */
! 1035:
! 1036: else if (repeat_min == 1 && repeat_max == -1)
! 1037: *code++ = OP_PLUS + repeat_type;
! 1038:
! 1039: /* The case {n,n} is just an EXACT, while the general case {n,m} is
! 1040: handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
! 1041:
! 1042: else
! 1043: {
! 1044: if (repeat_min != 1)
! 1045: {
! 1046: *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
! 1047: *code++ = repeat_min >> 8;
! 1048: *code++ = (repeat_min & 255);
! 1049: }
! 1050:
! 1051: /* If the mininum is 1 and the previous item was a character string,
! 1052: we either have to put back the item that got cancelled if the string
! 1053: length was 1, or add the character back onto the end of a longer
! 1054: string. For a character type nothing need be done; it will just get
! 1055: put back naturally. Note that the final character is always going to
! 1056: get added below. */
! 1057:
! 1058: else if (*previous == OP_CHARS)
! 1059: {
! 1060: if (code == previous) code += 2; else previous[1]++;
! 1061: }
! 1062:
! 1063: /* For a single negated character we also have to put back the
! 1064: item that got cancelled. */
! 1065:
! 1066: else if (*previous == OP_NOT) code++;
! 1067:
! 1068: /* If the maximum is unlimited, insert an OP_STAR. */
! 1069:
! 1070: if (repeat_max < 0)
! 1071: {
! 1072: *code++ = c;
! 1073: *code++ = OP_STAR + repeat_type;
! 1074: }
! 1075:
! 1076: /* Else insert an UPTO if the max is greater than the min. */
! 1077:
! 1078: else if (repeat_max != repeat_min)
! 1079: {
! 1080: *code++ = c;
! 1081: repeat_max -= repeat_min;
! 1082: *code++ = OP_UPTO + repeat_type;
! 1083: *code++ = repeat_max >> 8;
! 1084: *code++ = (repeat_max & 255);
! 1085: }
! 1086: }
! 1087:
! 1088: /* The character or character type itself comes last in all cases. */
! 1089:
! 1090: *code++ = c;
! 1091: }
! 1092:
! 1093: /* If previous was a character class or a back reference, we put the repeat
! 1094: stuff after it, but just skip the item if the repeat was {0,0}. */
! 1095:
! 1096: else if (*previous == OP_CLASS || *previous == OP_REF)
! 1097: {
! 1098: if (repeat_max == 0)
! 1099: {
! 1100: code = previous;
! 1101: goto END_REPEAT;
! 1102: }
! 1103: if (repeat_min == 0 && repeat_max == -1)
! 1104: *code++ = OP_CRSTAR + repeat_type;
! 1105: else if (repeat_min == 1 && repeat_max == -1)
! 1106: *code++ = OP_CRPLUS + repeat_type;
! 1107: else if (repeat_min == 0 && repeat_max == 1)
! 1108: *code++ = OP_CRQUERY + repeat_type;
! 1109: else
! 1110: {
! 1111: *code++ = OP_CRRANGE + repeat_type;
! 1112: *code++ = repeat_min >> 8;
! 1113: *code++ = repeat_min & 255;
! 1114: if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
! 1115: *code++ = repeat_max >> 8;
! 1116: *code++ = repeat_max & 255;
! 1117: }
! 1118: }
! 1119:
! 1120: /* If previous was a bracket group, we may have to replicate it in certain
! 1121: cases. */
! 1122:
! 1123: else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
! 1124: (int)*previous == OP_COND)
! 1125: {
! 1126: register int i;
! 1127: int ketoffset = 0;
! 1128: int len = code - previous;
! 1129: uschar *bralink = NULL;
! 1130:
! 1131: /* If the maximum repeat count is unlimited, find the end of the bracket
! 1132: by scanning through from the start, and compute the offset back to it
! 1133: from the current code pointer. There may be an OP_OPT setting following
! 1134: the final KET, so we can't find the end just by going back from the code
! 1135: pointer. */
! 1136:
! 1137: if (repeat_max == -1)
! 1138: {
! 1139: register uschar *ket = previous;
! 1140: do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
! 1141: ketoffset = code - ket;
! 1142: }
! 1143:
! 1144: /* The case of a zero minimum is special because of the need to stick
! 1145: OP_BRAZERO in front of it, and because the group appears once in the
! 1146: data, whereas in other cases it appears the minimum number of times. For
! 1147: this reason, it is simplest to treat this case separately, as otherwise
! 1148: the code gets far too mess. There are several special subcases when the
! 1149: minimum is zero. */
! 1150:
! 1151: if (repeat_min == 0)
! 1152: {
! 1153: /* If we set up a required char from the bracket, we must back off
! 1154: to the previous value and reset the countlits value too. */
! 1155:
! 1156: if (subcountlits > 0)
! 1157: {
! 1158: *reqchar = prevreqchar;
! 1159: *countlits -= subcountlits;
! 1160: }
! 1161:
! 1162: /* If the maximum is also zero, we just omit the group from the output
! 1163: altogether. */
! 1164:
! 1165: if (repeat_max == 0)
! 1166: {
! 1167: code = previous;
! 1168: goto END_REPEAT;
! 1169: }
! 1170:
! 1171: /* If the maximum is 1 or unlimited, we just have to stick in the
! 1172: BRAZERO and do no more at this point. */
! 1173:
! 1174: if (repeat_max <= 1)
! 1175: {
! 1176: memmove(previous+1, previous, len);
! 1177: code++;
! 1178: *previous++ = OP_BRAZERO + repeat_type;
! 1179: }
! 1180:
! 1181: /* If the maximum is greater than 1 and limited, we have to replicate
! 1182: in a nested fashion, sticking OP_BRAZERO before each set of brackets.
! 1183: The first one has to be handled carefully because it's the original
! 1184: copy, which has to be moved up. The remainder can be handled by code
! 1185: that is common with the non-zero minimum case below. We just have to
! 1186: adjust the value or repeat_max, since one less copy is required. */
! 1187:
! 1188: else
! 1189: {
! 1190: int offset;
! 1191: memmove(previous+4, previous, len);
! 1192: code += 4;
! 1193: *previous++ = OP_BRAZERO + repeat_type;
! 1194: *previous++ = OP_BRA;
! 1195:
! 1196: /* We chain together the bracket offset fields that have to be
! 1197: filled in later when the ends of the brackets are reached. */
! 1198:
! 1199: offset = (bralink == NULL)? 0 : previous - bralink;
! 1200: bralink = previous;
! 1201: *previous++ = offset >> 8;
! 1202: *previous++ = offset & 255;
! 1203: }
! 1204:
! 1205: repeat_max--;
! 1206: }
! 1207:
! 1208: /* If the minimum is greater than zero, replicate the group as many
! 1209: times as necessary, and adjust the maximum to the number of subsequent
! 1210: copies that we need. */
! 1211:
! 1212: else
! 1213: {
! 1214: for (i = 1; i < repeat_min; i++)
! 1215: {
! 1216: memcpy(code, previous, len);
! 1217: code += len;
! 1218: }
! 1219: if (repeat_max > 0) repeat_max -= repeat_min;
! 1220: }
! 1221:
! 1222: /* This code is common to both the zero and non-zero minimum cases. If
! 1223: the maximum is limited, it replicates the group in a nested fashion,
! 1224: remembering the bracket starts on a stack. In the case of a zero minimum,
! 1225: the first one was set up above. In all cases the repeat_max now specifies
! 1226: the number of additional copies needed. */
! 1227:
! 1228: if (repeat_max >= 0)
! 1229: {
! 1230: for (i = repeat_max - 1; i >= 0; i--)
! 1231: {
! 1232: *code++ = OP_BRAZERO + repeat_type;
! 1233:
! 1234: /* All but the final copy start a new nesting, maintaining the
! 1235: chain of brackets outstanding. */
! 1236:
! 1237: if (i != 0)
! 1238: {
! 1239: int offset;
! 1240: *code++ = OP_BRA;
! 1241: offset = (bralink == NULL)? 0 : code - bralink;
! 1242: bralink = code;
! 1243: *code++ = offset >> 8;
! 1244: *code++ = offset & 255;
! 1245: }
! 1246:
! 1247: memcpy(code, previous, len);
! 1248: code += len;
! 1249: }
! 1250:
! 1251: /* Now chain through the pending brackets, and fill in their length
! 1252: fields (which are holding the chain links pro tem). */
! 1253:
! 1254: while (bralink != NULL)
! 1255: {
! 1256: int oldlinkoffset;
! 1257: int offset = code - bralink + 1;
! 1258: uschar *bra = code - offset;
! 1259: oldlinkoffset = (bra[1] << 8) + bra[2];
! 1260: bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
! 1261: *code++ = OP_KET;
! 1262: *code++ = bra[1] = offset >> 8;
! 1263: *code++ = bra[2] = (offset & 255);
! 1264: }
! 1265: }
! 1266:
! 1267: /* If the maximum is unlimited, set a repeater in the final copy. We
! 1268: can't just offset backwards from the current code point, because we
! 1269: don't know if there's been an options resetting after the ket. The
! 1270: correct offset was computed above. */
! 1271:
! 1272: else code[-ketoffset] = OP_KETRMAX + repeat_type;
! 1273: }
! 1274:
! 1275: /* Else there's some kind of shambles */
! 1276:
! 1277: else
! 1278: {
! 1279: *errorptr = ERR11;
! 1280: goto FAILED;
! 1281: }
! 1282:
! 1283: /* In all case we no longer have a previous item. */
! 1284:
! 1285: END_REPEAT:
! 1286: previous = NULL;
! 1287: break;
! 1288:
! 1289:
! 1290: /* Start of nested bracket sub-expression, or comment or lookahead or
! 1291: lookbehind or option setting or condition. First deal with special things
! 1292: that can come after a bracket; all are introduced by ?, and the appearance
! 1293: of any of them means that this is not a referencing group. They were
! 1294: checked for validity in the first pass over the string, so we don't have to
! 1295: check for syntax errors here. */
! 1296:
! 1297: case '(':
! 1298: newoptions = options;
! 1299: condref = -1;
! 1300:
! 1301: if (*(++ptr) == '?')
! 1302: {
! 1303: int set, unset;
! 1304: int *optset;
! 1305:
! 1306: switch (*(++ptr))
! 1307: {
! 1308: case '#': /* Comment; skip to ket */
! 1309: ptr++;
! 1310: while (*ptr != ')') ptr++;
! 1311: continue;
! 1312:
! 1313: case ':': /* Non-extracting bracket */
! 1314: bravalue = OP_BRA;
! 1315: ptr++;
! 1316: break;
! 1317:
! 1318: case '(':
! 1319: bravalue = OP_COND; /* Conditional group */
! 1320: if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
! 1321: {
! 1322: condref = *ptr - '0';
! 1323: while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
! 1324: ptr++;
! 1325: }
! 1326: else ptr--;
! 1327: break;
! 1328:
! 1329: case '=': /* Positive lookahead */
! 1330: bravalue = OP_ASSERT;
! 1331: ptr++;
! 1332: break;
! 1333:
! 1334: case '!': /* Negative lookahead */
! 1335: bravalue = OP_ASSERT_NOT;
! 1336: ptr++;
! 1337: break;
! 1338:
! 1339: case '<': /* Lookbehinds */
! 1340: switch (*(++ptr))
! 1341: {
! 1342: case '=': /* Positive lookbehind */
! 1343: bravalue = OP_ASSERTBACK;
! 1344: ptr++;
! 1345: break;
! 1346:
! 1347: case '!': /* Negative lookbehind */
! 1348: bravalue = OP_ASSERTBACK_NOT;
! 1349: ptr++;
! 1350: break;
! 1351:
! 1352: default: /* Syntax error */
! 1353: *errorptr = ERR24;
! 1354: goto FAILED;
! 1355: }
! 1356: break;
! 1357:
! 1358: case '>': /* One-time brackets */
! 1359: bravalue = OP_ONCE;
! 1360: ptr++;
! 1361: break;
! 1362:
! 1363: default: /* Option setting */
! 1364: set = unset = 0;
! 1365: optset = &set;
! 1366:
! 1367: while (*ptr != ')' && *ptr != ':')
! 1368: {
! 1369: switch (*ptr++)
! 1370: {
! 1371: case '-': optset = &unset; break;
! 1372:
! 1373: case 'i': *optset |= PCRE_CASELESS; break;
! 1374: case 'm': *optset |= PCRE_MULTILINE; break;
! 1375: case 's': *optset |= PCRE_DOTALL; break;
! 1376: case 'x': *optset |= PCRE_EXTENDED; break;
! 1377: case 'U': *optset |= PCRE_UNGREEDY; break;
! 1378: case 'X': *optset |= PCRE_EXTRA; break;
! 1379:
! 1380: default:
! 1381: *errorptr = ERR12;
! 1382: goto FAILED;
! 1383: }
! 1384: }
! 1385:
! 1386: /* Set up the changed option bits, but don't change anything yet. */
! 1387:
! 1388: newoptions = (options | set) & (~unset);
! 1389:
! 1390: /* If the options ended with ')' this is not the start of a nested
! 1391: group with option changes, so the options change at this level. At top
! 1392: level there is nothing else to be done (the options will in fact have
! 1393: been set from the start of compiling as a result of the first pass) but
! 1394: at an inner level we must compile code to change the ims options if
! 1395: necessary, and pass the new setting back so that it can be put at the
! 1396: start of any following branches, and when this group ends, a resetting
! 1397: item can be compiled. */
! 1398:
! 1399: if (*ptr == ')')
! 1400: {
! 1401: if ((options & PCRE_INGROUP) != 0 &&
! 1402: (options & PCRE_IMS) != (newoptions & PCRE_IMS))
! 1403: {
! 1404: *code++ = OP_OPT;
! 1405: *code++ = *optchanged = newoptions & PCRE_IMS;
! 1406: }
! 1407: options = newoptions; /* Change options at this level */
! 1408: previous = NULL; /* This item can't be repeated */
! 1409: continue; /* It is complete */
! 1410: }
! 1411:
! 1412: /* If the options ended with ':' we are heading into a nested group
! 1413: with possible change of options. Such groups are non-capturing and are
! 1414: not assertions of any kind. All we need to do is skip over the ':';
! 1415: the newoptions value is handled below. */
! 1416:
! 1417: bravalue = OP_BRA;
! 1418: ptr++;
! 1419: }
! 1420: }
! 1421:
! 1422: /* Else we have a referencing group; adjust the opcode. */
! 1423:
! 1424: else
! 1425: {
! 1426: if (++(*brackets) > EXTRACT_MAX)
! 1427: {
! 1428: *errorptr = ERR13;
! 1429: goto FAILED;
! 1430: }
! 1431: bravalue = OP_BRA + *brackets;
! 1432: }
! 1433:
! 1434: /* Process nested bracketed re. Assertions may not be repeated, but other
! 1435: kinds can be. We copy code into a non-register variable in order to be able
! 1436: to pass its address because some compilers complain otherwise. Pass in a
! 1437: new setting for the ims options if they have changed. */
! 1438:
! 1439: previous = (bravalue >= OP_ONCE)? code : NULL;
! 1440: *code = bravalue;
! 1441: tempcode = code;
! 1442:
! 1443: if (!compile_regex(
! 1444: options | PCRE_INGROUP, /* Set for all nested groups */
! 1445: ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
! 1446: newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
! 1447: brackets, /* Bracket level */
! 1448: &tempcode, /* Where to put code (updated) */
! 1449: &ptr, /* Input pointer (updated) */
! 1450: errorptr, /* Where to put an error message */
! 1451: (bravalue == OP_ASSERTBACK ||
! 1452: bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
! 1453: condref, /* Condition reference number */
! 1454: &subreqchar, /* For possible last char */
! 1455: &subcountlits, /* For literal count */
! 1456: cd)) /* Tables block */
! 1457: goto FAILED;
! 1458:
! 1459: /* At the end of compiling, code is still pointing to the start of the
! 1460: group, while tempcode has been updated to point past the end of the group
! 1461: and any option resetting that may follow it. The pattern pointer (ptr)
! 1462: is on the bracket. */
! 1463:
! 1464: /* If this is a conditional bracket, check that there are no more than
! 1465: two branches in the group. */
! 1466:
! 1467: if (bravalue == OP_COND)
! 1468: {
! 1469: uschar *tc = code;
! 1470: condcount = 0;
! 1471:
! 1472: do {
! 1473: condcount++;
! 1474: tc += (tc[1] << 8) | tc[2];
! 1475: }
! 1476: while (*tc != OP_KET);
! 1477:
! 1478: if (condcount > 2)
! 1479: {
! 1480: *errorptr = ERR27;
! 1481: goto FAILED;
! 1482: }
! 1483: }
! 1484:
! 1485: /* Handle updating of the required character. If the subpattern didn't
! 1486: set one, leave it as it was. Otherwise, update it for normal brackets of
! 1487: all kinds, forward assertions, and conditions with two branches. Don't
! 1488: update the literal count for forward assertions, however. If the bracket
! 1489: is followed by a quantifier with zero repeat, we have to back off. Hence
! 1490: the definition of prevreqchar and subcountlits outside the main loop so
! 1491: that they can be accessed for the back off. */
! 1492:
! 1493: if (subreqchar > 0 &&
! 1494: (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
! 1495: (bravalue == OP_COND && condcount == 2)))
! 1496: {
! 1497: prevreqchar = *reqchar;
! 1498: *reqchar = subreqchar;
! 1499: if (bravalue != OP_ASSERT) *countlits += subcountlits;
! 1500: }
! 1501:
! 1502: /* Now update the main code pointer to the end of the group. */
! 1503:
! 1504: code = tempcode;
! 1505:
! 1506: /* Error if hit end of pattern */
! 1507:
! 1508: if (*ptr != ')')
! 1509: {
! 1510: *errorptr = ERR14;
! 1511: goto FAILED;
! 1512: }
! 1513: break;
! 1514:
! 1515: /* Check \ for being a real metacharacter; if not, fall through and handle
! 1516: it as a data character at the start of a string. Escape items are checked
! 1517: for validity in the pre-compiling pass. */
! 1518:
! 1519: case '\\':
! 1520: tempptr = ptr;
! 1521: c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
! 1522:
! 1523: /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
! 1524: are arranged to be the negation of the corresponding OP_values. For the
! 1525: back references, the values are ESC_REF plus the reference number. Only
! 1526: back references and those types that consume a character may be repeated.
! 1527: We can test for values between ESC_b and ESC_Z for the latter; this may
! 1528: have to change if any new ones are ever created. */
! 1529:
! 1530: if (c < 0)
! 1531: {
! 1532: if (-c >= ESC_REF)
! 1533: {
! 1534: previous = code;
! 1535: *code++ = OP_REF;
! 1536: *code++ = -c - ESC_REF;
! 1537: }
! 1538: else
! 1539: {
! 1540: previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
! 1541: *code++ = -c;
! 1542: }
! 1543: continue;
! 1544: }
! 1545:
! 1546: /* Data character: reset and fall through */
! 1547:
! 1548: ptr = tempptr;
! 1549: c = '\\';
! 1550:
! 1551: /* Handle a run of data characters until a metacharacter is encountered.
! 1552: The first character is guaranteed not to be whitespace or # when the
! 1553: extended flag is set. */
! 1554:
! 1555: NORMAL_CHAR:
! 1556: default:
! 1557: previous = code;
! 1558: *code = OP_CHARS;
! 1559: code += 2;
! 1560: length = 0;
! 1561:
! 1562: do
! 1563: {
! 1564: if ((options & PCRE_EXTENDED) != 0)
! 1565: {
! 1566: if ((cd->ctypes[c] & ctype_space) != 0) continue;
! 1567: if (c == '#')
! 1568: {
! 1569: while ((c = *(++ptr)) != 0 && c != '\n');
! 1570: if (c == 0) break;
! 1571: continue;
! 1572: }
! 1573: }
! 1574:
! 1575: /* Backslash may introduce a data char or a metacharacter. Escaped items
! 1576: are checked for validity in the pre-compiling pass. Stop the string
! 1577: before a metaitem. */
! 1578:
! 1579: if (c == '\\')
! 1580: {
! 1581: tempptr = ptr;
! 1582: c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
! 1583: if (c < 0) { ptr = tempptr; break; }
! 1584: }
! 1585:
! 1586: /* Ordinary character or single-char escape */
! 1587:
! 1588: *code++ = c;
! 1589: length++;
! 1590: }
! 1591:
! 1592: /* This "while" is the end of the "do" above. */
! 1593:
! 1594: while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
! 1595:
! 1596: /* Update the last character and the count of literals */
! 1597:
! 1598: prevreqchar = (length > 1)? code[-2] : *reqchar;
! 1599: *reqchar = code[-1];
! 1600: *countlits += length;
! 1601:
! 1602: /* Compute the length and set it in the data vector, and advance to
! 1603: the next state. */
! 1604:
! 1605: previous[1] = length;
! 1606: if (length < 255) ptr--;
! 1607: break;
! 1608: }
! 1609: } /* end of big loop */
! 1610:
! 1611: /* Control never reaches here by falling through, only by a goto for all the
! 1612: error states. Pass back the position in the pattern so that it can be displayed
! 1613: to the user for diagnosing the error. */
! 1614:
! 1615: FAILED:
! 1616: *ptrptr = ptr;
! 1617: return FALSE;
! 1618: }
! 1619:
! 1620:
! 1621:
! 1622:
! 1623: /*************************************************
! 1624: * Compile sequence of alternatives *
! 1625: *************************************************/
! 1626:
! 1627: /* On entry, ptr is pointing past the bracket character, but on return
! 1628: it points to the closing bracket, or vertical bar, or end of string.
! 1629: The code variable is pointing at the byte into which the BRA operator has been
! 1630: stored. If the ims options are changed at the start (for a (?ims: group) or
! 1631: during any branch, we need to insert an OP_OPT item at the start of every
! 1632: following branch to ensure they get set correctly at run time, and also pass
! 1633: the new options into every subsequent branch compile.
! 1634:
! 1635: Argument:
! 1636: options the option bits
! 1637: optchanged new ims options to set as if (?ims) were at the start, or -1
! 1638: for no change
! 1639: brackets -> int containing the number of extracting brackets used
! 1640: codeptr -> the address of the current code pointer
! 1641: ptrptr -> the address of the current pattern pointer
! 1642: errorptr -> pointer to error message
! 1643: lookbehind TRUE if this is a lookbehind assertion
! 1644: condref > 0 for OPT_CREF setting at start of conditional group
! 1645: reqchar -> place to put the last required character, or a negative number
! 1646: countlits -> place to put the shortest literal count of any branch
! 1647: cd points to the data block with tables pointers
! 1648:
! 1649: Returns: TRUE on success
! 1650: */
! 1651:
! 1652: static BOOL
! 1653: compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
! 1654: const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
! 1655: int *reqchar, int *countlits, compile_data *cd)
! 1656: {
! 1657: const uschar *ptr = *ptrptr;
! 1658: uschar *code = *codeptr;
! 1659: uschar *last_branch = code;
! 1660: uschar *start_bracket = code;
! 1661: uschar *reverse_count = NULL;
! 1662: int oldoptions = options & PCRE_IMS;
! 1663: int branchreqchar, branchcountlits;
! 1664:
! 1665: *reqchar = -1;
! 1666: *countlits = PCRE_MAX_POS;
! 1667: code += 3;
! 1668:
! 1669: /* At the start of a reference-based conditional group, insert the reference
! 1670: number as an OP_CREF item. */
! 1671:
! 1672: if (condref > 0)
! 1673: {
! 1674: *code++ = OP_CREF;
! 1675: *code++ = condref;
! 1676: }
! 1677:
! 1678: /* Loop for each alternative branch */
! 1679:
! 1680: for (;;)
! 1681: {
! 1682: int length;
! 1683:
! 1684: /* Handle change of options */
! 1685:
! 1686: if (optchanged >= 0)
! 1687: {
! 1688: *code++ = OP_OPT;
! 1689: *code++ = optchanged;
! 1690: options = (options & ~PCRE_IMS) | optchanged;
! 1691: }
! 1692:
! 1693: /* Set up dummy OP_REVERSE if lookbehind assertion */
! 1694:
! 1695: if (lookbehind)
! 1696: {
! 1697: *code++ = OP_REVERSE;
! 1698: reverse_count = code;
! 1699: *code++ = 0;
! 1700: *code++ = 0;
! 1701: }
! 1702:
! 1703: /* Now compile the branch */
! 1704:
! 1705: if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
! 1706: &branchreqchar, &branchcountlits, cd))
! 1707: {
! 1708: *ptrptr = ptr;
! 1709: return FALSE;
! 1710: }
! 1711:
! 1712: /* Fill in the length of the last branch */
! 1713:
! 1714: length = code - last_branch;
! 1715: last_branch[1] = length >> 8;
! 1716: last_branch[2] = length & 255;
! 1717:
! 1718: /* Save the last required character if all branches have the same; a current
! 1719: value of -1 means unset, while -2 means "previous branch had no last required
! 1720: char". */
! 1721:
! 1722: if (*reqchar != -2)
! 1723: {
! 1724: if (branchreqchar >= 0)
! 1725: {
! 1726: if (*reqchar == -1) *reqchar = branchreqchar;
! 1727: else if (*reqchar != branchreqchar) *reqchar = -2;
! 1728: }
! 1729: else *reqchar = -2;
! 1730: }
! 1731:
! 1732: /* Keep the shortest literal count */
! 1733:
! 1734: if (branchcountlits < *countlits) *countlits = branchcountlits;
! 1735: DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
! 1736:
! 1737: /* If lookbehind, check that this branch matches a fixed-length string,
! 1738: and put the length into the OP_REVERSE item. Temporarily mark the end of
! 1739: the branch with OP_END. */
! 1740:
! 1741: if (lookbehind)
! 1742: {
! 1743: *code = OP_END;
! 1744: length = find_fixedlength(last_branch);
! 1745: DPRINTF(("fixed length = %d\n", length));
! 1746: if (length < 0)
! 1747: {
! 1748: *errorptr = ERR25;
! 1749: *ptrptr = ptr;
! 1750: return FALSE;
! 1751: }
! 1752: reverse_count[0] = (length >> 8);
! 1753: reverse_count[1] = length & 255;
! 1754: }
! 1755:
! 1756: /* Reached end of expression, either ')' or end of pattern. Insert a
! 1757: terminating ket and the length of the whole bracketed item, and return,
! 1758: leaving the pointer at the terminating char. If any of the ims options
! 1759: were changed inside the group, compile a resetting op-code following. */
! 1760:
! 1761: if (*ptr != '|')
! 1762: {
! 1763: length = code - start_bracket;
! 1764: *code++ = OP_KET;
! 1765: *code++ = length >> 8;
! 1766: *code++ = length & 255;
! 1767: if (optchanged >= 0)
! 1768: {
! 1769: *code++ = OP_OPT;
! 1770: *code++ = oldoptions;
! 1771: }
! 1772: *codeptr = code;
! 1773: *ptrptr = ptr;
! 1774: return TRUE;
! 1775: }
! 1776:
! 1777: /* Another branch follows; insert an "or" node and advance the pointer. */
! 1778:
! 1779: *code = OP_ALT;
! 1780: last_branch = code;
! 1781: code += 3;
! 1782: ptr++;
! 1783: }
! 1784: /* Control never reaches here */
! 1785: }
! 1786:
! 1787:
! 1788:
! 1789:
! 1790: /*************************************************
! 1791: * Find first significant op code *
! 1792: *************************************************/
! 1793:
! 1794: /* This is called by several functions that scan a compiled expression looking
! 1795: for a fixed first character, or an anchoring op code etc. It skips over things
! 1796: that do not influence this. For one application, a change of caseless option is
! 1797: important.
! 1798:
! 1799: Arguments:
! 1800: code pointer to the start of the group
! 1801: options pointer to external options
! 1802: optbit the option bit whose changing is significant, or
! 1803: zero if none are
! 1804: optstop TRUE to return on option change, otherwise change the options
! 1805: value and continue
! 1806:
! 1807: Returns: pointer to the first significant opcode
! 1808: */
! 1809:
! 1810: static const uschar*
! 1811: first_significant_code(const uschar *code, int *options, int optbit,
! 1812: BOOL optstop)
! 1813: {
! 1814: for (;;)
! 1815: {
! 1816: switch ((int)*code)
! 1817: {
! 1818: case OP_OPT:
! 1819: if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
! 1820: {
! 1821: if (optstop) return code;
! 1822: *options = (int)code[1];
! 1823: }
! 1824: code += 2;
! 1825: break;
! 1826:
! 1827: case OP_CREF:
! 1828: code += 2;
! 1829: break;
! 1830:
! 1831: case OP_WORD_BOUNDARY:
! 1832: case OP_NOT_WORD_BOUNDARY:
! 1833: code++;
! 1834: break;
! 1835:
! 1836: case OP_ASSERT_NOT:
! 1837: case OP_ASSERTBACK:
! 1838: case OP_ASSERTBACK_NOT:
! 1839: do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
! 1840: code += 3;
! 1841: break;
! 1842:
! 1843: default:
! 1844: return code;
! 1845: }
! 1846: }
! 1847: /* Control never reaches here */
! 1848: }
! 1849:
! 1850:
! 1851:
! 1852:
! 1853: /*************************************************
! 1854: * Check for anchored expression *
! 1855: *************************************************/
! 1856:
! 1857: /* Try to find out if this is an anchored regular expression. Consider each
! 1858: alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
! 1859: all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
! 1860: it's anchored. However, if this is a multiline pattern, then only OP_SOD
! 1861: counts, since OP_CIRC can match in the middle.
! 1862:
! 1863: A branch is also implicitly anchored if it starts with .* and DOTALL is set,
! 1864: because that will try the rest of the pattern at all possible matching points,
! 1865: so there is no point trying them again.
! 1866:
! 1867: Arguments:
! 1868: code points to start of expression (the bracket)
! 1869: options points to the options setting
! 1870:
! 1871: Returns: TRUE or FALSE
! 1872: */
! 1873:
! 1874: static BOOL
! 1875: is_anchored(register const uschar *code, int *options)
! 1876: {
! 1877: do {
! 1878: const uschar *scode = first_significant_code(code + 3, options,
! 1879: PCRE_MULTILINE, FALSE);
! 1880: register int op = *scode;
! 1881: if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
! 1882: { if (!is_anchored(scode, options)) return FALSE; }
! 1883: else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
! 1884: (*options & PCRE_DOTALL) != 0)
! 1885: { if (scode[1] != OP_ANY) return FALSE; }
! 1886: else if (op != OP_SOD &&
! 1887: ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
! 1888: return FALSE;
! 1889: code += (code[1] << 8) + code[2];
! 1890: }
! 1891: while (*code == OP_ALT);
! 1892: return TRUE;
! 1893: }
! 1894:
! 1895:
! 1896:
! 1897: /*************************************************
! 1898: * Check for starting with ^ or .* *
! 1899: *************************************************/
! 1900:
! 1901: /* This is called to find out if every branch starts with ^ or .* so that
! 1902: "first char" processing can be done to speed things up in multiline
! 1903: matching and for non-DOTALL patterns that start with .* (which must start at
! 1904: the beginning or after \n).
! 1905:
! 1906: Argument: points to start of expression (the bracket)
! 1907: Returns: TRUE or FALSE
! 1908: */
! 1909:
! 1910: static BOOL
! 1911: is_startline(const uschar *code)
! 1912: {
! 1913: do {
! 1914: const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
! 1915: register int op = *scode;
! 1916: if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
! 1917: { if (!is_startline(scode)) return FALSE; }
! 1918: else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
! 1919: { if (scode[1] != OP_ANY) return FALSE; }
! 1920: else if (op != OP_CIRC) return FALSE;
! 1921: code += (code[1] << 8) + code[2];
! 1922: }
! 1923: while (*code == OP_ALT);
! 1924: return TRUE;
! 1925: }
! 1926:
! 1927:
! 1928:
! 1929: /*************************************************
! 1930: * Check for fixed first char *
! 1931: *************************************************/
! 1932:
! 1933: /* Try to find out if there is a fixed first character. This is called for
! 1934: unanchored expressions, as it speeds up their processing quite considerably.
! 1935: Consider each alternative branch. If they all start with the same char, or with
! 1936: a bracket all of whose alternatives start with the same char (recurse ad lib),
! 1937: then we return that char, otherwise -1.
! 1938:
! 1939: Arguments:
! 1940: code points to start of expression (the bracket)
! 1941: options pointer to the options (used to check casing changes)
! 1942:
! 1943: Returns: -1 or the fixed first char
! 1944: */
! 1945:
! 1946: static int
! 1947: find_firstchar(const uschar *code, int *options)
! 1948: {
! 1949: register int c = -1;
! 1950: do {
! 1951: int d;
! 1952: const uschar *scode = first_significant_code(code + 3, options,
! 1953: PCRE_CASELESS, TRUE);
! 1954: register int op = *scode;
! 1955:
! 1956: if (op >= OP_BRA) op = OP_BRA;
! 1957:
! 1958: switch(op)
! 1959: {
! 1960: default:
! 1961: return -1;
! 1962:
! 1963: case OP_BRA:
! 1964: case OP_ASSERT:
! 1965: case OP_ONCE:
! 1966: case OP_COND:
! 1967: if ((d = find_firstchar(scode, options)) < 0) return -1;
! 1968: if (c < 0) c = d; else if (c != d) return -1;
! 1969: break;
! 1970:
! 1971: case OP_EXACT: /* Fall through */
! 1972: scode++;
! 1973:
! 1974: case OP_CHARS: /* Fall through */
! 1975: scode++;
! 1976:
! 1977: case OP_PLUS:
! 1978: case OP_MINPLUS:
! 1979: if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
! 1980: break;
! 1981: }
! 1982:
! 1983: code += (code[1] << 8) + code[2];
! 1984: }
! 1985: while (*code == OP_ALT);
! 1986: return c;
! 1987: }
! 1988:
! 1989:
! 1990:
! 1991:
! 1992:
! 1993: /*************************************************
! 1994: * Compile a Regular Expression *
! 1995: *************************************************/
! 1996:
! 1997: /* This function takes a string and returns a pointer to a block of store
! 1998: holding a compiled version of the expression.
! 1999:
! 2000: Arguments:
! 2001: pattern the regular expression
! 2002: options various option bits
! 2003: errorptr pointer to pointer to error text
! 2004: erroroffset ptr offset in pattern where error was detected
! 2005: tables pointer to character tables or NULL
! 2006:
! 2007: Returns: pointer to compiled data block, or NULL on error,
! 2008: with errorptr and erroroffset set
! 2009: */
! 2010:
! 2011: pcre *
! 2012: pcre_compile(const char *pattern, int options, const char **errorptr,
! 2013: int *erroroffset, const unsigned char *tables)
! 2014: {
! 2015: real_pcre *re;
! 2016: int length = 3; /* For initial BRA plus length */
! 2017: int runlength;
! 2018: int c, size, reqchar, countlits;
! 2019: int bracount = 0;
! 2020: int top_backref = 0;
! 2021: int branch_extra = 0;
! 2022: int branch_newextra;
! 2023: unsigned int brastackptr = 0;
! 2024: uschar *code;
! 2025: const uschar *ptr;
! 2026: compile_data compile_block;
! 2027: int brastack[BRASTACK_SIZE];
! 2028: uschar bralenstack[BRASTACK_SIZE];
! 2029:
! 2030: #ifdef DEBUG
! 2031: uschar *code_base, *code_end;
! 2032: #endif
! 2033:
! 2034: /* We can't pass back an error message if errorptr is NULL; I guess the best we
! 2035: can do is just return NULL. */
! 2036:
! 2037: if (errorptr == NULL) return NULL;
! 2038: *errorptr = NULL;
! 2039:
! 2040: /* However, we can give a message for this error */
! 2041:
! 2042: if (erroroffset == NULL)
! 2043: {
! 2044: *errorptr = ERR16;
! 2045: return NULL;
! 2046: }
! 2047: *erroroffset = 0;
! 2048:
! 2049: if ((options & ~PUBLIC_OPTIONS) != 0)
! 2050: {
! 2051: *errorptr = ERR17;
! 2052: return NULL;
! 2053: }
! 2054:
! 2055: /* Set up pointers to the individual character tables */
! 2056:
! 2057: if (tables == NULL) tables = pcre_default_tables;
! 2058: compile_block.lcc = tables + lcc_offset;
! 2059: compile_block.fcc = tables + fcc_offset;
! 2060: compile_block.cbits = tables + cbits_offset;
! 2061: compile_block.ctypes = tables + ctypes_offset;
! 2062:
! 2063: /* Reflect pattern for debugging output */
! 2064:
! 2065: DPRINTF(("------------------------------------------------------------------\n"));
! 2066: DPRINTF(("%s\n", pattern));
! 2067:
! 2068: /* The first thing to do is to make a pass over the pattern to compute the
! 2069: amount of store required to hold the compiled code. This does not have to be
! 2070: perfect as long as errors are overestimates. At the same time we can detect any
! 2071: internal flag settings. Make an attempt to correct for any counted white space
! 2072: if an "extended" flag setting appears late in the pattern. We can't be so
! 2073: clever for #-comments. */
! 2074:
! 2075: ptr = (const uschar *)(pattern - 1);
! 2076: while ((c = *(++ptr)) != 0)
! 2077: {
! 2078: int min, max;
! 2079: int class_charcount;
! 2080:
! 2081: if ((options & PCRE_EXTENDED) != 0)
! 2082: {
! 2083: if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
! 2084: if (c == '#')
! 2085: {
! 2086: while ((c = *(++ptr)) != 0 && c != '\n');
! 2087: continue;
! 2088: }
! 2089: }
! 2090:
! 2091: switch(c)
! 2092: {
! 2093: /* A backslashed item may be an escaped "normal" character or a
! 2094: character type. For a "normal" character, put the pointers and
! 2095: character back so that tests for whitespace etc. in the input
! 2096: are done correctly. */
! 2097:
! 2098: case '\\':
! 2099: {
! 2100: const uschar *save_ptr = ptr;
! 2101: c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
! 2102: if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
! 2103: if (c >= 0)
! 2104: {
! 2105: ptr = save_ptr;
! 2106: c = '\\';
! 2107: goto NORMAL_CHAR;
! 2108: }
! 2109: }
! 2110: length++;
! 2111:
! 2112: /* A back reference needs an additional char, plus either one or 5
! 2113: bytes for a repeat. We also need to keep the value of the highest
! 2114: back reference. */
! 2115:
! 2116: if (c <= -ESC_REF)
! 2117: {
! 2118: int refnum = -c - ESC_REF;
! 2119: if (refnum > top_backref) top_backref = refnum;
! 2120: length++; /* For single back reference */
! 2121: if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
! 2122: {
! 2123: ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
! 2124: if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
! 2125: if ((min == 0 && (max == 1 || max == -1)) ||
! 2126: (min == 1 && max == -1))
! 2127: length++;
! 2128: else length += 5;
! 2129: if (ptr[1] == '?') ptr++;
! 2130: }
! 2131: }
! 2132: continue;
! 2133:
! 2134: case '^':
! 2135: case '.':
! 2136: case '$':
! 2137: case '*': /* These repeats won't be after brackets; */
! 2138: case '+': /* those are handled separately */
! 2139: case '?':
! 2140: length++;
! 2141: continue;
! 2142:
! 2143: /* This covers the cases of repeats after a single char, metachar, class,
! 2144: or back reference. */
! 2145:
! 2146: case '{':
! 2147: if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
! 2148: ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
! 2149: if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
! 2150: if ((min == 0 && (max == 1 || max == -1)) ||
! 2151: (min == 1 && max == -1))
! 2152: length++;
! 2153: else
! 2154: {
! 2155: length--; /* Uncount the original char or metachar */
! 2156: if (min == 1) length++; else if (min > 0) length += 4;
! 2157: if (max > 0) length += 4; else length += 2;
! 2158: }
! 2159: if (ptr[1] == '?') ptr++;
! 2160: continue;
! 2161:
! 2162: /* An alternation contains an offset to the next branch or ket. If any ims
! 2163: options changed in the previous branch(es), and/or if we are in a
! 2164: lookbehind assertion, extra space will be needed at the start of the
! 2165: branch. This is handled by branch_extra. */
! 2166:
! 2167: case '|':
! 2168: length += 3 + branch_extra;
! 2169: continue;
! 2170:
! 2171: /* A character class uses 33 characters. Don't worry about character types
! 2172: that aren't allowed in classes - they'll get picked up during the compile.
! 2173: A character class that contains only one character uses 2 or 3 bytes,
! 2174: depending on whether it is negated or not. Notice this where we can. */
! 2175:
! 2176: case '[':
! 2177: class_charcount = 0;
! 2178: if (*(++ptr) == '^') ptr++;
! 2179: do
! 2180: {
! 2181: if (*ptr == '\\')
! 2182: {
! 2183: int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
! 2184: &compile_block);
! 2185: if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
! 2186: if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
! 2187: }
! 2188: else class_charcount++;
! 2189: ptr++;
! 2190: }
! 2191: while (*ptr != 0 && *ptr != ']');
! 2192:
! 2193: /* Repeats for negated single chars are handled by the general code */
! 2194:
! 2195: if (class_charcount == 1) length += 3; else
! 2196: {
! 2197: length += 33;
! 2198:
! 2199: /* A repeat needs either 1 or 5 bytes. */
! 2200:
! 2201: if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
! 2202: {
! 2203: ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
! 2204: if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
! 2205: if ((min == 0 && (max == 1 || max == -1)) ||
! 2206: (min == 1 && max == -1))
! 2207: length++;
! 2208: else length += 5;
! 2209: if (ptr[1] == '?') ptr++;
! 2210: }
! 2211: }
! 2212: continue;
! 2213:
! 2214: /* Brackets may be genuine groups or special things */
! 2215:
! 2216: case '(':
! 2217: branch_newextra = 0;
! 2218:
! 2219: /* Handle special forms of bracket, which all start (? */
! 2220:
! 2221: if (ptr[1] == '?')
! 2222: {
! 2223: int set, unset;
! 2224: int *optset;
! 2225:
! 2226: switch (c = ptr[2])
! 2227: {
! 2228: /* Skip over comments entirely */
! 2229: case '#':
! 2230: ptr += 3;
! 2231: while (*ptr != 0 && *ptr != ')') ptr++;
! 2232: if (*ptr == 0)
! 2233: {
! 2234: *errorptr = ERR18;
! 2235: goto PCRE_ERROR_RETURN;
! 2236: }
! 2237: continue;
! 2238:
! 2239: /* Non-referencing groups and lookaheads just move the pointer on, and
! 2240: then behave like a non-special bracket, except that they don't increment
! 2241: the count of extracting brackets. Ditto for the "once only" bracket,
! 2242: which is in Perl from version 5.005. */
! 2243:
! 2244: case ':':
! 2245: case '=':
! 2246: case '!':
! 2247: case '>':
! 2248: ptr += 2;
! 2249: break;
! 2250:
! 2251: /* Lookbehinds are in Perl from version 5.005 */
! 2252:
! 2253: case '<':
! 2254: if (ptr[3] == '=' || ptr[3] == '!')
! 2255: {
! 2256: ptr += 3;
! 2257: branch_newextra = 3;
! 2258: length += 3; /* For the first branch */
! 2259: break;
! 2260: }
! 2261: *errorptr = ERR24;
! 2262: goto PCRE_ERROR_RETURN;
! 2263:
! 2264: /* Conditionals are in Perl from version 5.005. The bracket must either
! 2265: be followed by a number (for bracket reference) or by an assertion
! 2266: group. */
! 2267:
! 2268: case '(':
! 2269: if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
! 2270: {
! 2271: ptr += 4;
! 2272: length += 2;
! 2273: while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
! 2274: if (*ptr != ')')
! 2275: {
! 2276: *errorptr = ERR26;
! 2277: goto PCRE_ERROR_RETURN;
! 2278: }
! 2279: }
! 2280: else /* An assertion must follow */
! 2281: {
! 2282: ptr++; /* Can treat like ':' as far as spacing is concerned */
! 2283:
! 2284: if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
! 2285: {
! 2286: ptr += 2; /* To get right offset in message */
! 2287: *errorptr = ERR28;
! 2288: goto PCRE_ERROR_RETURN;
! 2289: }
! 2290: }
! 2291: break;
! 2292:
! 2293: /* Else loop checking valid options until ) is met. Anything else is an
! 2294: error. If we are without any brackets, i.e. at top level, the settings
! 2295: act as if specified in the options, so massage the options immediately.
! 2296: This is for backward compatibility with Perl 5.004. */
! 2297:
! 2298: default:
! 2299: set = unset = 0;
! 2300: optset = &set;
! 2301: ptr += 2;
! 2302:
! 2303: for (;; ptr++)
! 2304: {
! 2305: c = *ptr;
! 2306: switch (c)
! 2307: {
! 2308: case 'i':
! 2309: *optset |= PCRE_CASELESS;
! 2310: continue;
! 2311:
! 2312: case 'm':
! 2313: *optset |= PCRE_MULTILINE;
! 2314: continue;
! 2315:
! 2316: case 's':
! 2317: *optset |= PCRE_DOTALL;
! 2318: continue;
! 2319:
! 2320: case 'x':
! 2321: *optset |= PCRE_EXTENDED;
! 2322: continue;
! 2323:
! 2324: case 'X':
! 2325: *optset |= PCRE_EXTRA;
! 2326: continue;
! 2327:
! 2328: case 'U':
! 2329: *optset |= PCRE_UNGREEDY;
! 2330: continue;
! 2331:
! 2332: case '-':
! 2333: optset = &unset;
! 2334: continue;
! 2335:
! 2336: /* A termination by ')' indicates an options-setting-only item;
! 2337: this is global at top level; otherwise nothing is done here and
! 2338: it is handled during the compiling process on a per-bracket-group
! 2339: basis. */
! 2340:
! 2341: case ')':
! 2342: if (brastackptr == 0)
! 2343: {
! 2344: options = (options | set) & (~unset);
! 2345: set = unset = 0; /* To save length */
! 2346: }
! 2347: /* Fall through */
! 2348:
! 2349: /* A termination by ':' indicates the start of a nested group with
! 2350: the given options set. This is again handled at compile time, but
! 2351: we must allow for compiled space if any of the ims options are
! 2352: set. We also have to allow for resetting space at the end of
! 2353: the group, which is why 4 is added to the length and not just 2.
! 2354: If there are several changes of options within the same group, this
! 2355: will lead to an over-estimate on the length, but this shouldn't
! 2356: matter very much. We also have to allow for resetting options at
! 2357: the start of any alternations, which we do by setting
! 2358: branch_newextra to 2. Finally, we record whether the case-dependent
! 2359: flag ever changes within the regex. This is used by the "required
! 2360: character" code. */
! 2361:
! 2362: case ':':
! 2363: if (((set|unset) & PCRE_IMS) != 0)
! 2364: {
! 2365: length += 4;
! 2366: branch_newextra = 2;
! 2367: if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
! 2368: }
! 2369: goto END_OPTIONS;
! 2370:
! 2371: /* Unrecognized option character */
! 2372:
! 2373: default:
! 2374: *errorptr = ERR12;
! 2375: goto PCRE_ERROR_RETURN;
! 2376: }
! 2377: }
! 2378:
! 2379: /* If we hit a closing bracket, that's it - this is a freestanding
! 2380: option-setting. We need to ensure that branch_extra is updated if
! 2381: necessary. The only values branch_newextra can have here are 0 or 2.
! 2382: If the value is 2, then branch_extra must either be 2 or 5, depending
! 2383: on whether this is a lookbehind group or not. */
! 2384:
! 2385: END_OPTIONS:
! 2386: if (c == ')')
! 2387: {
! 2388: if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
! 2389: branch_extra += branch_newextra;
! 2390: continue;
! 2391: }
! 2392:
! 2393: /* If options were terminated by ':' control comes here. Fall through
! 2394: to handle the group below. */
! 2395: }
! 2396: }
! 2397:
! 2398: /* Extracting brackets must be counted so we can process escapes in a
! 2399: Perlish way. */
! 2400:
! 2401: else bracount++;
! 2402:
! 2403: /* Non-special forms of bracket. Save length for computing whole length
! 2404: at end if there's a repeat that requires duplication of the group. Also
! 2405: save the current value of branch_extra, and start the new group with
! 2406: the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
! 2407: for a lookbehind assertion. */
! 2408:
! 2409: if (brastackptr >= sizeof(brastack)/sizeof(int))
! 2410: {
! 2411: *errorptr = ERR19;
! 2412: goto PCRE_ERROR_RETURN;
! 2413: }
! 2414:
! 2415: bralenstack[brastackptr] = branch_extra;
! 2416: branch_extra = branch_newextra;
! 2417:
! 2418: brastack[brastackptr++] = length;
! 2419: length += 3;
! 2420: continue;
! 2421:
! 2422: /* Handle ket. Look for subsequent max/min; for certain sets of values we
! 2423: have to replicate this bracket up to that many times. If brastackptr is
! 2424: 0 this is an unmatched bracket which will generate an error, but take care
! 2425: not to try to access brastack[-1] when computing the length and restoring
! 2426: the branch_extra value. */
! 2427:
! 2428: case ')':
! 2429: length += 3;
! 2430: {
! 2431: int minval = 1;
! 2432: int maxval = 1;
! 2433: int duplength;
! 2434:
! 2435: if (brastackptr > 0)
! 2436: {
! 2437: duplength = length - brastack[--brastackptr];
! 2438: branch_extra = bralenstack[brastackptr];
! 2439: }
! 2440: else duplength = 0;
! 2441:
! 2442: /* Leave ptr at the final char; for read_repeat_counts this happens
! 2443: automatically; for the others we need an increment. */
! 2444:
! 2445: if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
! 2446: {
! 2447: ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
! 2448: &compile_block);
! 2449: if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
! 2450: }
! 2451: else if (c == '*') { minval = 0; maxval = -1; ptr++; }
! 2452: else if (c == '+') { maxval = -1; ptr++; }
! 2453: else if (c == '?') { minval = 0; ptr++; }
! 2454:
! 2455: /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
! 2456: group, and if the maximum is greater than zero, we have to replicate
! 2457: maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
! 2458: bracket set - hence the 7. */
! 2459:
! 2460: if (minval == 0)
! 2461: {
! 2462: length++;
! 2463: if (maxval > 0) length += (maxval - 1) * (duplength + 7);
! 2464: }
! 2465:
! 2466: /* When the minimum is greater than zero, 1 we have to replicate up to
! 2467: minval-1 times, with no additions required in the copies. Then, if
! 2468: there is a limited maximum we have to replicate up to maxval-1 times
! 2469: allowing for a BRAZERO item before each optional copy and nesting
! 2470: brackets for all but one of the optional copies. */
! 2471:
! 2472: else
! 2473: {
! 2474: length += (minval - 1) * duplength;
! 2475: if (maxval > minval) /* Need this test as maxval=-1 means no limit */
! 2476: length += (maxval - minval) * (duplength + 7) - 6;
! 2477: }
! 2478: }
! 2479: continue;
! 2480:
! 2481: /* Non-special character. For a run of such characters the length required
! 2482: is the number of characters + 2, except that the maximum run length is 255.
! 2483: We won't get a skipped space or a non-data escape or the start of a #
! 2484: comment as the first character, so the length can't be zero. */
! 2485:
! 2486: NORMAL_CHAR:
! 2487: default:
! 2488: length += 2;
! 2489: runlength = 0;
! 2490: do
! 2491: {
! 2492: if ((options & PCRE_EXTENDED) != 0)
! 2493: {
! 2494: if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
! 2495: if (c == '#')
! 2496: {
! 2497: while ((c = *(++ptr)) != 0 && c != '\n');
! 2498: continue;
! 2499: }
! 2500: }
! 2501:
! 2502: /* Backslash may introduce a data char or a metacharacter; stop the
! 2503: string before the latter. */
! 2504:
! 2505: if (c == '\\')
! 2506: {
! 2507: const uschar *saveptr = ptr;
! 2508: c = check_escape(&ptr, errorptr, bracount, options, FALSE,
! 2509: &compile_block);
! 2510: if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
! 2511: if (c < 0) { ptr = saveptr; break; }
! 2512: }
! 2513:
! 2514: /* Ordinary character or single-char escape */
! 2515:
! 2516: runlength++;
! 2517: }
! 2518:
! 2519: /* This "while" is the end of the "do" above. */
! 2520:
! 2521: while (runlength < 255 &&
! 2522: (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
! 2523:
! 2524: ptr--;
! 2525: length += runlength;
! 2526: continue;
! 2527: }
! 2528: }
! 2529:
! 2530: length += 4; /* For final KET and END */
! 2531:
! 2532: if (length > 65539)
! 2533: {
! 2534: *errorptr = ERR20;
! 2535: return NULL;
! 2536: }
! 2537:
! 2538: /* Compute the size of data block needed and get it, either from malloc or
! 2539: externally provided function. We specify "code[0]" in the offsetof() expression
! 2540: rather than just "code", because it has been reported that one broken compiler
! 2541: fails on "code" because it is also an independent variable. It should make no
! 2542: difference to the value of the offsetof(). */
! 2543:
! 2544: size = length + offsetof(real_pcre, code[0]);
! 2545: re = (real_pcre *)(pcre_malloc)(size);
! 2546:
! 2547: if (re == NULL)
! 2548: {
! 2549: *errorptr = ERR21;
! 2550: return NULL;
! 2551: }
! 2552:
! 2553: /* Put in the magic number and the options. */
! 2554:
! 2555: re->magic_number = MAGIC_NUMBER;
! 2556: re->options = options;
! 2557: re->tables = tables;
! 2558:
! 2559: /* Set up a starting, non-extracting bracket, then compile the expression. On
! 2560: error, *errorptr will be set non-NULL, so we don't need to look at the result
! 2561: of the function here. */
! 2562:
! 2563: ptr = (const uschar *)pattern;
! 2564: code = re->code;
! 2565: *code = OP_BRA;
! 2566: bracount = 0;
! 2567: (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
! 2568: &reqchar, &countlits, &compile_block);
! 2569: re->top_bracket = bracount;
! 2570: re->top_backref = top_backref;
! 2571:
! 2572: /* If not reached end of pattern on success, there's an excess bracket. */
! 2573:
! 2574: if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
! 2575:
! 2576: /* Fill in the terminating state and check for disastrous overflow, but
! 2577: if debugging, leave the test till after things are printed out. */
! 2578:
! 2579: *code++ = OP_END;
! 2580:
! 2581: #ifndef DEBUG
! 2582: if (code - re->code > length) *errorptr = ERR23;
! 2583: #endif
! 2584:
! 2585: /* Give an error if there's back reference to a non-existent capturing
! 2586: subpattern. */
! 2587:
! 2588: if (top_backref > re->top_bracket) *errorptr = ERR15;
! 2589:
! 2590: /* Failed to compile */
! 2591:
! 2592: if (*errorptr != NULL)
! 2593: {
! 2594: (pcre_free)(re);
! 2595: PCRE_ERROR_RETURN:
! 2596: *erroroffset = ptr - (const uschar *)pattern;
! 2597: return NULL;
! 2598: }
! 2599:
! 2600: /* If the anchored option was not passed, set flag if we can determine that the
! 2601: pattern is anchored by virtue of ^ characters or \A or anything else (such as
! 2602: starting with .* when DOTALL is set).
! 2603:
! 2604: Otherwise, see if we can determine what the first character has to be, because
! 2605: that speeds up unanchored matches no end. If not, see if we can set the
! 2606: PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
! 2607: start with ^. and also when all branches start with .* for non-DOTALL matches.
! 2608: */
! 2609:
! 2610: if ((options & PCRE_ANCHORED) == 0)
! 2611: {
! 2612: int temp_options = options;
! 2613: if (is_anchored(re->code, &temp_options))
! 2614: re->options |= PCRE_ANCHORED;
! 2615: else
! 2616: {
! 2617: int ch = find_firstchar(re->code, &temp_options);
! 2618: if (ch >= 0)
! 2619: {
! 2620: re->first_char = ch;
! 2621: re->options |= PCRE_FIRSTSET;
! 2622: }
! 2623: else if (is_startline(re->code))
! 2624: re->options |= PCRE_STARTLINE;
! 2625: }
! 2626: }
! 2627:
! 2628: /* Save the last required character if there are at least two literal
! 2629: characters on all paths, or if there is no first character setting. */
! 2630:
! 2631: if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
! 2632: {
! 2633: re->req_char = reqchar;
! 2634: re->options |= PCRE_REQCHSET;
! 2635: }
! 2636:
! 2637: /* Print out the compiled data for debugging */
! 2638:
! 2639: #ifdef DEBUG
! 2640:
! 2641: printf("Length = %d top_bracket = %d top_backref = %d\n",
! 2642: length, re->top_bracket, re->top_backref);
! 2643:
! 2644: if (re->options != 0)
! 2645: {
! 2646: printf("%s%s%s%s%s%s%s%s%s\n",
! 2647: ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
! 2648: ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
! 2649: ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
! 2650: ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
! 2651: ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
! 2652: ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
! 2653: ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
! 2654: ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
! 2655: ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
! 2656: }
! 2657:
! 2658: if ((re->options & PCRE_FIRSTSET) != 0)
! 2659: {
! 2660: if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
! 2661: else printf("First char = \\x%02x\n", re->first_char);
! 2662: }
! 2663:
! 2664: if ((re->options & PCRE_REQCHSET) != 0)
! 2665: {
! 2666: if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
! 2667: else printf("Req char = \\x%02x\n", re->req_char);
! 2668: }
! 2669:
! 2670: code_end = code;
! 2671: code_base = code = re->code;
! 2672:
! 2673: while (code < code_end)
! 2674: {
! 2675: int charlength;
! 2676:
! 2677: printf("%3d ", code - code_base);
! 2678:
! 2679: if (*code >= OP_BRA)
! 2680: {
! 2681: printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
! 2682: code += 2;
! 2683: }
! 2684:
! 2685: else switch(*code)
! 2686: {
! 2687: case OP_OPT:
! 2688: printf(" %.2x %s", code[1], OP_names[*code]);
! 2689: code++;
! 2690: break;
! 2691:
! 2692: case OP_COND:
! 2693: printf("%3d Cond", (code[1] << 8) + code[2]);
! 2694: code += 2;
! 2695: break;
! 2696:
! 2697: case OP_CREF:
! 2698: printf(" %.2d %s", code[1], OP_names[*code]);
! 2699: code++;
! 2700: break;
! 2701:
! 2702: case OP_CHARS:
! 2703: charlength = *(++code);
! 2704: printf("%3d ", charlength);
! 2705: while (charlength-- > 0)
! 2706: if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
! 2707: break;
! 2708:
! 2709: case OP_KETRMAX:
! 2710: case OP_KETRMIN:
! 2711: case OP_ALT:
! 2712: case OP_KET:
! 2713: case OP_ASSERT:
! 2714: case OP_ASSERT_NOT:
! 2715: case OP_ASSERTBACK:
! 2716: case OP_ASSERTBACK_NOT:
! 2717: case OP_ONCE:
! 2718: printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
! 2719: code += 2;
! 2720: break;
! 2721:
! 2722: case OP_REVERSE:
! 2723: printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
! 2724: code += 2;
! 2725: break;
! 2726:
! 2727: case OP_STAR:
! 2728: case OP_MINSTAR:
! 2729: case OP_PLUS:
! 2730: case OP_MINPLUS:
! 2731: case OP_QUERY:
! 2732: case OP_MINQUERY:
! 2733: case OP_TYPESTAR:
! 2734: case OP_TYPEMINSTAR:
! 2735: case OP_TYPEPLUS:
! 2736: case OP_TYPEMINPLUS:
! 2737: case OP_TYPEQUERY:
! 2738: case OP_TYPEMINQUERY:
! 2739: if (*code >= OP_TYPESTAR)
! 2740: printf(" %s", OP_names[code[1]]);
! 2741: else if (isprint(c = code[1])) printf(" %c", c);
! 2742: else printf(" \\x%02x", c);
! 2743: printf("%s", OP_names[*code++]);
! 2744: break;
! 2745:
! 2746: case OP_EXACT:
! 2747: case OP_UPTO:
! 2748: case OP_MINUPTO:
! 2749: if (isprint(c = code[3])) printf(" %c{", c);
! 2750: else printf(" \\x%02x{", c);
! 2751: if (*code != OP_EXACT) printf("0,");
! 2752: printf("%d}", (code[1] << 8) + code[2]);
! 2753: if (*code == OP_MINUPTO) printf("?");
! 2754: code += 3;
! 2755: break;
! 2756:
! 2757: case OP_TYPEEXACT:
! 2758: case OP_TYPEUPTO:
! 2759: case OP_TYPEMINUPTO:
! 2760: printf(" %s{", OP_names[code[3]]);
! 2761: if (*code != OP_TYPEEXACT) printf(",");
! 2762: printf("%d}", (code[1] << 8) + code[2]);
! 2763: if (*code == OP_TYPEMINUPTO) printf("?");
! 2764: code += 3;
! 2765: break;
! 2766:
! 2767: case OP_NOT:
! 2768: if (isprint(c = *(++code))) printf(" [^%c]", c);
! 2769: else printf(" [^\\x%02x]", c);
! 2770: break;
! 2771:
! 2772: case OP_NOTSTAR:
! 2773: case OP_NOTMINSTAR:
! 2774: case OP_NOTPLUS:
! 2775: case OP_NOTMINPLUS:
! 2776: case OP_NOTQUERY:
! 2777: case OP_NOTMINQUERY:
! 2778: if (isprint(c = code[1])) printf(" [^%c]", c);
! 2779: else printf(" [^\\x%02x]", c);
! 2780: printf("%s", OP_names[*code++]);
! 2781: break;
! 2782:
! 2783: case OP_NOTEXACT:
! 2784: case OP_NOTUPTO:
! 2785: case OP_NOTMINUPTO:
! 2786: if (isprint(c = code[3])) printf(" [^%c]{", c);
! 2787: else printf(" [^\\x%02x]{", c);
! 2788: if (*code != OP_NOTEXACT) printf(",");
! 2789: printf("%d}", (code[1] << 8) + code[2]);
! 2790: if (*code == OP_NOTMINUPTO) printf("?");
! 2791: code += 3;
! 2792: break;
! 2793:
! 2794: case OP_REF:
! 2795: printf(" \\%d", *(++code));
! 2796: code ++;
! 2797: goto CLASS_REF_REPEAT;
! 2798:
! 2799: case OP_CLASS:
! 2800: {
! 2801: int i, min, max;
! 2802: code++;
! 2803: printf(" [");
! 2804:
! 2805: for (i = 0; i < 256; i++)
! 2806: {
! 2807: if ((code[i/8] & (1 << (i&7))) != 0)
! 2808: {
! 2809: int j;
! 2810: for (j = i+1; j < 256; j++)
! 2811: if ((code[j/8] & (1 << (j&7))) == 0) break;
! 2812: if (i == '-' || i == ']') printf("\\");
! 2813: if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
! 2814: if (--j > i)
! 2815: {
! 2816: printf("-");
! 2817: if (j == '-' || j == ']') printf("\\");
! 2818: if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
! 2819: }
! 2820: i = j;
! 2821: }
! 2822: }
! 2823: printf("]");
! 2824: code += 32;
! 2825:
! 2826: CLASS_REF_REPEAT:
! 2827:
! 2828: switch(*code)
! 2829: {
! 2830: case OP_CRSTAR:
! 2831: case OP_CRMINSTAR:
! 2832: case OP_CRPLUS:
! 2833: case OP_CRMINPLUS:
! 2834: case OP_CRQUERY:
! 2835: case OP_CRMINQUERY:
! 2836: printf("%s", OP_names[*code]);
! 2837: break;
! 2838:
! 2839: case OP_CRRANGE:
! 2840: case OP_CRMINRANGE:
! 2841: min = (code[1] << 8) + code[2];
! 2842: max = (code[3] << 8) + code[4];
! 2843: if (max == 0) printf("{%d,}", min);
! 2844: else printf("{%d,%d}", min, max);
! 2845: if (*code == OP_CRMINRANGE) printf("?");
! 2846: code += 4;
! 2847: break;
! 2848:
! 2849: default:
! 2850: code--;
! 2851: }
! 2852: }
! 2853: break;
! 2854:
! 2855: /* Anything else is just a one-node item */
! 2856:
! 2857: default:
! 2858: printf(" %s", OP_names[*code]);
! 2859: break;
! 2860: }
! 2861:
! 2862: code++;
! 2863: printf("\n");
! 2864: }
! 2865: printf("------------------------------------------------------------------\n");
! 2866:
! 2867: /* This check is done here in the debugging case so that the code that
! 2868: was compiled can be seen. */
! 2869:
! 2870: if (code - re->code > length)
! 2871: {
! 2872: *errorptr = ERR23;
! 2873: (pcre_free)(re);
! 2874: *erroroffset = ptr - (uschar *)pattern;
! 2875: return NULL;
! 2876: }
! 2877: #endif
! 2878:
! 2879: return (pcre *)re;
! 2880: }
! 2881:
! 2882:
! 2883:
! 2884: /*************************************************
! 2885: * Match a back-reference *
! 2886: *************************************************/
! 2887:
! 2888: /* If a back reference hasn't been set, the length that is passed is greater
! 2889: than the number of characters left in the string, so the match fails.
! 2890:
! 2891: Arguments:
! 2892: offset index into the offset vector
! 2893: eptr points into the subject
! 2894: length length to be matched
! 2895: md points to match data block
! 2896: ims the ims flags
! 2897:
! 2898: Returns: TRUE if matched
! 2899: */
! 2900:
! 2901: static BOOL
! 2902: match_ref(int offset, register const uschar *eptr, int length, match_data *md,
! 2903: unsigned long int ims)
! 2904: {
! 2905: const uschar *p = md->start_subject + md->offset_vector[offset];
! 2906:
! 2907: #ifdef DEBUG
! 2908: if (eptr >= md->end_subject)
! 2909: printf("matching subject <null>");
! 2910: else
! 2911: {
! 2912: printf("matching subject ");
! 2913: pchars(eptr, length, TRUE, md);
! 2914: }
! 2915: printf(" against backref ");
! 2916: pchars(p, length, FALSE, md);
! 2917: printf("\n");
! 2918: #endif
! 2919:
! 2920: /* Always fail if not enough characters left */
! 2921:
! 2922: if (length > md->end_subject - eptr) return FALSE;
! 2923:
! 2924: /* Separate the caselesss case for speed */
! 2925:
! 2926: if ((ims & PCRE_CASELESS) != 0)
! 2927: {
! 2928: while (length-- > 0)
! 2929: if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
! 2930: }
! 2931: else
! 2932: { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
! 2933:
! 2934: return TRUE;
! 2935: }
! 2936:
! 2937:
! 2938:
! 2939: /*************************************************
! 2940: * Match from current position *
! 2941: *************************************************/
! 2942:
! 2943: /* On entry ecode points to the first opcode, and eptr to the first character
! 2944: in the subject string, while eptrb holds the value of eptr at the start of the
! 2945: last bracketed group - used for breaking infinite loops matching zero-length
! 2946: strings.
! 2947:
! 2948: Arguments:
! 2949: eptr pointer in subject
! 2950: ecode position in code
! 2951: offset_top current top pointer
! 2952: md pointer to "static" info for the match
! 2953: ims current /i, /m, and /s options
! 2954: condassert TRUE if called to check a condition assertion
! 2955: eptrb eptr at start of last bracket
! 2956:
! 2957: Returns: TRUE if matched
! 2958: */
! 2959:
! 2960: static BOOL
! 2961: match(register const uschar *eptr, register const uschar *ecode,
! 2962: int offset_top, match_data *md, unsigned long int ims, BOOL condassert,
! 2963: const uschar *eptrb)
! 2964: {
! 2965: unsigned long int original_ims = ims; /* Save for resetting on ')' */
! 2966:
! 2967: for (;;)
! 2968: {
! 2969: int op = (int)*ecode;
! 2970: int min, max, ctype;
! 2971: register int i;
! 2972: register int c;
! 2973: BOOL minimize = FALSE;
! 2974:
! 2975: /* Opening capturing bracket. If there is space in the offset vector, save
! 2976: the current subject position in the working slot at the top of the vector. We
! 2977: mustn't change the current values of the data slot, because they may be set
! 2978: from a previous iteration of this group, and be referred to by a reference
! 2979: inside the group.
! 2980:
! 2981: If the bracket fails to match, we need to restore this value and also the
! 2982: values of the final offsets, in case they were set by a previous iteration of
! 2983: the same bracket.
! 2984:
! 2985: If there isn't enough space in the offset vector, treat this as if it were a
! 2986: non-capturing bracket. Don't worry about setting the flag for the error case
! 2987: here; that is handled in the code for KET. */
! 2988:
! 2989: if (op > OP_BRA)
! 2990: {
! 2991: int number = op - OP_BRA;
! 2992: int offset = number << 1;
! 2993:
! 2994: #ifdef DEBUG
! 2995: printf("start bracket %d subject=", number);
! 2996: pchars(eptr, 16, TRUE, md);
! 2997: printf("\n");
! 2998: #endif
! 2999:
! 3000: if (offset < md->offset_max)
! 3001: {
! 3002: int save_offset1 = md->offset_vector[offset];
! 3003: int save_offset2 = md->offset_vector[offset+1];
! 3004: int save_offset3 = md->offset_vector[md->offset_end - number];
! 3005:
! 3006: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
! 3007: md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
! 3008:
! 3009: do
! 3010: {
! 3011: if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
! 3012: ecode += (ecode[1] << 8) + ecode[2];
! 3013: }
! 3014: while (*ecode == OP_ALT);
! 3015:
! 3016: DPRINTF(("bracket %d failed\n", number));
! 3017:
! 3018: md->offset_vector[offset] = save_offset1;
! 3019: md->offset_vector[offset+1] = save_offset2;
! 3020: md->offset_vector[md->offset_end - number] = save_offset3;
! 3021: return FALSE;
! 3022: }
! 3023:
! 3024: /* Insufficient room for saving captured contents */
! 3025:
! 3026: else op = OP_BRA;
! 3027: }
! 3028:
! 3029: /* Other types of node can be handled by a switch */
! 3030:
! 3031: switch(op)
! 3032: {
! 3033: case OP_BRA: /* Non-capturing bracket: optimized */
! 3034: DPRINTF(("start bracket 0\n"));
! 3035: do
! 3036: {
! 3037: if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
! 3038: ecode += (ecode[1] << 8) + ecode[2];
! 3039: }
! 3040: while (*ecode == OP_ALT);
! 3041: DPRINTF(("bracket 0 failed\n"));
! 3042: return FALSE;
! 3043:
! 3044: /* Conditional group: compilation checked that there are no more than
! 3045: two branches. If the condition is false, skipping the first branch takes us
! 3046: past the end if there is only one branch, but that's OK because that is
! 3047: exactly what going to the ket would do. */
! 3048:
! 3049: case OP_COND:
! 3050: if (ecode[3] == OP_CREF) /* Condition is extraction test */
! 3051: {
! 3052: int offset = ecode[4] << 1; /* Doubled reference number */
! 3053: return match(eptr,
! 3054: ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
! 3055: 5 : 3 + (ecode[1] << 8) + ecode[2]),
! 3056: offset_top, md, ims, FALSE, eptr);
! 3057: }
! 3058:
! 3059: /* The condition is an assertion. Call match() to evaluate it - setting
! 3060: the final argument TRUE causes it to stop at the end of an assertion. */
! 3061:
! 3062: else
! 3063: {
! 3064: if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
! 3065: {
! 3066: ecode += 3 + (ecode[4] << 8) + ecode[5];
! 3067: while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
! 3068: }
! 3069: else ecode += (ecode[1] << 8) + ecode[2];
! 3070: return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
! 3071: }
! 3072: /* Control never reaches here */
! 3073:
! 3074: /* Skip over conditional reference data if encountered (should not be) */
! 3075:
! 3076: case OP_CREF:
! 3077: ecode += 2;
! 3078: break;
! 3079:
! 3080: /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
! 3081: an empty string - recursion will then try other alternatives, if any. */
! 3082:
! 3083: case OP_END:
! 3084: if (md->notempty && eptr == md->start_match) return FALSE;
! 3085: md->end_match_ptr = eptr; /* Record where we ended */
! 3086: md->end_offset_top = offset_top; /* and how many extracts were taken */
! 3087: return TRUE;
! 3088:
! 3089: /* Change option settings */
! 3090:
! 3091: case OP_OPT:
! 3092: ims = ecode[1];
! 3093: ecode += 2;
! 3094: DPRINTF(("ims set to %02lx\n", ims));
! 3095: break;
! 3096:
! 3097: /* Assertion brackets. Check the alternative branches in turn - the
! 3098: matching won't pass the KET for an assertion. If any one branch matches,
! 3099: the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
! 3100: start of each branch to move the current point backwards, so the code at
! 3101: this level is identical to the lookahead case. */
! 3102:
! 3103: case OP_ASSERT:
! 3104: case OP_ASSERTBACK:
! 3105: do
! 3106: {
! 3107: if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
! 3108: ecode += (ecode[1] << 8) + ecode[2];
! 3109: }
! 3110: while (*ecode == OP_ALT);
! 3111: if (*ecode == OP_KET) return FALSE;
! 3112:
! 3113: /* If checking an assertion for a condition, return TRUE. */
! 3114:
! 3115: if (condassert) return TRUE;
! 3116:
! 3117: /* Continue from after the assertion, updating the offsets high water
! 3118: mark, since extracts may have been taken during the assertion. */
! 3119:
! 3120: do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
! 3121: ecode += 3;
! 3122: offset_top = md->end_offset_top;
! 3123: continue;
! 3124:
! 3125: /* Negative assertion: all branches must fail to match */
! 3126:
! 3127: case OP_ASSERT_NOT:
! 3128: case OP_ASSERTBACK_NOT:
! 3129: do
! 3130: {
! 3131: if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
! 3132: ecode += (ecode[1] << 8) + ecode[2];
! 3133: }
! 3134: while (*ecode == OP_ALT);
! 3135:
! 3136: if (condassert) return TRUE;
! 3137: ecode += 3;
! 3138: continue;
! 3139:
! 3140: /* Move the subject pointer back. This occurs only at the start of
! 3141: each branch of a lookbehind assertion. If we are too close to the start to
! 3142: move back, this match function fails. */
! 3143:
! 3144: case OP_REVERSE:
! 3145: eptr -= (ecode[1] << 8) + ecode[2];
! 3146: if (eptr < md->start_subject) return FALSE;
! 3147: ecode += 3;
! 3148: break;
! 3149:
! 3150:
! 3151: /* "Once" brackets are like assertion brackets except that after a match,
! 3152: the point in the subject string is not moved back. Thus there can never be
! 3153: a move back into the brackets. Check the alternative branches in turn - the
! 3154: matching won't pass the KET for this kind of subpattern. If any one branch
! 3155: matches, we carry on as at the end of a normal bracket, leaving the subject
! 3156: pointer. */
! 3157:
! 3158: case OP_ONCE:
! 3159: {
! 3160: const uschar *prev = ecode;
! 3161:
! 3162: do
! 3163: {
! 3164: if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
! 3165: ecode += (ecode[1] << 8) + ecode[2];
! 3166: }
! 3167: while (*ecode == OP_ALT);
! 3168:
! 3169: /* If hit the end of the group (which could be repeated), fail */
! 3170:
! 3171: if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
! 3172:
! 3173: /* Continue as from after the assertion, updating the offsets high water
! 3174: mark, since extracts may have been taken. */
! 3175:
! 3176: do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
! 3177:
! 3178: offset_top = md->end_offset_top;
! 3179: eptr = md->end_match_ptr;
! 3180:
! 3181: /* For a non-repeating ket, just continue at this level. This also
! 3182: happens for a repeating ket if no characters were matched in the group.
! 3183: This is the forcible breaking of infinite loops as implemented in Perl
! 3184: 5.005. If there is an options reset, it will get obeyed in the normal
! 3185: course of events. */
! 3186:
! 3187: if (*ecode == OP_KET || eptr == eptrb)
! 3188: {
! 3189: ecode += 3;
! 3190: break;
! 3191: }
! 3192:
! 3193: /* The repeating kets try the rest of the pattern or restart from the
! 3194: preceding bracket, in the appropriate order. We need to reset any options
! 3195: that changed within the bracket before re-running it, so check the next
! 3196: opcode. */
! 3197:
! 3198: if (ecode[3] == OP_OPT)
! 3199: {
! 3200: ims = (ims & ~PCRE_IMS) | ecode[4];
! 3201: DPRINTF(("ims set to %02lx at group repeat\n", ims));
! 3202: }
! 3203:
! 3204: if (*ecode == OP_KETRMIN)
! 3205: {
! 3206: if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
! 3207: match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
! 3208: }
! 3209: else /* OP_KETRMAX */
! 3210: {
! 3211: if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
! 3212: match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
! 3213: }
! 3214: }
! 3215: return FALSE;
! 3216:
! 3217: /* An alternation is the end of a branch; scan along to find the end of the
! 3218: bracketed group and go to there. */
! 3219:
! 3220: case OP_ALT:
! 3221: do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
! 3222: break;
! 3223:
! 3224: /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
! 3225: that it may occur zero times. It may repeat infinitely, or not at all -
! 3226: i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
! 3227: repeat limits are compiled as a number of copies, with the optional ones
! 3228: preceded by BRAZERO or BRAMINZERO. */
! 3229:
! 3230: case OP_BRAZERO:
! 3231: {
! 3232: const uschar *next = ecode+1;
! 3233: if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
! 3234: do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
! 3235: ecode = next + 3;
! 3236: }
! 3237: break;
! 3238:
! 3239: case OP_BRAMINZERO:
! 3240: {
! 3241: const uschar *next = ecode+1;
! 3242: do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
! 3243: if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
! 3244: ecode++;
! 3245: }
! 3246: break;
! 3247:
! 3248: /* End of a group, repeated or non-repeating. If we are at the end of
! 3249: an assertion "group", stop matching and return TRUE, but record the
! 3250: current high water mark for use by positive assertions. Do this also
! 3251: for the "once" (not-backup up) groups. */
! 3252:
! 3253: case OP_KET:
! 3254: case OP_KETRMIN:
! 3255: case OP_KETRMAX:
! 3256: {
! 3257: const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
! 3258:
! 3259: if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
! 3260: *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
! 3261: *prev == OP_ONCE)
! 3262: {
! 3263: md->end_match_ptr = eptr; /* For ONCE */
! 3264: md->end_offset_top = offset_top;
! 3265: return TRUE;
! 3266: }
! 3267:
! 3268: /* In all other cases except a conditional group we have to check the
! 3269: group number back at the start and if necessary complete handling an
! 3270: extraction by setting the offsets and bumping the high water mark. */
! 3271:
! 3272: if (*prev != OP_COND)
! 3273: {
! 3274: int number = *prev - OP_BRA;
! 3275: int offset = number << 1;
! 3276:
! 3277: DPRINTF(("end bracket %d\n", number));
! 3278:
! 3279: if (number > 0)
! 3280: {
! 3281: if (offset >= md->offset_max) md->offset_overflow = TRUE; else
! 3282: {
! 3283: md->offset_vector[offset] =
! 3284: md->offset_vector[md->offset_end - number];
! 3285: md->offset_vector[offset+1] = eptr - md->start_subject;
! 3286: if (offset_top <= offset) offset_top = offset + 2;
! 3287: }
! 3288: }
! 3289: }
! 3290:
! 3291: /* Reset the value of the ims flags, in case they got changed during
! 3292: the group. */
! 3293:
! 3294: ims = original_ims;
! 3295: DPRINTF(("ims reset to %02lx\n", ims));
! 3296:
! 3297: /* For a non-repeating ket, just continue at this level. This also
! 3298: happens for a repeating ket if no characters were matched in the group.
! 3299: This is the forcible breaking of infinite loops as implemented in Perl
! 3300: 5.005. If there is an options reset, it will get obeyed in the normal
! 3301: course of events. */
! 3302:
! 3303: if (*ecode == OP_KET || eptr == eptrb)
! 3304: {
! 3305: ecode += 3;
! 3306: break;
! 3307: }
! 3308:
! 3309: /* The repeating kets try the rest of the pattern or restart from the
! 3310: preceding bracket, in the appropriate order. */
! 3311:
! 3312: if (*ecode == OP_KETRMIN)
! 3313: {
! 3314: if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
! 3315: match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
! 3316: }
! 3317: else /* OP_KETRMAX */
! 3318: {
! 3319: if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
! 3320: match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
! 3321: }
! 3322: }
! 3323: return FALSE;
! 3324:
! 3325: /* Start of subject unless notbol, or after internal newline if multiline */
! 3326:
! 3327: case OP_CIRC:
! 3328: if (md->notbol && eptr == md->start_subject) return FALSE;
! 3329: if ((ims & PCRE_MULTILINE) != 0)
! 3330: {
! 3331: if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
! 3332: ecode++;
! 3333: break;
! 3334: }
! 3335: /* ... else fall through */
! 3336:
! 3337: /* Start of subject assertion */
! 3338:
! 3339: case OP_SOD:
! 3340: if (eptr != md->start_subject) return FALSE;
! 3341: ecode++;
! 3342: break;
! 3343:
! 3344: /* Assert before internal newline if multiline, or before a terminating
! 3345: newline unless endonly is set, else end of subject unless noteol is set. */
! 3346:
! 3347: case OP_DOLL:
! 3348: if ((ims & PCRE_MULTILINE) != 0)
! 3349: {
! 3350: if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
! 3351: else { if (md->noteol) return FALSE; }
! 3352: ecode++;
! 3353: break;
! 3354: }
! 3355: else
! 3356: {
! 3357: if (md->noteol) return FALSE;
! 3358: if (!md->endonly)
! 3359: {
! 3360: if (eptr < md->end_subject - 1 ||
! 3361: (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
! 3362:
! 3363: ecode++;
! 3364: break;
! 3365: }
! 3366: }
! 3367: /* ... else fall through */
! 3368:
! 3369: /* End of subject assertion (\z) */
! 3370:
! 3371: case OP_EOD:
! 3372: if (eptr < md->end_subject) return FALSE;
! 3373: ecode++;
! 3374: break;
! 3375:
! 3376: /* End of subject or ending \n assertion (\Z) */
! 3377:
! 3378: case OP_EODN:
! 3379: if (eptr < md->end_subject - 1 ||
! 3380: (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
! 3381: ecode++;
! 3382: break;
! 3383:
! 3384: /* Word boundary assertions */
! 3385:
! 3386: case OP_NOT_WORD_BOUNDARY:
! 3387: case OP_WORD_BOUNDARY:
! 3388: {
! 3389: BOOL prev_is_word = (eptr != md->start_subject) &&
! 3390: ((md->ctypes[eptr[-1]] & ctype_word) != 0);
! 3391: BOOL cur_is_word = (eptr < md->end_subject) &&
! 3392: ((md->ctypes[*eptr] & ctype_word) != 0);
! 3393: if ((*ecode++ == OP_WORD_BOUNDARY)?
! 3394: cur_is_word == prev_is_word : cur_is_word != prev_is_word)
! 3395: return FALSE;
! 3396: }
! 3397: break;
! 3398:
! 3399: /* Match a single character type; inline for speed */
! 3400:
! 3401: case OP_ANY:
! 3402: if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
! 3403: return FALSE;
! 3404: if (eptr++ >= md->end_subject) return FALSE;
! 3405: ecode++;
! 3406: break;
! 3407:
! 3408: case OP_NOT_DIGIT:
! 3409: if (eptr >= md->end_subject ||
! 3410: (md->ctypes[*eptr++] & ctype_digit) != 0)
! 3411: return FALSE;
! 3412: ecode++;
! 3413: break;
! 3414:
! 3415: case OP_DIGIT:
! 3416: if (eptr >= md->end_subject ||
! 3417: (md->ctypes[*eptr++] & ctype_digit) == 0)
! 3418: return FALSE;
! 3419: ecode++;
! 3420: break;
! 3421:
! 3422: case OP_NOT_WHITESPACE:
! 3423: if (eptr >= md->end_subject ||
! 3424: (md->ctypes[*eptr++] & ctype_space) != 0)
! 3425: return FALSE;
! 3426: ecode++;
! 3427: break;
! 3428:
! 3429: case OP_WHITESPACE:
! 3430: if (eptr >= md->end_subject ||
! 3431: (md->ctypes[*eptr++] & ctype_space) == 0)
! 3432: return FALSE;
! 3433: ecode++;
! 3434: break;
! 3435:
! 3436: case OP_NOT_WORDCHAR:
! 3437: if (eptr >= md->end_subject ||
! 3438: (md->ctypes[*eptr++] & ctype_word) != 0)
! 3439: return FALSE;
! 3440: ecode++;
! 3441: break;
! 3442:
! 3443: case OP_WORDCHAR:
! 3444: if (eptr >= md->end_subject ||
! 3445: (md->ctypes[*eptr++] & ctype_word) == 0)
! 3446: return FALSE;
! 3447: ecode++;
! 3448: break;
! 3449:
! 3450: /* Match a back reference, possibly repeatedly. Look past the end of the
! 3451: item to see if there is repeat information following. The code is similar
! 3452: to that for character classes, but repeated for efficiency. Then obey
! 3453: similar code to character type repeats - written out again for speed.
! 3454: However, if the referenced string is the empty string, always treat
! 3455: it as matched, any number of times (otherwise there could be infinite
! 3456: loops). */
! 3457:
! 3458: case OP_REF:
! 3459: {
! 3460: int length;
! 3461: int offset = ecode[1] << 1; /* Doubled reference number */
! 3462: ecode += 2; /* Advance past the item */
! 3463:
! 3464: /* If the reference is unset, set the length to be longer than the amount
! 3465: of subject left; this ensures that every attempt at a match fails. We
! 3466: can't just fail here, because of the possibility of quantifiers with zero
! 3467: minima. */
! 3468:
! 3469: length = (offset >= offset_top || md->offset_vector[offset] < 0)?
! 3470: md->end_subject - eptr + 1 :
! 3471: md->offset_vector[offset+1] - md->offset_vector[offset];
! 3472:
! 3473: /* Set up for repetition, or handle the non-repeated case */
! 3474:
! 3475: switch (*ecode)
! 3476: {
! 3477: case OP_CRSTAR:
! 3478: case OP_CRMINSTAR:
! 3479: case OP_CRPLUS:
! 3480: case OP_CRMINPLUS:
! 3481: case OP_CRQUERY:
! 3482: case OP_CRMINQUERY:
! 3483: c = *ecode++ - OP_CRSTAR;
! 3484: minimize = (c & 1) != 0;
! 3485: min = rep_min[c]; /* Pick up values from tables; */
! 3486: max = rep_max[c]; /* zero for max => infinity */
! 3487: if (max == 0) max = PCRE_MAX_POS;
! 3488: break;
! 3489:
! 3490: case OP_CRRANGE:
! 3491: case OP_CRMINRANGE:
! 3492: minimize = (*ecode == OP_CRMINRANGE);
! 3493: min = (ecode[1] << 8) + ecode[2];
! 3494: max = (ecode[3] << 8) + ecode[4];
! 3495: if (max == 0) max = PCRE_MAX_POS;
! 3496: ecode += 5;
! 3497: break;
! 3498:
! 3499: default: /* No repeat follows */
! 3500: if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
! 3501: eptr += length;
! 3502: continue; /* With the main loop */
! 3503: }
! 3504:
! 3505: /* If the length of the reference is zero, just continue with the
! 3506: main loop. */
! 3507:
! 3508: if (length == 0) continue;
! 3509:
! 3510: /* First, ensure the minimum number of matches are present. We get back
! 3511: the length of the reference string explicitly rather than passing the
! 3512: address of eptr, so that eptr can be a register variable. */
! 3513:
! 3514: for (i = 1; i <= min; i++)
! 3515: {
! 3516: if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
! 3517: eptr += length;
! 3518: }
! 3519:
! 3520: /* If min = max, continue at the same level without recursion.
! 3521: They are not both allowed to be zero. */
! 3522:
! 3523: if (min == max) continue;
! 3524:
! 3525: /* If minimizing, keep trying and advancing the pointer */
! 3526:
! 3527: if (minimize)
! 3528: {
! 3529: for (i = min;; i++)
! 3530: {
! 3531: if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
! 3532: return TRUE;
! 3533: if (i >= max || !match_ref(offset, eptr, length, md, ims))
! 3534: return FALSE;
! 3535: eptr += length;
! 3536: }
! 3537: /* Control never gets here */
! 3538: }
! 3539:
! 3540: /* If maximizing, find the longest string and work backwards */
! 3541:
! 3542: else
! 3543: {
! 3544: const uschar *pp = eptr;
! 3545: for (i = min; i < max; i++)
! 3546: {
! 3547: if (!match_ref(offset, eptr, length, md, ims)) break;
! 3548: eptr += length;
! 3549: }
! 3550: while (eptr >= pp)
! 3551: {
! 3552: if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
! 3553: return TRUE;
! 3554: eptr -= length;
! 3555: }
! 3556: return FALSE;
! 3557: }
! 3558: }
! 3559: /* Control never gets here */
! 3560:
! 3561:
! 3562:
! 3563: /* Match a character class, possibly repeatedly. Look past the end of the
! 3564: item to see if there is repeat information following. Then obey similar
! 3565: code to character type repeats - written out again for speed. */
! 3566:
! 3567: case OP_CLASS:
! 3568: {
! 3569: const uschar *data = ecode + 1; /* Save for matching */
! 3570: ecode += 33; /* Advance past the item */
! 3571:
! 3572: switch (*ecode)
! 3573: {
! 3574: case OP_CRSTAR:
! 3575: case OP_CRMINSTAR:
! 3576: case OP_CRPLUS:
! 3577: case OP_CRMINPLUS:
! 3578: case OP_CRQUERY:
! 3579: case OP_CRMINQUERY:
! 3580: c = *ecode++ - OP_CRSTAR;
! 3581: minimize = (c & 1) != 0;
! 3582: min = rep_min[c]; /* Pick up values from tables; */
! 3583: max = rep_max[c]; /* zero for max => infinity */
! 3584: if (max == 0) max = PCRE_MAX_POS;
! 3585: break;
! 3586:
! 3587: case OP_CRRANGE:
! 3588: case OP_CRMINRANGE:
! 3589: minimize = (*ecode == OP_CRMINRANGE);
! 3590: min = (ecode[1] << 8) + ecode[2];
! 3591: max = (ecode[3] << 8) + ecode[4];
! 3592: if (max == 0) max = PCRE_MAX_POS;
! 3593: ecode += 5;
! 3594: break;
! 3595:
! 3596: default: /* No repeat follows */
! 3597: min = max = 1;
! 3598: break;
! 3599: }
! 3600:
! 3601: /* First, ensure the minimum number of matches are present. */
! 3602:
! 3603: for (i = 1; i <= min; i++)
! 3604: {
! 3605: if (eptr >= md->end_subject) return FALSE;
! 3606: c = *eptr++;
! 3607: if ((data[c/8] & (1 << (c&7))) != 0) continue;
! 3608: return FALSE;
! 3609: }
! 3610:
! 3611: /* If max == min we can continue with the main loop without the
! 3612: need to recurse. */
! 3613:
! 3614: if (min == max) continue;
! 3615:
! 3616: /* If minimizing, keep testing the rest of the expression and advancing
! 3617: the pointer while it matches the class. */
! 3618:
! 3619: if (minimize)
! 3620: {
! 3621: for (i = min;; i++)
! 3622: {
! 3623: if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
! 3624: return TRUE;
! 3625: if (i >= max || eptr >= md->end_subject) return FALSE;
! 3626: c = *eptr++;
! 3627: if ((data[c/8] & (1 << (c&7))) != 0) continue;
! 3628: return FALSE;
! 3629: }
! 3630: /* Control never gets here */
! 3631: }
! 3632:
! 3633: /* If maximizing, find the longest possible run, then work backwards. */
! 3634:
! 3635: else
! 3636: {
! 3637: const uschar *pp = eptr;
! 3638: for (i = min; i < max; eptr++, i++)
! 3639: {
! 3640: if (eptr >= md->end_subject) break;
! 3641: c = *eptr;
! 3642: if ((data[c/8] & (1 << (c&7))) != 0) continue;
! 3643: break;
! 3644: }
! 3645:
! 3646: while (eptr >= pp)
! 3647: if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
! 3648: return TRUE;
! 3649: return FALSE;
! 3650: }
! 3651: }
! 3652: /* Control never gets here */
! 3653:
! 3654: /* Match a run of characters */
! 3655:
! 3656: case OP_CHARS:
! 3657: {
! 3658: register int length = ecode[1];
! 3659: ecode += 2;
! 3660:
! 3661: #ifdef DEBUG /* Sigh. Some compilers never learn. */
! 3662: if (eptr >= md->end_subject)
! 3663: printf("matching subject <null> against pattern ");
! 3664: else
! 3665: {
! 3666: printf("matching subject ");
! 3667: pchars(eptr, length, TRUE, md);
! 3668: printf(" against pattern ");
! 3669: }
! 3670: pchars(ecode, length, FALSE, md);
! 3671: printf("\n");
! 3672: #endif
! 3673:
! 3674: if (length > md->end_subject - eptr) return FALSE;
! 3675: if ((ims & PCRE_CASELESS) != 0)
! 3676: {
! 3677: while (length-- > 0)
! 3678: if (md->lcc[*ecode++] != md->lcc[*eptr++])
! 3679: return FALSE;
! 3680: }
! 3681: else
! 3682: {
! 3683: while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
! 3684: }
! 3685: }
! 3686: break;
! 3687:
! 3688: /* Match a single character repeatedly; different opcodes share code. */
! 3689:
! 3690: case OP_EXACT:
! 3691: min = max = (ecode[1] << 8) + ecode[2];
! 3692: ecode += 3;
! 3693: goto REPEATCHAR;
! 3694:
! 3695: case OP_UPTO:
! 3696: case OP_MINUPTO:
! 3697: min = 0;
! 3698: max = (ecode[1] << 8) + ecode[2];
! 3699: minimize = *ecode == OP_MINUPTO;
! 3700: ecode += 3;
! 3701: goto REPEATCHAR;
! 3702:
! 3703: case OP_STAR:
! 3704: case OP_MINSTAR:
! 3705: case OP_PLUS:
! 3706: case OP_MINPLUS:
! 3707: case OP_QUERY:
! 3708: case OP_MINQUERY:
! 3709: c = *ecode++ - OP_STAR;
! 3710: minimize = (c & 1) != 0;
! 3711: min = rep_min[c]; /* Pick up values from tables; */
! 3712: max = rep_max[c]; /* zero for max => infinity */
! 3713: if (max == 0) max = PCRE_MAX_POS;
! 3714:
! 3715: /* Common code for all repeated single-character matches. We can give
! 3716: up quickly if there are fewer than the minimum number of characters left in
! 3717: the subject. */
! 3718:
! 3719: REPEATCHAR:
! 3720: if (min > md->end_subject - eptr) return FALSE;
! 3721: c = *ecode++;
! 3722:
! 3723: /* The code is duplicated for the caseless and caseful cases, for speed,
! 3724: since matching characters is likely to be quite common. First, ensure the
! 3725: minimum number of matches are present. If min = max, continue at the same
! 3726: level without recursing. Otherwise, if minimizing, keep trying the rest of
! 3727: the expression and advancing one matching character if failing, up to the
! 3728: maximum. Alternatively, if maximizing, find the maximum number of
! 3729: characters and work backwards. */
! 3730:
! 3731: DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
! 3732: max, eptr));
! 3733:
! 3734: if ((ims & PCRE_CASELESS) != 0)
! 3735: {
! 3736: c = md->lcc[c];
! 3737: for (i = 1; i <= min; i++)
! 3738: if (c != md->lcc[*eptr++]) return FALSE;
! 3739: if (min == max) continue;
! 3740: if (minimize)
! 3741: {
! 3742: for (i = min;; i++)
! 3743: {
! 3744: if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
! 3745: return TRUE;
! 3746: if (i >= max || eptr >= md->end_subject ||
! 3747: c != md->lcc[*eptr++])
! 3748: return FALSE;
! 3749: }
! 3750: /* Control never gets here */
! 3751: }
! 3752: else
! 3753: {
! 3754: const uschar *pp = eptr;
! 3755: for (i = min; i < max; i++)
! 3756: {
! 3757: if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
! 3758: eptr++;
! 3759: }
! 3760: while (eptr >= pp)
! 3761: if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
! 3762: return TRUE;
! 3763: return FALSE;
! 3764: }
! 3765: /* Control never gets here */
! 3766: }
! 3767:
! 3768: /* Caseful comparisons */
! 3769:
! 3770: else
! 3771: {
! 3772: for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
! 3773: if (min == max) continue;
! 3774: if (minimize)
! 3775: {
! 3776: for (i = min;; i++)
! 3777: {
! 3778: if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
! 3779: return TRUE;
! 3780: if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
! 3781: }
! 3782: /* Control never gets here */
! 3783: }
! 3784: else
! 3785: {
! 3786: const uschar *pp = eptr;
! 3787: for (i = min; i < max; i++)
! 3788: {
! 3789: if (eptr >= md->end_subject || c != *eptr) break;
! 3790: eptr++;
! 3791: }
! 3792: while (eptr >= pp)
! 3793: if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
! 3794: return TRUE;
! 3795: return FALSE;
! 3796: }
! 3797: }
! 3798: /* Control never gets here */
! 3799:
! 3800: /* Match a negated single character */
! 3801:
! 3802: case OP_NOT:
! 3803: if (eptr >= md->end_subject) return FALSE;
! 3804: ecode++;
! 3805: if ((ims & PCRE_CASELESS) != 0)
! 3806: {
! 3807: if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
! 3808: }
! 3809: else
! 3810: {
! 3811: if (*ecode++ == *eptr++) return FALSE;
! 3812: }
! 3813: break;
! 3814:
! 3815: /* Match a negated single character repeatedly. This is almost a repeat of
! 3816: the code for a repeated single character, but I haven't found a nice way of
! 3817: commoning these up that doesn't require a test of the positive/negative
! 3818: option for each character match. Maybe that wouldn't add very much to the
! 3819: time taken, but character matching *is* what this is all about... */
! 3820:
! 3821: case OP_NOTEXACT:
! 3822: min = max = (ecode[1] << 8) + ecode[2];
! 3823: ecode += 3;
! 3824: goto REPEATNOTCHAR;
! 3825:
! 3826: case OP_NOTUPTO:
! 3827: case OP_NOTMINUPTO:
! 3828: min = 0;
! 3829: max = (ecode[1] << 8) + ecode[2];
! 3830: minimize = *ecode == OP_NOTMINUPTO;
! 3831: ecode += 3;
! 3832: goto REPEATNOTCHAR;
! 3833:
! 3834: case OP_NOTSTAR:
! 3835: case OP_NOTMINSTAR:
! 3836: case OP_NOTPLUS:
! 3837: case OP_NOTMINPLUS:
! 3838: case OP_NOTQUERY:
! 3839: case OP_NOTMINQUERY:
! 3840: c = *ecode++ - OP_NOTSTAR;
! 3841: minimize = (c & 1) != 0;
! 3842: min = rep_min[c]; /* Pick up values from tables; */
! 3843: max = rep_max[c]; /* zero for max => infinity */
! 3844: if (max == 0) max = PCRE_MAX_POS;
! 3845:
! 3846: /* Common code for all repeated single-character matches. We can give
! 3847: up quickly if there are fewer than the minimum number of characters left in
! 3848: the subject. */
! 3849:
! 3850: REPEATNOTCHAR:
! 3851: if (min > md->end_subject - eptr) return FALSE;
! 3852: c = *ecode++;
! 3853:
! 3854: /* The code is duplicated for the caseless and caseful cases, for speed,
! 3855: since matching characters is likely to be quite common. First, ensure the
! 3856: minimum number of matches are present. If min = max, continue at the same
! 3857: level without recursing. Otherwise, if minimizing, keep trying the rest of
! 3858: the expression and advancing one matching character if failing, up to the
! 3859: maximum. Alternatively, if maximizing, find the maximum number of
! 3860: characters and work backwards. */
! 3861:
! 3862: DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
! 3863: max, eptr));
! 3864:
! 3865: if ((ims & PCRE_CASELESS) != 0)
! 3866: {
! 3867: c = md->lcc[c];
! 3868: for (i = 1; i <= min; i++)
! 3869: if (c == md->lcc[*eptr++]) return FALSE;
! 3870: if (min == max) continue;
! 3871: if (minimize)
! 3872: {
! 3873: for (i = min;; i++)
! 3874: {
! 3875: if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
! 3876: return TRUE;
! 3877: if (i >= max || eptr >= md->end_subject ||
! 3878: c == md->lcc[*eptr++])
! 3879: return FALSE;
! 3880: }
! 3881: /* Control never gets here */
! 3882: }
! 3883: else
! 3884: {
! 3885: const uschar *pp = eptr;
! 3886: for (i = min; i < max; i++)
! 3887: {
! 3888: if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
! 3889: eptr++;
! 3890: }
! 3891: while (eptr >= pp)
! 3892: if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
! 3893: return TRUE;
! 3894: return FALSE;
! 3895: }
! 3896: /* Control never gets here */
! 3897: }
! 3898:
! 3899: /* Caseful comparisons */
! 3900:
! 3901: else
! 3902: {
! 3903: for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
! 3904: if (min == max) continue;
! 3905: if (minimize)
! 3906: {
! 3907: for (i = min;; i++)
! 3908: {
! 3909: if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
! 3910: return TRUE;
! 3911: if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
! 3912: }
! 3913: /* Control never gets here */
! 3914: }
! 3915: else
! 3916: {
! 3917: const uschar *pp = eptr;
! 3918: for (i = min; i < max; i++)
! 3919: {
! 3920: if (eptr >= md->end_subject || c == *eptr) break;
! 3921: eptr++;
! 3922: }
! 3923: while (eptr >= pp)
! 3924: if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
! 3925: return TRUE;
! 3926: return FALSE;
! 3927: }
! 3928: }
! 3929: /* Control never gets here */
! 3930:
! 3931: /* Match a single character type repeatedly; several different opcodes
! 3932: share code. This is very similar to the code for single characters, but we
! 3933: repeat it in the interests of efficiency. */
! 3934:
! 3935: case OP_TYPEEXACT:
! 3936: min = max = (ecode[1] << 8) + ecode[2];
! 3937: minimize = TRUE;
! 3938: ecode += 3;
! 3939: goto REPEATTYPE;
! 3940:
! 3941: case OP_TYPEUPTO:
! 3942: case OP_TYPEMINUPTO:
! 3943: min = 0;
! 3944: max = (ecode[1] << 8) + ecode[2];
! 3945: minimize = *ecode == OP_TYPEMINUPTO;
! 3946: ecode += 3;
! 3947: goto REPEATTYPE;
! 3948:
! 3949: case OP_TYPESTAR:
! 3950: case OP_TYPEMINSTAR:
! 3951: case OP_TYPEPLUS:
! 3952: case OP_TYPEMINPLUS:
! 3953: case OP_TYPEQUERY:
! 3954: case OP_TYPEMINQUERY:
! 3955: c = *ecode++ - OP_TYPESTAR;
! 3956: minimize = (c & 1) != 0;
! 3957: min = rep_min[c]; /* Pick up values from tables; */
! 3958: max = rep_max[c]; /* zero for max => infinity */
! 3959: if (max == 0) max = PCRE_MAX_POS;
! 3960:
! 3961: /* Common code for all repeated single character type matches */
! 3962:
! 3963: REPEATTYPE:
! 3964: ctype = *ecode++; /* Code for the character type */
! 3965:
! 3966: /* First, ensure the minimum number of matches are present. Use inline
! 3967: code for maximizing the speed, and do the type test once at the start
! 3968: (i.e. keep it out of the loop). Also test that there are at least the
! 3969: minimum number of characters before we start. */
! 3970:
! 3971: if (min > md->end_subject - eptr) return FALSE;
! 3972: if (min > 0) switch(ctype)
! 3973: {
! 3974: case OP_ANY:
! 3975: if ((ims & PCRE_DOTALL) == 0)
! 3976: { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
! 3977: else eptr += min;
! 3978: break;
! 3979:
! 3980: case OP_NOT_DIGIT:
! 3981: for (i = 1; i <= min; i++)
! 3982: if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
! 3983: break;
! 3984:
! 3985: case OP_DIGIT:
! 3986: for (i = 1; i <= min; i++)
! 3987: if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
! 3988: break;
! 3989:
! 3990: case OP_NOT_WHITESPACE:
! 3991: for (i = 1; i <= min; i++)
! 3992: if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
! 3993: break;
! 3994:
! 3995: case OP_WHITESPACE:
! 3996: for (i = 1; i <= min; i++)
! 3997: if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
! 3998: break;
! 3999:
! 4000: case OP_NOT_WORDCHAR:
! 4001: for (i = 1; i <= min; i++)
! 4002: if ((md->ctypes[*eptr++] & ctype_word) != 0)
! 4003: return FALSE;
! 4004: break;
! 4005:
! 4006: case OP_WORDCHAR:
! 4007: for (i = 1; i <= min; i++)
! 4008: if ((md->ctypes[*eptr++] & ctype_word) == 0)
! 4009: return FALSE;
! 4010: break;
! 4011: }
! 4012:
! 4013: /* If min = max, continue at the same level without recursing */
! 4014:
! 4015: if (min == max) continue;
! 4016:
! 4017: /* If minimizing, we have to test the rest of the pattern before each
! 4018: subsequent match. */
! 4019:
! 4020: if (minimize)
! 4021: {
! 4022: for (i = min;; i++)
! 4023: {
! 4024: if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
! 4025: if (i >= max || eptr >= md->end_subject) return FALSE;
! 4026:
! 4027: c = *eptr++;
! 4028: switch(ctype)
! 4029: {
! 4030: case OP_ANY:
! 4031: if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
! 4032: break;
! 4033:
! 4034: case OP_NOT_DIGIT:
! 4035: if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
! 4036: break;
! 4037:
! 4038: case OP_DIGIT:
! 4039: if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
! 4040: break;
! 4041:
! 4042: case OP_NOT_WHITESPACE:
! 4043: if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
! 4044: break;
! 4045:
! 4046: case OP_WHITESPACE:
! 4047: if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
! 4048: break;
! 4049:
! 4050: case OP_NOT_WORDCHAR:
! 4051: if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
! 4052: break;
! 4053:
! 4054: case OP_WORDCHAR:
! 4055: if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
! 4056: break;
! 4057: }
! 4058: }
! 4059: /* Control never gets here */
! 4060: }
! 4061:
! 4062: /* If maximizing it is worth using inline code for speed, doing the type
! 4063: test once at the start (i.e. keep it out of the loop). */
! 4064:
! 4065: else
! 4066: {
! 4067: const uschar *pp = eptr;
! 4068: switch(ctype)
! 4069: {
! 4070: case OP_ANY:
! 4071: if ((ims & PCRE_DOTALL) == 0)
! 4072: {
! 4073: for (i = min; i < max; i++)
! 4074: {
! 4075: if (eptr >= md->end_subject || *eptr == '\n') break;
! 4076: eptr++;
! 4077: }
! 4078: }
! 4079: else
! 4080: {
! 4081: c = max - min;
! 4082: if (c > md->end_subject - eptr) c = md->end_subject - eptr;
! 4083: eptr += c;
! 4084: }
! 4085: break;
! 4086:
! 4087: case OP_NOT_DIGIT:
! 4088: for (i = min; i < max; i++)
! 4089: {
! 4090: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
! 4091: break;
! 4092: eptr++;
! 4093: }
! 4094: break;
! 4095:
! 4096: case OP_DIGIT:
! 4097: for (i = min; i < max; i++)
! 4098: {
! 4099: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
! 4100: break;
! 4101: eptr++;
! 4102: }
! 4103: break;
! 4104:
! 4105: case OP_NOT_WHITESPACE:
! 4106: for (i = min; i < max; i++)
! 4107: {
! 4108: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
! 4109: break;
! 4110: eptr++;
! 4111: }
! 4112: break;
! 4113:
! 4114: case OP_WHITESPACE:
! 4115: for (i = min; i < max; i++)
! 4116: {
! 4117: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
! 4118: break;
! 4119: eptr++;
! 4120: }
! 4121: break;
! 4122:
! 4123: case OP_NOT_WORDCHAR:
! 4124: for (i = min; i < max; i++)
! 4125: {
! 4126: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
! 4127: break;
! 4128: eptr++;
! 4129: }
! 4130: break;
! 4131:
! 4132: case OP_WORDCHAR:
! 4133: for (i = min; i < max; i++)
! 4134: {
! 4135: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
! 4136: break;
! 4137: eptr++;
! 4138: }
! 4139: break;
! 4140: }
! 4141:
! 4142: while (eptr >= pp)
! 4143: if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
! 4144: return TRUE;
! 4145: return FALSE;
! 4146: }
! 4147: /* Control never gets here */
! 4148:
! 4149: /* There's been some horrible disaster. */
! 4150:
! 4151: default:
! 4152: DPRINTF(("Unknown opcode %d\n", *ecode));
! 4153: md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
! 4154: return FALSE;
! 4155: }
! 4156:
! 4157: /* Do not stick any code in here without much thought; it is assumed
! 4158: that "continue" in the code above comes out to here to repeat the main
! 4159: loop. */
! 4160:
! 4161: } /* End of main loop */
! 4162: /* Control never reaches here */
! 4163: }
! 4164:
! 4165:
! 4166:
! 4167:
! 4168: /*************************************************
! 4169: * Execute a Regular Expression *
! 4170: *************************************************/
! 4171:
! 4172: /* This function applies a compiled re to a subject string and picks out
! 4173: portions of the string if it matches. Two elements in the vector are set for
! 4174: each substring: the offsets to the start and end of the substring.
! 4175:
! 4176: Arguments:
! 4177: external_re points to the compiled expression
! 4178: external_extra points to "hints" from pcre_study() or is NULL
! 4179: subject points to the subject string
! 4180: length length of subject string (may contain binary zeros)
! 4181: start_offset where to start in the subject string
! 4182: options option bits
! 4183: offsets points to a vector of ints to be filled in with offsets
! 4184: offsetcount the number of elements in the vector
! 4185:
! 4186: Returns: > 0 => success; value is the number of elements filled in
! 4187: = 0 => success, but offsets is not big enough
! 4188: -1 => failed to match
! 4189: < -1 => some kind of unexpected problem
! 4190: */
! 4191:
! 4192: int
! 4193: pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
! 4194: const char *subject, int length, int start_offset, int options, int *offsets,
! 4195: int offsetcount)
! 4196: {
! 4197: int resetcount, ocount;
! 4198: int first_char = -1;
! 4199: int req_char = -1;
! 4200: int req_char2 = -1;
! 4201: unsigned long int ims = 0;
! 4202: match_data match_block;
! 4203: const uschar *start_bits = NULL;
! 4204: const uschar *start_match = (const uschar *)subject + start_offset;
! 4205: const uschar *end_subject;
! 4206: const uschar *req_char_ptr = start_match - 1;
! 4207: const real_pcre *re = (const real_pcre *)external_re;
! 4208: const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
! 4209: BOOL using_temporary_offsets = FALSE;
! 4210: BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
! 4211: BOOL startline = (re->options & PCRE_STARTLINE) != 0;
! 4212:
! 4213: if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
! 4214:
! 4215: if (re == NULL || subject == NULL ||
! 4216: (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
! 4217: if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
! 4218:
! 4219: match_block.start_subject = (const uschar *)subject;
! 4220: match_block.end_subject = match_block.start_subject + length;
! 4221: end_subject = match_block.end_subject;
! 4222:
! 4223: match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
! 4224:
! 4225: match_block.notbol = (options & PCRE_NOTBOL) != 0;
! 4226: match_block.noteol = (options & PCRE_NOTEOL) != 0;
! 4227: match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
! 4228:
! 4229: match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
! 4230:
! 4231: match_block.lcc = re->tables + lcc_offset;
! 4232: match_block.ctypes = re->tables + ctypes_offset;
! 4233:
! 4234: /* The ims options can vary during the matching as a result of the presence
! 4235: of (?ims) items in the pattern. They are kept in a local variable so that
! 4236: restoring at the exit of a group is easy. */
! 4237:
! 4238: ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
! 4239:
! 4240: /* If the expression has got more back references than the offsets supplied can
! 4241: hold, we get a temporary bit of working store to use during the matching.
! 4242: Otherwise, we can use the vector supplied, rounding down its size to a multiple
! 4243: of 3. */
! 4244:
! 4245: ocount = offsetcount - (offsetcount % 3);
! 4246:
! 4247: if (re->top_backref > 0 && re->top_backref >= ocount/3)
! 4248: {
! 4249: ocount = re->top_backref * 3 + 3;
! 4250: match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
! 4251: if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
! 4252: using_temporary_offsets = TRUE;
! 4253: DPRINTF(("Got memory to hold back references\n"));
! 4254: }
! 4255: else match_block.offset_vector = offsets;
! 4256:
! 4257: match_block.offset_end = ocount;
! 4258: match_block.offset_max = (2*ocount)/3;
! 4259: match_block.offset_overflow = FALSE;
! 4260:
! 4261: /* Compute the minimum number of offsets that we need to reset each time. Doing
! 4262: this makes a huge difference to execution time when there aren't many brackets
! 4263: in the pattern. */
! 4264:
! 4265: resetcount = 2 + re->top_bracket * 2;
! 4266: if (resetcount > offsetcount) resetcount = ocount;
! 4267:
! 4268: /* Reset the working variable associated with each extraction. These should
! 4269: never be used unless previously set, but they get saved and restored, and so we
! 4270: initialize them to avoid reading uninitialized locations. */
! 4271:
! 4272: if (match_block.offset_vector != NULL)
! 4273: {
! 4274: register int *iptr = match_block.offset_vector + ocount;
! 4275: register int *iend = iptr - resetcount/2 + 1;
! 4276: while (--iptr >= iend) *iptr = -1;
! 4277: }
! 4278:
! 4279: /* Set up the first character to match, if available. The first_char value is
! 4280: never set for an anchored regular expression, but the anchoring may be forced
! 4281: at run time, so we have to test for anchoring. The first char may be unset for
! 4282: an unanchored pattern, of course. If there's no first char and the pattern was
! 4283: studied, there may be a bitmap of possible first characters. */
! 4284:
! 4285: if (!anchored)
! 4286: {
! 4287: if ((re->options & PCRE_FIRSTSET) != 0)
! 4288: {
! 4289: first_char = re->first_char;
! 4290: if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
! 4291: }
! 4292: else
! 4293: if (!startline && extra != NULL &&
! 4294: (extra->options & PCRE_STUDY_MAPPED) != 0)
! 4295: start_bits = extra->start_bits;
! 4296: }
! 4297:
! 4298: /* For anchored or unanchored matches, there may be a "last known required
! 4299: character" set. If the PCRE_CASELESS is set, implying that the match starts
! 4300: caselessly, or if there are any changes of this flag within the regex, set up
! 4301: both cases of the character. Otherwise set the two values the same, which will
! 4302: avoid duplicate testing (which takes significant time). This covers the vast
! 4303: majority of cases. It will be suboptimal when the case flag changes in a regex
! 4304: and the required character in fact is caseful. */
! 4305:
! 4306: if ((re->options & PCRE_REQCHSET) != 0)
! 4307: {
! 4308: req_char = re->req_char;
! 4309: req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
! 4310: (re->tables + fcc_offset)[req_char] : req_char;
! 4311: }
! 4312:
! 4313: /* Loop for handling unanchored repeated matching attempts; for anchored regexs
! 4314: the loop runs just once. */
! 4315:
! 4316: do
! 4317: {
! 4318: int rc;
! 4319: register int *iptr = match_block.offset_vector;
! 4320: register int *iend = iptr + resetcount;
! 4321:
! 4322: /* Reset the maximum number of extractions we might see. */
! 4323:
! 4324: while (iptr < iend) *iptr++ = -1;
! 4325:
! 4326: /* Advance to a unique first char if possible */
! 4327:
! 4328: if (first_char >= 0)
! 4329: {
! 4330: if ((ims & PCRE_CASELESS) != 0)
! 4331: while (start_match < end_subject &&
! 4332: match_block.lcc[*start_match] != first_char)
! 4333: start_match++;
! 4334: else
! 4335: while (start_match < end_subject && *start_match != first_char)
! 4336: start_match++;
! 4337: }
! 4338:
! 4339: /* Or to just after \n for a multiline match if possible */
! 4340:
! 4341: else if (startline)
! 4342: {
! 4343: if (start_match > match_block.start_subject + start_offset)
! 4344: {
! 4345: while (start_match < end_subject && start_match[-1] != '\n')
! 4346: start_match++;
! 4347: }
! 4348: }
! 4349:
! 4350: /* Or to a non-unique first char after study */
! 4351:
! 4352: else if (start_bits != NULL)
! 4353: {
! 4354: while (start_match < end_subject)
! 4355: {
! 4356: register int c = *start_match;
! 4357: if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
! 4358: }
! 4359: }
! 4360:
! 4361: #ifdef DEBUG /* Sigh. Some compilers never learn. */
! 4362: printf(">>>> Match against: ");
! 4363: pchars(start_match, end_subject - start_match, TRUE, &match_block);
! 4364: printf("\n");
! 4365: #endif
! 4366:
! 4367: /* If req_char is set, we know that that character must appear in the subject
! 4368: for the match to succeed. If the first character is set, req_char must be
! 4369: later in the subject; otherwise the test starts at the match point. This
! 4370: optimization can save a huge amount of backtracking in patterns with nested
! 4371: unlimited repeats that aren't going to match. We don't know what the state of
! 4372: case matching may be when this character is hit, so test for it in both its
! 4373: cases if necessary. However, the different cased versions will not be set up
! 4374: unless PCRE_CASELESS was given or the casing state changes within the regex.
! 4375: Writing separate code makes it go faster, as does using an autoincrement and
! 4376: backing off on a match. */
! 4377:
! 4378: if (req_char >= 0)
! 4379: {
! 4380: register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
! 4381:
! 4382: /* We don't need to repeat the search if we haven't yet reached the
! 4383: place we found it at last time. */
! 4384:
! 4385: if (p > req_char_ptr)
! 4386: {
! 4387: /* Do a single test if no case difference is set up */
! 4388:
! 4389: if (req_char == req_char2)
! 4390: {
! 4391: while (p < end_subject)
! 4392: {
! 4393: if (*p++ == req_char) { p--; break; }
! 4394: }
! 4395: }
! 4396:
! 4397: /* Otherwise test for either case */
! 4398:
! 4399: else
! 4400: {
! 4401: while (p < end_subject)
! 4402: {
! 4403: register int pp = *p++;
! 4404: if (pp == req_char || pp == req_char2) { p--; break; }
! 4405: }
! 4406: }
! 4407:
! 4408: /* If we can't find the required character, break the matching loop */
! 4409:
! 4410: if (p >= end_subject) break;
! 4411:
! 4412: /* If we have found the required character, save the point where we
! 4413: found it, so that we don't search again next time round the loop if
! 4414: the start hasn't passed this character yet. */
! 4415:
! 4416: req_char_ptr = p;
! 4417: }
! 4418: }
! 4419:
! 4420: /* When a match occurs, substrings will be set for all internal extractions;
! 4421: we just need to set up the whole thing as substring 0 before returning. If
! 4422: there were too many extractions, set the return code to zero. In the case
! 4423: where we had to get some local store to hold offsets for backreferences, copy
! 4424: those back references that we can. In this case there need not be overflow
! 4425: if certain parts of the pattern were not used. */
! 4426:
! 4427: match_block.start_match = start_match;
! 4428: if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
! 4429: continue;
! 4430:
! 4431: /* Copy the offset information from temporary store if necessary */
! 4432:
! 4433: if (using_temporary_offsets)
! 4434: {
! 4435: if (offsetcount >= 4)
! 4436: {
! 4437: memcpy(offsets + 2, match_block.offset_vector + 2,
! 4438: (offsetcount - 2) * sizeof(int));
! 4439: DPRINTF(("Copied offsets from temporary memory\n"));
! 4440: }
! 4441: if (match_block.end_offset_top > offsetcount)
! 4442: match_block.offset_overflow = TRUE;
! 4443:
! 4444: DPRINTF(("Freeing temporary memory\n"));
! 4445: (pcre_free)(match_block.offset_vector);
! 4446: }
! 4447:
! 4448: rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
! 4449:
! 4450: if (match_block.offset_end < 2) rc = 0; else
! 4451: {
! 4452: offsets[0] = start_match - match_block.start_subject;
! 4453: offsets[1] = match_block.end_match_ptr - match_block.start_subject;
! 4454: }
! 4455:
! 4456: DPRINTF((">>>> returning %d\n", rc));
! 4457: return rc;
! 4458: }
! 4459:
! 4460: /* This "while" is the end of the "do" above */
! 4461:
! 4462: while (!anchored &&
! 4463: match_block.errorcode == PCRE_ERROR_NOMATCH &&
! 4464: start_match++ < end_subject);
! 4465:
! 4466: if (using_temporary_offsets)
! 4467: {
! 4468: DPRINTF(("Freeing temporary memory\n"));
! 4469: (pcre_free)(match_block.offset_vector);
! 4470: }
! 4471:
! 4472: DPRINTF((">>>> returning %d\n", match_block.errorcode));
! 4473:
! 4474: return match_block.errorcode;
! 4475: }
! 4476:
! 4477: /* End of pcre.c */
E-mail: