Annotation of win32/pcre/pcre_exec.c, revision 1.1
1.1 ! misha 1: /*************************************************
! 2: * Perl-Compatible Regular Expressions *
! 3: *************************************************/
! 4:
! 5: /* PCRE is a library of functions to support regular expressions whose syntax
! 6: and semantics are as close as possible to those of the Perl 5 language.
! 7:
! 8: Written by Philip Hazel
! 9: Copyright (c) 1997-2008 University of Cambridge
! 10:
! 11: -----------------------------------------------------------------------------
! 12: Redistribution and use in source and binary forms, with or without
! 13: modification, are permitted provided that the following conditions are met:
! 14:
! 15: * Redistributions of source code must retain the above copyright notice,
! 16: this list of conditions and the following disclaimer.
! 17:
! 18: * Redistributions in binary form must reproduce the above copyright
! 19: notice, this list of conditions and the following disclaimer in the
! 20: documentation and/or other materials provided with the distribution.
! 21:
! 22: * Neither the name of the University of Cambridge nor the names of its
! 23: contributors may be used to endorse or promote products derived from
! 24: this software without specific prior written permission.
! 25:
! 26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
! 27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
! 28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
! 29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
! 30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
! 31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
! 32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
! 33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
! 34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
! 35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
! 36: POSSIBILITY OF SUCH DAMAGE.
! 37: -----------------------------------------------------------------------------
! 38: */
! 39:
! 40:
! 41: /* This module contains pcre_exec(), the externally visible function that does
! 42: pattern matching using an NFA algorithm, trying to mimic Perl as closely as
! 43: possible. There are also some static supporting functions. */
! 44:
! 45: #ifdef HAVE_CONFIG_H
! 46: #include "config.h"
! 47: #endif
! 48:
! 49: #define NLBLOCK md /* Block containing newline information */
! 50: #define PSSTART start_subject /* Field containing processed string start */
! 51: #define PSEND end_subject /* Field containing processed string end */
! 52:
! 53: #include "pcre_internal.h"
! 54:
! 55: /* Undefine some potentially clashing cpp symbols */
! 56:
! 57: #undef min
! 58: #undef max
! 59:
! 60: /* Flag bits for the match() function */
! 61:
! 62: #define match_condassert 0x01 /* Called to check a condition assertion */
! 63: #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
! 64:
! 65: /* Non-error returns from the match() function. Error returns are externally
! 66: defined PCRE_ERROR_xxx codes, which are all negative. */
! 67:
! 68: #define MATCH_MATCH 1
! 69: #define MATCH_NOMATCH 0
! 70:
! 71: /* Special internal returns from the match() function. Make them sufficiently
! 72: negative to avoid the external error codes. */
! 73:
! 74: #define MATCH_COMMIT (-999)
! 75: #define MATCH_PRUNE (-998)
! 76: #define MATCH_SKIP (-997)
! 77: #define MATCH_THEN (-996)
! 78:
! 79: /* Maximum number of ints of offset to save on the stack for recursive calls.
! 80: If the offset vector is bigger, malloc is used. This should be a multiple of 3,
! 81: because the offset vector is always a multiple of 3 long. */
! 82:
! 83: #define REC_STACK_SAVE_MAX 30
! 84:
! 85: /* Min and max values for the common repeats; for the maxima, 0 => infinity */
! 86:
! 87: static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
! 88: static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
! 89:
! 90:
! 91:
! 92: #ifdef DEBUG
! 93: /*************************************************
! 94: * Debugging function to print chars *
! 95: *************************************************/
! 96:
! 97: /* Print a sequence of chars in printable format, stopping at the end of the
! 98: subject if the requested.
! 99:
! 100: Arguments:
! 101: p points to characters
! 102: length number to print
! 103: is_subject TRUE if printing from within md->start_subject
! 104: md pointer to matching data block, if is_subject is TRUE
! 105:
! 106: Returns: nothing
! 107: */
! 108:
! 109: static void
! 110: pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
! 111: {
! 112: unsigned int c;
! 113: if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
! 114: while (length-- > 0)
! 115: if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
! 116: }
! 117: #endif
! 118:
! 119:
! 120:
! 121: /*************************************************
! 122: * Match a back-reference *
! 123: *************************************************/
! 124:
! 125: /* If a back reference hasn't been set, the length that is passed is greater
! 126: than the number of characters left in the string, so the match fails.
! 127:
! 128: Arguments:
! 129: offset index into the offset vector
! 130: eptr points into the subject
! 131: length length to be matched
! 132: md points to match data block
! 133: ims the ims flags
! 134:
! 135: Returns: TRUE if matched
! 136: */
! 137:
! 138: static BOOL
! 139: match_ref(int offset, register USPTR eptr, int length, match_data *md,
! 140: unsigned long int ims)
! 141: {
! 142: USPTR p = md->start_subject + md->offset_vector[offset];
! 143:
! 144: #ifdef DEBUG
! 145: if (eptr >= md->end_subject)
! 146: printf("matching subject <null>");
! 147: else
! 148: {
! 149: printf("matching subject ");
! 150: pchars(eptr, length, TRUE, md);
! 151: }
! 152: printf(" against backref ");
! 153: pchars(p, length, FALSE, md);
! 154: printf("\n");
! 155: #endif
! 156:
! 157: /* Always fail if not enough characters left */
! 158:
! 159: if (length > md->end_subject - eptr) return FALSE;
! 160:
! 161: /* Separate the caselesss case for speed */
! 162:
! 163: if ((ims & PCRE_CASELESS) != 0)
! 164: {
! 165: while (length-- > 0)
! 166: if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
! 167: }
! 168: else
! 169: { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
! 170:
! 171: return TRUE;
! 172: }
! 173:
! 174:
! 175:
! 176: /***************************************************************************
! 177: ****************************************************************************
! 178: RECURSION IN THE match() FUNCTION
! 179:
! 180: The match() function is highly recursive, though not every recursive call
! 181: increases the recursive depth. Nevertheless, some regular expressions can cause
! 182: it to recurse to a great depth. I was writing for Unix, so I just let it call
! 183: itself recursively. This uses the stack for saving everything that has to be
! 184: saved for a recursive call. On Unix, the stack can be large, and this works
! 185: fine.
! 186:
! 187: It turns out that on some non-Unix-like systems there are problems with
! 188: programs that use a lot of stack. (This despite the fact that every last chip
! 189: has oodles of memory these days, and techniques for extending the stack have
! 190: been known for decades.) So....
! 191:
! 192: There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
! 193: calls by keeping local variables that need to be preserved in blocks of memory
! 194: obtained from malloc() instead instead of on the stack. Macros are used to
! 195: achieve this so that the actual code doesn't look very different to what it
! 196: always used to.
! 197:
! 198: The original heap-recursive code used longjmp(). However, it seems that this
! 199: can be very slow on some operating systems. Following a suggestion from Stan
! 200: Switzer, the use of longjmp() has been abolished, at the cost of having to
! 201: provide a unique number for each call to RMATCH. There is no way of generating
! 202: a sequence of numbers at compile time in C. I have given them names, to make
! 203: them stand out more clearly.
! 204:
! 205: Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
! 206: FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
! 207: tests. Furthermore, not using longjmp() means that local dynamic variables
! 208: don't have indeterminate values; this has meant that the frame size can be
! 209: reduced because the result can be "passed back" by straight setting of the
! 210: variable instead of being passed in the frame.
! 211: ****************************************************************************
! 212: ***************************************************************************/
! 213:
! 214: /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
! 215: below must be updated in sync. */
! 216:
! 217: enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
! 218: RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
! 219: RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
! 220: RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
! 221: RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
! 222: RM51, RM52, RM53, RM54 };
! 223:
! 224: /* These versions of the macros use the stack, as normal. There are debugging
! 225: versions and production versions. Note that the "rw" argument of RMATCH isn't
! 226: actuall used in this definition. */
! 227:
! 228: #ifndef NO_RECURSE
! 229: #define REGISTER register
! 230:
! 231: #ifdef DEBUG
! 232: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
! 233: { \
! 234: printf("match() called in line %d\n", __LINE__); \
! 235: rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
! 236: printf("to line %d\n", __LINE__); \
! 237: }
! 238: #define RRETURN(ra) \
! 239: { \
! 240: printf("match() returned %d from line %d ", ra, __LINE__); \
! 241: return ra; \
! 242: }
! 243: #else
! 244: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
! 245: rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
! 246: #define RRETURN(ra) return ra
! 247: #endif
! 248:
! 249: #else
! 250:
! 251:
! 252: /* These versions of the macros manage a private stack on the heap. Note that
! 253: the "rd" argument of RMATCH isn't actually used in this definition. It's the md
! 254: argument of match(), which never changes. */
! 255:
! 256: #define REGISTER
! 257:
! 258: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
! 259: {\
! 260: heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
! 261: frame->Xwhere = rw; \
! 262: newframe->Xeptr = ra;\
! 263: newframe->Xecode = rb;\
! 264: newframe->Xmstart = mstart;\
! 265: newframe->Xoffset_top = rc;\
! 266: newframe->Xims = re;\
! 267: newframe->Xeptrb = rf;\
! 268: newframe->Xflags = rg;\
! 269: newframe->Xrdepth = frame->Xrdepth + 1;\
! 270: newframe->Xprevframe = frame;\
! 271: frame = newframe;\
! 272: DPRINTF(("restarting from line %d\n", __LINE__));\
! 273: goto HEAP_RECURSE;\
! 274: L_##rw:\
! 275: DPRINTF(("jumped back to line %d\n", __LINE__));\
! 276: }
! 277:
! 278: #define RRETURN(ra)\
! 279: {\
! 280: heapframe *newframe = frame;\
! 281: frame = newframe->Xprevframe;\
! 282: (pcre_stack_free)(newframe);\
! 283: if (frame != NULL)\
! 284: {\
! 285: rrc = ra;\
! 286: goto HEAP_RETURN;\
! 287: }\
! 288: return ra;\
! 289: }
! 290:
! 291:
! 292: /* Structure for remembering the local variables in a private frame */
! 293:
! 294: typedef struct heapframe {
! 295: struct heapframe *Xprevframe;
! 296:
! 297: /* Function arguments that may change */
! 298:
! 299: const uschar *Xeptr;
! 300: const uschar *Xecode;
! 301: const uschar *Xmstart;
! 302: int Xoffset_top;
! 303: long int Xims;
! 304: eptrblock *Xeptrb;
! 305: int Xflags;
! 306: unsigned int Xrdepth;
! 307:
! 308: /* Function local variables */
! 309:
! 310: const uschar *Xcallpat;
! 311: const uschar *Xcharptr;
! 312: const uschar *Xdata;
! 313: const uschar *Xnext;
! 314: const uschar *Xpp;
! 315: const uschar *Xprev;
! 316: const uschar *Xsaved_eptr;
! 317:
! 318: recursion_info Xnew_recursive;
! 319:
! 320: BOOL Xcur_is_word;
! 321: BOOL Xcondition;
! 322: BOOL Xprev_is_word;
! 323:
! 324: unsigned long int Xoriginal_ims;
! 325:
! 326: #ifdef SUPPORT_UCP
! 327: int Xprop_type;
! 328: int Xprop_value;
! 329: int Xprop_fail_result;
! 330: int Xprop_category;
! 331: int Xprop_chartype;
! 332: int Xprop_script;
! 333: int Xoclength;
! 334: uschar Xocchars[8];
! 335: #endif
! 336:
! 337: int Xctype;
! 338: unsigned int Xfc;
! 339: int Xfi;
! 340: int Xlength;
! 341: int Xmax;
! 342: int Xmin;
! 343: int Xnumber;
! 344: int Xoffset;
! 345: int Xop;
! 346: int Xsave_capture_last;
! 347: int Xsave_offset1, Xsave_offset2, Xsave_offset3;
! 348: int Xstacksave[REC_STACK_SAVE_MAX];
! 349:
! 350: eptrblock Xnewptrb;
! 351:
! 352: /* Where to jump back to */
! 353:
! 354: int Xwhere;
! 355:
! 356: } heapframe;
! 357:
! 358: #endif
! 359:
! 360:
! 361: /***************************************************************************
! 362: ***************************************************************************/
! 363:
! 364:
! 365:
! 366: /*************************************************
! 367: * Match from current position *
! 368: *************************************************/
! 369:
! 370: /* This function is called recursively in many circumstances. Whenever it
! 371: returns a negative (error) response, the outer incarnation must also return the
! 372: same response.
! 373:
! 374: Performance note: It might be tempting to extract commonly used fields from the
! 375: md structure (e.g. utf8, end_subject) into individual variables to improve
! 376: performance. Tests using gcc on a SPARC disproved this; in the first case, it
! 377: made performance worse.
! 378:
! 379: Arguments:
! 380: eptr pointer to current character in subject
! 381: ecode pointer to current position in compiled code
! 382: mstart pointer to the current match start position (can be modified
! 383: by encountering \K)
! 384: offset_top current top pointer
! 385: md pointer to "static" info for the match
! 386: ims current /i, /m, and /s options
! 387: eptrb pointer to chain of blocks containing eptr at start of
! 388: brackets - for testing for empty matches
! 389: flags can contain
! 390: match_condassert - this is an assertion condition
! 391: match_cbegroup - this is the start of an unlimited repeat
! 392: group that can match an empty string
! 393: rdepth the recursion depth
! 394:
! 395: Returns: MATCH_MATCH if matched ) these values are >= 0
! 396: MATCH_NOMATCH if failed to match )
! 397: a negative PCRE_ERROR_xxx value if aborted by an error condition
! 398: (e.g. stopped by repeated call or recursion limit)
! 399: */
! 400:
! 401: static int
! 402: match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
! 403: int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
! 404: int flags, unsigned int rdepth)
! 405: {
! 406: /* These variables do not need to be preserved over recursion in this function,
! 407: so they can be ordinary variables in all cases. Mark some of them with
! 408: "register" because they are used a lot in loops. */
! 409:
! 410: register int rrc; /* Returns from recursive calls */
! 411: register int i; /* Used for loops not involving calls to RMATCH() */
! 412: register unsigned int c; /* Character values not kept over RMATCH() calls */
! 413: register BOOL utf8; /* Local copy of UTF-8 flag for speed */
! 414:
! 415: BOOL minimize, possessive; /* Quantifier options */
! 416:
! 417: /* When recursion is not being used, all "local" variables that have to be
! 418: preserved over calls to RMATCH() are part of a "frame" which is obtained from
! 419: heap storage. Set up the top-level frame here; others are obtained from the
! 420: heap whenever RMATCH() does a "recursion". See the macro definitions above. */
! 421:
! 422: #ifdef NO_RECURSE
! 423: heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
! 424: frame->Xprevframe = NULL; /* Marks the top level */
! 425:
! 426: /* Copy in the original argument variables */
! 427:
! 428: frame->Xeptr = eptr;
! 429: frame->Xecode = ecode;
! 430: frame->Xmstart = mstart;
! 431: frame->Xoffset_top = offset_top;
! 432: frame->Xims = ims;
! 433: frame->Xeptrb = eptrb;
! 434: frame->Xflags = flags;
! 435: frame->Xrdepth = rdepth;
! 436:
! 437: /* This is where control jumps back to to effect "recursion" */
! 438:
! 439: HEAP_RECURSE:
! 440:
! 441: /* Macros make the argument variables come from the current frame */
! 442:
! 443: #define eptr frame->Xeptr
! 444: #define ecode frame->Xecode
! 445: #define mstart frame->Xmstart
! 446: #define offset_top frame->Xoffset_top
! 447: #define ims frame->Xims
! 448: #define eptrb frame->Xeptrb
! 449: #define flags frame->Xflags
! 450: #define rdepth frame->Xrdepth
! 451:
! 452: /* Ditto for the local variables */
! 453:
! 454: #ifdef SUPPORT_UTF8
! 455: #define charptr frame->Xcharptr
! 456: #endif
! 457: #define callpat frame->Xcallpat
! 458: #define data frame->Xdata
! 459: #define next frame->Xnext
! 460: #define pp frame->Xpp
! 461: #define prev frame->Xprev
! 462: #define saved_eptr frame->Xsaved_eptr
! 463:
! 464: #define new_recursive frame->Xnew_recursive
! 465:
! 466: #define cur_is_word frame->Xcur_is_word
! 467: #define condition frame->Xcondition
! 468: #define prev_is_word frame->Xprev_is_word
! 469:
! 470: #define original_ims frame->Xoriginal_ims
! 471:
! 472: #ifdef SUPPORT_UCP
! 473: #define prop_type frame->Xprop_type
! 474: #define prop_value frame->Xprop_value
! 475: #define prop_fail_result frame->Xprop_fail_result
! 476: #define prop_category frame->Xprop_category
! 477: #define prop_chartype frame->Xprop_chartype
! 478: #define prop_script frame->Xprop_script
! 479: #define oclength frame->Xoclength
! 480: #define occhars frame->Xocchars
! 481: #endif
! 482:
! 483: #define ctype frame->Xctype
! 484: #define fc frame->Xfc
! 485: #define fi frame->Xfi
! 486: #define length frame->Xlength
! 487: #define max frame->Xmax
! 488: #define min frame->Xmin
! 489: #define number frame->Xnumber
! 490: #define offset frame->Xoffset
! 491: #define op frame->Xop
! 492: #define save_capture_last frame->Xsave_capture_last
! 493: #define save_offset1 frame->Xsave_offset1
! 494: #define save_offset2 frame->Xsave_offset2
! 495: #define save_offset3 frame->Xsave_offset3
! 496: #define stacksave frame->Xstacksave
! 497:
! 498: #define newptrb frame->Xnewptrb
! 499:
! 500: /* When recursion is being used, local variables are allocated on the stack and
! 501: get preserved during recursion in the normal way. In this environment, fi and
! 502: i, and fc and c, can be the same variables. */
! 503:
! 504: #else /* NO_RECURSE not defined */
! 505: #define fi i
! 506: #define fc c
! 507:
! 508:
! 509: #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
! 510: const uschar *charptr; /* in small blocks of the code. My normal */
! 511: #endif /* style of coding would have declared */
! 512: const uschar *callpat; /* them within each of those blocks. */
! 513: const uschar *data; /* However, in order to accommodate the */
! 514: const uschar *next; /* version of this code that uses an */
! 515: USPTR pp; /* external "stack" implemented on the */
! 516: const uschar *prev; /* heap, it is easier to declare them all */
! 517: USPTR saved_eptr; /* here, so the declarations can be cut */
! 518: /* out in a block. The only declarations */
! 519: recursion_info new_recursive; /* within blocks below are for variables */
! 520: /* that do not have to be preserved over */
! 521: BOOL cur_is_word; /* a recursive call to RMATCH(). */
! 522: BOOL condition;
! 523: BOOL prev_is_word;
! 524:
! 525: unsigned long int original_ims;
! 526:
! 527: #ifdef SUPPORT_UCP
! 528: int prop_type;
! 529: int prop_value;
! 530: int prop_fail_result;
! 531: int prop_category;
! 532: int prop_chartype;
! 533: int prop_script;
! 534: int oclength;
! 535: uschar occhars[8];
! 536: #endif
! 537:
! 538: int ctype;
! 539: int length;
! 540: int max;
! 541: int min;
! 542: int number;
! 543: int offset;
! 544: int op;
! 545: int save_capture_last;
! 546: int save_offset1, save_offset2, save_offset3;
! 547: int stacksave[REC_STACK_SAVE_MAX];
! 548:
! 549: eptrblock newptrb;
! 550: #endif /* NO_RECURSE */
! 551:
! 552: /* These statements are here to stop the compiler complaining about unitialized
! 553: variables. */
! 554:
! 555: #ifdef SUPPORT_UCP
! 556: prop_value = 0;
! 557: prop_fail_result = 0;
! 558: #endif
! 559:
! 560:
! 561: /* This label is used for tail recursion, which is used in a few cases even
! 562: when NO_RECURSE is not defined, in order to reduce the amount of stack that is
! 563: used. Thanks to Ian Taylor for noticing this possibility and sending the
! 564: original patch. */
! 565:
! 566: TAIL_RECURSE:
! 567:
! 568: /* OK, now we can get on with the real code of the function. Recursive calls
! 569: are specified by the macro RMATCH and RRETURN is used to return. When
! 570: NO_RECURSE is *not* defined, these just turn into a recursive call to match()
! 571: and a "return", respectively (possibly with some debugging if DEBUG is
! 572: defined). However, RMATCH isn't like a function call because it's quite a
! 573: complicated macro. It has to be used in one particular way. This shouldn't,
! 574: however, impact performance when true recursion is being used. */
! 575:
! 576: #ifdef SUPPORT_UTF8
! 577: utf8 = md->utf8; /* Local copy of the flag */
! 578: #else
! 579: utf8 = FALSE;
! 580: #endif
! 581:
! 582: /* First check that we haven't called match() too many times, or that we
! 583: haven't exceeded the recursive call limit. */
! 584:
! 585: if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
! 586: if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
! 587:
! 588: original_ims = ims; /* Save for resetting on ')' */
! 589:
! 590: /* At the start of a group with an unlimited repeat that may match an empty
! 591: string, the match_cbegroup flag is set. When this is the case, add the current
! 592: subject pointer to the chain of such remembered pointers, to be checked when we
! 593: hit the closing ket, in order to break infinite loops that match no characters.
! 594: When match() is called in other circumstances, don't add to the chain. The
! 595: match_cbegroup flag must NOT be used with tail recursion, because the memory
! 596: block that is used is on the stack, so a new one may be required for each
! 597: match(). */
! 598:
! 599: if ((flags & match_cbegroup) != 0)
! 600: {
! 601: newptrb.epb_saved_eptr = eptr;
! 602: newptrb.epb_prev = eptrb;
! 603: eptrb = &newptrb;
! 604: }
! 605:
! 606: /* Now start processing the opcodes. */
! 607:
! 608: for (;;)
! 609: {
! 610: minimize = possessive = FALSE;
! 611: op = *ecode;
! 612:
! 613: /* For partial matching, remember if we ever hit the end of the subject after
! 614: matching at least one subject character. */
! 615:
! 616: if (md->partial &&
! 617: eptr >= md->end_subject &&
! 618: eptr > mstart)
! 619: md->hitend = TRUE;
! 620:
! 621: switch(op)
! 622: {
! 623: case OP_FAIL:
! 624: RRETURN(MATCH_NOMATCH);
! 625:
! 626: case OP_PRUNE:
! 627: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
! 628: ims, eptrb, flags, RM51);
! 629: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 630: RRETURN(MATCH_PRUNE);
! 631:
! 632: case OP_COMMIT:
! 633: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
! 634: ims, eptrb, flags, RM52);
! 635: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 636: RRETURN(MATCH_COMMIT);
! 637:
! 638: case OP_SKIP:
! 639: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
! 640: ims, eptrb, flags, RM53);
! 641: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 642: md->start_match_ptr = eptr; /* Pass back current position */
! 643: RRETURN(MATCH_SKIP);
! 644:
! 645: case OP_THEN:
! 646: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
! 647: ims, eptrb, flags, RM54);
! 648: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 649: RRETURN(MATCH_THEN);
! 650:
! 651: /* Handle a capturing bracket. If there is space in the offset vector, save
! 652: the current subject position in the working slot at the top of the vector.
! 653: We mustn't change the current values of the data slot, because they may be
! 654: set from a previous iteration of this group, and be referred to by a
! 655: reference inside the group.
! 656:
! 657: If the bracket fails to match, we need to restore this value and also the
! 658: values of the final offsets, in case they were set by a previous iteration
! 659: of the same bracket.
! 660:
! 661: If there isn't enough space in the offset vector, treat this as if it were
! 662: a non-capturing bracket. Don't worry about setting the flag for the error
! 663: case here; that is handled in the code for KET. */
! 664:
! 665: case OP_CBRA:
! 666: case OP_SCBRA:
! 667: number = GET2(ecode, 1+LINK_SIZE);
! 668: offset = number << 1;
! 669:
! 670: #ifdef DEBUG
! 671: printf("start bracket %d\n", number);
! 672: printf("subject=");
! 673: pchars(eptr, 16, TRUE, md);
! 674: printf("\n");
! 675: #endif
! 676:
! 677: if (offset < md->offset_max)
! 678: {
! 679: save_offset1 = md->offset_vector[offset];
! 680: save_offset2 = md->offset_vector[offset+1];
! 681: save_offset3 = md->offset_vector[md->offset_end - number];
! 682: save_capture_last = md->capture_last;
! 683:
! 684: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
! 685: md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
! 686:
! 687: flags = (op == OP_SCBRA)? match_cbegroup : 0;
! 688: do
! 689: {
! 690: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
! 691: ims, eptrb, flags, RM1);
! 692: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
! 693: md->capture_last = save_capture_last;
! 694: ecode += GET(ecode, 1);
! 695: }
! 696: while (*ecode == OP_ALT);
! 697:
! 698: DPRINTF(("bracket %d failed\n", number));
! 699:
! 700: md->offset_vector[offset] = save_offset1;
! 701: md->offset_vector[offset+1] = save_offset2;
! 702: md->offset_vector[md->offset_end - number] = save_offset3;
! 703:
! 704: RRETURN(MATCH_NOMATCH);
! 705: }
! 706:
! 707: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
! 708: as a non-capturing bracket. */
! 709:
! 710: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
! 711: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
! 712:
! 713: DPRINTF(("insufficient capture room: treat as non-capturing\n"));
! 714:
! 715: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
! 716: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
! 717:
! 718: /* Non-capturing bracket. Loop for all the alternatives. When we get to the
! 719: final alternative within the brackets, we would return the result of a
! 720: recursive call to match() whatever happened. We can reduce stack usage by
! 721: turning this into a tail recursion, except in the case when match_cbegroup
! 722: is set.*/
! 723:
! 724: case OP_BRA:
! 725: case OP_SBRA:
! 726: DPRINTF(("start non-capturing bracket\n"));
! 727: flags = (op >= OP_SBRA)? match_cbegroup : 0;
! 728: for (;;)
! 729: {
! 730: if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
! 731: {
! 732: if (flags == 0) /* Not a possibly empty group */
! 733: {
! 734: ecode += _pcre_OP_lengths[*ecode];
! 735: DPRINTF(("bracket 0 tail recursion\n"));
! 736: goto TAIL_RECURSE;
! 737: }
! 738:
! 739: /* Possibly empty group; can't use tail recursion. */
! 740:
! 741: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
! 742: eptrb, flags, RM48);
! 743: RRETURN(rrc);
! 744: }
! 745:
! 746: /* For non-final alternatives, continue the loop for a NOMATCH result;
! 747: otherwise return. */
! 748:
! 749: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
! 750: eptrb, flags, RM2);
! 751: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
! 752: ecode += GET(ecode, 1);
! 753: }
! 754: /* Control never reaches here. */
! 755:
! 756: /* Conditional group: compilation checked that there are no more than
! 757: two branches. If the condition is false, skipping the first branch takes us
! 758: past the end if there is only one branch, but that's OK because that is
! 759: exactly what going to the ket would do. As there is only one branch to be
! 760: obeyed, we can use tail recursion to avoid using another stack frame. */
! 761:
! 762: case OP_COND:
! 763: case OP_SCOND:
! 764: if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
! 765: {
! 766: offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
! 767: condition = md->recursive != NULL &&
! 768: (offset == RREF_ANY || offset == md->recursive->group_num);
! 769: ecode += condition? 3 : GET(ecode, 1);
! 770: }
! 771:
! 772: else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
! 773: {
! 774: offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
! 775: condition = offset < offset_top && md->offset_vector[offset] >= 0;
! 776: ecode += condition? 3 : GET(ecode, 1);
! 777: }
! 778:
! 779: else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
! 780: {
! 781: condition = FALSE;
! 782: ecode += GET(ecode, 1);
! 783: }
! 784:
! 785: /* The condition is an assertion. Call match() to evaluate it - setting
! 786: the final argument match_condassert causes it to stop at the end of an
! 787: assertion. */
! 788:
! 789: else
! 790: {
! 791: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
! 792: match_condassert, RM3);
! 793: if (rrc == MATCH_MATCH)
! 794: {
! 795: condition = TRUE;
! 796: ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
! 797: while (*ecode == OP_ALT) ecode += GET(ecode, 1);
! 798: }
! 799: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
! 800: {
! 801: RRETURN(rrc); /* Need braces because of following else */
! 802: }
! 803: else
! 804: {
! 805: condition = FALSE;
! 806: ecode += GET(ecode, 1);
! 807: }
! 808: }
! 809:
! 810: /* We are now at the branch that is to be obeyed. As there is only one,
! 811: we can use tail recursion to avoid using another stack frame, except when
! 812: match_cbegroup is required for an unlimited repeat of a possibly empty
! 813: group. If the second alternative doesn't exist, we can just plough on. */
! 814:
! 815: if (condition || *ecode == OP_ALT)
! 816: {
! 817: ecode += 1 + LINK_SIZE;
! 818: if (op == OP_SCOND) /* Possibly empty group */
! 819: {
! 820: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
! 821: RRETURN(rrc);
! 822: }
! 823: else /* Group must match something */
! 824: {
! 825: flags = 0;
! 826: goto TAIL_RECURSE;
! 827: }
! 828: }
! 829: else /* Condition false & no 2nd alternative */
! 830: {
! 831: ecode += 1 + LINK_SIZE;
! 832: }
! 833: break;
! 834:
! 835:
! 836: /* End of the pattern, either real or forced. If we are in a top-level
! 837: recursion, we should restore the offsets appropriately and continue from
! 838: after the call. */
! 839:
! 840: case OP_ACCEPT:
! 841: case OP_END:
! 842: if (md->recursive != NULL && md->recursive->group_num == 0)
! 843: {
! 844: recursion_info *rec = md->recursive;
! 845: DPRINTF(("End of pattern in a (?0) recursion\n"));
! 846: md->recursive = rec->prevrec;
! 847: memmove(md->offset_vector, rec->offset_save,
! 848: rec->saved_max * sizeof(int));
! 849: mstart = rec->save_start;
! 850: ims = original_ims;
! 851: ecode = rec->after_call;
! 852: break;
! 853: }
! 854:
! 855: /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
! 856: string - backtracking will then try other alternatives, if any. */
! 857:
! 858: if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
! 859: md->end_match_ptr = eptr; /* Record where we ended */
! 860: md->end_offset_top = offset_top; /* and how many extracts were taken */
! 861: md->start_match_ptr = mstart; /* and the start (\K can modify) */
! 862: RRETURN(MATCH_MATCH);
! 863:
! 864: /* Change option settings */
! 865:
! 866: case OP_OPT:
! 867: ims = ecode[1];
! 868: ecode += 2;
! 869: DPRINTF(("ims set to %02lx\n", ims));
! 870: break;
! 871:
! 872: /* Assertion brackets. Check the alternative branches in turn - the
! 873: matching won't pass the KET for an assertion. If any one branch matches,
! 874: the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
! 875: start of each branch to move the current point backwards, so the code at
! 876: this level is identical to the lookahead case. */
! 877:
! 878: case OP_ASSERT:
! 879: case OP_ASSERTBACK:
! 880: do
! 881: {
! 882: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
! 883: RM4);
! 884: if (rrc == MATCH_MATCH) break;
! 885: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
! 886: ecode += GET(ecode, 1);
! 887: }
! 888: while (*ecode == OP_ALT);
! 889: if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
! 890:
! 891: /* If checking an assertion for a condition, return MATCH_MATCH. */
! 892:
! 893: if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
! 894:
! 895: /* Continue from after the assertion, updating the offsets high water
! 896: mark, since extracts may have been taken during the assertion. */
! 897:
! 898: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
! 899: ecode += 1 + LINK_SIZE;
! 900: offset_top = md->end_offset_top;
! 901: continue;
! 902:
! 903: /* Negative assertion: all branches must fail to match */
! 904:
! 905: case OP_ASSERT_NOT:
! 906: case OP_ASSERTBACK_NOT:
! 907: do
! 908: {
! 909: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
! 910: RM5);
! 911: if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
! 912: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
! 913: ecode += GET(ecode,1);
! 914: }
! 915: while (*ecode == OP_ALT);
! 916:
! 917: if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
! 918:
! 919: ecode += 1 + LINK_SIZE;
! 920: continue;
! 921:
! 922: /* Move the subject pointer back. This occurs only at the start of
! 923: each branch of a lookbehind assertion. If we are too close to the start to
! 924: move back, this match function fails. When working with UTF-8 we move
! 925: back a number of characters, not bytes. */
! 926:
! 927: case OP_REVERSE:
! 928: #ifdef SUPPORT_UTF8
! 929: if (utf8)
! 930: {
! 931: i = GET(ecode, 1);
! 932: while (i-- > 0)
! 933: {
! 934: eptr--;
! 935: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
! 936: BACKCHAR(eptr);
! 937: }
! 938: }
! 939: else
! 940: #endif
! 941:
! 942: /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
! 943:
! 944: {
! 945: eptr -= GET(ecode, 1);
! 946: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
! 947: }
! 948:
! 949: /* Skip to next op code */
! 950:
! 951: ecode += 1 + LINK_SIZE;
! 952: break;
! 953:
! 954: /* The callout item calls an external function, if one is provided, passing
! 955: details of the match so far. This is mainly for debugging, though the
! 956: function is able to force a failure. */
! 957:
! 958: case OP_CALLOUT:
! 959: if (pcre_callout != NULL)
! 960: {
! 961: pcre_callout_block cb;
! 962: cb.version = 1; /* Version 1 of the callout block */
! 963: cb.callout_number = ecode[1];
! 964: cb.offset_vector = md->offset_vector;
! 965: cb.subject = (PCRE_SPTR)md->start_subject;
! 966: cb.subject_length = md->end_subject - md->start_subject;
! 967: cb.start_match = mstart - md->start_subject;
! 968: cb.current_position = eptr - md->start_subject;
! 969: cb.pattern_position = GET(ecode, 2);
! 970: cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
! 971: cb.capture_top = offset_top/2;
! 972: cb.capture_last = md->capture_last;
! 973: cb.callout_data = md->callout_data;
! 974: if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
! 975: if (rrc < 0) RRETURN(rrc);
! 976: }
! 977: ecode += 2 + 2*LINK_SIZE;
! 978: break;
! 979:
! 980: /* Recursion either matches the current regex, or some subexpression. The
! 981: offset data is the offset to the starting bracket from the start of the
! 982: whole pattern. (This is so that it works from duplicated subpatterns.)
! 983:
! 984: If there are any capturing brackets started but not finished, we have to
! 985: save their starting points and reinstate them after the recursion. However,
! 986: we don't know how many such there are (offset_top records the completed
! 987: total) so we just have to save all the potential data. There may be up to
! 988: 65535 such values, which is too large to put on the stack, but using malloc
! 989: for small numbers seems expensive. As a compromise, the stack is used when
! 990: there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
! 991: is used. A problem is what to do if the malloc fails ... there is no way of
! 992: returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
! 993: values on the stack, and accept that the rest may be wrong.
! 994:
! 995: There are also other values that have to be saved. We use a chained
! 996: sequence of blocks that actually live on the stack. Thanks to Robin Houston
! 997: for the original version of this logic. */
! 998:
! 999: case OP_RECURSE:
! 1000: {
! 1001: callpat = md->start_code + GET(ecode, 1);
! 1002: new_recursive.group_num = (callpat == md->start_code)? 0 :
! 1003: GET2(callpat, 1 + LINK_SIZE);
! 1004:
! 1005: /* Add to "recursing stack" */
! 1006:
! 1007: new_recursive.prevrec = md->recursive;
! 1008: md->recursive = &new_recursive;
! 1009:
! 1010: /* Find where to continue from afterwards */
! 1011:
! 1012: ecode += 1 + LINK_SIZE;
! 1013: new_recursive.after_call = ecode;
! 1014:
! 1015: /* Now save the offset data. */
! 1016:
! 1017: new_recursive.saved_max = md->offset_end;
! 1018: if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
! 1019: new_recursive.offset_save = stacksave;
! 1020: else
! 1021: {
! 1022: new_recursive.offset_save =
! 1023: (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
! 1024: if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
! 1025: }
! 1026:
! 1027: memcpy(new_recursive.offset_save, md->offset_vector,
! 1028: new_recursive.saved_max * sizeof(int));
! 1029: new_recursive.save_start = mstart;
! 1030: mstart = eptr;
! 1031:
! 1032: /* OK, now we can do the recursion. For each top-level alternative we
! 1033: restore the offset and recursion data. */
! 1034:
! 1035: DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
! 1036: flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
! 1037: do
! 1038: {
! 1039: RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
! 1040: md, ims, eptrb, flags, RM6);
! 1041: if (rrc == MATCH_MATCH)
! 1042: {
! 1043: DPRINTF(("Recursion matched\n"));
! 1044: md->recursive = new_recursive.prevrec;
! 1045: if (new_recursive.offset_save != stacksave)
! 1046: (pcre_free)(new_recursive.offset_save);
! 1047: RRETURN(MATCH_MATCH);
! 1048: }
! 1049: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
! 1050: {
! 1051: DPRINTF(("Recursion gave error %d\n", rrc));
! 1052: RRETURN(rrc);
! 1053: }
! 1054:
! 1055: md->recursive = &new_recursive;
! 1056: memcpy(md->offset_vector, new_recursive.offset_save,
! 1057: new_recursive.saved_max * sizeof(int));
! 1058: callpat += GET(callpat, 1);
! 1059: }
! 1060: while (*callpat == OP_ALT);
! 1061:
! 1062: DPRINTF(("Recursion didn't match\n"));
! 1063: md->recursive = new_recursive.prevrec;
! 1064: if (new_recursive.offset_save != stacksave)
! 1065: (pcre_free)(new_recursive.offset_save);
! 1066: RRETURN(MATCH_NOMATCH);
! 1067: }
! 1068: /* Control never reaches here */
! 1069:
! 1070: /* "Once" brackets are like assertion brackets except that after a match,
! 1071: the point in the subject string is not moved back. Thus there can never be
! 1072: a move back into the brackets. Friedl calls these "atomic" subpatterns.
! 1073: Check the alternative branches in turn - the matching won't pass the KET
! 1074: for this kind of subpattern. If any one branch matches, we carry on as at
! 1075: the end of a normal bracket, leaving the subject pointer. */
! 1076:
! 1077: case OP_ONCE:
! 1078: prev = ecode;
! 1079: saved_eptr = eptr;
! 1080:
! 1081: do
! 1082: {
! 1083: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
! 1084: if (rrc == MATCH_MATCH) break;
! 1085: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
! 1086: ecode += GET(ecode,1);
! 1087: }
! 1088: while (*ecode == OP_ALT);
! 1089:
! 1090: /* If hit the end of the group (which could be repeated), fail */
! 1091:
! 1092: if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
! 1093:
! 1094: /* Continue as from after the assertion, updating the offsets high water
! 1095: mark, since extracts may have been taken. */
! 1096:
! 1097: do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
! 1098:
! 1099: offset_top = md->end_offset_top;
! 1100: eptr = md->end_match_ptr;
! 1101:
! 1102: /* For a non-repeating ket, just continue at this level. This also
! 1103: happens for a repeating ket if no characters were matched in the group.
! 1104: This is the forcible breaking of infinite loops as implemented in Perl
! 1105: 5.005. If there is an options reset, it will get obeyed in the normal
! 1106: course of events. */
! 1107:
! 1108: if (*ecode == OP_KET || eptr == saved_eptr)
! 1109: {
! 1110: ecode += 1+LINK_SIZE;
! 1111: break;
! 1112: }
! 1113:
! 1114: /* The repeating kets try the rest of the pattern or restart from the
! 1115: preceding bracket, in the appropriate order. The second "call" of match()
! 1116: uses tail recursion, to avoid using another stack frame. We need to reset
! 1117: any options that changed within the bracket before re-running it, so
! 1118: check the next opcode. */
! 1119:
! 1120: if (ecode[1+LINK_SIZE] == OP_OPT)
! 1121: {
! 1122: ims = (ims & ~PCRE_IMS) | ecode[4];
! 1123: DPRINTF(("ims set to %02lx at group repeat\n", ims));
! 1124: }
! 1125:
! 1126: if (*ecode == OP_KETRMIN)
! 1127: {
! 1128: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
! 1129: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 1130: ecode = prev;
! 1131: flags = 0;
! 1132: goto TAIL_RECURSE;
! 1133: }
! 1134: else /* OP_KETRMAX */
! 1135: {
! 1136: RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
! 1137: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 1138: ecode += 1 + LINK_SIZE;
! 1139: flags = 0;
! 1140: goto TAIL_RECURSE;
! 1141: }
! 1142: /* Control never gets here */
! 1143:
! 1144: /* An alternation is the end of a branch; scan along to find the end of the
! 1145: bracketed group and go to there. */
! 1146:
! 1147: case OP_ALT:
! 1148: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
! 1149: break;
! 1150:
! 1151: /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
! 1152: indicating that it may occur zero times. It may repeat infinitely, or not
! 1153: at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
! 1154: with fixed upper repeat limits are compiled as a number of copies, with the
! 1155: optional ones preceded by BRAZERO or BRAMINZERO. */
! 1156:
! 1157: case OP_BRAZERO:
! 1158: {
! 1159: next = ecode+1;
! 1160: RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
! 1161: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 1162: do next += GET(next,1); while (*next == OP_ALT);
! 1163: ecode = next + 1 + LINK_SIZE;
! 1164: }
! 1165: break;
! 1166:
! 1167: case OP_BRAMINZERO:
! 1168: {
! 1169: next = ecode+1;
! 1170: do next += GET(next, 1); while (*next == OP_ALT);
! 1171: RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
! 1172: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 1173: ecode++;
! 1174: }
! 1175: break;
! 1176:
! 1177: case OP_SKIPZERO:
! 1178: {
! 1179: next = ecode+1;
! 1180: do next += GET(next,1); while (*next == OP_ALT);
! 1181: ecode = next + 1 + LINK_SIZE;
! 1182: }
! 1183: break;
! 1184:
! 1185: /* End of a group, repeated or non-repeating. */
! 1186:
! 1187: case OP_KET:
! 1188: case OP_KETRMIN:
! 1189: case OP_KETRMAX:
! 1190: prev = ecode - GET(ecode, 1);
! 1191:
! 1192: /* If this was a group that remembered the subject start, in order to break
! 1193: infinite repeats of empty string matches, retrieve the subject start from
! 1194: the chain. Otherwise, set it NULL. */
! 1195:
! 1196: if (*prev >= OP_SBRA)
! 1197: {
! 1198: saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
! 1199: eptrb = eptrb->epb_prev; /* Backup to previous group */
! 1200: }
! 1201: else saved_eptr = NULL;
! 1202:
! 1203: /* If we are at the end of an assertion group, stop matching and return
! 1204: MATCH_MATCH, but record the current high water mark for use by positive
! 1205: assertions. Do this also for the "once" (atomic) groups. */
! 1206:
! 1207: if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
! 1208: *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
! 1209: *prev == OP_ONCE)
! 1210: {
! 1211: md->end_match_ptr = eptr; /* For ONCE */
! 1212: md->end_offset_top = offset_top;
! 1213: RRETURN(MATCH_MATCH);
! 1214: }
! 1215:
! 1216: /* For capturing groups we have to check the group number back at the start
! 1217: and if necessary complete handling an extraction by setting the offsets and
! 1218: bumping the high water mark. Note that whole-pattern recursion is coded as
! 1219: a recurse into group 0, so it won't be picked up here. Instead, we catch it
! 1220: when the OP_END is reached. Other recursion is handled here. */
! 1221:
! 1222: if (*prev == OP_CBRA || *prev == OP_SCBRA)
! 1223: {
! 1224: number = GET2(prev, 1+LINK_SIZE);
! 1225: offset = number << 1;
! 1226:
! 1227: #ifdef DEBUG
! 1228: printf("end bracket %d", number);
! 1229: printf("\n");
! 1230: #endif
! 1231:
! 1232: md->capture_last = number;
! 1233: if (offset >= md->offset_max) md->offset_overflow = TRUE; else
! 1234: {
! 1235: md->offset_vector[offset] =
! 1236: md->offset_vector[md->offset_end - number];
! 1237: md->offset_vector[offset+1] = eptr - md->start_subject;
! 1238: if (offset_top <= offset) offset_top = offset + 2;
! 1239: }
! 1240:
! 1241: /* Handle a recursively called group. Restore the offsets
! 1242: appropriately and continue from after the call. */
! 1243:
! 1244: if (md->recursive != NULL && md->recursive->group_num == number)
! 1245: {
! 1246: recursion_info *rec = md->recursive;
! 1247: DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
! 1248: md->recursive = rec->prevrec;
! 1249: mstart = rec->save_start;
! 1250: memcpy(md->offset_vector, rec->offset_save,
! 1251: rec->saved_max * sizeof(int));
! 1252: ecode = rec->after_call;
! 1253: ims = original_ims;
! 1254: break;
! 1255: }
! 1256: }
! 1257:
! 1258: /* For both capturing and non-capturing groups, reset the value of the ims
! 1259: flags, in case they got changed during the group. */
! 1260:
! 1261: ims = original_ims;
! 1262: DPRINTF(("ims reset to %02lx\n", ims));
! 1263:
! 1264: /* For a non-repeating ket, just continue at this level. This also
! 1265: happens for a repeating ket if no characters were matched in the group.
! 1266: This is the forcible breaking of infinite loops as implemented in Perl
! 1267: 5.005. If there is an options reset, it will get obeyed in the normal
! 1268: course of events. */
! 1269:
! 1270: if (*ecode == OP_KET || eptr == saved_eptr)
! 1271: {
! 1272: ecode += 1 + LINK_SIZE;
! 1273: break;
! 1274: }
! 1275:
! 1276: /* The repeating kets try the rest of the pattern or restart from the
! 1277: preceding bracket, in the appropriate order. In the second case, we can use
! 1278: tail recursion to avoid using another stack frame, unless we have an
! 1279: unlimited repeat of a group that can match an empty string. */
! 1280:
! 1281: flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
! 1282:
! 1283: if (*ecode == OP_KETRMIN)
! 1284: {
! 1285: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
! 1286: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 1287: if (flags != 0) /* Could match an empty string */
! 1288: {
! 1289: RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
! 1290: RRETURN(rrc);
! 1291: }
! 1292: ecode = prev;
! 1293: goto TAIL_RECURSE;
! 1294: }
! 1295: else /* OP_KETRMAX */
! 1296: {
! 1297: RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
! 1298: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 1299: ecode += 1 + LINK_SIZE;
! 1300: flags = 0;
! 1301: goto TAIL_RECURSE;
! 1302: }
! 1303: /* Control never gets here */
! 1304:
! 1305: /* Start of subject unless notbol, or after internal newline if multiline */
! 1306:
! 1307: case OP_CIRC:
! 1308: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
! 1309: if ((ims & PCRE_MULTILINE) != 0)
! 1310: {
! 1311: if (eptr != md->start_subject &&
! 1312: (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
! 1313: RRETURN(MATCH_NOMATCH);
! 1314: ecode++;
! 1315: break;
! 1316: }
! 1317: /* ... else fall through */
! 1318:
! 1319: /* Start of subject assertion */
! 1320:
! 1321: case OP_SOD:
! 1322: if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
! 1323: ecode++;
! 1324: break;
! 1325:
! 1326: /* Start of match assertion */
! 1327:
! 1328: case OP_SOM:
! 1329: if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
! 1330: ecode++;
! 1331: break;
! 1332:
! 1333: /* Reset the start of match point */
! 1334:
! 1335: case OP_SET_SOM:
! 1336: mstart = eptr;
! 1337: ecode++;
! 1338: break;
! 1339:
! 1340: /* Assert before internal newline if multiline, or before a terminating
! 1341: newline unless endonly is set, else end of subject unless noteol is set. */
! 1342:
! 1343: case OP_DOLL:
! 1344: if ((ims & PCRE_MULTILINE) != 0)
! 1345: {
! 1346: if (eptr < md->end_subject)
! 1347: { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
! 1348: else
! 1349: { if (md->noteol) RRETURN(MATCH_NOMATCH); }
! 1350: ecode++;
! 1351: break;
! 1352: }
! 1353: else
! 1354: {
! 1355: if (md->noteol) RRETURN(MATCH_NOMATCH);
! 1356: if (!md->endonly)
! 1357: {
! 1358: if (eptr != md->end_subject &&
! 1359: (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
! 1360: RRETURN(MATCH_NOMATCH);
! 1361: ecode++;
! 1362: break;
! 1363: }
! 1364: }
! 1365: /* ... else fall through for endonly */
! 1366:
! 1367: /* End of subject assertion (\z) */
! 1368:
! 1369: case OP_EOD:
! 1370: if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
! 1371: ecode++;
! 1372: break;
! 1373:
! 1374: /* End of subject or ending \n assertion (\Z) */
! 1375:
! 1376: case OP_EODN:
! 1377: if (eptr != md->end_subject &&
! 1378: (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
! 1379: RRETURN(MATCH_NOMATCH);
! 1380: ecode++;
! 1381: break;
! 1382:
! 1383: /* Word boundary assertions */
! 1384:
! 1385: case OP_NOT_WORD_BOUNDARY:
! 1386: case OP_WORD_BOUNDARY:
! 1387: {
! 1388:
! 1389: /* Find out if the previous and current characters are "word" characters.
! 1390: It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
! 1391: be "non-word" characters. */
! 1392:
! 1393: #ifdef SUPPORT_UTF8
! 1394: if (utf8)
! 1395: {
! 1396: if (eptr == md->start_subject) prev_is_word = FALSE; else
! 1397: {
! 1398: const uschar *lastptr = eptr - 1;
! 1399: while((*lastptr & 0xc0) == 0x80) lastptr--;
! 1400: GETCHAR(c, lastptr);
! 1401: prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
! 1402: }
! 1403: if (eptr >= md->end_subject) cur_is_word = FALSE; else
! 1404: {
! 1405: GETCHAR(c, eptr);
! 1406: cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
! 1407: }
! 1408: }
! 1409: else
! 1410: #endif
! 1411:
! 1412: /* More streamlined when not in UTF-8 mode */
! 1413:
! 1414: {
! 1415: prev_is_word = (eptr != md->start_subject) &&
! 1416: ((md->ctypes[eptr[-1]] & ctype_word) != 0);
! 1417: cur_is_word = (eptr < md->end_subject) &&
! 1418: ((md->ctypes[*eptr] & ctype_word) != 0);
! 1419: }
! 1420:
! 1421: /* Now see if the situation is what we want */
! 1422:
! 1423: if ((*ecode++ == OP_WORD_BOUNDARY)?
! 1424: cur_is_word == prev_is_word : cur_is_word != prev_is_word)
! 1425: RRETURN(MATCH_NOMATCH);
! 1426: }
! 1427: break;
! 1428:
! 1429: /* Match a single character type; inline for speed */
! 1430:
! 1431: case OP_ANY:
! 1432: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
! 1433: /* Fall through */
! 1434:
! 1435: case OP_ALLANY:
! 1436: if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1437: if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
! 1438: ecode++;
! 1439: break;
! 1440:
! 1441: /* Match a single byte, even in UTF-8 mode. This opcode really does match
! 1442: any byte, even newline, independent of the setting of PCRE_DOTALL. */
! 1443:
! 1444: case OP_ANYBYTE:
! 1445: if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1446: ecode++;
! 1447: break;
! 1448:
! 1449: case OP_NOT_DIGIT:
! 1450: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1451: GETCHARINCTEST(c, eptr);
! 1452: if (
! 1453: #ifdef SUPPORT_UTF8
! 1454: c < 256 &&
! 1455: #endif
! 1456: (md->ctypes[c] & ctype_digit) != 0
! 1457: )
! 1458: RRETURN(MATCH_NOMATCH);
! 1459: ecode++;
! 1460: break;
! 1461:
! 1462: case OP_DIGIT:
! 1463: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1464: GETCHARINCTEST(c, eptr);
! 1465: if (
! 1466: #ifdef SUPPORT_UTF8
! 1467: c >= 256 ||
! 1468: #endif
! 1469: (md->ctypes[c] & ctype_digit) == 0
! 1470: )
! 1471: RRETURN(MATCH_NOMATCH);
! 1472: ecode++;
! 1473: break;
! 1474:
! 1475: case OP_NOT_WHITESPACE:
! 1476: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1477: GETCHARINCTEST(c, eptr);
! 1478: if (
! 1479: #ifdef SUPPORT_UTF8
! 1480: c < 256 &&
! 1481: #endif
! 1482: (md->ctypes[c] & ctype_space) != 0
! 1483: )
! 1484: RRETURN(MATCH_NOMATCH);
! 1485: ecode++;
! 1486: break;
! 1487:
! 1488: case OP_WHITESPACE:
! 1489: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1490: GETCHARINCTEST(c, eptr);
! 1491: if (
! 1492: #ifdef SUPPORT_UTF8
! 1493: c >= 256 ||
! 1494: #endif
! 1495: (md->ctypes[c] & ctype_space) == 0
! 1496: )
! 1497: RRETURN(MATCH_NOMATCH);
! 1498: ecode++;
! 1499: break;
! 1500:
! 1501: case OP_NOT_WORDCHAR:
! 1502: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1503: GETCHARINCTEST(c, eptr);
! 1504: if (
! 1505: #ifdef SUPPORT_UTF8
! 1506: c < 256 &&
! 1507: #endif
! 1508: (md->ctypes[c] & ctype_word) != 0
! 1509: )
! 1510: RRETURN(MATCH_NOMATCH);
! 1511: ecode++;
! 1512: break;
! 1513:
! 1514: case OP_WORDCHAR:
! 1515: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1516: GETCHARINCTEST(c, eptr);
! 1517: if (
! 1518: #ifdef SUPPORT_UTF8
! 1519: c >= 256 ||
! 1520: #endif
! 1521: (md->ctypes[c] & ctype_word) == 0
! 1522: )
! 1523: RRETURN(MATCH_NOMATCH);
! 1524: ecode++;
! 1525: break;
! 1526:
! 1527: case OP_ANYNL:
! 1528: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1529: GETCHARINCTEST(c, eptr);
! 1530: switch(c)
! 1531: {
! 1532: default: RRETURN(MATCH_NOMATCH);
! 1533: case 0x000d:
! 1534: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
! 1535: break;
! 1536:
! 1537: case 0x000a:
! 1538: break;
! 1539:
! 1540: case 0x000b:
! 1541: case 0x000c:
! 1542: case 0x0085:
! 1543: case 0x2028:
! 1544: case 0x2029:
! 1545: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
! 1546: break;
! 1547: }
! 1548: ecode++;
! 1549: break;
! 1550:
! 1551: case OP_NOT_HSPACE:
! 1552: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1553: GETCHARINCTEST(c, eptr);
! 1554: switch(c)
! 1555: {
! 1556: default: break;
! 1557: case 0x09: /* HT */
! 1558: case 0x20: /* SPACE */
! 1559: case 0xa0: /* NBSP */
! 1560: case 0x1680: /* OGHAM SPACE MARK */
! 1561: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
! 1562: case 0x2000: /* EN QUAD */
! 1563: case 0x2001: /* EM QUAD */
! 1564: case 0x2002: /* EN SPACE */
! 1565: case 0x2003: /* EM SPACE */
! 1566: case 0x2004: /* THREE-PER-EM SPACE */
! 1567: case 0x2005: /* FOUR-PER-EM SPACE */
! 1568: case 0x2006: /* SIX-PER-EM SPACE */
! 1569: case 0x2007: /* FIGURE SPACE */
! 1570: case 0x2008: /* PUNCTUATION SPACE */
! 1571: case 0x2009: /* THIN SPACE */
! 1572: case 0x200A: /* HAIR SPACE */
! 1573: case 0x202f: /* NARROW NO-BREAK SPACE */
! 1574: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
! 1575: case 0x3000: /* IDEOGRAPHIC SPACE */
! 1576: RRETURN(MATCH_NOMATCH);
! 1577: }
! 1578: ecode++;
! 1579: break;
! 1580:
! 1581: case OP_HSPACE:
! 1582: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1583: GETCHARINCTEST(c, eptr);
! 1584: switch(c)
! 1585: {
! 1586: default: RRETURN(MATCH_NOMATCH);
! 1587: case 0x09: /* HT */
! 1588: case 0x20: /* SPACE */
! 1589: case 0xa0: /* NBSP */
! 1590: case 0x1680: /* OGHAM SPACE MARK */
! 1591: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
! 1592: case 0x2000: /* EN QUAD */
! 1593: case 0x2001: /* EM QUAD */
! 1594: case 0x2002: /* EN SPACE */
! 1595: case 0x2003: /* EM SPACE */
! 1596: case 0x2004: /* THREE-PER-EM SPACE */
! 1597: case 0x2005: /* FOUR-PER-EM SPACE */
! 1598: case 0x2006: /* SIX-PER-EM SPACE */
! 1599: case 0x2007: /* FIGURE SPACE */
! 1600: case 0x2008: /* PUNCTUATION SPACE */
! 1601: case 0x2009: /* THIN SPACE */
! 1602: case 0x200A: /* HAIR SPACE */
! 1603: case 0x202f: /* NARROW NO-BREAK SPACE */
! 1604: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
! 1605: case 0x3000: /* IDEOGRAPHIC SPACE */
! 1606: break;
! 1607: }
! 1608: ecode++;
! 1609: break;
! 1610:
! 1611: case OP_NOT_VSPACE:
! 1612: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1613: GETCHARINCTEST(c, eptr);
! 1614: switch(c)
! 1615: {
! 1616: default: break;
! 1617: case 0x0a: /* LF */
! 1618: case 0x0b: /* VT */
! 1619: case 0x0c: /* FF */
! 1620: case 0x0d: /* CR */
! 1621: case 0x85: /* NEL */
! 1622: case 0x2028: /* LINE SEPARATOR */
! 1623: case 0x2029: /* PARAGRAPH SEPARATOR */
! 1624: RRETURN(MATCH_NOMATCH);
! 1625: }
! 1626: ecode++;
! 1627: break;
! 1628:
! 1629: case OP_VSPACE:
! 1630: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1631: GETCHARINCTEST(c, eptr);
! 1632: switch(c)
! 1633: {
! 1634: default: RRETURN(MATCH_NOMATCH);
! 1635: case 0x0a: /* LF */
! 1636: case 0x0b: /* VT */
! 1637: case 0x0c: /* FF */
! 1638: case 0x0d: /* CR */
! 1639: case 0x85: /* NEL */
! 1640: case 0x2028: /* LINE SEPARATOR */
! 1641: case 0x2029: /* PARAGRAPH SEPARATOR */
! 1642: break;
! 1643: }
! 1644: ecode++;
! 1645: break;
! 1646:
! 1647: #ifdef SUPPORT_UCP
! 1648: /* Check the next character by Unicode property. We will get here only
! 1649: if the support is in the binary; otherwise a compile-time error occurs. */
! 1650:
! 1651: case OP_PROP:
! 1652: case OP_NOTPROP:
! 1653: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1654: GETCHARINCTEST(c, eptr);
! 1655: {
! 1656: int chartype, script;
! 1657: int category = _pcre_ucp_findprop(c, &chartype, &script);
! 1658:
! 1659: switch(ecode[1])
! 1660: {
! 1661: case PT_ANY:
! 1662: if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
! 1663: break;
! 1664:
! 1665: case PT_LAMP:
! 1666: if ((chartype == ucp_Lu ||
! 1667: chartype == ucp_Ll ||
! 1668: chartype == ucp_Lt) == (op == OP_NOTPROP))
! 1669: RRETURN(MATCH_NOMATCH);
! 1670: break;
! 1671:
! 1672: case PT_GC:
! 1673: if ((ecode[2] != category) == (op == OP_PROP))
! 1674: RRETURN(MATCH_NOMATCH);
! 1675: break;
! 1676:
! 1677: case PT_PC:
! 1678: if ((ecode[2] != chartype) == (op == OP_PROP))
! 1679: RRETURN(MATCH_NOMATCH);
! 1680: break;
! 1681:
! 1682: case PT_SC:
! 1683: if ((ecode[2] != script) == (op == OP_PROP))
! 1684: RRETURN(MATCH_NOMATCH);
! 1685: break;
! 1686:
! 1687: default:
! 1688: RRETURN(PCRE_ERROR_INTERNAL);
! 1689: }
! 1690:
! 1691: ecode += 3;
! 1692: }
! 1693: break;
! 1694:
! 1695: /* Match an extended Unicode sequence. We will get here only if the support
! 1696: is in the binary; otherwise a compile-time error occurs. */
! 1697:
! 1698: case OP_EXTUNI:
! 1699: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1700: GETCHARINCTEST(c, eptr);
! 1701: {
! 1702: int chartype, script;
! 1703: int category = _pcre_ucp_findprop(c, &chartype, &script);
! 1704: if (category == ucp_M) RRETURN(MATCH_NOMATCH);
! 1705: while (eptr < md->end_subject)
! 1706: {
! 1707: int len = 1;
! 1708: if (!utf8) c = *eptr; else
! 1709: {
! 1710: GETCHARLEN(c, eptr, len);
! 1711: }
! 1712: category = _pcre_ucp_findprop(c, &chartype, &script);
! 1713: if (category != ucp_M) break;
! 1714: eptr += len;
! 1715: }
! 1716: }
! 1717: ecode++;
! 1718: break;
! 1719: #endif
! 1720:
! 1721:
! 1722: /* Match a back reference, possibly repeatedly. Look past the end of the
! 1723: item to see if there is repeat information following. The code is similar
! 1724: to that for character classes, but repeated for efficiency. Then obey
! 1725: similar code to character type repeats - written out again for speed.
! 1726: However, if the referenced string is the empty string, always treat
! 1727: it as matched, any number of times (otherwise there could be infinite
! 1728: loops). */
! 1729:
! 1730: case OP_REF:
! 1731: {
! 1732: offset = GET2(ecode, 1) << 1; /* Doubled ref number */
! 1733: ecode += 3;
! 1734:
! 1735: /* If the reference is unset, there are two possibilities:
! 1736:
! 1737: (a) In the default, Perl-compatible state, set the length to be longer
! 1738: than the amount of subject left; this ensures that every attempt at a
! 1739: match fails. We can't just fail here, because of the possibility of
! 1740: quantifiers with zero minima.
! 1741:
! 1742: (b) If the JavaScript compatibility flag is set, set the length to zero
! 1743: so that the back reference matches an empty string.
! 1744:
! 1745: Otherwise, set the length to the length of what was matched by the
! 1746: referenced subpattern. */
! 1747:
! 1748: if (offset >= offset_top || md->offset_vector[offset] < 0)
! 1749: length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
! 1750: else
! 1751: length = md->offset_vector[offset+1] - md->offset_vector[offset];
! 1752:
! 1753: /* Set up for repetition, or handle the non-repeated case */
! 1754:
! 1755: switch (*ecode)
! 1756: {
! 1757: case OP_CRSTAR:
! 1758: case OP_CRMINSTAR:
! 1759: case OP_CRPLUS:
! 1760: case OP_CRMINPLUS:
! 1761: case OP_CRQUERY:
! 1762: case OP_CRMINQUERY:
! 1763: c = *ecode++ - OP_CRSTAR;
! 1764: minimize = (c & 1) != 0;
! 1765: min = rep_min[c]; /* Pick up values from tables; */
! 1766: max = rep_max[c]; /* zero for max => infinity */
! 1767: if (max == 0) max = INT_MAX;
! 1768: break;
! 1769:
! 1770: case OP_CRRANGE:
! 1771: case OP_CRMINRANGE:
! 1772: minimize = (*ecode == OP_CRMINRANGE);
! 1773: min = GET2(ecode, 1);
! 1774: max = GET2(ecode, 3);
! 1775: if (max == 0) max = INT_MAX;
! 1776: ecode += 5;
! 1777: break;
! 1778:
! 1779: default: /* No repeat follows */
! 1780: if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
! 1781: eptr += length;
! 1782: continue; /* With the main loop */
! 1783: }
! 1784:
! 1785: /* If the length of the reference is zero, just continue with the
! 1786: main loop. */
! 1787:
! 1788: if (length == 0) continue;
! 1789:
! 1790: /* First, ensure the minimum number of matches are present. We get back
! 1791: the length of the reference string explicitly rather than passing the
! 1792: address of eptr, so that eptr can be a register variable. */
! 1793:
! 1794: for (i = 1; i <= min; i++)
! 1795: {
! 1796: if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
! 1797: eptr += length;
! 1798: }
! 1799:
! 1800: /* If min = max, continue at the same level without recursion.
! 1801: They are not both allowed to be zero. */
! 1802:
! 1803: if (min == max) continue;
! 1804:
! 1805: /* If minimizing, keep trying and advancing the pointer */
! 1806:
! 1807: if (minimize)
! 1808: {
! 1809: for (fi = min;; fi++)
! 1810: {
! 1811: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
! 1812: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 1813: if (fi >= max || !match_ref(offset, eptr, length, md, ims))
! 1814: RRETURN(MATCH_NOMATCH);
! 1815: eptr += length;
! 1816: }
! 1817: /* Control never gets here */
! 1818: }
! 1819:
! 1820: /* If maximizing, find the longest string and work backwards */
! 1821:
! 1822: else
! 1823: {
! 1824: pp = eptr;
! 1825: for (i = min; i < max; i++)
! 1826: {
! 1827: if (!match_ref(offset, eptr, length, md, ims)) break;
! 1828: eptr += length;
! 1829: }
! 1830: while (eptr >= pp)
! 1831: {
! 1832: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
! 1833: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 1834: eptr -= length;
! 1835: }
! 1836: RRETURN(MATCH_NOMATCH);
! 1837: }
! 1838: }
! 1839: /* Control never gets here */
! 1840:
! 1841:
! 1842:
! 1843: /* Match a bit-mapped character class, possibly repeatedly. This op code is
! 1844: used when all the characters in the class have values in the range 0-255,
! 1845: and either the matching is caseful, or the characters are in the range
! 1846: 0-127 when UTF-8 processing is enabled. The only difference between
! 1847: OP_CLASS and OP_NCLASS occurs when a data character outside the range is
! 1848: encountered.
! 1849:
! 1850: First, look past the end of the item to see if there is repeat information
! 1851: following. Then obey similar code to character type repeats - written out
! 1852: again for speed. */
! 1853:
! 1854: case OP_NCLASS:
! 1855: case OP_CLASS:
! 1856: {
! 1857: data = ecode + 1; /* Save for matching */
! 1858: ecode += 33; /* Advance past the item */
! 1859:
! 1860: switch (*ecode)
! 1861: {
! 1862: case OP_CRSTAR:
! 1863: case OP_CRMINSTAR:
! 1864: case OP_CRPLUS:
! 1865: case OP_CRMINPLUS:
! 1866: case OP_CRQUERY:
! 1867: case OP_CRMINQUERY:
! 1868: c = *ecode++ - OP_CRSTAR;
! 1869: minimize = (c & 1) != 0;
! 1870: min = rep_min[c]; /* Pick up values from tables; */
! 1871: max = rep_max[c]; /* zero for max => infinity */
! 1872: if (max == 0) max = INT_MAX;
! 1873: break;
! 1874:
! 1875: case OP_CRRANGE:
! 1876: case OP_CRMINRANGE:
! 1877: minimize = (*ecode == OP_CRMINRANGE);
! 1878: min = GET2(ecode, 1);
! 1879: max = GET2(ecode, 3);
! 1880: if (max == 0) max = INT_MAX;
! 1881: ecode += 5;
! 1882: break;
! 1883:
! 1884: default: /* No repeat follows */
! 1885: min = max = 1;
! 1886: break;
! 1887: }
! 1888:
! 1889: /* First, ensure the minimum number of matches are present. */
! 1890:
! 1891: #ifdef SUPPORT_UTF8
! 1892: /* UTF-8 mode */
! 1893: if (utf8)
! 1894: {
! 1895: for (i = 1; i <= min; i++)
! 1896: {
! 1897: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1898: GETCHARINC(c, eptr);
! 1899: if (c > 255)
! 1900: {
! 1901: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
! 1902: }
! 1903: else
! 1904: {
! 1905: if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
! 1906: }
! 1907: }
! 1908: }
! 1909: else
! 1910: #endif
! 1911: /* Not UTF-8 mode */
! 1912: {
! 1913: for (i = 1; i <= min; i++)
! 1914: {
! 1915: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1916: c = *eptr++;
! 1917: if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
! 1918: }
! 1919: }
! 1920:
! 1921: /* If max == min we can continue with the main loop without the
! 1922: need to recurse. */
! 1923:
! 1924: if (min == max) continue;
! 1925:
! 1926: /* If minimizing, keep testing the rest of the expression and advancing
! 1927: the pointer while it matches the class. */
! 1928:
! 1929: if (minimize)
! 1930: {
! 1931: #ifdef SUPPORT_UTF8
! 1932: /* UTF-8 mode */
! 1933: if (utf8)
! 1934: {
! 1935: for (fi = min;; fi++)
! 1936: {
! 1937: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
! 1938: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 1939: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1940: GETCHARINC(c, eptr);
! 1941: if (c > 255)
! 1942: {
! 1943: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
! 1944: }
! 1945: else
! 1946: {
! 1947: if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
! 1948: }
! 1949: }
! 1950: }
! 1951: else
! 1952: #endif
! 1953: /* Not UTF-8 mode */
! 1954: {
! 1955: for (fi = min;; fi++)
! 1956: {
! 1957: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
! 1958: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 1959: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 1960: c = *eptr++;
! 1961: if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
! 1962: }
! 1963: }
! 1964: /* Control never gets here */
! 1965: }
! 1966:
! 1967: /* If maximizing, find the longest possible run, then work backwards. */
! 1968:
! 1969: else
! 1970: {
! 1971: pp = eptr;
! 1972:
! 1973: #ifdef SUPPORT_UTF8
! 1974: /* UTF-8 mode */
! 1975: if (utf8)
! 1976: {
! 1977: for (i = min; i < max; i++)
! 1978: {
! 1979: int len = 1;
! 1980: if (eptr >= md->end_subject) break;
! 1981: GETCHARLEN(c, eptr, len);
! 1982: if (c > 255)
! 1983: {
! 1984: if (op == OP_CLASS) break;
! 1985: }
! 1986: else
! 1987: {
! 1988: if ((data[c/8] & (1 << (c&7))) == 0) break;
! 1989: }
! 1990: eptr += len;
! 1991: }
! 1992: for (;;)
! 1993: {
! 1994: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
! 1995: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 1996: if (eptr-- == pp) break; /* Stop if tried at original pos */
! 1997: BACKCHAR(eptr);
! 1998: }
! 1999: }
! 2000: else
! 2001: #endif
! 2002: /* Not UTF-8 mode */
! 2003: {
! 2004: for (i = min; i < max; i++)
! 2005: {
! 2006: if (eptr >= md->end_subject) break;
! 2007: c = *eptr;
! 2008: if ((data[c/8] & (1 << (c&7))) == 0) break;
! 2009: eptr++;
! 2010: }
! 2011: while (eptr >= pp)
! 2012: {
! 2013: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
! 2014: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2015: eptr--;
! 2016: }
! 2017: }
! 2018:
! 2019: RRETURN(MATCH_NOMATCH);
! 2020: }
! 2021: }
! 2022: /* Control never gets here */
! 2023:
! 2024:
! 2025: /* Match an extended character class. This opcode is encountered only
! 2026: in UTF-8 mode, because that's the only time it is compiled. */
! 2027:
! 2028: #ifdef SUPPORT_UTF8
! 2029: case OP_XCLASS:
! 2030: {
! 2031: data = ecode + 1 + LINK_SIZE; /* Save for matching */
! 2032: ecode += GET(ecode, 1); /* Advance past the item */
! 2033:
! 2034: switch (*ecode)
! 2035: {
! 2036: case OP_CRSTAR:
! 2037: case OP_CRMINSTAR:
! 2038: case OP_CRPLUS:
! 2039: case OP_CRMINPLUS:
! 2040: case OP_CRQUERY:
! 2041: case OP_CRMINQUERY:
! 2042: c = *ecode++ - OP_CRSTAR;
! 2043: minimize = (c & 1) != 0;
! 2044: min = rep_min[c]; /* Pick up values from tables; */
! 2045: max = rep_max[c]; /* zero for max => infinity */
! 2046: if (max == 0) max = INT_MAX;
! 2047: break;
! 2048:
! 2049: case OP_CRRANGE:
! 2050: case OP_CRMINRANGE:
! 2051: minimize = (*ecode == OP_CRMINRANGE);
! 2052: min = GET2(ecode, 1);
! 2053: max = GET2(ecode, 3);
! 2054: if (max == 0) max = INT_MAX;
! 2055: ecode += 5;
! 2056: break;
! 2057:
! 2058: default: /* No repeat follows */
! 2059: min = max = 1;
! 2060: break;
! 2061: }
! 2062:
! 2063: /* First, ensure the minimum number of matches are present. */
! 2064:
! 2065: for (i = 1; i <= min; i++)
! 2066: {
! 2067: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 2068: GETCHARINC(c, eptr);
! 2069: if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
! 2070: }
! 2071:
! 2072: /* If max == min we can continue with the main loop without the
! 2073: need to recurse. */
! 2074:
! 2075: if (min == max) continue;
! 2076:
! 2077: /* If minimizing, keep testing the rest of the expression and advancing
! 2078: the pointer while it matches the class. */
! 2079:
! 2080: if (minimize)
! 2081: {
! 2082: for (fi = min;; fi++)
! 2083: {
! 2084: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
! 2085: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2086: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 2087: GETCHARINC(c, eptr);
! 2088: if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
! 2089: }
! 2090: /* Control never gets here */
! 2091: }
! 2092:
! 2093: /* If maximizing, find the longest possible run, then work backwards. */
! 2094:
! 2095: else
! 2096: {
! 2097: pp = eptr;
! 2098: for (i = min; i < max; i++)
! 2099: {
! 2100: int len = 1;
! 2101: if (eptr >= md->end_subject) break;
! 2102: GETCHARLEN(c, eptr, len);
! 2103: if (!_pcre_xclass(c, data)) break;
! 2104: eptr += len;
! 2105: }
! 2106: for(;;)
! 2107: {
! 2108: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
! 2109: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2110: if (eptr-- == pp) break; /* Stop if tried at original pos */
! 2111: if (utf8) BACKCHAR(eptr);
! 2112: }
! 2113: RRETURN(MATCH_NOMATCH);
! 2114: }
! 2115:
! 2116: /* Control never gets here */
! 2117: }
! 2118: #endif /* End of XCLASS */
! 2119:
! 2120: /* Match a single character, casefully */
! 2121:
! 2122: case OP_CHAR:
! 2123: #ifdef SUPPORT_UTF8
! 2124: if (utf8)
! 2125: {
! 2126: length = 1;
! 2127: ecode++;
! 2128: GETCHARLEN(fc, ecode, length);
! 2129: if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
! 2130: while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
! 2131: }
! 2132: else
! 2133: #endif
! 2134:
! 2135: /* Non-UTF-8 mode */
! 2136: {
! 2137: if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
! 2138: if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
! 2139: ecode += 2;
! 2140: }
! 2141: break;
! 2142:
! 2143: /* Match a single character, caselessly */
! 2144:
! 2145: case OP_CHARNC:
! 2146: #ifdef SUPPORT_UTF8
! 2147: if (utf8)
! 2148: {
! 2149: length = 1;
! 2150: ecode++;
! 2151: GETCHARLEN(fc, ecode, length);
! 2152:
! 2153: if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
! 2154:
! 2155: /* If the pattern character's value is < 128, we have only one byte, and
! 2156: can use the fast lookup table. */
! 2157:
! 2158: if (fc < 128)
! 2159: {
! 2160: if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
! 2161: }
! 2162:
! 2163: /* Otherwise we must pick up the subject character */
! 2164:
! 2165: else
! 2166: {
! 2167: unsigned int dc;
! 2168: GETCHARINC(dc, eptr);
! 2169: ecode += length;
! 2170:
! 2171: /* If we have Unicode property support, we can use it to test the other
! 2172: case of the character, if there is one. */
! 2173:
! 2174: if (fc != dc)
! 2175: {
! 2176: #ifdef SUPPORT_UCP
! 2177: if (dc != _pcre_ucp_othercase(fc))
! 2178: #endif
! 2179: RRETURN(MATCH_NOMATCH);
! 2180: }
! 2181: }
! 2182: }
! 2183: else
! 2184: #endif /* SUPPORT_UTF8 */
! 2185:
! 2186: /* Non-UTF-8 mode */
! 2187: {
! 2188: if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
! 2189: if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
! 2190: ecode += 2;
! 2191: }
! 2192: break;
! 2193:
! 2194: /* Match a single character repeatedly. */
! 2195:
! 2196: case OP_EXACT:
! 2197: min = max = GET2(ecode, 1);
! 2198: ecode += 3;
! 2199: goto REPEATCHAR;
! 2200:
! 2201: case OP_POSUPTO:
! 2202: possessive = TRUE;
! 2203: /* Fall through */
! 2204:
! 2205: case OP_UPTO:
! 2206: case OP_MINUPTO:
! 2207: min = 0;
! 2208: max = GET2(ecode, 1);
! 2209: minimize = *ecode == OP_MINUPTO;
! 2210: ecode += 3;
! 2211: goto REPEATCHAR;
! 2212:
! 2213: case OP_POSSTAR:
! 2214: possessive = TRUE;
! 2215: min = 0;
! 2216: max = INT_MAX;
! 2217: ecode++;
! 2218: goto REPEATCHAR;
! 2219:
! 2220: case OP_POSPLUS:
! 2221: possessive = TRUE;
! 2222: min = 1;
! 2223: max = INT_MAX;
! 2224: ecode++;
! 2225: goto REPEATCHAR;
! 2226:
! 2227: case OP_POSQUERY:
! 2228: possessive = TRUE;
! 2229: min = 0;
! 2230: max = 1;
! 2231: ecode++;
! 2232: goto REPEATCHAR;
! 2233:
! 2234: case OP_STAR:
! 2235: case OP_MINSTAR:
! 2236: case OP_PLUS:
! 2237: case OP_MINPLUS:
! 2238: case OP_QUERY:
! 2239: case OP_MINQUERY:
! 2240: c = *ecode++ - OP_STAR;
! 2241: minimize = (c & 1) != 0;
! 2242: min = rep_min[c]; /* Pick up values from tables; */
! 2243: max = rep_max[c]; /* zero for max => infinity */
! 2244: if (max == 0) max = INT_MAX;
! 2245:
! 2246: /* Common code for all repeated single-character matches. We can give
! 2247: up quickly if there are fewer than the minimum number of characters left in
! 2248: the subject. */
! 2249:
! 2250: REPEATCHAR:
! 2251: #ifdef SUPPORT_UTF8
! 2252: if (utf8)
! 2253: {
! 2254: length = 1;
! 2255: charptr = ecode;
! 2256: GETCHARLEN(fc, ecode, length);
! 2257: if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
! 2258: ecode += length;
! 2259:
! 2260: /* Handle multibyte character matching specially here. There is
! 2261: support for caseless matching if UCP support is present. */
! 2262:
! 2263: if (length > 1)
! 2264: {
! 2265: #ifdef SUPPORT_UCP
! 2266: unsigned int othercase;
! 2267: if ((ims & PCRE_CASELESS) != 0 &&
! 2268: (othercase = _pcre_ucp_othercase(fc)) != NOTACHAR)
! 2269: oclength = _pcre_ord2utf8(othercase, occhars);
! 2270: else oclength = 0;
! 2271: #endif /* SUPPORT_UCP */
! 2272:
! 2273: for (i = 1; i <= min; i++)
! 2274: {
! 2275: if (memcmp(eptr, charptr, length) == 0) eptr += length;
! 2276: #ifdef SUPPORT_UCP
! 2277: /* Need braces because of following else */
! 2278: else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
! 2279: else
! 2280: {
! 2281: if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
! 2282: eptr += oclength;
! 2283: }
! 2284: #else /* without SUPPORT_UCP */
! 2285: else { RRETURN(MATCH_NOMATCH); }
! 2286: #endif /* SUPPORT_UCP */
! 2287: }
! 2288:
! 2289: if (min == max) continue;
! 2290:
! 2291: if (minimize)
! 2292: {
! 2293: for (fi = min;; fi++)
! 2294: {
! 2295: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
! 2296: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2297: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 2298: if (memcmp(eptr, charptr, length) == 0) eptr += length;
! 2299: #ifdef SUPPORT_UCP
! 2300: /* Need braces because of following else */
! 2301: else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
! 2302: else
! 2303: {
! 2304: if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
! 2305: eptr += oclength;
! 2306: }
! 2307: #else /* without SUPPORT_UCP */
! 2308: else { RRETURN (MATCH_NOMATCH); }
! 2309: #endif /* SUPPORT_UCP */
! 2310: }
! 2311: /* Control never gets here */
! 2312: }
! 2313:
! 2314: else /* Maximize */
! 2315: {
! 2316: pp = eptr;
! 2317: for (i = min; i < max; i++)
! 2318: {
! 2319: if (eptr > md->end_subject - length) break;
! 2320: if (memcmp(eptr, charptr, length) == 0) eptr += length;
! 2321: #ifdef SUPPORT_UCP
! 2322: else if (oclength == 0) break;
! 2323: else
! 2324: {
! 2325: if (memcmp(eptr, occhars, oclength) != 0) break;
! 2326: eptr += oclength;
! 2327: }
! 2328: #else /* without SUPPORT_UCP */
! 2329: else break;
! 2330: #endif /* SUPPORT_UCP */
! 2331: }
! 2332:
! 2333: if (possessive) continue;
! 2334: for(;;)
! 2335: {
! 2336: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
! 2337: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2338: if (eptr == pp) RRETURN(MATCH_NOMATCH);
! 2339: #ifdef SUPPORT_UCP
! 2340: eptr--;
! 2341: BACKCHAR(eptr);
! 2342: #else /* without SUPPORT_UCP */
! 2343: eptr -= length;
! 2344: #endif /* SUPPORT_UCP */
! 2345: }
! 2346: }
! 2347: /* Control never gets here */
! 2348: }
! 2349:
! 2350: /* If the length of a UTF-8 character is 1, we fall through here, and
! 2351: obey the code as for non-UTF-8 characters below, though in this case the
! 2352: value of fc will always be < 128. */
! 2353: }
! 2354: else
! 2355: #endif /* SUPPORT_UTF8 */
! 2356:
! 2357: /* When not in UTF-8 mode, load a single-byte character. */
! 2358: {
! 2359: if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
! 2360: fc = *ecode++;
! 2361: }
! 2362:
! 2363: /* The value of fc at this point is always less than 256, though we may or
! 2364: may not be in UTF-8 mode. The code is duplicated for the caseless and
! 2365: caseful cases, for speed, since matching characters is likely to be quite
! 2366: common. First, ensure the minimum number of matches are present. If min =
! 2367: max, continue at the same level without recursing. Otherwise, if
! 2368: minimizing, keep trying the rest of the expression and advancing one
! 2369: matching character if failing, up to the maximum. Alternatively, if
! 2370: maximizing, find the maximum number of characters and work backwards. */
! 2371:
! 2372: DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
! 2373: max, eptr));
! 2374:
! 2375: if ((ims & PCRE_CASELESS) != 0)
! 2376: {
! 2377: fc = md->lcc[fc];
! 2378: for (i = 1; i <= min; i++)
! 2379: if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
! 2380: if (min == max) continue;
! 2381: if (minimize)
! 2382: {
! 2383: for (fi = min;; fi++)
! 2384: {
! 2385: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
! 2386: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2387: if (fi >= max || eptr >= md->end_subject ||
! 2388: fc != md->lcc[*eptr++])
! 2389: RRETURN(MATCH_NOMATCH);
! 2390: }
! 2391: /* Control never gets here */
! 2392: }
! 2393: else /* Maximize */
! 2394: {
! 2395: pp = eptr;
! 2396: for (i = min; i < max; i++)
! 2397: {
! 2398: if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
! 2399: eptr++;
! 2400: }
! 2401: if (possessive) continue;
! 2402: while (eptr >= pp)
! 2403: {
! 2404: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
! 2405: eptr--;
! 2406: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2407: }
! 2408: RRETURN(MATCH_NOMATCH);
! 2409: }
! 2410: /* Control never gets here */
! 2411: }
! 2412:
! 2413: /* Caseful comparisons (includes all multi-byte characters) */
! 2414:
! 2415: else
! 2416: {
! 2417: for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
! 2418: if (min == max) continue;
! 2419: if (minimize)
! 2420: {
! 2421: for (fi = min;; fi++)
! 2422: {
! 2423: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
! 2424: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2425: if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
! 2426: RRETURN(MATCH_NOMATCH);
! 2427: }
! 2428: /* Control never gets here */
! 2429: }
! 2430: else /* Maximize */
! 2431: {
! 2432: pp = eptr;
! 2433: for (i = min; i < max; i++)
! 2434: {
! 2435: if (eptr >= md->end_subject || fc != *eptr) break;
! 2436: eptr++;
! 2437: }
! 2438: if (possessive) continue;
! 2439: while (eptr >= pp)
! 2440: {
! 2441: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
! 2442: eptr--;
! 2443: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2444: }
! 2445: RRETURN(MATCH_NOMATCH);
! 2446: }
! 2447: }
! 2448: /* Control never gets here */
! 2449:
! 2450: /* Match a negated single one-byte character. The character we are
! 2451: checking can be multibyte. */
! 2452:
! 2453: case OP_NOT:
! 2454: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 2455: ecode++;
! 2456: GETCHARINCTEST(c, eptr);
! 2457: if ((ims & PCRE_CASELESS) != 0)
! 2458: {
! 2459: #ifdef SUPPORT_UTF8
! 2460: if (c < 256)
! 2461: #endif
! 2462: c = md->lcc[c];
! 2463: if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
! 2464: }
! 2465: else
! 2466: {
! 2467: if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
! 2468: }
! 2469: break;
! 2470:
! 2471: /* Match a negated single one-byte character repeatedly. This is almost a
! 2472: repeat of the code for a repeated single character, but I haven't found a
! 2473: nice way of commoning these up that doesn't require a test of the
! 2474: positive/negative option for each character match. Maybe that wouldn't add
! 2475: very much to the time taken, but character matching *is* what this is all
! 2476: about... */
! 2477:
! 2478: case OP_NOTEXACT:
! 2479: min = max = GET2(ecode, 1);
! 2480: ecode += 3;
! 2481: goto REPEATNOTCHAR;
! 2482:
! 2483: case OP_NOTUPTO:
! 2484: case OP_NOTMINUPTO:
! 2485: min = 0;
! 2486: max = GET2(ecode, 1);
! 2487: minimize = *ecode == OP_NOTMINUPTO;
! 2488: ecode += 3;
! 2489: goto REPEATNOTCHAR;
! 2490:
! 2491: case OP_NOTPOSSTAR:
! 2492: possessive = TRUE;
! 2493: min = 0;
! 2494: max = INT_MAX;
! 2495: ecode++;
! 2496: goto REPEATNOTCHAR;
! 2497:
! 2498: case OP_NOTPOSPLUS:
! 2499: possessive = TRUE;
! 2500: min = 1;
! 2501: max = INT_MAX;
! 2502: ecode++;
! 2503: goto REPEATNOTCHAR;
! 2504:
! 2505: case OP_NOTPOSQUERY:
! 2506: possessive = TRUE;
! 2507: min = 0;
! 2508: max = 1;
! 2509: ecode++;
! 2510: goto REPEATNOTCHAR;
! 2511:
! 2512: case OP_NOTPOSUPTO:
! 2513: possessive = TRUE;
! 2514: min = 0;
! 2515: max = GET2(ecode, 1);
! 2516: ecode += 3;
! 2517: goto REPEATNOTCHAR;
! 2518:
! 2519: case OP_NOTSTAR:
! 2520: case OP_NOTMINSTAR:
! 2521: case OP_NOTPLUS:
! 2522: case OP_NOTMINPLUS:
! 2523: case OP_NOTQUERY:
! 2524: case OP_NOTMINQUERY:
! 2525: c = *ecode++ - OP_NOTSTAR;
! 2526: minimize = (c & 1) != 0;
! 2527: min = rep_min[c]; /* Pick up values from tables; */
! 2528: max = rep_max[c]; /* zero for max => infinity */
! 2529: if (max == 0) max = INT_MAX;
! 2530:
! 2531: /* Common code for all repeated single-byte matches. We can give up quickly
! 2532: if there are fewer than the minimum number of bytes left in the
! 2533: subject. */
! 2534:
! 2535: REPEATNOTCHAR:
! 2536: if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
! 2537: fc = *ecode++;
! 2538:
! 2539: /* The code is duplicated for the caseless and caseful cases, for speed,
! 2540: since matching characters is likely to be quite common. First, ensure the
! 2541: minimum number of matches are present. If min = max, continue at the same
! 2542: level without recursing. Otherwise, if minimizing, keep trying the rest of
! 2543: the expression and advancing one matching character if failing, up to the
! 2544: maximum. Alternatively, if maximizing, find the maximum number of
! 2545: characters and work backwards. */
! 2546:
! 2547: DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
! 2548: max, eptr));
! 2549:
! 2550: if ((ims & PCRE_CASELESS) != 0)
! 2551: {
! 2552: fc = md->lcc[fc];
! 2553:
! 2554: #ifdef SUPPORT_UTF8
! 2555: /* UTF-8 mode */
! 2556: if (utf8)
! 2557: {
! 2558: register unsigned int d;
! 2559: for (i = 1; i <= min; i++)
! 2560: {
! 2561: GETCHARINC(d, eptr);
! 2562: if (d < 256) d = md->lcc[d];
! 2563: if (fc == d) RRETURN(MATCH_NOMATCH);
! 2564: }
! 2565: }
! 2566: else
! 2567: #endif
! 2568:
! 2569: /* Not UTF-8 mode */
! 2570: {
! 2571: for (i = 1; i <= min; i++)
! 2572: if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
! 2573: }
! 2574:
! 2575: if (min == max) continue;
! 2576:
! 2577: if (minimize)
! 2578: {
! 2579: #ifdef SUPPORT_UTF8
! 2580: /* UTF-8 mode */
! 2581: if (utf8)
! 2582: {
! 2583: register unsigned int d;
! 2584: for (fi = min;; fi++)
! 2585: {
! 2586: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
! 2587: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2588: GETCHARINC(d, eptr);
! 2589: if (d < 256) d = md->lcc[d];
! 2590: if (fi >= max || eptr >= md->end_subject || fc == d)
! 2591: RRETURN(MATCH_NOMATCH);
! 2592: }
! 2593: }
! 2594: else
! 2595: #endif
! 2596: /* Not UTF-8 mode */
! 2597: {
! 2598: for (fi = min;; fi++)
! 2599: {
! 2600: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
! 2601: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2602: if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
! 2603: RRETURN(MATCH_NOMATCH);
! 2604: }
! 2605: }
! 2606: /* Control never gets here */
! 2607: }
! 2608:
! 2609: /* Maximize case */
! 2610:
! 2611: else
! 2612: {
! 2613: pp = eptr;
! 2614:
! 2615: #ifdef SUPPORT_UTF8
! 2616: /* UTF-8 mode */
! 2617: if (utf8)
! 2618: {
! 2619: register unsigned int d;
! 2620: for (i = min; i < max; i++)
! 2621: {
! 2622: int len = 1;
! 2623: if (eptr >= md->end_subject) break;
! 2624: GETCHARLEN(d, eptr, len);
! 2625: if (d < 256) d = md->lcc[d];
! 2626: if (fc == d) break;
! 2627: eptr += len;
! 2628: }
! 2629: if (possessive) continue;
! 2630: for(;;)
! 2631: {
! 2632: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
! 2633: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2634: if (eptr-- == pp) break; /* Stop if tried at original pos */
! 2635: BACKCHAR(eptr);
! 2636: }
! 2637: }
! 2638: else
! 2639: #endif
! 2640: /* Not UTF-8 mode */
! 2641: {
! 2642: for (i = min; i < max; i++)
! 2643: {
! 2644: if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
! 2645: eptr++;
! 2646: }
! 2647: if (possessive) continue;
! 2648: while (eptr >= pp)
! 2649: {
! 2650: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
! 2651: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2652: eptr--;
! 2653: }
! 2654: }
! 2655:
! 2656: RRETURN(MATCH_NOMATCH);
! 2657: }
! 2658: /* Control never gets here */
! 2659: }
! 2660:
! 2661: /* Caseful comparisons */
! 2662:
! 2663: else
! 2664: {
! 2665: #ifdef SUPPORT_UTF8
! 2666: /* UTF-8 mode */
! 2667: if (utf8)
! 2668: {
! 2669: register unsigned int d;
! 2670: for (i = 1; i <= min; i++)
! 2671: {
! 2672: GETCHARINC(d, eptr);
! 2673: if (fc == d) RRETURN(MATCH_NOMATCH);
! 2674: }
! 2675: }
! 2676: else
! 2677: #endif
! 2678: /* Not UTF-8 mode */
! 2679: {
! 2680: for (i = 1; i <= min; i++)
! 2681: if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
! 2682: }
! 2683:
! 2684: if (min == max) continue;
! 2685:
! 2686: if (minimize)
! 2687: {
! 2688: #ifdef SUPPORT_UTF8
! 2689: /* UTF-8 mode */
! 2690: if (utf8)
! 2691: {
! 2692: register unsigned int d;
! 2693: for (fi = min;; fi++)
! 2694: {
! 2695: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
! 2696: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2697: GETCHARINC(d, eptr);
! 2698: if (fi >= max || eptr >= md->end_subject || fc == d)
! 2699: RRETURN(MATCH_NOMATCH);
! 2700: }
! 2701: }
! 2702: else
! 2703: #endif
! 2704: /* Not UTF-8 mode */
! 2705: {
! 2706: for (fi = min;; fi++)
! 2707: {
! 2708: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
! 2709: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2710: if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
! 2711: RRETURN(MATCH_NOMATCH);
! 2712: }
! 2713: }
! 2714: /* Control never gets here */
! 2715: }
! 2716:
! 2717: /* Maximize case */
! 2718:
! 2719: else
! 2720: {
! 2721: pp = eptr;
! 2722:
! 2723: #ifdef SUPPORT_UTF8
! 2724: /* UTF-8 mode */
! 2725: if (utf8)
! 2726: {
! 2727: register unsigned int d;
! 2728: for (i = min; i < max; i++)
! 2729: {
! 2730: int len = 1;
! 2731: if (eptr >= md->end_subject) break;
! 2732: GETCHARLEN(d, eptr, len);
! 2733: if (fc == d) break;
! 2734: eptr += len;
! 2735: }
! 2736: if (possessive) continue;
! 2737: for(;;)
! 2738: {
! 2739: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
! 2740: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2741: if (eptr-- == pp) break; /* Stop if tried at original pos */
! 2742: BACKCHAR(eptr);
! 2743: }
! 2744: }
! 2745: else
! 2746: #endif
! 2747: /* Not UTF-8 mode */
! 2748: {
! 2749: for (i = min; i < max; i++)
! 2750: {
! 2751: if (eptr >= md->end_subject || fc == *eptr) break;
! 2752: eptr++;
! 2753: }
! 2754: if (possessive) continue;
! 2755: while (eptr >= pp)
! 2756: {
! 2757: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
! 2758: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2759: eptr--;
! 2760: }
! 2761: }
! 2762:
! 2763: RRETURN(MATCH_NOMATCH);
! 2764: }
! 2765: }
! 2766: /* Control never gets here */
! 2767:
! 2768: /* Match a single character type repeatedly; several different opcodes
! 2769: share code. This is very similar to the code for single characters, but we
! 2770: repeat it in the interests of efficiency. */
! 2771:
! 2772: case OP_TYPEEXACT:
! 2773: min = max = GET2(ecode, 1);
! 2774: minimize = TRUE;
! 2775: ecode += 3;
! 2776: goto REPEATTYPE;
! 2777:
! 2778: case OP_TYPEUPTO:
! 2779: case OP_TYPEMINUPTO:
! 2780: min = 0;
! 2781: max = GET2(ecode, 1);
! 2782: minimize = *ecode == OP_TYPEMINUPTO;
! 2783: ecode += 3;
! 2784: goto REPEATTYPE;
! 2785:
! 2786: case OP_TYPEPOSSTAR:
! 2787: possessive = TRUE;
! 2788: min = 0;
! 2789: max = INT_MAX;
! 2790: ecode++;
! 2791: goto REPEATTYPE;
! 2792:
! 2793: case OP_TYPEPOSPLUS:
! 2794: possessive = TRUE;
! 2795: min = 1;
! 2796: max = INT_MAX;
! 2797: ecode++;
! 2798: goto REPEATTYPE;
! 2799:
! 2800: case OP_TYPEPOSQUERY:
! 2801: possessive = TRUE;
! 2802: min = 0;
! 2803: max = 1;
! 2804: ecode++;
! 2805: goto REPEATTYPE;
! 2806:
! 2807: case OP_TYPEPOSUPTO:
! 2808: possessive = TRUE;
! 2809: min = 0;
! 2810: max = GET2(ecode, 1);
! 2811: ecode += 3;
! 2812: goto REPEATTYPE;
! 2813:
! 2814: case OP_TYPESTAR:
! 2815: case OP_TYPEMINSTAR:
! 2816: case OP_TYPEPLUS:
! 2817: case OP_TYPEMINPLUS:
! 2818: case OP_TYPEQUERY:
! 2819: case OP_TYPEMINQUERY:
! 2820: c = *ecode++ - OP_TYPESTAR;
! 2821: minimize = (c & 1) != 0;
! 2822: min = rep_min[c]; /* Pick up values from tables; */
! 2823: max = rep_max[c]; /* zero for max => infinity */
! 2824: if (max == 0) max = INT_MAX;
! 2825:
! 2826: /* Common code for all repeated single character type matches. Note that
! 2827: in UTF-8 mode, '.' matches a character of any length, but for the other
! 2828: character types, the valid characters are all one-byte long. */
! 2829:
! 2830: REPEATTYPE:
! 2831: ctype = *ecode++; /* Code for the character type */
! 2832:
! 2833: #ifdef SUPPORT_UCP
! 2834: if (ctype == OP_PROP || ctype == OP_NOTPROP)
! 2835: {
! 2836: prop_fail_result = ctype == OP_NOTPROP;
! 2837: prop_type = *ecode++;
! 2838: prop_value = *ecode++;
! 2839: }
! 2840: else prop_type = -1;
! 2841: #endif
! 2842:
! 2843: /* First, ensure the minimum number of matches are present. Use inline
! 2844: code for maximizing the speed, and do the type test once at the start
! 2845: (i.e. keep it out of the loop). Also we can test that there are at least
! 2846: the minimum number of bytes before we start. This isn't as effective in
! 2847: UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
! 2848: is tidier. Also separate the UCP code, which can be the same for both UTF-8
! 2849: and single-bytes. */
! 2850:
! 2851: if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
! 2852: if (min > 0)
! 2853: {
! 2854: #ifdef SUPPORT_UCP
! 2855: if (prop_type >= 0)
! 2856: {
! 2857: switch(prop_type)
! 2858: {
! 2859: case PT_ANY:
! 2860: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
! 2861: for (i = 1; i <= min; i++)
! 2862: {
! 2863: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 2864: GETCHARINCTEST(c, eptr);
! 2865: }
! 2866: break;
! 2867:
! 2868: case PT_LAMP:
! 2869: for (i = 1; i <= min; i++)
! 2870: {
! 2871: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 2872: GETCHARINCTEST(c, eptr);
! 2873: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 2874: if ((prop_chartype == ucp_Lu ||
! 2875: prop_chartype == ucp_Ll ||
! 2876: prop_chartype == ucp_Lt) == prop_fail_result)
! 2877: RRETURN(MATCH_NOMATCH);
! 2878: }
! 2879: break;
! 2880:
! 2881: case PT_GC:
! 2882: for (i = 1; i <= min; i++)
! 2883: {
! 2884: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 2885: GETCHARINCTEST(c, eptr);
! 2886: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 2887: if ((prop_category == prop_value) == prop_fail_result)
! 2888: RRETURN(MATCH_NOMATCH);
! 2889: }
! 2890: break;
! 2891:
! 2892: case PT_PC:
! 2893: for (i = 1; i <= min; i++)
! 2894: {
! 2895: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 2896: GETCHARINCTEST(c, eptr);
! 2897: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 2898: if ((prop_chartype == prop_value) == prop_fail_result)
! 2899: RRETURN(MATCH_NOMATCH);
! 2900: }
! 2901: break;
! 2902:
! 2903: case PT_SC:
! 2904: for (i = 1; i <= min; i++)
! 2905: {
! 2906: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 2907: GETCHARINCTEST(c, eptr);
! 2908: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 2909: if ((prop_script == prop_value) == prop_fail_result)
! 2910: RRETURN(MATCH_NOMATCH);
! 2911: }
! 2912: break;
! 2913:
! 2914: default:
! 2915: RRETURN(PCRE_ERROR_INTERNAL);
! 2916: }
! 2917: }
! 2918:
! 2919: /* Match extended Unicode sequences. We will get here only if the
! 2920: support is in the binary; otherwise a compile-time error occurs. */
! 2921:
! 2922: else if (ctype == OP_EXTUNI)
! 2923: {
! 2924: for (i = 1; i <= min; i++)
! 2925: {
! 2926: GETCHARINCTEST(c, eptr);
! 2927: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 2928: if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
! 2929: while (eptr < md->end_subject)
! 2930: {
! 2931: int len = 1;
! 2932: if (!utf8) c = *eptr; else
! 2933: {
! 2934: GETCHARLEN(c, eptr, len);
! 2935: }
! 2936: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 2937: if (prop_category != ucp_M) break;
! 2938: eptr += len;
! 2939: }
! 2940: }
! 2941: }
! 2942:
! 2943: else
! 2944: #endif /* SUPPORT_UCP */
! 2945:
! 2946: /* Handle all other cases when the coding is UTF-8 */
! 2947:
! 2948: #ifdef SUPPORT_UTF8
! 2949: if (utf8) switch(ctype)
! 2950: {
! 2951: case OP_ANY:
! 2952: for (i = 1; i <= min; i++)
! 2953: {
! 2954: if (eptr >= md->end_subject || IS_NEWLINE(eptr))
! 2955: RRETURN(MATCH_NOMATCH);
! 2956: eptr++;
! 2957: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
! 2958: }
! 2959: break;
! 2960:
! 2961: case OP_ALLANY:
! 2962: for (i = 1; i <= min; i++)
! 2963: {
! 2964: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 2965: eptr++;
! 2966: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
! 2967: }
! 2968: break;
! 2969:
! 2970: case OP_ANYBYTE:
! 2971: eptr += min;
! 2972: break;
! 2973:
! 2974: case OP_ANYNL:
! 2975: for (i = 1; i <= min; i++)
! 2976: {
! 2977: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 2978: GETCHARINC(c, eptr);
! 2979: switch(c)
! 2980: {
! 2981: default: RRETURN(MATCH_NOMATCH);
! 2982: case 0x000d:
! 2983: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
! 2984: break;
! 2985:
! 2986: case 0x000a:
! 2987: break;
! 2988:
! 2989: case 0x000b:
! 2990: case 0x000c:
! 2991: case 0x0085:
! 2992: case 0x2028:
! 2993: case 0x2029:
! 2994: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
! 2995: break;
! 2996: }
! 2997: }
! 2998: break;
! 2999:
! 3000: case OP_NOT_HSPACE:
! 3001: for (i = 1; i <= min; i++)
! 3002: {
! 3003: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 3004: GETCHARINC(c, eptr);
! 3005: switch(c)
! 3006: {
! 3007: default: break;
! 3008: case 0x09: /* HT */
! 3009: case 0x20: /* SPACE */
! 3010: case 0xa0: /* NBSP */
! 3011: case 0x1680: /* OGHAM SPACE MARK */
! 3012: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
! 3013: case 0x2000: /* EN QUAD */
! 3014: case 0x2001: /* EM QUAD */
! 3015: case 0x2002: /* EN SPACE */
! 3016: case 0x2003: /* EM SPACE */
! 3017: case 0x2004: /* THREE-PER-EM SPACE */
! 3018: case 0x2005: /* FOUR-PER-EM SPACE */
! 3019: case 0x2006: /* SIX-PER-EM SPACE */
! 3020: case 0x2007: /* FIGURE SPACE */
! 3021: case 0x2008: /* PUNCTUATION SPACE */
! 3022: case 0x2009: /* THIN SPACE */
! 3023: case 0x200A: /* HAIR SPACE */
! 3024: case 0x202f: /* NARROW NO-BREAK SPACE */
! 3025: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
! 3026: case 0x3000: /* IDEOGRAPHIC SPACE */
! 3027: RRETURN(MATCH_NOMATCH);
! 3028: }
! 3029: }
! 3030: break;
! 3031:
! 3032: case OP_HSPACE:
! 3033: for (i = 1; i <= min; i++)
! 3034: {
! 3035: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 3036: GETCHARINC(c, eptr);
! 3037: switch(c)
! 3038: {
! 3039: default: RRETURN(MATCH_NOMATCH);
! 3040: case 0x09: /* HT */
! 3041: case 0x20: /* SPACE */
! 3042: case 0xa0: /* NBSP */
! 3043: case 0x1680: /* OGHAM SPACE MARK */
! 3044: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
! 3045: case 0x2000: /* EN QUAD */
! 3046: case 0x2001: /* EM QUAD */
! 3047: case 0x2002: /* EN SPACE */
! 3048: case 0x2003: /* EM SPACE */
! 3049: case 0x2004: /* THREE-PER-EM SPACE */
! 3050: case 0x2005: /* FOUR-PER-EM SPACE */
! 3051: case 0x2006: /* SIX-PER-EM SPACE */
! 3052: case 0x2007: /* FIGURE SPACE */
! 3053: case 0x2008: /* PUNCTUATION SPACE */
! 3054: case 0x2009: /* THIN SPACE */
! 3055: case 0x200A: /* HAIR SPACE */
! 3056: case 0x202f: /* NARROW NO-BREAK SPACE */
! 3057: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
! 3058: case 0x3000: /* IDEOGRAPHIC SPACE */
! 3059: break;
! 3060: }
! 3061: }
! 3062: break;
! 3063:
! 3064: case OP_NOT_VSPACE:
! 3065: for (i = 1; i <= min; i++)
! 3066: {
! 3067: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 3068: GETCHARINC(c, eptr);
! 3069: switch(c)
! 3070: {
! 3071: default: break;
! 3072: case 0x0a: /* LF */
! 3073: case 0x0b: /* VT */
! 3074: case 0x0c: /* FF */
! 3075: case 0x0d: /* CR */
! 3076: case 0x85: /* NEL */
! 3077: case 0x2028: /* LINE SEPARATOR */
! 3078: case 0x2029: /* PARAGRAPH SEPARATOR */
! 3079: RRETURN(MATCH_NOMATCH);
! 3080: }
! 3081: }
! 3082: break;
! 3083:
! 3084: case OP_VSPACE:
! 3085: for (i = 1; i <= min; i++)
! 3086: {
! 3087: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 3088: GETCHARINC(c, eptr);
! 3089: switch(c)
! 3090: {
! 3091: default: RRETURN(MATCH_NOMATCH);
! 3092: case 0x0a: /* LF */
! 3093: case 0x0b: /* VT */
! 3094: case 0x0c: /* FF */
! 3095: case 0x0d: /* CR */
! 3096: case 0x85: /* NEL */
! 3097: case 0x2028: /* LINE SEPARATOR */
! 3098: case 0x2029: /* PARAGRAPH SEPARATOR */
! 3099: break;
! 3100: }
! 3101: }
! 3102: break;
! 3103:
! 3104: case OP_NOT_DIGIT:
! 3105: for (i = 1; i <= min; i++)
! 3106: {
! 3107: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 3108: GETCHARINC(c, eptr);
! 3109: if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
! 3110: RRETURN(MATCH_NOMATCH);
! 3111: }
! 3112: break;
! 3113:
! 3114: case OP_DIGIT:
! 3115: for (i = 1; i <= min; i++)
! 3116: {
! 3117: if (eptr >= md->end_subject ||
! 3118: *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
! 3119: RRETURN(MATCH_NOMATCH);
! 3120: /* No need to skip more bytes - we know it's a 1-byte character */
! 3121: }
! 3122: break;
! 3123:
! 3124: case OP_NOT_WHITESPACE:
! 3125: for (i = 1; i <= min; i++)
! 3126: {
! 3127: if (eptr >= md->end_subject ||
! 3128: (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
! 3129: RRETURN(MATCH_NOMATCH);
! 3130: while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
! 3131: }
! 3132: break;
! 3133:
! 3134: case OP_WHITESPACE:
! 3135: for (i = 1; i <= min; i++)
! 3136: {
! 3137: if (eptr >= md->end_subject ||
! 3138: *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
! 3139: RRETURN(MATCH_NOMATCH);
! 3140: /* No need to skip more bytes - we know it's a 1-byte character */
! 3141: }
! 3142: break;
! 3143:
! 3144: case OP_NOT_WORDCHAR:
! 3145: for (i = 1; i <= min; i++)
! 3146: {
! 3147: if (eptr >= md->end_subject ||
! 3148: (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
! 3149: RRETURN(MATCH_NOMATCH);
! 3150: while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
! 3151: }
! 3152: break;
! 3153:
! 3154: case OP_WORDCHAR:
! 3155: for (i = 1; i <= min; i++)
! 3156: {
! 3157: if (eptr >= md->end_subject ||
! 3158: *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
! 3159: RRETURN(MATCH_NOMATCH);
! 3160: /* No need to skip more bytes - we know it's a 1-byte character */
! 3161: }
! 3162: break;
! 3163:
! 3164: default:
! 3165: RRETURN(PCRE_ERROR_INTERNAL);
! 3166: } /* End switch(ctype) */
! 3167:
! 3168: else
! 3169: #endif /* SUPPORT_UTF8 */
! 3170:
! 3171: /* Code for the non-UTF-8 case for minimum matching of operators other
! 3172: than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
! 3173: number of bytes present, as this was tested above. */
! 3174:
! 3175: switch(ctype)
! 3176: {
! 3177: case OP_ANY:
! 3178: for (i = 1; i <= min; i++)
! 3179: {
! 3180: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
! 3181: eptr++;
! 3182: }
! 3183: break;
! 3184:
! 3185: case OP_ALLANY:
! 3186: eptr += min;
! 3187: break;
! 3188:
! 3189: case OP_ANYBYTE:
! 3190: eptr += min;
! 3191: break;
! 3192:
! 3193: /* Because of the CRLF case, we can't assume the minimum number of
! 3194: bytes are present in this case. */
! 3195:
! 3196: case OP_ANYNL:
! 3197: for (i = 1; i <= min; i++)
! 3198: {
! 3199: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 3200: switch(*eptr++)
! 3201: {
! 3202: default: RRETURN(MATCH_NOMATCH);
! 3203: case 0x000d:
! 3204: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
! 3205: break;
! 3206: case 0x000a:
! 3207: break;
! 3208:
! 3209: case 0x000b:
! 3210: case 0x000c:
! 3211: case 0x0085:
! 3212: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
! 3213: break;
! 3214: }
! 3215: }
! 3216: break;
! 3217:
! 3218: case OP_NOT_HSPACE:
! 3219: for (i = 1; i <= min; i++)
! 3220: {
! 3221: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 3222: switch(*eptr++)
! 3223: {
! 3224: default: break;
! 3225: case 0x09: /* HT */
! 3226: case 0x20: /* SPACE */
! 3227: case 0xa0: /* NBSP */
! 3228: RRETURN(MATCH_NOMATCH);
! 3229: }
! 3230: }
! 3231: break;
! 3232:
! 3233: case OP_HSPACE:
! 3234: for (i = 1; i <= min; i++)
! 3235: {
! 3236: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 3237: switch(*eptr++)
! 3238: {
! 3239: default: RRETURN(MATCH_NOMATCH);
! 3240: case 0x09: /* HT */
! 3241: case 0x20: /* SPACE */
! 3242: case 0xa0: /* NBSP */
! 3243: break;
! 3244: }
! 3245: }
! 3246: break;
! 3247:
! 3248: case OP_NOT_VSPACE:
! 3249: for (i = 1; i <= min; i++)
! 3250: {
! 3251: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 3252: switch(*eptr++)
! 3253: {
! 3254: default: break;
! 3255: case 0x0a: /* LF */
! 3256: case 0x0b: /* VT */
! 3257: case 0x0c: /* FF */
! 3258: case 0x0d: /* CR */
! 3259: case 0x85: /* NEL */
! 3260: RRETURN(MATCH_NOMATCH);
! 3261: }
! 3262: }
! 3263: break;
! 3264:
! 3265: case OP_VSPACE:
! 3266: for (i = 1; i <= min; i++)
! 3267: {
! 3268: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 3269: switch(*eptr++)
! 3270: {
! 3271: default: RRETURN(MATCH_NOMATCH);
! 3272: case 0x0a: /* LF */
! 3273: case 0x0b: /* VT */
! 3274: case 0x0c: /* FF */
! 3275: case 0x0d: /* CR */
! 3276: case 0x85: /* NEL */
! 3277: break;
! 3278: }
! 3279: }
! 3280: break;
! 3281:
! 3282: case OP_NOT_DIGIT:
! 3283: for (i = 1; i <= min; i++)
! 3284: if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
! 3285: break;
! 3286:
! 3287: case OP_DIGIT:
! 3288: for (i = 1; i <= min; i++)
! 3289: if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
! 3290: break;
! 3291:
! 3292: case OP_NOT_WHITESPACE:
! 3293: for (i = 1; i <= min; i++)
! 3294: if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
! 3295: break;
! 3296:
! 3297: case OP_WHITESPACE:
! 3298: for (i = 1; i <= min; i++)
! 3299: if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
! 3300: break;
! 3301:
! 3302: case OP_NOT_WORDCHAR:
! 3303: for (i = 1; i <= min; i++)
! 3304: if ((md->ctypes[*eptr++] & ctype_word) != 0)
! 3305: RRETURN(MATCH_NOMATCH);
! 3306: break;
! 3307:
! 3308: case OP_WORDCHAR:
! 3309: for (i = 1; i <= min; i++)
! 3310: if ((md->ctypes[*eptr++] & ctype_word) == 0)
! 3311: RRETURN(MATCH_NOMATCH);
! 3312: break;
! 3313:
! 3314: default:
! 3315: RRETURN(PCRE_ERROR_INTERNAL);
! 3316: }
! 3317: }
! 3318:
! 3319: /* If min = max, continue at the same level without recursing */
! 3320:
! 3321: if (min == max) continue;
! 3322:
! 3323: /* If minimizing, we have to test the rest of the pattern before each
! 3324: subsequent match. Again, separate the UTF-8 case for speed, and also
! 3325: separate the UCP cases. */
! 3326:
! 3327: if (minimize)
! 3328: {
! 3329: #ifdef SUPPORT_UCP
! 3330: if (prop_type >= 0)
! 3331: {
! 3332: switch(prop_type)
! 3333: {
! 3334: case PT_ANY:
! 3335: for (fi = min;; fi++)
! 3336: {
! 3337: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
! 3338: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 3339: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 3340: GETCHARINC(c, eptr);
! 3341: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
! 3342: }
! 3343: /* Control never gets here */
! 3344:
! 3345: case PT_LAMP:
! 3346: for (fi = min;; fi++)
! 3347: {
! 3348: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
! 3349: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 3350: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 3351: GETCHARINC(c, eptr);
! 3352: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 3353: if ((prop_chartype == ucp_Lu ||
! 3354: prop_chartype == ucp_Ll ||
! 3355: prop_chartype == ucp_Lt) == prop_fail_result)
! 3356: RRETURN(MATCH_NOMATCH);
! 3357: }
! 3358: /* Control never gets here */
! 3359:
! 3360: case PT_GC:
! 3361: for (fi = min;; fi++)
! 3362: {
! 3363: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
! 3364: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 3365: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 3366: GETCHARINC(c, eptr);
! 3367: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 3368: if ((prop_category == prop_value) == prop_fail_result)
! 3369: RRETURN(MATCH_NOMATCH);
! 3370: }
! 3371: /* Control never gets here */
! 3372:
! 3373: case PT_PC:
! 3374: for (fi = min;; fi++)
! 3375: {
! 3376: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
! 3377: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 3378: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 3379: GETCHARINC(c, eptr);
! 3380: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 3381: if ((prop_chartype == prop_value) == prop_fail_result)
! 3382: RRETURN(MATCH_NOMATCH);
! 3383: }
! 3384: /* Control never gets here */
! 3385:
! 3386: case PT_SC:
! 3387: for (fi = min;; fi++)
! 3388: {
! 3389: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
! 3390: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 3391: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 3392: GETCHARINC(c, eptr);
! 3393: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 3394: if ((prop_script == prop_value) == prop_fail_result)
! 3395: RRETURN(MATCH_NOMATCH);
! 3396: }
! 3397: /* Control never gets here */
! 3398:
! 3399: default:
! 3400: RRETURN(PCRE_ERROR_INTERNAL);
! 3401: }
! 3402: }
! 3403:
! 3404: /* Match extended Unicode sequences. We will get here only if the
! 3405: support is in the binary; otherwise a compile-time error occurs. */
! 3406:
! 3407: else if (ctype == OP_EXTUNI)
! 3408: {
! 3409: for (fi = min;; fi++)
! 3410: {
! 3411: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
! 3412: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 3413: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
! 3414: GETCHARINCTEST(c, eptr);
! 3415: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 3416: if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
! 3417: while (eptr < md->end_subject)
! 3418: {
! 3419: int len = 1;
! 3420: if (!utf8) c = *eptr; else
! 3421: {
! 3422: GETCHARLEN(c, eptr, len);
! 3423: }
! 3424: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 3425: if (prop_category != ucp_M) break;
! 3426: eptr += len;
! 3427: }
! 3428: }
! 3429: }
! 3430:
! 3431: else
! 3432: #endif /* SUPPORT_UCP */
! 3433:
! 3434: #ifdef SUPPORT_UTF8
! 3435: /* UTF-8 mode */
! 3436: if (utf8)
! 3437: {
! 3438: for (fi = min;; fi++)
! 3439: {
! 3440: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
! 3441: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 3442: if (fi >= max || eptr >= md->end_subject ||
! 3443: (ctype == OP_ANY && IS_NEWLINE(eptr)))
! 3444: RRETURN(MATCH_NOMATCH);
! 3445:
! 3446: GETCHARINC(c, eptr);
! 3447: switch(ctype)
! 3448: {
! 3449: case OP_ANY: /* This is the non-NL case */
! 3450: case OP_ALLANY:
! 3451: case OP_ANYBYTE:
! 3452: break;
! 3453:
! 3454: case OP_ANYNL:
! 3455: switch(c)
! 3456: {
! 3457: default: RRETURN(MATCH_NOMATCH);
! 3458: case 0x000d:
! 3459: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
! 3460: break;
! 3461: case 0x000a:
! 3462: break;
! 3463:
! 3464: case 0x000b:
! 3465: case 0x000c:
! 3466: case 0x0085:
! 3467: case 0x2028:
! 3468: case 0x2029:
! 3469: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
! 3470: break;
! 3471: }
! 3472: break;
! 3473:
! 3474: case OP_NOT_HSPACE:
! 3475: switch(c)
! 3476: {
! 3477: default: break;
! 3478: case 0x09: /* HT */
! 3479: case 0x20: /* SPACE */
! 3480: case 0xa0: /* NBSP */
! 3481: case 0x1680: /* OGHAM SPACE MARK */
! 3482: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
! 3483: case 0x2000: /* EN QUAD */
! 3484: case 0x2001: /* EM QUAD */
! 3485: case 0x2002: /* EN SPACE */
! 3486: case 0x2003: /* EM SPACE */
! 3487: case 0x2004: /* THREE-PER-EM SPACE */
! 3488: case 0x2005: /* FOUR-PER-EM SPACE */
! 3489: case 0x2006: /* SIX-PER-EM SPACE */
! 3490: case 0x2007: /* FIGURE SPACE */
! 3491: case 0x2008: /* PUNCTUATION SPACE */
! 3492: case 0x2009: /* THIN SPACE */
! 3493: case 0x200A: /* HAIR SPACE */
! 3494: case 0x202f: /* NARROW NO-BREAK SPACE */
! 3495: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
! 3496: case 0x3000: /* IDEOGRAPHIC SPACE */
! 3497: RRETURN(MATCH_NOMATCH);
! 3498: }
! 3499: break;
! 3500:
! 3501: case OP_HSPACE:
! 3502: switch(c)
! 3503: {
! 3504: default: RRETURN(MATCH_NOMATCH);
! 3505: case 0x09: /* HT */
! 3506: case 0x20: /* SPACE */
! 3507: case 0xa0: /* NBSP */
! 3508: case 0x1680: /* OGHAM SPACE MARK */
! 3509: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
! 3510: case 0x2000: /* EN QUAD */
! 3511: case 0x2001: /* EM QUAD */
! 3512: case 0x2002: /* EN SPACE */
! 3513: case 0x2003: /* EM SPACE */
! 3514: case 0x2004: /* THREE-PER-EM SPACE */
! 3515: case 0x2005: /* FOUR-PER-EM SPACE */
! 3516: case 0x2006: /* SIX-PER-EM SPACE */
! 3517: case 0x2007: /* FIGURE SPACE */
! 3518: case 0x2008: /* PUNCTUATION SPACE */
! 3519: case 0x2009: /* THIN SPACE */
! 3520: case 0x200A: /* HAIR SPACE */
! 3521: case 0x202f: /* NARROW NO-BREAK SPACE */
! 3522: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
! 3523: case 0x3000: /* IDEOGRAPHIC SPACE */
! 3524: break;
! 3525: }
! 3526: break;
! 3527:
! 3528: case OP_NOT_VSPACE:
! 3529: switch(c)
! 3530: {
! 3531: default: break;
! 3532: case 0x0a: /* LF */
! 3533: case 0x0b: /* VT */
! 3534: case 0x0c: /* FF */
! 3535: case 0x0d: /* CR */
! 3536: case 0x85: /* NEL */
! 3537: case 0x2028: /* LINE SEPARATOR */
! 3538: case 0x2029: /* PARAGRAPH SEPARATOR */
! 3539: RRETURN(MATCH_NOMATCH);
! 3540: }
! 3541: break;
! 3542:
! 3543: case OP_VSPACE:
! 3544: switch(c)
! 3545: {
! 3546: default: RRETURN(MATCH_NOMATCH);
! 3547: case 0x0a: /* LF */
! 3548: case 0x0b: /* VT */
! 3549: case 0x0c: /* FF */
! 3550: case 0x0d: /* CR */
! 3551: case 0x85: /* NEL */
! 3552: case 0x2028: /* LINE SEPARATOR */
! 3553: case 0x2029: /* PARAGRAPH SEPARATOR */
! 3554: break;
! 3555: }
! 3556: break;
! 3557:
! 3558: case OP_NOT_DIGIT:
! 3559: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
! 3560: RRETURN(MATCH_NOMATCH);
! 3561: break;
! 3562:
! 3563: case OP_DIGIT:
! 3564: if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
! 3565: RRETURN(MATCH_NOMATCH);
! 3566: break;
! 3567:
! 3568: case OP_NOT_WHITESPACE:
! 3569: if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
! 3570: RRETURN(MATCH_NOMATCH);
! 3571: break;
! 3572:
! 3573: case OP_WHITESPACE:
! 3574: if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
! 3575: RRETURN(MATCH_NOMATCH);
! 3576: break;
! 3577:
! 3578: case OP_NOT_WORDCHAR:
! 3579: if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
! 3580: RRETURN(MATCH_NOMATCH);
! 3581: break;
! 3582:
! 3583: case OP_WORDCHAR:
! 3584: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
! 3585: RRETURN(MATCH_NOMATCH);
! 3586: break;
! 3587:
! 3588: default:
! 3589: RRETURN(PCRE_ERROR_INTERNAL);
! 3590: }
! 3591: }
! 3592: }
! 3593: else
! 3594: #endif
! 3595: /* Not UTF-8 mode */
! 3596: {
! 3597: for (fi = min;; fi++)
! 3598: {
! 3599: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
! 3600: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 3601: if (fi >= max || eptr >= md->end_subject ||
! 3602: (ctype == OP_ANY && IS_NEWLINE(eptr)))
! 3603: RRETURN(MATCH_NOMATCH);
! 3604:
! 3605: c = *eptr++;
! 3606: switch(ctype)
! 3607: {
! 3608: case OP_ANY: /* This is the non-NL case */
! 3609: case OP_ALLANY:
! 3610: case OP_ANYBYTE:
! 3611: break;
! 3612:
! 3613: case OP_ANYNL:
! 3614: switch(c)
! 3615: {
! 3616: default: RRETURN(MATCH_NOMATCH);
! 3617: case 0x000d:
! 3618: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
! 3619: break;
! 3620:
! 3621: case 0x000a:
! 3622: break;
! 3623:
! 3624: case 0x000b:
! 3625: case 0x000c:
! 3626: case 0x0085:
! 3627: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
! 3628: break;
! 3629: }
! 3630: break;
! 3631:
! 3632: case OP_NOT_HSPACE:
! 3633: switch(c)
! 3634: {
! 3635: default: break;
! 3636: case 0x09: /* HT */
! 3637: case 0x20: /* SPACE */
! 3638: case 0xa0: /* NBSP */
! 3639: RRETURN(MATCH_NOMATCH);
! 3640: }
! 3641: break;
! 3642:
! 3643: case OP_HSPACE:
! 3644: switch(c)
! 3645: {
! 3646: default: RRETURN(MATCH_NOMATCH);
! 3647: case 0x09: /* HT */
! 3648: case 0x20: /* SPACE */
! 3649: case 0xa0: /* NBSP */
! 3650: break;
! 3651: }
! 3652: break;
! 3653:
! 3654: case OP_NOT_VSPACE:
! 3655: switch(c)
! 3656: {
! 3657: default: break;
! 3658: case 0x0a: /* LF */
! 3659: case 0x0b: /* VT */
! 3660: case 0x0c: /* FF */
! 3661: case 0x0d: /* CR */
! 3662: case 0x85: /* NEL */
! 3663: RRETURN(MATCH_NOMATCH);
! 3664: }
! 3665: break;
! 3666:
! 3667: case OP_VSPACE:
! 3668: switch(c)
! 3669: {
! 3670: default: RRETURN(MATCH_NOMATCH);
! 3671: case 0x0a: /* LF */
! 3672: case 0x0b: /* VT */
! 3673: case 0x0c: /* FF */
! 3674: case 0x0d: /* CR */
! 3675: case 0x85: /* NEL */
! 3676: break;
! 3677: }
! 3678: break;
! 3679:
! 3680: case OP_NOT_DIGIT:
! 3681: if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
! 3682: break;
! 3683:
! 3684: case OP_DIGIT:
! 3685: if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
! 3686: break;
! 3687:
! 3688: case OP_NOT_WHITESPACE:
! 3689: if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
! 3690: break;
! 3691:
! 3692: case OP_WHITESPACE:
! 3693: if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
! 3694: break;
! 3695:
! 3696: case OP_NOT_WORDCHAR:
! 3697: if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
! 3698: break;
! 3699:
! 3700: case OP_WORDCHAR:
! 3701: if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
! 3702: break;
! 3703:
! 3704: default:
! 3705: RRETURN(PCRE_ERROR_INTERNAL);
! 3706: }
! 3707: }
! 3708: }
! 3709: /* Control never gets here */
! 3710: }
! 3711:
! 3712: /* If maximizing, it is worth using inline code for speed, doing the type
! 3713: test once at the start (i.e. keep it out of the loop). Again, keep the
! 3714: UTF-8 and UCP stuff separate. */
! 3715:
! 3716: else
! 3717: {
! 3718: pp = eptr; /* Remember where we started */
! 3719:
! 3720: #ifdef SUPPORT_UCP
! 3721: if (prop_type >= 0)
! 3722: {
! 3723: switch(prop_type)
! 3724: {
! 3725: case PT_ANY:
! 3726: for (i = min; i < max; i++)
! 3727: {
! 3728: int len = 1;
! 3729: if (eptr >= md->end_subject) break;
! 3730: GETCHARLEN(c, eptr, len);
! 3731: if (prop_fail_result) break;
! 3732: eptr+= len;
! 3733: }
! 3734: break;
! 3735:
! 3736: case PT_LAMP:
! 3737: for (i = min; i < max; i++)
! 3738: {
! 3739: int len = 1;
! 3740: if (eptr >= md->end_subject) break;
! 3741: GETCHARLEN(c, eptr, len);
! 3742: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 3743: if ((prop_chartype == ucp_Lu ||
! 3744: prop_chartype == ucp_Ll ||
! 3745: prop_chartype == ucp_Lt) == prop_fail_result)
! 3746: break;
! 3747: eptr+= len;
! 3748: }
! 3749: break;
! 3750:
! 3751: case PT_GC:
! 3752: for (i = min; i < max; i++)
! 3753: {
! 3754: int len = 1;
! 3755: if (eptr >= md->end_subject) break;
! 3756: GETCHARLEN(c, eptr, len);
! 3757: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 3758: if ((prop_category == prop_value) == prop_fail_result)
! 3759: break;
! 3760: eptr+= len;
! 3761: }
! 3762: break;
! 3763:
! 3764: case PT_PC:
! 3765: for (i = min; i < max; i++)
! 3766: {
! 3767: int len = 1;
! 3768: if (eptr >= md->end_subject) break;
! 3769: GETCHARLEN(c, eptr, len);
! 3770: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 3771: if ((prop_chartype == prop_value) == prop_fail_result)
! 3772: break;
! 3773: eptr+= len;
! 3774: }
! 3775: break;
! 3776:
! 3777: case PT_SC:
! 3778: for (i = min; i < max; i++)
! 3779: {
! 3780: int len = 1;
! 3781: if (eptr >= md->end_subject) break;
! 3782: GETCHARLEN(c, eptr, len);
! 3783: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 3784: if ((prop_script == prop_value) == prop_fail_result)
! 3785: break;
! 3786: eptr+= len;
! 3787: }
! 3788: break;
! 3789: }
! 3790:
! 3791: /* eptr is now past the end of the maximum run */
! 3792:
! 3793: if (possessive) continue;
! 3794: for(;;)
! 3795: {
! 3796: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
! 3797: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 3798: if (eptr-- == pp) break; /* Stop if tried at original pos */
! 3799: if (utf8) BACKCHAR(eptr);
! 3800: }
! 3801: }
! 3802:
! 3803: /* Match extended Unicode sequences. We will get here only if the
! 3804: support is in the binary; otherwise a compile-time error occurs. */
! 3805:
! 3806: else if (ctype == OP_EXTUNI)
! 3807: {
! 3808: for (i = min; i < max; i++)
! 3809: {
! 3810: if (eptr >= md->end_subject) break;
! 3811: GETCHARINCTEST(c, eptr);
! 3812: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 3813: if (prop_category == ucp_M) break;
! 3814: while (eptr < md->end_subject)
! 3815: {
! 3816: int len = 1;
! 3817: if (!utf8) c = *eptr; else
! 3818: {
! 3819: GETCHARLEN(c, eptr, len);
! 3820: }
! 3821: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 3822: if (prop_category != ucp_M) break;
! 3823: eptr += len;
! 3824: }
! 3825: }
! 3826:
! 3827: /* eptr is now past the end of the maximum run */
! 3828:
! 3829: if (possessive) continue;
! 3830: for(;;)
! 3831: {
! 3832: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
! 3833: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 3834: if (eptr-- == pp) break; /* Stop if tried at original pos */
! 3835: for (;;) /* Move back over one extended */
! 3836: {
! 3837: int len = 1;
! 3838: if (!utf8) c = *eptr; else
! 3839: {
! 3840: BACKCHAR(eptr);
! 3841: GETCHARLEN(c, eptr, len);
! 3842: }
! 3843: prop_category = _pcre_ucp_findprop(c, &prop_chartype, &prop_script);
! 3844: if (prop_category != ucp_M) break;
! 3845: eptr--;
! 3846: }
! 3847: }
! 3848: }
! 3849:
! 3850: else
! 3851: #endif /* SUPPORT_UCP */
! 3852:
! 3853: #ifdef SUPPORT_UTF8
! 3854: /* UTF-8 mode */
! 3855:
! 3856: if (utf8)
! 3857: {
! 3858: switch(ctype)
! 3859: {
! 3860: case OP_ANY:
! 3861: if (max < INT_MAX)
! 3862: {
! 3863: for (i = min; i < max; i++)
! 3864: {
! 3865: if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
! 3866: eptr++;
! 3867: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
! 3868: }
! 3869: }
! 3870:
! 3871: /* Handle unlimited UTF-8 repeat */
! 3872:
! 3873: else
! 3874: {
! 3875: for (i = min; i < max; i++)
! 3876: {
! 3877: if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
! 3878: eptr++;
! 3879: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
! 3880: }
! 3881: }
! 3882: break;
! 3883:
! 3884: case OP_ALLANY:
! 3885: if (max < INT_MAX)
! 3886: {
! 3887: for (i = min; i < max; i++)
! 3888: {
! 3889: if (eptr >= md->end_subject) break;
! 3890: eptr++;
! 3891: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
! 3892: }
! 3893: }
! 3894: else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
! 3895: break;
! 3896:
! 3897: /* The byte case is the same as non-UTF8 */
! 3898:
! 3899: case OP_ANYBYTE:
! 3900: c = max - min;
! 3901: if (c > (unsigned int)(md->end_subject - eptr))
! 3902: c = md->end_subject - eptr;
! 3903: eptr += c;
! 3904: break;
! 3905:
! 3906: case OP_ANYNL:
! 3907: for (i = min; i < max; i++)
! 3908: {
! 3909: int len = 1;
! 3910: if (eptr >= md->end_subject) break;
! 3911: GETCHARLEN(c, eptr, len);
! 3912: if (c == 0x000d)
! 3913: {
! 3914: if (++eptr >= md->end_subject) break;
! 3915: if (*eptr == 0x000a) eptr++;
! 3916: }
! 3917: else
! 3918: {
! 3919: if (c != 0x000a &&
! 3920: (md->bsr_anycrlf ||
! 3921: (c != 0x000b && c != 0x000c &&
! 3922: c != 0x0085 && c != 0x2028 && c != 0x2029)))
! 3923: break;
! 3924: eptr += len;
! 3925: }
! 3926: }
! 3927: break;
! 3928:
! 3929: case OP_NOT_HSPACE:
! 3930: case OP_HSPACE:
! 3931: for (i = min; i < max; i++)
! 3932: {
! 3933: BOOL gotspace;
! 3934: int len = 1;
! 3935: if (eptr >= md->end_subject) break;
! 3936: GETCHARLEN(c, eptr, len);
! 3937: switch(c)
! 3938: {
! 3939: default: gotspace = FALSE; break;
! 3940: case 0x09: /* HT */
! 3941: case 0x20: /* SPACE */
! 3942: case 0xa0: /* NBSP */
! 3943: case 0x1680: /* OGHAM SPACE MARK */
! 3944: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
! 3945: case 0x2000: /* EN QUAD */
! 3946: case 0x2001: /* EM QUAD */
! 3947: case 0x2002: /* EN SPACE */
! 3948: case 0x2003: /* EM SPACE */
! 3949: case 0x2004: /* THREE-PER-EM SPACE */
! 3950: case 0x2005: /* FOUR-PER-EM SPACE */
! 3951: case 0x2006: /* SIX-PER-EM SPACE */
! 3952: case 0x2007: /* FIGURE SPACE */
! 3953: case 0x2008: /* PUNCTUATION SPACE */
! 3954: case 0x2009: /* THIN SPACE */
! 3955: case 0x200A: /* HAIR SPACE */
! 3956: case 0x202f: /* NARROW NO-BREAK SPACE */
! 3957: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
! 3958: case 0x3000: /* IDEOGRAPHIC SPACE */
! 3959: gotspace = TRUE;
! 3960: break;
! 3961: }
! 3962: if (gotspace == (ctype == OP_NOT_HSPACE)) break;
! 3963: eptr += len;
! 3964: }
! 3965: break;
! 3966:
! 3967: case OP_NOT_VSPACE:
! 3968: case OP_VSPACE:
! 3969: for (i = min; i < max; i++)
! 3970: {
! 3971: BOOL gotspace;
! 3972: int len = 1;
! 3973: if (eptr >= md->end_subject) break;
! 3974: GETCHARLEN(c, eptr, len);
! 3975: switch(c)
! 3976: {
! 3977: default: gotspace = FALSE; break;
! 3978: case 0x0a: /* LF */
! 3979: case 0x0b: /* VT */
! 3980: case 0x0c: /* FF */
! 3981: case 0x0d: /* CR */
! 3982: case 0x85: /* NEL */
! 3983: case 0x2028: /* LINE SEPARATOR */
! 3984: case 0x2029: /* PARAGRAPH SEPARATOR */
! 3985: gotspace = TRUE;
! 3986: break;
! 3987: }
! 3988: if (gotspace == (ctype == OP_NOT_VSPACE)) break;
! 3989: eptr += len;
! 3990: }
! 3991: break;
! 3992:
! 3993: case OP_NOT_DIGIT:
! 3994: for (i = min; i < max; i++)
! 3995: {
! 3996: int len = 1;
! 3997: if (eptr >= md->end_subject) break;
! 3998: GETCHARLEN(c, eptr, len);
! 3999: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
! 4000: eptr+= len;
! 4001: }
! 4002: break;
! 4003:
! 4004: case OP_DIGIT:
! 4005: for (i = min; i < max; i++)
! 4006: {
! 4007: int len = 1;
! 4008: if (eptr >= md->end_subject) break;
! 4009: GETCHARLEN(c, eptr, len);
! 4010: if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
! 4011: eptr+= len;
! 4012: }
! 4013: break;
! 4014:
! 4015: case OP_NOT_WHITESPACE:
! 4016: for (i = min; i < max; i++)
! 4017: {
! 4018: int len = 1;
! 4019: if (eptr >= md->end_subject) break;
! 4020: GETCHARLEN(c, eptr, len);
! 4021: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
! 4022: eptr+= len;
! 4023: }
! 4024: break;
! 4025:
! 4026: case OP_WHITESPACE:
! 4027: for (i = min; i < max; i++)
! 4028: {
! 4029: int len = 1;
! 4030: if (eptr >= md->end_subject) break;
! 4031: GETCHARLEN(c, eptr, len);
! 4032: if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
! 4033: eptr+= len;
! 4034: }
! 4035: break;
! 4036:
! 4037: case OP_NOT_WORDCHAR:
! 4038: for (i = min; i < max; i++)
! 4039: {
! 4040: int len = 1;
! 4041: if (eptr >= md->end_subject) break;
! 4042: GETCHARLEN(c, eptr, len);
! 4043: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
! 4044: eptr+= len;
! 4045: }
! 4046: break;
! 4047:
! 4048: case OP_WORDCHAR:
! 4049: for (i = min; i < max; i++)
! 4050: {
! 4051: int len = 1;
! 4052: if (eptr >= md->end_subject) break;
! 4053: GETCHARLEN(c, eptr, len);
! 4054: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
! 4055: eptr+= len;
! 4056: }
! 4057: break;
! 4058:
! 4059: default:
! 4060: RRETURN(PCRE_ERROR_INTERNAL);
! 4061: }
! 4062:
! 4063: /* eptr is now past the end of the maximum run */
! 4064:
! 4065: if (possessive) continue;
! 4066: for(;;)
! 4067: {
! 4068: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
! 4069: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 4070: if (eptr-- == pp) break; /* Stop if tried at original pos */
! 4071: BACKCHAR(eptr);
! 4072: }
! 4073: }
! 4074: else
! 4075: #endif /* SUPPORT_UTF8 */
! 4076:
! 4077: /* Not UTF-8 mode */
! 4078: {
! 4079: switch(ctype)
! 4080: {
! 4081: case OP_ANY:
! 4082: for (i = min; i < max; i++)
! 4083: {
! 4084: if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
! 4085: eptr++;
! 4086: }
! 4087: break;
! 4088:
! 4089: case OP_ALLANY:
! 4090: case OP_ANYBYTE:
! 4091: c = max - min;
! 4092: if (c > (unsigned int)(md->end_subject - eptr))
! 4093: c = md->end_subject - eptr;
! 4094: eptr += c;
! 4095: break;
! 4096:
! 4097: case OP_ANYNL:
! 4098: for (i = min; i < max; i++)
! 4099: {
! 4100: if (eptr >= md->end_subject) break;
! 4101: c = *eptr;
! 4102: if (c == 0x000d)
! 4103: {
! 4104: if (++eptr >= md->end_subject) break;
! 4105: if (*eptr == 0x000a) eptr++;
! 4106: }
! 4107: else
! 4108: {
! 4109: if (c != 0x000a &&
! 4110: (md->bsr_anycrlf ||
! 4111: (c != 0x000b && c != 0x000c && c != 0x0085)))
! 4112: break;
! 4113: eptr++;
! 4114: }
! 4115: }
! 4116: break;
! 4117:
! 4118: case OP_NOT_HSPACE:
! 4119: for (i = min; i < max; i++)
! 4120: {
! 4121: if (eptr >= md->end_subject) break;
! 4122: c = *eptr;
! 4123: if (c == 0x09 || c == 0x20 || c == 0xa0) break;
! 4124: eptr++;
! 4125: }
! 4126: break;
! 4127:
! 4128: case OP_HSPACE:
! 4129: for (i = min; i < max; i++)
! 4130: {
! 4131: if (eptr >= md->end_subject) break;
! 4132: c = *eptr;
! 4133: if (c != 0x09 && c != 0x20 && c != 0xa0) break;
! 4134: eptr++;
! 4135: }
! 4136: break;
! 4137:
! 4138: case OP_NOT_VSPACE:
! 4139: for (i = min; i < max; i++)
! 4140: {
! 4141: if (eptr >= md->end_subject) break;
! 4142: c = *eptr;
! 4143: if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
! 4144: break;
! 4145: eptr++;
! 4146: }
! 4147: break;
! 4148:
! 4149: case OP_VSPACE:
! 4150: for (i = min; i < max; i++)
! 4151: {
! 4152: if (eptr >= md->end_subject) break;
! 4153: c = *eptr;
! 4154: if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
! 4155: break;
! 4156: eptr++;
! 4157: }
! 4158: break;
! 4159:
! 4160: case OP_NOT_DIGIT:
! 4161: for (i = min; i < max; i++)
! 4162: {
! 4163: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
! 4164: break;
! 4165: eptr++;
! 4166: }
! 4167: break;
! 4168:
! 4169: case OP_DIGIT:
! 4170: for (i = min; i < max; i++)
! 4171: {
! 4172: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
! 4173: break;
! 4174: eptr++;
! 4175: }
! 4176: break;
! 4177:
! 4178: case OP_NOT_WHITESPACE:
! 4179: for (i = min; i < max; i++)
! 4180: {
! 4181: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
! 4182: break;
! 4183: eptr++;
! 4184: }
! 4185: break;
! 4186:
! 4187: case OP_WHITESPACE:
! 4188: for (i = min; i < max; i++)
! 4189: {
! 4190: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
! 4191: break;
! 4192: eptr++;
! 4193: }
! 4194: break;
! 4195:
! 4196: case OP_NOT_WORDCHAR:
! 4197: for (i = min; i < max; i++)
! 4198: {
! 4199: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
! 4200: break;
! 4201: eptr++;
! 4202: }
! 4203: break;
! 4204:
! 4205: case OP_WORDCHAR:
! 4206: for (i = min; i < max; i++)
! 4207: {
! 4208: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
! 4209: break;
! 4210: eptr++;
! 4211: }
! 4212: break;
! 4213:
! 4214: default:
! 4215: RRETURN(PCRE_ERROR_INTERNAL);
! 4216: }
! 4217:
! 4218: /* eptr is now past the end of the maximum run */
! 4219:
! 4220: if (possessive) continue;
! 4221: while (eptr >= pp)
! 4222: {
! 4223: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
! 4224: eptr--;
! 4225: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 4226: }
! 4227: }
! 4228:
! 4229: /* Get here if we can't make it match with any permitted repetitions */
! 4230:
! 4231: RRETURN(MATCH_NOMATCH);
! 4232: }
! 4233: /* Control never gets here */
! 4234:
! 4235: /* There's been some horrible disaster. Arrival here can only mean there is
! 4236: something seriously wrong in the code above or the OP_xxx definitions. */
! 4237:
! 4238: default:
! 4239: DPRINTF(("Unknown opcode %d\n", *ecode));
! 4240: RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
! 4241: }
! 4242:
! 4243: /* Do not stick any code in here without much thought; it is assumed
! 4244: that "continue" in the code above comes out to here to repeat the main
! 4245: loop. */
! 4246:
! 4247: } /* End of main loop */
! 4248: /* Control never reaches here */
! 4249:
! 4250:
! 4251: /* When compiling to use the heap rather than the stack for recursive calls to
! 4252: match(), the RRETURN() macro jumps here. The number that is saved in
! 4253: frame->Xwhere indicates which label we actually want to return to. */
! 4254:
! 4255: #ifdef NO_RECURSE
! 4256: #define LBL(val) case val: goto L_RM##val;
! 4257: HEAP_RETURN:
! 4258: switch (frame->Xwhere)
! 4259: {
! 4260: LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
! 4261: LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
! 4262: LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
! 4263: LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
! 4264: LBL(53) LBL(54)
! 4265: #ifdef SUPPORT_UTF8
! 4266: LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
! 4267: LBL(32) LBL(34) LBL(42) LBL(46)
! 4268: #ifdef SUPPORT_UCP
! 4269: LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
! 4270: #endif /* SUPPORT_UCP */
! 4271: #endif /* SUPPORT_UTF8 */
! 4272: default:
! 4273: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
! 4274: return PCRE_ERROR_INTERNAL;
! 4275: }
! 4276: #undef LBL
! 4277: #endif /* NO_RECURSE */
! 4278: }
! 4279:
! 4280:
! 4281: /***************************************************************************
! 4282: ****************************************************************************
! 4283: RECURSION IN THE match() FUNCTION
! 4284:
! 4285: Undefine all the macros that were defined above to handle this. */
! 4286:
! 4287: #ifdef NO_RECURSE
! 4288: #undef eptr
! 4289: #undef ecode
! 4290: #undef mstart
! 4291: #undef offset_top
! 4292: #undef ims
! 4293: #undef eptrb
! 4294: #undef flags
! 4295:
! 4296: #undef callpat
! 4297: #undef charptr
! 4298: #undef data
! 4299: #undef next
! 4300: #undef pp
! 4301: #undef prev
! 4302: #undef saved_eptr
! 4303:
! 4304: #undef new_recursive
! 4305:
! 4306: #undef cur_is_word
! 4307: #undef condition
! 4308: #undef prev_is_word
! 4309:
! 4310: #undef original_ims
! 4311:
! 4312: #undef ctype
! 4313: #undef length
! 4314: #undef max
! 4315: #undef min
! 4316: #undef number
! 4317: #undef offset
! 4318: #undef op
! 4319: #undef save_capture_last
! 4320: #undef save_offset1
! 4321: #undef save_offset2
! 4322: #undef save_offset3
! 4323: #undef stacksave
! 4324:
! 4325: #undef newptrb
! 4326:
! 4327: #endif
! 4328:
! 4329: /* These two are defined as macros in both cases */
! 4330:
! 4331: #undef fc
! 4332: #undef fi
! 4333:
! 4334: /***************************************************************************
! 4335: ***************************************************************************/
! 4336:
! 4337:
! 4338:
! 4339: /*************************************************
! 4340: * Execute a Regular Expression *
! 4341: *************************************************/
! 4342:
! 4343: /* This function applies a compiled re to a subject string and picks out
! 4344: portions of the string if it matches. Two elements in the vector are set for
! 4345: each substring: the offsets to the start and end of the substring.
! 4346:
! 4347: Arguments:
! 4348: argument_re points to the compiled expression
! 4349: extra_data points to extra data or is NULL
! 4350: subject points to the subject string
! 4351: length length of subject string (may contain binary zeros)
! 4352: start_offset where to start in the subject string
! 4353: options option bits
! 4354: offsets points to a vector of ints to be filled in with offsets
! 4355: offsetcount the number of elements in the vector
! 4356:
! 4357: Returns: > 0 => success; value is the number of elements filled in
! 4358: = 0 => success, but offsets is not big enough
! 4359: -1 => failed to match
! 4360: < -1 => some kind of unexpected problem
! 4361: */
! 4362:
! 4363: PCRE_EXP_DEFN int
! 4364: pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
! 4365: PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
! 4366: int offsetcount)
! 4367: {
! 4368: int rc, resetcount, ocount;
! 4369: int first_byte = -1;
! 4370: int req_byte = -1;
! 4371: int req_byte2 = -1;
! 4372: int newline;
! 4373: unsigned long int ims;
! 4374: BOOL using_temporary_offsets = FALSE;
! 4375: BOOL anchored;
! 4376: BOOL startline;
! 4377: BOOL firstline;
! 4378: BOOL first_byte_caseless = FALSE;
! 4379: BOOL req_byte_caseless = FALSE;
! 4380: BOOL utf8;
! 4381: match_data match_block;
! 4382: match_data *md = &match_block;
! 4383: const uschar *tables;
! 4384: const uschar *start_bits = NULL;
! 4385: USPTR start_match = (USPTR)subject + start_offset;
! 4386: USPTR end_subject;
! 4387: USPTR req_byte_ptr = start_match - 1;
! 4388:
! 4389: pcre_study_data internal_study;
! 4390: const pcre_study_data *study;
! 4391:
! 4392: real_pcre internal_re;
! 4393: const real_pcre *external_re = (const real_pcre *)argument_re;
! 4394: const real_pcre *re = external_re;
! 4395:
! 4396: /* Plausibility checks */
! 4397:
! 4398: if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
! 4399: if (re == NULL || subject == NULL ||
! 4400: (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
! 4401: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
! 4402:
! 4403: /* Fish out the optional data from the extra_data structure, first setting
! 4404: the default values. */
! 4405:
! 4406: study = NULL;
! 4407: md->match_limit = MATCH_LIMIT;
! 4408: md->match_limit_recursion = MATCH_LIMIT_RECURSION;
! 4409: md->callout_data = NULL;
! 4410:
! 4411: /* The table pointer is always in native byte order. */
! 4412:
! 4413: tables = external_re->tables;
! 4414:
! 4415: if (extra_data != NULL)
! 4416: {
! 4417: register unsigned int flags = extra_data->flags;
! 4418: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
! 4419: study = (const pcre_study_data *)extra_data->study_data;
! 4420: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
! 4421: md->match_limit = extra_data->match_limit;
! 4422: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
! 4423: md->match_limit_recursion = extra_data->match_limit_recursion;
! 4424: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
! 4425: md->callout_data = extra_data->callout_data;
! 4426: if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
! 4427: }
! 4428:
! 4429: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
! 4430: is a feature that makes it possible to save compiled regex and re-use them
! 4431: in other programs later. */
! 4432:
! 4433: if (tables == NULL) tables = _pcre_default_tables;
! 4434:
! 4435: /* Check that the first field in the block is the magic number. If it is not,
! 4436: test for a regex that was compiled on a host of opposite endianness. If this is
! 4437: the case, flipped values are put in internal_re and internal_study if there was
! 4438: study data too. */
! 4439:
! 4440: if (re->magic_number != MAGIC_NUMBER)
! 4441: {
! 4442: re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
! 4443: if (re == NULL) return PCRE_ERROR_BADMAGIC;
! 4444: if (study != NULL) study = &internal_study;
! 4445: }
! 4446:
! 4447: /* Set up other data */
! 4448:
! 4449: anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
! 4450: startline = (re->flags & PCRE_STARTLINE) != 0;
! 4451: firstline = (re->options & PCRE_FIRSTLINE) != 0;
! 4452:
! 4453: /* The code starts after the real_pcre block and the capture name table. */
! 4454:
! 4455: md->start_code = (const uschar *)external_re + re->name_table_offset +
! 4456: re->name_count * re->name_entry_size;
! 4457:
! 4458: md->start_subject = (USPTR)subject;
! 4459: md->start_offset = start_offset;
! 4460: md->end_subject = md->start_subject + length;
! 4461: end_subject = md->end_subject;
! 4462:
! 4463: md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
! 4464: utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
! 4465: md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
! 4466:
! 4467: md->notbol = (options & PCRE_NOTBOL) != 0;
! 4468: md->noteol = (options & PCRE_NOTEOL) != 0;
! 4469: md->notempty = (options & PCRE_NOTEMPTY) != 0;
! 4470: md->partial = (options & PCRE_PARTIAL) != 0;
! 4471: md->hitend = FALSE;
! 4472:
! 4473: md->recursive = NULL; /* No recursion at top level */
! 4474:
! 4475: md->lcc = tables + lcc_offset;
! 4476: md->ctypes = tables + ctypes_offset;
! 4477:
! 4478: /* Handle different \R options. */
! 4479:
! 4480: switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
! 4481: {
! 4482: case 0:
! 4483: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
! 4484: md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
! 4485: else
! 4486: #ifdef BSR_ANYCRLF
! 4487: md->bsr_anycrlf = TRUE;
! 4488: #else
! 4489: md->bsr_anycrlf = FALSE;
! 4490: #endif
! 4491: break;
! 4492:
! 4493: case PCRE_BSR_ANYCRLF:
! 4494: md->bsr_anycrlf = TRUE;
! 4495: break;
! 4496:
! 4497: case PCRE_BSR_UNICODE:
! 4498: md->bsr_anycrlf = FALSE;
! 4499: break;
! 4500:
! 4501: default: return PCRE_ERROR_BADNEWLINE;
! 4502: }
! 4503:
! 4504: /* Handle different types of newline. The three bits give eight cases. If
! 4505: nothing is set at run time, whatever was used at compile time applies. */
! 4506:
! 4507: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
! 4508: (pcre_uint32)options) & PCRE_NEWLINE_BITS)
! 4509: {
! 4510: case 0: newline = NEWLINE; break; /* Compile-time default */
! 4511: case PCRE_NEWLINE_CR: newline = '\r'; break;
! 4512: case PCRE_NEWLINE_LF: newline = '\n'; break;
! 4513: case PCRE_NEWLINE_CR+
! 4514: PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
! 4515: case PCRE_NEWLINE_ANY: newline = -1; break;
! 4516: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
! 4517: default: return PCRE_ERROR_BADNEWLINE;
! 4518: }
! 4519:
! 4520: if (newline == -2)
! 4521: {
! 4522: md->nltype = NLTYPE_ANYCRLF;
! 4523: }
! 4524: else if (newline < 0)
! 4525: {
! 4526: md->nltype = NLTYPE_ANY;
! 4527: }
! 4528: else
! 4529: {
! 4530: md->nltype = NLTYPE_FIXED;
! 4531: if (newline > 255)
! 4532: {
! 4533: md->nllen = 2;
! 4534: md->nl[0] = (newline >> 8) & 255;
! 4535: md->nl[1] = newline & 255;
! 4536: }
! 4537: else
! 4538: {
! 4539: md->nllen = 1;
! 4540: md->nl[0] = newline;
! 4541: }
! 4542: }
! 4543:
! 4544: /* Partial matching is supported only for a restricted set of regexes at the
! 4545: moment. */
! 4546:
! 4547: if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
! 4548: return PCRE_ERROR_BADPARTIAL;
! 4549:
! 4550: /* Check a UTF-8 string if required. Unfortunately there's no way of passing
! 4551: back the character offset. */
! 4552:
! 4553: #ifdef SUPPORT_UTF8
! 4554: if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
! 4555: {
! 4556: if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
! 4557: return PCRE_ERROR_BADUTF8;
! 4558: if (start_offset > 0 && start_offset < length)
! 4559: {
! 4560: int tb = ((uschar *)subject)[start_offset];
! 4561: if (tb > 127)
! 4562: {
! 4563: tb &= 0xc0;
! 4564: if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
! 4565: }
! 4566: }
! 4567: }
! 4568: #endif
! 4569:
! 4570: /* The ims options can vary during the matching as a result of the presence
! 4571: of (?ims) items in the pattern. They are kept in a local variable so that
! 4572: restoring at the exit of a group is easy. */
! 4573:
! 4574: ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
! 4575:
! 4576: /* If the expression has got more back references than the offsets supplied can
! 4577: hold, we get a temporary chunk of working store to use during the matching.
! 4578: Otherwise, we can use the vector supplied, rounding down its size to a multiple
! 4579: of 3. */
! 4580:
! 4581: ocount = offsetcount - (offsetcount % 3);
! 4582:
! 4583: if (re->top_backref > 0 && re->top_backref >= ocount/3)
! 4584: {
! 4585: ocount = re->top_backref * 3 + 3;
! 4586: md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
! 4587: if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
! 4588: using_temporary_offsets = TRUE;
! 4589: DPRINTF(("Got memory to hold back references\n"));
! 4590: }
! 4591: else md->offset_vector = offsets;
! 4592:
! 4593: md->offset_end = ocount;
! 4594: md->offset_max = (2*ocount)/3;
! 4595: md->offset_overflow = FALSE;
! 4596: md->capture_last = -1;
! 4597:
! 4598: /* Compute the minimum number of offsets that we need to reset each time. Doing
! 4599: this makes a huge difference to execution time when there aren't many brackets
! 4600: in the pattern. */
! 4601:
! 4602: resetcount = 2 + re->top_bracket * 2;
! 4603: if (resetcount > offsetcount) resetcount = ocount;
! 4604:
! 4605: /* Reset the working variable associated with each extraction. These should
! 4606: never be used unless previously set, but they get saved and restored, and so we
! 4607: initialize them to avoid reading uninitialized locations. */
! 4608:
! 4609: if (md->offset_vector != NULL)
! 4610: {
! 4611: register int *iptr = md->offset_vector + ocount;
! 4612: register int *iend = iptr - resetcount/2 + 1;
! 4613: while (--iptr >= iend) *iptr = -1;
! 4614: }
! 4615:
! 4616: /* Set up the first character to match, if available. The first_byte value is
! 4617: never set for an anchored regular expression, but the anchoring may be forced
! 4618: at run time, so we have to test for anchoring. The first char may be unset for
! 4619: an unanchored pattern, of course. If there's no first char and the pattern was
! 4620: studied, there may be a bitmap of possible first characters. */
! 4621:
! 4622: if (!anchored)
! 4623: {
! 4624: if ((re->flags & PCRE_FIRSTSET) != 0)
! 4625: {
! 4626: first_byte = re->first_byte & 255;
! 4627: if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
! 4628: first_byte = md->lcc[first_byte];
! 4629: }
! 4630: else
! 4631: if (!startline && study != NULL &&
! 4632: (study->options & PCRE_STUDY_MAPPED) != 0)
! 4633: start_bits = study->start_bits;
! 4634: }
! 4635:
! 4636: /* For anchored or unanchored matches, there may be a "last known required
! 4637: character" set. */
! 4638:
! 4639: if ((re->flags & PCRE_REQCHSET) != 0)
! 4640: {
! 4641: req_byte = re->req_byte & 255;
! 4642: req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
! 4643: req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
! 4644: }
! 4645:
! 4646:
! 4647: /* ==========================================================================*/
! 4648:
! 4649: /* Loop for handling unanchored repeated matching attempts; for anchored regexs
! 4650: the loop runs just once. */
! 4651:
! 4652: for(;;)
! 4653: {
! 4654: USPTR save_end_subject = end_subject;
! 4655: USPTR new_start_match;
! 4656:
! 4657: /* Reset the maximum number of extractions we might see. */
! 4658:
! 4659: if (md->offset_vector != NULL)
! 4660: {
! 4661: register int *iptr = md->offset_vector;
! 4662: register int *iend = iptr + resetcount;
! 4663: while (iptr < iend) *iptr++ = -1;
! 4664: }
! 4665:
! 4666: /* Advance to a unique first char if possible. If firstline is TRUE, the
! 4667: start of the match is constrained to the first line of a multiline string.
! 4668: That is, the match must be before or at the first newline. Implement this by
! 4669: temporarily adjusting end_subject so that we stop scanning at a newline. If
! 4670: the match fails at the newline, later code breaks this loop. */
! 4671:
! 4672: if (firstline)
! 4673: {
! 4674: USPTR t = start_match;
! 4675: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
! 4676: end_subject = t;
! 4677: }
! 4678:
! 4679: /* Now test for a unique first byte */
! 4680:
! 4681: if (first_byte >= 0)
! 4682: {
! 4683: if (first_byte_caseless)
! 4684: while (start_match < end_subject &&
! 4685: md->lcc[*start_match] != first_byte)
! 4686: { NEXTCHAR(start_match); }
! 4687: else
! 4688: while (start_match < end_subject && *start_match != first_byte)
! 4689: { NEXTCHAR(start_match); }
! 4690: }
! 4691:
! 4692: /* Or to just after a linebreak for a multiline match if possible */
! 4693:
! 4694: else if (startline)
! 4695: {
! 4696: if (start_match > md->start_subject + start_offset)
! 4697: {
! 4698: while (start_match <= end_subject && !WAS_NEWLINE(start_match))
! 4699: { NEXTCHAR(start_match); }
! 4700:
! 4701: /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
! 4702: and we are now at a LF, advance the match position by one more character.
! 4703: */
! 4704:
! 4705: if (start_match[-1] == '\r' &&
! 4706: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
! 4707: start_match < end_subject &&
! 4708: *start_match == '\n')
! 4709: start_match++;
! 4710: }
! 4711: }
! 4712:
! 4713: /* Or to a non-unique first char after study */
! 4714:
! 4715: else if (start_bits != NULL)
! 4716: {
! 4717: while (start_match < end_subject)
! 4718: {
! 4719: register unsigned int c = *start_match;
! 4720: if ((start_bits[c/8] & (1 << (c&7))) == 0)
! 4721: { NEXTCHAR(start_match); }
! 4722: else break;
! 4723: }
! 4724: }
! 4725:
! 4726: /* Restore fudged end_subject */
! 4727:
! 4728: end_subject = save_end_subject;
! 4729:
! 4730: #ifdef DEBUG /* Sigh. Some compilers never learn. */
! 4731: printf(">>>> Match against: ");
! 4732: pchars(start_match, end_subject - start_match, TRUE, md);
! 4733: printf("\n");
! 4734: #endif
! 4735:
! 4736: /* If req_byte is set, we know that that character must appear in the subject
! 4737: for the match to succeed. If the first character is set, req_byte must be
! 4738: later in the subject; otherwise the test starts at the match point. This
! 4739: optimization can save a huge amount of backtracking in patterns with nested
! 4740: unlimited repeats that aren't going to match. Writing separate code for
! 4741: cased/caseless versions makes it go faster, as does using an autoincrement
! 4742: and backing off on a match.
! 4743:
! 4744: HOWEVER: when the subject string is very, very long, searching to its end can
! 4745: take a long time, and give bad performance on quite ordinary patterns. This
! 4746: showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
! 4747: string... so we don't do this when the string is sufficiently long.
! 4748:
! 4749: ALSO: this processing is disabled when partial matching is requested.
! 4750: */
! 4751:
! 4752: if (req_byte >= 0 &&
! 4753: end_subject - start_match < REQ_BYTE_MAX &&
! 4754: !md->partial)
! 4755: {
! 4756: register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
! 4757:
! 4758: /* We don't need to repeat the search if we haven't yet reached the
! 4759: place we found it at last time. */
! 4760:
! 4761: if (p > req_byte_ptr)
! 4762: {
! 4763: if (req_byte_caseless)
! 4764: {
! 4765: while (p < end_subject)
! 4766: {
! 4767: register int pp = *p++;
! 4768: if (pp == req_byte || pp == req_byte2) { p--; break; }
! 4769: }
! 4770: }
! 4771: else
! 4772: {
! 4773: while (p < end_subject)
! 4774: {
! 4775: if (*p++ == req_byte) { p--; break; }
! 4776: }
! 4777: }
! 4778:
! 4779: /* If we can't find the required character, break the matching loop,
! 4780: forcing a match failure. */
! 4781:
! 4782: if (p >= end_subject)
! 4783: {
! 4784: rc = MATCH_NOMATCH;
! 4785: break;
! 4786: }
! 4787:
! 4788: /* If we have found the required character, save the point where we
! 4789: found it, so that we don't search again next time round the loop if
! 4790: the start hasn't passed this character yet. */
! 4791:
! 4792: req_byte_ptr = p;
! 4793: }
! 4794: }
! 4795:
! 4796: /* OK, we can now run the match. */
! 4797:
! 4798: md->start_match_ptr = start_match;
! 4799: md->match_call_count = 0;
! 4800: rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
! 4801:
! 4802: switch(rc)
! 4803: {
! 4804: /* NOMATCH and PRUNE advance by one character. THEN at this level acts
! 4805: exactly like PRUNE. */
! 4806:
! 4807: case MATCH_NOMATCH:
! 4808: case MATCH_PRUNE:
! 4809: case MATCH_THEN:
! 4810: new_start_match = start_match + 1;
! 4811: #ifdef SUPPORT_UTF8
! 4812: if (utf8)
! 4813: while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
! 4814: new_start_match++;
! 4815: #endif
! 4816: break;
! 4817:
! 4818: /* SKIP passes back the next starting point explicitly. */
! 4819:
! 4820: case MATCH_SKIP:
! 4821: new_start_match = md->start_match_ptr;
! 4822: break;
! 4823:
! 4824: /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
! 4825:
! 4826: case MATCH_COMMIT:
! 4827: rc = MATCH_NOMATCH;
! 4828: goto ENDLOOP;
! 4829:
! 4830: /* Any other return is some kind of error. */
! 4831:
! 4832: default:
! 4833: goto ENDLOOP;
! 4834: }
! 4835:
! 4836: /* Control reaches here for the various types of "no match at this point"
! 4837: result. Reset the code to MATCH_NOMATCH for subsequent checking. */
! 4838:
! 4839: rc = MATCH_NOMATCH;
! 4840:
! 4841: /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
! 4842: newline in the subject (though it may continue over the newline). Therefore,
! 4843: if we have just failed to match, starting at a newline, do not continue. */
! 4844:
! 4845: if (firstline && IS_NEWLINE(start_match)) break;
! 4846:
! 4847: /* Advance to new matching position */
! 4848:
! 4849: start_match = new_start_match;
! 4850:
! 4851: /* Break the loop if the pattern is anchored or if we have passed the end of
! 4852: the subject. */
! 4853:
! 4854: if (anchored || start_match > end_subject) break;
! 4855:
! 4856: /* If we have just passed a CR and we are now at a LF, and the pattern does
! 4857: not contain any explicit matches for \r or \n, and the newline option is CRLF
! 4858: or ANY or ANYCRLF, advance the match position by one more character. */
! 4859:
! 4860: if (start_match[-1] == '\r' &&
! 4861: start_match < end_subject &&
! 4862: *start_match == '\n' &&
! 4863: (re->flags & PCRE_HASCRORLF) == 0 &&
! 4864: (md->nltype == NLTYPE_ANY ||
! 4865: md->nltype == NLTYPE_ANYCRLF ||
! 4866: md->nllen == 2))
! 4867: start_match++;
! 4868:
! 4869: } /* End of for(;;) "bumpalong" loop */
! 4870:
! 4871: /* ==========================================================================*/
! 4872:
! 4873: /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
! 4874: conditions is true:
! 4875:
! 4876: (1) The pattern is anchored or the match was failed by (*COMMIT);
! 4877:
! 4878: (2) We are past the end of the subject;
! 4879:
! 4880: (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
! 4881: this option requests that a match occur at or before the first newline in
! 4882: the subject.
! 4883:
! 4884: When we have a match and the offset vector is big enough to deal with any
! 4885: backreferences, captured substring offsets will already be set up. In the case
! 4886: where we had to get some local store to hold offsets for backreference
! 4887: processing, copy those that we can. In this case there need not be overflow if
! 4888: certain parts of the pattern were not used, even though there are more
! 4889: capturing parentheses than vector slots. */
! 4890:
! 4891: ENDLOOP:
! 4892:
! 4893: if (rc == MATCH_MATCH)
! 4894: {
! 4895: if (using_temporary_offsets)
! 4896: {
! 4897: if (offsetcount >= 4)
! 4898: {
! 4899: memcpy(offsets + 2, md->offset_vector + 2,
! 4900: (offsetcount - 2) * sizeof(int));
! 4901: DPRINTF(("Copied offsets from temporary memory\n"));
! 4902: }
! 4903: if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
! 4904: DPRINTF(("Freeing temporary memory\n"));
! 4905: (pcre_free)(md->offset_vector);
! 4906: }
! 4907:
! 4908: /* Set the return code to the number of captured strings, or 0 if there are
! 4909: too many to fit into the vector. */
! 4910:
! 4911: rc = md->offset_overflow? 0 : md->end_offset_top/2;
! 4912:
! 4913: /* If there is space, set up the whole thing as substring 0. The value of
! 4914: md->start_match_ptr might be modified if \K was encountered on the success
! 4915: matching path. */
! 4916:
! 4917: if (offsetcount < 2) rc = 0; else
! 4918: {
! 4919: offsets[0] = md->start_match_ptr - md->start_subject;
! 4920: offsets[1] = md->end_match_ptr - md->start_subject;
! 4921: }
! 4922:
! 4923: DPRINTF((">>>> returning %d\n", rc));
! 4924: return rc;
! 4925: }
! 4926:
! 4927: /* Control gets here if there has been an error, or if the overall match
! 4928: attempt has failed at all permitted starting positions. */
! 4929:
! 4930: if (using_temporary_offsets)
! 4931: {
! 4932: DPRINTF(("Freeing temporary memory\n"));
! 4933: (pcre_free)(md->offset_vector);
! 4934: }
! 4935:
! 4936: if (rc != MATCH_NOMATCH)
! 4937: {
! 4938: DPRINTF((">>>> error: returning %d\n", rc));
! 4939: return rc;
! 4940: }
! 4941: else if (md->partial && md->hitend)
! 4942: {
! 4943: DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
! 4944: return PCRE_ERROR_PARTIAL;
! 4945: }
! 4946: else
! 4947: {
! 4948: DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
! 4949: return PCRE_ERROR_NOMATCH;
! 4950: }
! 4951: }
! 4952:
! 4953: /* End of pcre_exec.c */
E-mail: