Annotation of win32/pcre/pcre_exec.c, revision 1.9
1.1 misha 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
1.9 ! moko 9: Copyright (c) 1997-2018 University of Cambridge
1.1 misha 10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40: /* This module contains pcre_exec(), the externally visible function that does
41: pattern matching using an NFA algorithm, trying to mimic Perl as closely as
42: possible. There are also some static supporting functions. */
43:
44: #ifdef HAVE_CONFIG_H
45: #include "config.h"
46: #endif
47:
48: #define NLBLOCK md /* Block containing newline information */
49: #define PSSTART start_subject /* Field containing processed string start */
50: #define PSEND end_subject /* Field containing processed string end */
51:
52: #include "pcre_internal.h"
53:
54: /* Undefine some potentially clashing cpp symbols */
55:
56: #undef min
57: #undef max
58:
1.7 misha 59: /* The md->capture_last field uses the lower 16 bits for the last captured
60: substring (which can never be greater than 65535) and a bit in the top half
61: to mean "capture vector overflowed". This odd way of doing things was
62: implemented when it was realized that preserving and restoring the overflow bit
63: whenever the last capture number was saved/restored made for a neater
64: interface, and doing it this way saved on (a) another variable, which would
65: have increased the stack frame size (a big NO-NO in PCRE) and (b) another
66: separate set of save/restore instructions. The following defines are used in
67: implementing this. */
68:
69: #define CAPLMASK 0x0000ffff /* The bits used for last_capture */
70: #define OVFLMASK 0xffff0000 /* The bits used for the overflow flag */
71: #define OVFLBIT 0x00010000 /* The bit that is set for overflow */
72:
1.6 misha 73: /* Values for setting in md->match_function_type to indicate two special types
74: of call to match(). We do it this way to save on using another stack variable,
75: as stack usage is to be discouraged. */
1.1 misha 76:
1.6 misha 77: #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
78: #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
1.1 misha 79:
80: /* Non-error returns from the match() function. Error returns are externally
81: defined PCRE_ERROR_xxx codes, which are all negative. */
82:
83: #define MATCH_MATCH 1
84: #define MATCH_NOMATCH 0
85:
86: /* Special internal returns from the match() function. Make them sufficiently
87: negative to avoid the external error codes. */
88:
1.4 misha 89: #define MATCH_ACCEPT (-999)
1.7 misha 90: #define MATCH_KETRPOS (-998)
91: #define MATCH_ONCE (-997)
92: /* The next 5 must be kept together and in sequence so that a test that checks
93: for any one of them can use a range. */
94: #define MATCH_COMMIT (-996)
1.6 misha 95: #define MATCH_PRUNE (-995)
96: #define MATCH_SKIP (-994)
97: #define MATCH_SKIP_ARG (-993)
98: #define MATCH_THEN (-992)
1.7 misha 99: #define MATCH_BACKTRACK_MAX MATCH_THEN
100: #define MATCH_BACKTRACK_MIN MATCH_COMMIT
1.1 misha 101:
102: /* Maximum number of ints of offset to save on the stack for recursive calls.
103: If the offset vector is bigger, malloc is used. This should be a multiple of 3,
104: because the offset vector is always a multiple of 3 long. */
105:
106: #define REC_STACK_SAVE_MAX 30
107:
108: /* Min and max values for the common repeats; for the maxima, 0 => infinity */
109:
1.8 moko 110: static const char rep_min[] = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, };
111: static const char rep_max[] = { 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, };
1.1 misha 112:
1.4 misha 113: #ifdef PCRE_DEBUG
1.1 misha 114: /*************************************************
115: * Debugging function to print chars *
116: *************************************************/
117:
118: /* Print a sequence of chars in printable format, stopping at the end of the
119: subject if the requested.
120:
121: Arguments:
122: p points to characters
123: length number to print
124: is_subject TRUE if printing from within md->start_subject
125: md pointer to matching data block, if is_subject is TRUE
126:
127: Returns: nothing
128: */
129:
130: static void
1.6 misha 131: pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
1.1 misha 132: {
1.7 misha 133: pcre_uint32 c;
134: BOOL utf = md->utf;
1.1 misha 135: if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
136: while (length-- > 0)
1.8 moko 137: if (isprint(c = UCHAR21INCTEST(p))) printf("%c", (char)c); else printf("\\x{%02x}", c);
1.1 misha 138: }
139: #endif
140:
141:
142:
143: /*************************************************
144: * Match a back-reference *
145: *************************************************/
146:
1.6 misha 147: /* Normally, if a back reference hasn't been set, the length that is passed is
148: negative, so the match always fails. However, in JavaScript compatibility mode,
149: the length passed is zero. Note that in caseless UTF-8 mode, the number of
150: subject bytes matched may be different to the number of reference bytes.
1.1 misha 151:
152: Arguments:
153: offset index into the offset vector
1.6 misha 154: eptr pointer into the subject
155: length length of reference to be matched (number of bytes)
1.1 misha 156: md points to match data block
1.6 misha 157: caseless TRUE if caseless
1.1 misha 158:
1.7 misha 159: Returns: >= 0 the number of subject bytes matched
160: -1 no match
161: -2 partial match; always given if at end subject
1.1 misha 162: */
163:
1.6 misha 164: static int
165: match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
166: BOOL caseless)
1.1 misha 167: {
1.6 misha 168: PCRE_PUCHAR eptr_start = eptr;
169: register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
1.8 moko 170: #if defined SUPPORT_UTF && defined SUPPORT_UCP
1.7 misha 171: BOOL utf = md->utf;
172: #endif
1.1 misha 173:
1.4 misha 174: #ifdef PCRE_DEBUG
1.1 misha 175: if (eptr >= md->end_subject)
176: printf("matching subject <null>");
177: else
178: {
179: printf("matching subject ");
180: pchars(eptr, length, TRUE, md);
181: }
182: printf(" against backref ");
183: pchars(p, length, FALSE, md);
184: printf("\n");
185: #endif
186:
1.7 misha 187: /* Always fail if reference not set (and not JavaScript compatible - in that
188: case the length is passed as zero). */
1.1 misha 189:
1.6 misha 190: if (length < 0) return -1;
1.1 misha 191:
1.2 misha 192: /* Separate the caseless case for speed. In UTF-8 mode we can only do this
193: properly if Unicode properties are supported. Otherwise, we can check only
194: ASCII characters. */
1.1 misha 195:
1.6 misha 196: if (caseless)
1.1 misha 197: {
1.8 moko 198: #if defined SUPPORT_UTF && defined SUPPORT_UCP
1.7 misha 199: if (utf)
1.2 misha 200: {
1.6 misha 201: /* Match characters up to the end of the reference. NOTE: the number of
1.7 misha 202: data units matched may differ, because in UTF-8 there are some characters
203: whose upper and lower case versions code have different numbers of bytes.
204: For example, U+023A (2 bytes in UTF-8) is the upper case version of U+2C65
205: (3 bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a
206: sequence of two of the latter. It is important, therefore, to check the
207: length along the reference, not along the subject (earlier code did this
208: wrong). */
1.6 misha 209:
210: PCRE_PUCHAR endptr = p + length;
211: while (p < endptr)
1.2 misha 212: {
1.7 misha 213: pcre_uint32 c, d;
214: const ucd_record *ur;
215: if (eptr >= md->end_subject) return -2; /* Partial match */
1.2 misha 216: GETCHARINC(c, eptr);
217: GETCHARINC(d, p);
1.7 misha 218: ur = GET_UCD(d);
219: if (c != d && c != d + ur->other_case)
220: {
221: const pcre_uint32 *pp = PRIV(ucd_caseless_sets) + ur->caseset;
222: for (;;)
223: {
224: if (c < *pp) return -1;
225: if (c == *pp++) break;
226: }
227: }
1.2 misha 228: }
229: }
230: else
231: #endif
232:
233: /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
234: is no UCP support. */
1.6 misha 235: {
236: while (length-- > 0)
237: {
1.7 misha 238: pcre_uint32 cc, cp;
239: if (eptr >= md->end_subject) return -2; /* Partial match */
1.8 moko 240: cc = UCHAR21TEST(eptr);
241: cp = UCHAR21TEST(p);
1.7 misha 242: if (TABLE_GET(cp, md->lcc, cp) != TABLE_GET(cc, md->lcc, cc)) return -1;
1.6 misha 243: p++;
244: eptr++;
245: }
246: }
1.1 misha 247: }
1.2 misha 248:
249: /* In the caseful case, we can just compare the bytes, whether or not we
250: are in UTF-8 mode. */
251:
1.1 misha 252: else
1.6 misha 253: {
1.7 misha 254: while (length-- > 0)
255: {
256: if (eptr >= md->end_subject) return -2; /* Partial match */
1.8 moko 257: if (UCHAR21INCTEST(p) != UCHAR21INCTEST(eptr)) return -1;
1.7 misha 258: }
1.6 misha 259: }
1.1 misha 260:
1.6 misha 261: return (int)(eptr - eptr_start);
1.1 misha 262: }
263:
264:
265:
266: /***************************************************************************
267: ****************************************************************************
268: RECURSION IN THE match() FUNCTION
269:
270: The match() function is highly recursive, though not every recursive call
271: increases the recursive depth. Nevertheless, some regular expressions can cause
272: it to recurse to a great depth. I was writing for Unix, so I just let it call
273: itself recursively. This uses the stack for saving everything that has to be
274: saved for a recursive call. On Unix, the stack can be large, and this works
275: fine.
276:
277: It turns out that on some non-Unix-like systems there are problems with
278: programs that use a lot of stack. (This despite the fact that every last chip
279: has oodles of memory these days, and techniques for extending the stack have
280: been known for decades.) So....
281:
282: There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
283: calls by keeping local variables that need to be preserved in blocks of memory
284: obtained from malloc() instead instead of on the stack. Macros are used to
285: achieve this so that the actual code doesn't look very different to what it
286: always used to.
287:
288: The original heap-recursive code used longjmp(). However, it seems that this
289: can be very slow on some operating systems. Following a suggestion from Stan
290: Switzer, the use of longjmp() has been abolished, at the cost of having to
291: provide a unique number for each call to RMATCH. There is no way of generating
292: a sequence of numbers at compile time in C. I have given them names, to make
293: them stand out more clearly.
294:
295: Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
296: FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
297: tests. Furthermore, not using longjmp() means that local dynamic variables
298: don't have indeterminate values; this has meant that the frame size can be
299: reduced because the result can be "passed back" by straight setting of the
300: variable instead of being passed in the frame.
301: ****************************************************************************
302: ***************************************************************************/
303:
304: /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
305: below must be updated in sync. */
306:
307: enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
308: RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
309: RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
310: RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
311: RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
1.4 misha 312: RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
1.8 moko 313: RM61, RM62, RM63, RM64, RM65, RM66, RM67 };
1.1 misha 314:
315: /* These versions of the macros use the stack, as normal. There are debugging
316: versions and production versions. Note that the "rw" argument of RMATCH isn't
1.4 misha 317: actually used in this definition. */
1.1 misha 318:
319: #ifndef NO_RECURSE
320: #define REGISTER register
321:
1.4 misha 322: #ifdef PCRE_DEBUG
1.6 misha 323: #define RMATCH(ra,rb,rc,rd,re,rw) \
1.1 misha 324: { \
325: printf("match() called in line %d\n", __LINE__); \
1.6 misha 326: rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
1.1 misha 327: printf("to line %d\n", __LINE__); \
328: }
329: #define RRETURN(ra) \
330: { \
1.7 misha 331: printf("match() returned %d from line %d\n", ra, __LINE__); \
1.1 misha 332: return ra; \
333: }
334: #else
1.6 misha 335: #define RMATCH(ra,rb,rc,rd,re,rw) \
336: rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
1.1 misha 337: #define RRETURN(ra) return ra
338: #endif
339:
340: #else
341:
342:
343: /* These versions of the macros manage a private stack on the heap. Note that
344: the "rd" argument of RMATCH isn't actually used in this definition. It's the md
345: argument of match(), which never changes. */
346:
347: #define REGISTER
348:
1.6 misha 349: #define RMATCH(ra,rb,rc,rd,re,rw)\
1.1 misha 350: {\
1.7 misha 351: heapframe *newframe = frame->Xnextframe;\
352: if (newframe == NULL)\
353: {\
354: newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
355: if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
356: newframe->Xnextframe = NULL;\
357: frame->Xnextframe = newframe;\
358: }\
359: frame->Xwhere = rw;\
1.1 misha 360: newframe->Xeptr = ra;\
361: newframe->Xecode = rb;\
362: newframe->Xmstart = mstart;\
363: newframe->Xoffset_top = rc;\
1.6 misha 364: newframe->Xeptrb = re;\
1.1 misha 365: newframe->Xrdepth = frame->Xrdepth + 1;\
366: newframe->Xprevframe = frame;\
367: frame = newframe;\
368: DPRINTF(("restarting from line %d\n", __LINE__));\
369: goto HEAP_RECURSE;\
370: L_##rw:\
371: DPRINTF(("jumped back to line %d\n", __LINE__));\
372: }
373:
374: #define RRETURN(ra)\
375: {\
1.4 misha 376: heapframe *oldframe = frame;\
377: frame = oldframe->Xprevframe;\
1.1 misha 378: if (frame != NULL)\
379: {\
380: rrc = ra;\
381: goto HEAP_RETURN;\
382: }\
383: return ra;\
384: }
385:
386:
387: /* Structure for remembering the local variables in a private frame */
388:
389: typedef struct heapframe {
390: struct heapframe *Xprevframe;
1.7 misha 391: struct heapframe *Xnextframe;
1.1 misha 392:
393: /* Function arguments that may change */
394:
1.6 misha 395: PCRE_PUCHAR Xeptr;
396: const pcre_uchar *Xecode;
397: PCRE_PUCHAR Xmstart;
1.1 misha 398: int Xoffset_top;
399: eptrblock *Xeptrb;
400: unsigned int Xrdepth;
401:
402: /* Function local variables */
403:
1.6 misha 404: PCRE_PUCHAR Xcallpat;
405: #ifdef SUPPORT_UTF
406: PCRE_PUCHAR Xcharptr;
407: #endif
408: PCRE_PUCHAR Xdata;
409: PCRE_PUCHAR Xnext;
410: PCRE_PUCHAR Xpp;
411: PCRE_PUCHAR Xprev;
412: PCRE_PUCHAR Xsaved_eptr;
1.1 misha 413:
414: recursion_info Xnew_recursive;
415:
416: BOOL Xcur_is_word;
417: BOOL Xcondition;
418: BOOL Xprev_is_word;
419:
420: #ifdef SUPPORT_UCP
421: int Xprop_type;
1.7 misha 422: unsigned int Xprop_value;
1.1 misha 423: int Xprop_fail_result;
424: int Xoclength;
1.6 misha 425: pcre_uchar Xocchars[6];
1.1 misha 426: #endif
427:
1.3 misha 428: int Xcodelink;
1.1 misha 429: int Xctype;
430: unsigned int Xfc;
431: int Xfi;
432: int Xlength;
433: int Xmax;
434: int Xmin;
1.7 misha 435: unsigned int Xnumber;
1.1 misha 436: int Xoffset;
1.7 misha 437: unsigned int Xop;
438: pcre_int32 Xsave_capture_last;
1.1 misha 439: int Xsave_offset1, Xsave_offset2, Xsave_offset3;
440: int Xstacksave[REC_STACK_SAVE_MAX];
441:
442: eptrblock Xnewptrb;
443:
444: /* Where to jump back to */
445:
446: int Xwhere;
447:
448: } heapframe;
449:
450: #endif
451:
452:
453: /***************************************************************************
454: ***************************************************************************/
455:
456:
457:
458: /*************************************************
459: * Match from current position *
460: *************************************************/
461:
462: /* This function is called recursively in many circumstances. Whenever it
463: returns a negative (error) response, the outer incarnation must also return the
1.4 misha 464: same response. */
465:
466: /* These macros pack up tests that are used for partial matching, and which
1.6 misha 467: appear several times in the code. We set the "hit end" flag if the pointer is
1.4 misha 468: at the end of the subject and also past the start of the subject (i.e.
469: something has been matched). For hard partial matching, we then return
470: immediately. The second one is used when we already know we are past the end of
471: the subject. */
472:
473: #define CHECK_PARTIAL()\
1.5 misha 474: if (md->partial != 0 && eptr >= md->end_subject && \
475: eptr > md->start_used_ptr) \
476: { \
477: md->hitend = TRUE; \
1.6 misha 478: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
1.4 misha 479: }
1.1 misha 480:
1.4 misha 481: #define SCHECK_PARTIAL()\
1.5 misha 482: if (md->partial != 0 && eptr > md->start_used_ptr) \
483: { \
484: md->hitend = TRUE; \
1.6 misha 485: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
1.4 misha 486: }
487:
488:
489: /* Performance note: It might be tempting to extract commonly used fields from
1.6 misha 490: the md structure (e.g. utf, end_subject) into individual variables to improve
1.1 misha 491: performance. Tests using gcc on a SPARC disproved this; in the first case, it
492: made performance worse.
493:
494: Arguments:
495: eptr pointer to current character in subject
496: ecode pointer to current position in compiled code
497: mstart pointer to the current match start position (can be modified
498: by encountering \K)
499: offset_top current top pointer
500: md pointer to "static" info for the match
501: eptrb pointer to chain of blocks containing eptr at start of
502: brackets - for testing for empty matches
503: rdepth the recursion depth
504:
505: Returns: MATCH_MATCH if matched ) these values are >= 0
506: MATCH_NOMATCH if failed to match )
1.4 misha 507: a negative MATCH_xxx value for PRUNE, SKIP, etc
1.1 misha 508: a negative PCRE_ERROR_xxx value if aborted by an error condition
509: (e.g. stopped by repeated call or recursion limit)
510: */
511:
512: static int
1.6 misha 513: match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
514: PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
515: unsigned int rdepth)
1.1 misha 516: {
517: /* These variables do not need to be preserved over recursion in this function,
518: so they can be ordinary variables in all cases. Mark some of them with
519: "register" because they are used a lot in loops. */
520:
521: register int rrc; /* Returns from recursive calls */
522: register int i; /* Used for loops not involving calls to RMATCH() */
1.7 misha 523: register pcre_uint32 c; /* Character values not kept over RMATCH() calls */
1.6 misha 524: register BOOL utf; /* Local copy of UTF flag for speed */
1.1 misha 525:
526: BOOL minimize, possessive; /* Quantifier options */
1.6 misha 527: BOOL caseless;
1.3 misha 528: int condcode;
1.1 misha 529:
530: /* When recursion is not being used, all "local" variables that have to be
1.6 misha 531: preserved over calls to RMATCH() are part of a "frame". We set up the top-level
532: frame on the stack here; subsequent instantiations are obtained from the heap
533: whenever RMATCH() does a "recursion". See the macro definitions above. Putting
534: the top-level on the stack rather than malloc-ing them all gives a performance
535: boost in many cases where there is not much "recursion". */
1.1 misha 536:
537: #ifdef NO_RECURSE
1.7 misha 538: heapframe *frame = (heapframe *)md->match_frames_base;
1.1 misha 539:
540: /* Copy in the original argument variables */
541:
542: frame->Xeptr = eptr;
543: frame->Xecode = ecode;
544: frame->Xmstart = mstart;
545: frame->Xoffset_top = offset_top;
546: frame->Xeptrb = eptrb;
547: frame->Xrdepth = rdepth;
548:
549: /* This is where control jumps back to to effect "recursion" */
550:
551: HEAP_RECURSE:
552:
553: /* Macros make the argument variables come from the current frame */
554:
555: #define eptr frame->Xeptr
556: #define ecode frame->Xecode
557: #define mstart frame->Xmstart
558: #define offset_top frame->Xoffset_top
559: #define eptrb frame->Xeptrb
560: #define rdepth frame->Xrdepth
561:
562: /* Ditto for the local variables */
563:
1.6 misha 564: #ifdef SUPPORT_UTF
1.1 misha 565: #define charptr frame->Xcharptr
566: #endif
567: #define callpat frame->Xcallpat
1.3 misha 568: #define codelink frame->Xcodelink
1.1 misha 569: #define data frame->Xdata
570: #define next frame->Xnext
571: #define pp frame->Xpp
572: #define prev frame->Xprev
573: #define saved_eptr frame->Xsaved_eptr
574:
575: #define new_recursive frame->Xnew_recursive
576:
577: #define cur_is_word frame->Xcur_is_word
578: #define condition frame->Xcondition
579: #define prev_is_word frame->Xprev_is_word
580:
581: #ifdef SUPPORT_UCP
582: #define prop_type frame->Xprop_type
583: #define prop_value frame->Xprop_value
584: #define prop_fail_result frame->Xprop_fail_result
585: #define oclength frame->Xoclength
586: #define occhars frame->Xocchars
587: #endif
588:
589: #define ctype frame->Xctype
590: #define fc frame->Xfc
591: #define fi frame->Xfi
592: #define length frame->Xlength
593: #define max frame->Xmax
594: #define min frame->Xmin
595: #define number frame->Xnumber
596: #define offset frame->Xoffset
597: #define op frame->Xop
598: #define save_capture_last frame->Xsave_capture_last
599: #define save_offset1 frame->Xsave_offset1
600: #define save_offset2 frame->Xsave_offset2
601: #define save_offset3 frame->Xsave_offset3
602: #define stacksave frame->Xstacksave
603:
604: #define newptrb frame->Xnewptrb
605:
606: /* When recursion is being used, local variables are allocated on the stack and
607: get preserved during recursion in the normal way. In this environment, fi and
608: i, and fc and c, can be the same variables. */
609:
610: #else /* NO_RECURSE not defined */
611: #define fi i
612: #define fc c
613:
1.6 misha 614: /* Many of the following variables are used only in small blocks of the code.
615: My normal style of coding would have declared them within each of those blocks.
616: However, in order to accommodate the version of this code that uses an external
617: "stack" implemented on the heap, it is easier to declare them all here, so the
618: declarations can be cut out in a block. The only declarations within blocks
619: below are for variables that do not have to be preserved over a recursive call
620: to RMATCH(). */
621:
622: #ifdef SUPPORT_UTF
623: const pcre_uchar *charptr;
624: #endif
625: const pcre_uchar *callpat;
626: const pcre_uchar *data;
627: const pcre_uchar *next;
628: PCRE_PUCHAR pp;
629: const pcre_uchar *prev;
630: PCRE_PUCHAR saved_eptr;
631:
632: recursion_info new_recursive;
1.1 misha 633:
1.6 misha 634: BOOL cur_is_word;
1.1 misha 635: BOOL condition;
636: BOOL prev_is_word;
637:
638: #ifdef SUPPORT_UCP
639: int prop_type;
1.7 misha 640: unsigned int prop_value;
1.1 misha 641: int prop_fail_result;
642: int oclength;
1.6 misha 643: pcre_uchar occhars[6];
1.1 misha 644: #endif
645:
1.3 misha 646: int codelink;
1.1 misha 647: int ctype;
648: int length;
649: int max;
650: int min;
1.7 misha 651: unsigned int number;
1.1 misha 652: int offset;
1.7 misha 653: unsigned int op;
654: pcre_int32 save_capture_last;
1.1 misha 655: int save_offset1, save_offset2, save_offset3;
656: int stacksave[REC_STACK_SAVE_MAX];
657:
658: eptrblock newptrb;
1.6 misha 659:
660: /* There is a special fudge for calling match() in a way that causes it to
661: measure the size of its basic stack frame when the stack is being used for
662: recursion. The second argument (ecode) being NULL triggers this behaviour. It
663: cannot normally ever be NULL. The return is the negated value of the frame
664: size. */
665:
666: if (ecode == NULL)
667: {
668: if (rdepth == 0)
669: return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
670: else
671: {
1.9 ! moko 672: int len = (int)((char *)&rdepth - (char *)eptr);
1.6 misha 673: return (len > 0)? -len : len;
674: }
675: }
1.1 misha 676: #endif /* NO_RECURSE */
677:
1.6 misha 678: /* To save space on the stack and in the heap frame, I have doubled up on some
679: of the local variables that are used only in localised parts of the code, but
680: still need to be preserved over recursive calls of match(). These macros define
681: the alternative names that are used. */
682:
683: #define allow_zero cur_is_word
684: #define cbegroup condition
685: #define code_offset codelink
686: #define condassert condition
687: #define matched_once prev_is_word
688: #define foc number
689: #define save_mark data
690:
1.1 misha 691: /* These statements are here to stop the compiler complaining about unitialized
692: variables. */
693:
694: #ifdef SUPPORT_UCP
695: prop_value = 0;
696: prop_fail_result = 0;
697: #endif
698:
699:
700: /* This label is used for tail recursion, which is used in a few cases even
701: when NO_RECURSE is not defined, in order to reduce the amount of stack that is
702: used. Thanks to Ian Taylor for noticing this possibility and sending the
703: original patch. */
704:
705: TAIL_RECURSE:
706:
707: /* OK, now we can get on with the real code of the function. Recursive calls
708: are specified by the macro RMATCH and RRETURN is used to return. When
709: NO_RECURSE is *not* defined, these just turn into a recursive call to match()
1.4 misha 710: and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
1.1 misha 711: defined). However, RMATCH isn't like a function call because it's quite a
712: complicated macro. It has to be used in one particular way. This shouldn't,
713: however, impact performance when true recursion is being used. */
714:
1.6 misha 715: #ifdef SUPPORT_UTF
716: utf = md->utf; /* Local copy of the flag */
1.1 misha 717: #else
1.6 misha 718: utf = FALSE;
1.1 misha 719: #endif
720:
721: /* First check that we haven't called match() too many times, or that we
722: haven't exceeded the recursive call limit. */
723:
724: if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
725: if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
726:
727: /* At the start of a group with an unlimited repeat that may match an empty
1.6 misha 728: string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
729: done this way to save having to use another function argument, which would take
730: up space on the stack. See also MATCH_CONDASSERT below.
731:
732: When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
733: such remembered pointers, to be checked when we hit the closing ket, in order
734: to break infinite loops that match no characters. When match() is called in
735: other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
736: NOT be used with tail recursion, because the memory block that is used is on
737: the stack, so a new one may be required for each match(). */
1.1 misha 738:
1.6 misha 739: if (md->match_function_type == MATCH_CBEGROUP)
1.1 misha 740: {
741: newptrb.epb_saved_eptr = eptr;
742: newptrb.epb_prev = eptrb;
743: eptrb = &newptrb;
1.6 misha 744: md->match_function_type = 0;
1.1 misha 745: }
746:
747: /* Now start processing the opcodes. */
748:
749: for (;;)
750: {
751: minimize = possessive = FALSE;
752: op = *ecode;
753:
1.4 misha 754: switch(op)
755: {
756: case OP_MARK:
1.6 misha 757: md->nomatch_mark = ecode + 2;
758: md->mark = NULL; /* In case previously set by assertion */
759: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
760: eptrb, RM55);
761: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
762: md->mark == NULL) md->mark = ecode + 2;
1.4 misha 763:
764: /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
765: argument, and we must check whether that argument matches this MARK's
766: argument. It is passed back in md->start_match_ptr (an overloading of that
767: variable). If it does match, we reset that variable to the current subject
768: position and return MATCH_SKIP. Otherwise, pass back the return code
769: unaltered. */
770:
1.6 misha 771: else if (rrc == MATCH_SKIP_ARG &&
1.7 misha 772: STRCMP_UC_UC_TEST(ecode + 2, md->start_match_ptr) == 0)
1.4 misha 773: {
774: md->start_match_ptr = eptr;
775: RRETURN(MATCH_SKIP);
776: }
777: RRETURN(rrc);
1.1 misha 778:
779: case OP_FAIL:
1.6 misha 780: RRETURN(MATCH_NOMATCH);
1.4 misha 781:
782: case OP_COMMIT:
1.6 misha 783: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
784: eptrb, RM52);
1.7 misha 785: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 786: RRETURN(MATCH_COMMIT);
1.1 misha 787:
788: case OP_PRUNE:
1.6 misha 789: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
790: eptrb, RM51);
1.7 misha 791: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 792: RRETURN(MATCH_PRUNE);
1.1 misha 793:
1.4 misha 794: case OP_PRUNE_ARG:
1.6 misha 795: md->nomatch_mark = ecode + 2;
796: md->mark = NULL; /* In case previously set by assertion */
797: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
798: eptrb, RM56);
799: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
800: md->mark == NULL) md->mark = ecode + 2;
1.7 misha 801: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 802: RRETURN(MATCH_PRUNE);
1.1 misha 803:
804: case OP_SKIP:
1.6 misha 805: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
806: eptrb, RM53);
1.7 misha 807: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misha 808: md->start_match_ptr = eptr; /* Pass back current position */
1.6 misha 809: RRETURN(MATCH_SKIP);
810:
811: /* Note that, for Perl compatibility, SKIP with an argument does NOT set
1.7 misha 812: nomatch_mark. When a pattern match ends with a SKIP_ARG for which there was
813: not a matching mark, we have to re-run the match, ignoring the SKIP_ARG
814: that failed and any that precede it (either they also failed, or were not
815: triggered). To do this, we maintain a count of executed SKIP_ARGs. If a
816: SKIP_ARG gets to top level, the match is re-run with md->ignore_skip_arg
817: set to the count of the one that failed. */
1.4 misha 818:
819: case OP_SKIP_ARG:
1.7 misha 820: md->skip_arg_count++;
821: if (md->skip_arg_count <= md->ignore_skip_arg)
1.6 misha 822: {
823: ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
824: break;
825: }
826: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
827: eptrb, RM57);
1.7 misha 828: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 829:
830: /* Pass back the current skip name by overloading md->start_match_ptr and
831: returning the special MATCH_SKIP_ARG return code. This will either be
1.6 misha 832: caught by a matching MARK, or get to the top, where it causes a rematch
1.7 misha 833: with md->ignore_skip_arg set to the value of md->skip_arg_count. */
1.4 misha 834:
835: md->start_match_ptr = ecode + 2;
836: RRETURN(MATCH_SKIP_ARG);
1.1 misha 837:
1.6 misha 838: /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
839: the branch in which it occurs can be determined. Overload the start of
840: match pointer to do this. */
1.5 misha 841:
1.1 misha 842: case OP_THEN:
1.6 misha 843: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
844: eptrb, RM54);
1.1 misha 845: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 846: md->start_match_ptr = ecode;
847: RRETURN(MATCH_THEN);
1.4 misha 848:
849: case OP_THEN_ARG:
1.6 misha 850: md->nomatch_mark = ecode + 2;
851: md->mark = NULL; /* In case previously set by assertion */
852: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
853: md, eptrb, RM58);
854: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
855: md->mark == NULL) md->mark = ecode + 2;
1.4 misha 856: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 857: md->start_match_ptr = ecode;
1.1 misha 858: RRETURN(MATCH_THEN);
859:
1.6 misha 860: /* Handle an atomic group that does not contain any capturing parentheses.
861: This can be handled like an assertion. Prior to 8.13, all atomic groups
862: were handled this way. In 8.13, the code was changed as below for ONCE, so
863: that backups pass through the group and thereby reset captured values.
864: However, this uses a lot more stack, so in 8.20, atomic groups that do not
865: contain any captures generate OP_ONCE_NC, which can be handled in the old,
866: less stack intensive way.
867:
868: Check the alternative branches in turn - the matching won't pass the KET
869: for this kind of subpattern. If any one branch matches, we carry on as at
870: the end of a normal bracket, leaving the subject pointer, but resetting
871: the start-of-match value in case it was changed by \K. */
872:
873: case OP_ONCE_NC:
874: prev = ecode;
875: saved_eptr = eptr;
876: save_mark = md->mark;
877: do
878: {
879: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
880: if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
881: {
882: mstart = md->start_match_ptr;
883: break;
884: }
885: if (rrc == MATCH_THEN)
886: {
887: next = ecode + GET(ecode,1);
888: if (md->start_match_ptr < next &&
889: (*ecode == OP_ALT || *next == OP_ALT))
890: rrc = MATCH_NOMATCH;
891: }
892:
893: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
894: ecode += GET(ecode,1);
895: md->mark = save_mark;
896: }
897: while (*ecode == OP_ALT);
898:
899: /* If hit the end of the group (which could be repeated), fail */
900:
901: if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
902:
903: /* Continue as from after the group, updating the offsets high water
904: mark, since extracts may have been taken. */
905:
906: do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
907:
908: offset_top = md->end_offset_top;
909: eptr = md->end_match_ptr;
910:
911: /* For a non-repeating ket, just continue at this level. This also
912: happens for a repeating ket if no characters were matched in the group.
913: This is the forcible breaking of infinite loops as implemented in Perl
914: 5.005. */
915:
916: if (*ecode == OP_KET || eptr == saved_eptr)
917: {
918: ecode += 1+LINK_SIZE;
919: break;
920: }
921:
922: /* The repeating kets try the rest of the pattern or restart from the
923: preceding bracket, in the appropriate order. The second "call" of match()
924: uses tail recursion, to avoid using another stack frame. */
925:
926: if (*ecode == OP_KETRMIN)
927: {
928: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
929: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
930: ecode = prev;
931: goto TAIL_RECURSE;
932: }
933: else /* OP_KETRMAX */
934: {
935: RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
936: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
937: ecode += 1 + LINK_SIZE;
938: goto TAIL_RECURSE;
939: }
940: /* Control never gets here */
941:
942: /* Handle a capturing bracket, other than those that are possessive with an
943: unlimited repeat. If there is space in the offset vector, save the current
944: subject position in the working slot at the top of the vector. We mustn't
945: change the current values of the data slot, because they may be set from a
946: previous iteration of this group, and be referred to by a reference inside
947: the group. A failure to match might occur after the group has succeeded,
948: if something later on doesn't match. For this reason, we need to restore
949: the working value and also the values of the final offsets, in case they
950: were set by a previous iteration of the same bracket.
1.1 misha 951:
952: If there isn't enough space in the offset vector, treat this as if it were
953: a non-capturing bracket. Don't worry about setting the flag for the error
954: case here; that is handled in the code for KET. */
955:
956: case OP_CBRA:
957: case OP_SCBRA:
958: number = GET2(ecode, 1+LINK_SIZE);
959: offset = number << 1;
960:
1.4 misha 961: #ifdef PCRE_DEBUG
1.1 misha 962: printf("start bracket %d\n", number);
963: printf("subject=");
964: pchars(eptr, 16, TRUE, md);
965: printf("\n");
966: #endif
967:
968: if (offset < md->offset_max)
969: {
970: save_offset1 = md->offset_vector[offset];
971: save_offset2 = md->offset_vector[offset+1];
972: save_offset3 = md->offset_vector[md->offset_end - number];
973: save_capture_last = md->capture_last;
1.6 misha 974: save_mark = md->mark;
1.1 misha 975:
976: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1.4 misha 977: md->offset_vector[md->offset_end - number] =
978: (int)(eptr - md->start_subject);
1.1 misha 979:
1.6 misha 980: for (;;)
1.1 misha 981: {
1.6 misha 982: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
983: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
984: eptrb, RM1);
985: if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
986:
987: /* If we backed up to a THEN, check whether it is within the current
988: branch by comparing the address of the THEN that is passed back with
989: the end of the branch. If it is within the current branch, and the
990: branch is one of two or more alternatives (it either starts or ends
991: with OP_ALT), we have reached the limit of THEN's action, so convert
992: the return code to NOMATCH, which will cause normal backtracking to
993: happen from now on. Otherwise, THEN is passed back to an outer
994: alternative. This implements Perl's treatment of parenthesized groups,
995: where a group not containing | does not affect the current alternative,
996: that is, (X) is NOT the same as (X|(*F)). */
997:
998: if (rrc == MATCH_THEN)
999: {
1000: next = ecode + GET(ecode,1);
1001: if (md->start_match_ptr < next &&
1002: (*ecode == OP_ALT || *next == OP_ALT))
1003: rrc = MATCH_NOMATCH;
1004: }
1005:
1006: /* Anything other than NOMATCH is passed back. */
1007:
1008: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misha 1009: md->capture_last = save_capture_last;
1010: ecode += GET(ecode, 1);
1.6 misha 1011: md->mark = save_mark;
1012: if (*ecode != OP_ALT) break;
1.1 misha 1013: }
1014:
1015: DPRINTF(("bracket %d failed\n", number));
1016: md->offset_vector[offset] = save_offset1;
1017: md->offset_vector[offset+1] = save_offset2;
1018: md->offset_vector[md->offset_end - number] = save_offset3;
1019:
1.6 misha 1020: /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
1021:
1022: RRETURN(rrc);
1.1 misha 1023: }
1024:
1025: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
1026: as a non-capturing bracket. */
1027:
1028: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1029: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1030:
1031: DPRINTF(("insufficient capture room: treat as non-capturing\n"));
1032:
1033: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1034: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1035:
1.6 misha 1036: /* Non-capturing or atomic group, except for possessive with unlimited
1037: repeat and ONCE group with no captures. Loop for all the alternatives.
1.1 misha 1038:
1.6 misha 1039: When we get to the final alternative within the brackets, we used to return
1040: the result of a recursive call to match() whatever happened so it was
1041: possible to reduce stack usage by turning this into a tail recursion,
1042: except in the case of a possibly empty group. However, now that there is
1043: the possiblity of (*THEN) occurring in the final alternative, this
1044: optimization is no longer always possible.
1045:
1046: We can optimize if we know there are no (*THEN)s in the pattern; at present
1047: this is the best that can be done.
1048:
1049: MATCH_ONCE is returned when the end of an atomic group is successfully
1050: reached, but subsequent matching fails. It passes back up the tree (causing
1051: captured values to be reset) until the original atomic group level is
1052: reached. This is tested by comparing md->once_target with the start of the
1053: group. At this point, the return is converted into MATCH_NOMATCH so that
1054: previous backup points can be taken. */
1055:
1056: case OP_ONCE:
1.1 misha 1057: case OP_BRA:
1058: case OP_SBRA:
1059: DPRINTF(("start non-capturing bracket\n"));
1.6 misha 1060:
1.1 misha 1061: for (;;)
1062: {
1.7 misha 1063: if (op >= OP_SBRA || op == OP_ONCE)
1064: md->match_function_type = MATCH_CBEGROUP;
1.6 misha 1065:
1066: /* If this is not a possibly empty group, and there are no (*THEN)s in
1067: the pattern, and this is the final alternative, optimize as described
1068: above. */
1069:
1070: else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1.1 misha 1071: {
1.6 misha 1072: ecode += PRIV(OP_lengths)[*ecode];
1073: goto TAIL_RECURSE;
1074: }
1075:
1076: /* In all other cases, we have to make another call to match(). */
1077:
1078: save_mark = md->mark;
1.7 misha 1079: save_capture_last = md->capture_last;
1.6 misha 1080: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
1081: RM2);
1082:
1083: /* See comment in the code for capturing groups above about handling
1084: THEN. */
1085:
1086: if (rrc == MATCH_THEN)
1087: {
1088: next = ecode + GET(ecode,1);
1089: if (md->start_match_ptr < next &&
1090: (*ecode == OP_ALT || *next == OP_ALT))
1091: rrc = MATCH_NOMATCH;
1092: }
1093:
1094: if (rrc != MATCH_NOMATCH)
1095: {
1096: if (rrc == MATCH_ONCE)
1.1 misha 1097: {
1.6 misha 1098: const pcre_uchar *scode = ecode;
1099: if (*scode != OP_ONCE) /* If not at start, find it */
1100: {
1101: while (*scode == OP_ALT) scode += GET(scode, 1);
1102: scode -= GET(scode, 1);
1103: }
1104: if (md->once_target == scode) rrc = MATCH_NOMATCH;
1.1 misha 1105: }
1.6 misha 1106: RRETURN(rrc);
1107: }
1108: ecode += GET(ecode, 1);
1109: md->mark = save_mark;
1110: if (*ecode != OP_ALT) break;
1.7 misha 1111: md->capture_last = save_capture_last;
1.6 misha 1112: }
1113:
1114: RRETURN(MATCH_NOMATCH);
1115:
1116: /* Handle possessive capturing brackets with an unlimited repeat. We come
1117: here from BRAZERO with allow_zero set TRUE. The offset_vector values are
1118: handled similarly to the normal case above. However, the matching is
1119: different. The end of these brackets will always be OP_KETRPOS, which
1120: returns MATCH_KETRPOS without going further in the pattern. By this means
1121: we can handle the group by iteration rather than recursion, thereby
1122: reducing the amount of stack needed. */
1.1 misha 1123:
1.6 misha 1124: case OP_CBRAPOS:
1125: case OP_SCBRAPOS:
1126: allow_zero = FALSE;
1.1 misha 1127:
1.6 misha 1128: POSSESSIVE_CAPTURE:
1129: number = GET2(ecode, 1+LINK_SIZE);
1130: offset = number << 1;
1131:
1132: #ifdef PCRE_DEBUG
1133: printf("start possessive bracket %d\n", number);
1134: printf("subject=");
1135: pchars(eptr, 16, TRUE, md);
1136: printf("\n");
1137: #endif
1138:
1.8 moko 1139: if (offset >= md->offset_max) goto POSSESSIVE_NON_CAPTURE;
1140:
1141: matched_once = FALSE;
1142: code_offset = (int)(ecode - md->start_code);
1.6 misha 1143:
1.8 moko 1144: save_offset1 = md->offset_vector[offset];
1145: save_offset2 = md->offset_vector[offset+1];
1146: save_offset3 = md->offset_vector[md->offset_end - number];
1147: save_capture_last = md->capture_last;
1.6 misha 1148:
1.8 moko 1149: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1.6 misha 1150:
1.8 moko 1151: /* Each time round the loop, save the current subject position for use
1152: when the group matches. For MATCH_MATCH, the group has matched, so we
1153: restart it with a new subject starting position, remembering that we had
1154: at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
1155: usual. If we haven't matched any alternatives in any iteration, check to
1156: see if a previous iteration matched. If so, the group has matched;
1157: continue from afterwards. Otherwise it has failed; restore the previous
1158: capture values before returning NOMATCH. */
1.6 misha 1159:
1.8 moko 1160: for (;;)
1161: {
1162: md->offset_vector[md->offset_end - number] =
1163: (int)(eptr - md->start_subject);
1164: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1165: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1166: eptrb, RM63);
1167: if (rrc == MATCH_KETRPOS)
1.6 misha 1168: {
1.8 moko 1169: offset_top = md->end_offset_top;
1170: ecode = md->start_code + code_offset;
1171: save_capture_last = md->capture_last;
1172: matched_once = TRUE;
1173: mstart = md->start_match_ptr; /* In case \K changed it */
1174: if (eptr == md->end_match_ptr) /* Matched an empty string */
1.6 misha 1175: {
1.8 moko 1176: do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1177: break;
1.6 misha 1178: }
1.8 moko 1179: eptr = md->end_match_ptr;
1180: continue;
1.6 misha 1181: }
1182:
1.8 moko 1183: /* See comment in the code for capturing groups above about handling
1184: THEN. */
1.6 misha 1185:
1.8 moko 1186: if (rrc == MATCH_THEN)
1.6 misha 1187: {
1.8 moko 1188: next = ecode + GET(ecode,1);
1189: if (md->start_match_ptr < next &&
1190: (*ecode == OP_ALT || *next == OP_ALT))
1191: rrc = MATCH_NOMATCH;
1.1 misha 1192: }
1193:
1.8 moko 1194: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1195: md->capture_last = save_capture_last;
1196: ecode += GET(ecode, 1);
1197: if (*ecode != OP_ALT) break;
1.6 misha 1198: }
1199:
1.8 moko 1200: if (!matched_once)
1201: {
1202: md->offset_vector[offset] = save_offset1;
1203: md->offset_vector[offset+1] = save_offset2;
1204: md->offset_vector[md->offset_end - number] = save_offset3;
1205: }
1.6 misha 1206:
1.8 moko 1207: if (allow_zero || matched_once)
1208: {
1209: ecode += 1 + LINK_SIZE;
1210: break;
1211: }
1.1 misha 1212:
1.8 moko 1213: RRETURN(MATCH_NOMATCH);
1.6 misha 1214:
1215: /* Non-capturing possessive bracket with unlimited repeat. We come here
1216: from BRAZERO with allow_zero = TRUE. The code is similar to the above,
1217: without the capturing complication. It is written out separately for speed
1218: and cleanliness. */
1219:
1220: case OP_BRAPOS:
1221: case OP_SBRAPOS:
1222: allow_zero = FALSE;
1223:
1224: POSSESSIVE_NON_CAPTURE:
1225: matched_once = FALSE;
1226: code_offset = (int)(ecode - md->start_code);
1.7 misha 1227: save_capture_last = md->capture_last;
1.6 misha 1228:
1229: for (;;)
1230: {
1231: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
1232: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
1233: eptrb, RM48);
1234: if (rrc == MATCH_KETRPOS)
1235: {
1236: offset_top = md->end_offset_top;
1237: ecode = md->start_code + code_offset;
1238: matched_once = TRUE;
1.8 moko 1239: mstart = md->start_match_ptr; /* In case \K reset it */
1240: if (eptr == md->end_match_ptr) /* Matched an empty string */
1241: {
1242: do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1243: break;
1244: }
1245: eptr = md->end_match_ptr;
1.6 misha 1246: continue;
1247: }
1248:
1249: /* See comment in the code for capturing groups above about handling
1250: THEN. */
1251:
1252: if (rrc == MATCH_THEN)
1253: {
1254: next = ecode + GET(ecode,1);
1255: if (md->start_match_ptr < next &&
1256: (*ecode == OP_ALT || *next == OP_ALT))
1257: rrc = MATCH_NOMATCH;
1258: }
1259:
1260: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misha 1261: ecode += GET(ecode, 1);
1.6 misha 1262: if (*ecode != OP_ALT) break;
1.7 misha 1263: md->capture_last = save_capture_last;
1.1 misha 1264: }
1.6 misha 1265:
1266: if (matched_once || allow_zero)
1267: {
1268: ecode += 1 + LINK_SIZE;
1269: break;
1270: }
1271: RRETURN(MATCH_NOMATCH);
1272:
1.1 misha 1273: /* Control never reaches here. */
1274:
1.8 moko 1275: /* Conditional group: compilation checked that there are no more than two
1276: branches. If the condition is false, skipping the first branch takes us
1277: past the end of the item if there is only one branch, but that's exactly
1278: what we want. */
1.1 misha 1279:
1280: case OP_COND:
1281: case OP_SCOND:
1.8 moko 1282:
1283: /* The variable codelink will be added to ecode when the condition is
1284: false, to get to the second branch. Setting it to the offset to the ALT
1285: or KET, then incrementing ecode achieves this effect. We now have ecode
1286: pointing to the condition or callout. */
1287:
1288: codelink = GET(ecode, 1); /* Offset to the second branch */
1289: ecode += 1 + LINK_SIZE; /* From this opcode */
1.3 misha 1290:
1291: /* Because of the way auto-callout works during compile, a callout item is
1292: inserted between OP_COND and an assertion condition. */
1293:
1.8 moko 1294: if (*ecode == OP_CALLOUT)
1.3 misha 1295: {
1.6 misha 1296: if (PUBL(callout) != NULL)
1.3 misha 1297: {
1.6 misha 1298: PUBL(callout_block) cb;
1299: cb.version = 2; /* Version 1 of the callout block */
1.8 moko 1300: cb.callout_number = ecode[1];
1.3 misha 1301: cb.offset_vector = md->offset_vector;
1.7 misha 1302: #if defined COMPILE_PCRE8
1.3 misha 1303: cb.subject = (PCRE_SPTR)md->start_subject;
1.7 misha 1304: #elif defined COMPILE_PCRE16
1.6 misha 1305: cb.subject = (PCRE_SPTR16)md->start_subject;
1.7 misha 1306: #elif defined COMPILE_PCRE32
1307: cb.subject = (PCRE_SPTR32)md->start_subject;
1.6 misha 1308: #endif
1.4 misha 1309: cb.subject_length = (int)(md->end_subject - md->start_subject);
1310: cb.start_match = (int)(mstart - md->start_subject);
1311: cb.current_position = (int)(eptr - md->start_subject);
1.8 moko 1312: cb.pattern_position = GET(ecode, 2);
1313: cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1.3 misha 1314: cb.capture_top = offset_top/2;
1.7 misha 1315: cb.capture_last = md->capture_last & CAPLMASK;
1316: /* Internal change requires this for API compatibility. */
1317: if (cb.capture_last == 0) cb.capture_last = -1;
1.3 misha 1318: cb.callout_data = md->callout_data;
1.6 misha 1319: cb.mark = md->nomatch_mark;
1320: if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1.3 misha 1321: if (rrc < 0) RRETURN(rrc);
1322: }
1.8 moko 1323:
1324: /* Advance ecode past the callout, so it now points to the condition. We
1325: must adjust codelink so that the value of ecode+codelink is unchanged. */
1326:
1.6 misha 1327: ecode += PRIV(OP_lengths)[OP_CALLOUT];
1.7 misha 1328: codelink -= PRIV(OP_lengths)[OP_CALLOUT];
1.3 misha 1329: }
1330:
1.8 moko 1331: /* Test the various possible conditions */
1.3 misha 1332:
1.8 moko 1333: condition = FALSE;
1334: switch(condcode = *ecode)
1.1 misha 1335: {
1.8 moko 1336: case OP_RREF: /* Numbered group recursion test */
1337: if (md->recursive != NULL) /* Not recursing => FALSE */
1.4 misha 1338: {
1.8 moko 1339: unsigned int recno = GET2(ecode, 1); /* Recursion group number*/
1340: condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1.4 misha 1341: }
1.8 moko 1342: break;
1343:
1344: case OP_DNRREF: /* Duplicate named group recursion test */
1345: if (md->recursive != NULL)
1.4 misha 1346: {
1.8 moko 1347: int count = GET2(ecode, 1 + IMM2_SIZE);
1348: pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1349: while (count-- > 0)
1350: {
1351: unsigned int recno = GET2(slot, 0);
1352: condition = recno == md->recursive->group_num;
1353: if (condition) break;
1354: slot += md->name_entry_size;
1.4 misha 1355: }
1356: }
1.8 moko 1357: break;
1.1 misha 1358:
1.8 moko 1359: case OP_CREF: /* Numbered group used test */
1360: offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1.1 misha 1361: condition = offset < offset_top && md->offset_vector[offset] >= 0;
1.8 moko 1362: break;
1.4 misha 1363:
1.8 moko 1364: case OP_DNCREF: /* Duplicate named group used test */
1.4 misha 1365: {
1.8 moko 1366: int count = GET2(ecode, 1 + IMM2_SIZE);
1367: pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
1368: while (count-- > 0)
1369: {
1370: offset = GET2(slot, 0) << 1;
1371: condition = offset < offset_top && md->offset_vector[offset] >= 0;
1372: if (condition) break;
1373: slot += md->name_entry_size;
1.4 misha 1374: }
1375: }
1.8 moko 1376: break;
1.4 misha 1377:
1.8 moko 1378: case OP_DEF: /* DEFINE - always false */
1379: case OP_FAIL: /* From optimized (?!) condition */
1380: break;
1.1 misha 1381:
1.8 moko 1382: /* The condition is an assertion. Call match() to evaluate it - setting
1383: md->match_function_type to MATCH_CONDASSERT causes it to stop at the end
1384: of an assertion. */
1.1 misha 1385:
1.8 moko 1386: default:
1.6 misha 1387: md->match_function_type = MATCH_CONDASSERT;
1.8 moko 1388: RMATCH(eptr, ecode, offset_top, md, NULL, RM3);
1.1 misha 1389: if (rrc == MATCH_MATCH)
1390: {
1.6 misha 1391: if (md->end_offset_top > offset_top)
1392: offset_top = md->end_offset_top; /* Captures may have happened */
1.1 misha 1393: condition = TRUE;
1.8 moko 1394:
1395: /* Advance ecode past the assertion to the start of the first branch,
1396: but adjust it so that the general choosing code below works. If the
1397: assertion has a quantifier that allows zero repeats we must skip over
1398: the BRAZERO. This is a lunatic thing to do, but somebody did! */
1399:
1400: if (*ecode == OP_BRAZERO) ecode++;
1401: ecode += GET(ecode, 1);
1.1 misha 1402: while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1.8 moko 1403: ecode += 1 + LINK_SIZE - PRIV(OP_lengths)[condcode];
1.1 misha 1404: }
1.6 misha 1405:
1406: /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
1.8 moko 1407: assertion; it is therefore treated as NOMATCH. Any other return is an
1408: error. */
1.6 misha 1409:
1410: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1.1 misha 1411: {
1412: RRETURN(rrc); /* Need braces because of following else */
1413: }
1.8 moko 1414: break;
1.1 misha 1415: }
1416:
1.8 moko 1417: /* Choose branch according to the condition */
1.1 misha 1418:
1.8 moko 1419: ecode += condition? PRIV(OP_lengths)[condcode] : codelink;
1420:
1421: /* We are now at the branch that is to be obeyed. As there is only one, we
1422: can use tail recursion to avoid using another stack frame, except when
1423: there is unlimited repeat of a possibly empty group. In the latter case, a
1424: recursive call to match() is always required, unless the second alternative
1425: doesn't exist, in which case we can just plough on. Note that, for
1426: compatibility with Perl, the | in a conditional group is NOT treated as
1427: creating two alternatives. If a THEN is encountered in the branch, it
1428: propagates out to the enclosing alternative (unless nested in a deeper set
1429: of alternatives, of course). */
1430:
1431: if (condition || ecode[-(1+LINK_SIZE)] == OP_ALT)
1.1 misha 1432: {
1.6 misha 1433: if (op != OP_SCOND)
1.1 misha 1434: {
1435: goto TAIL_RECURSE;
1436: }
1.6 misha 1437:
1438: md->match_function_type = MATCH_CBEGROUP;
1.8 moko 1439: RMATCH(eptr, ecode, offset_top, md, eptrb, RM49);
1.6 misha 1440: RRETURN(rrc);
1.1 misha 1441: }
1.6 misha 1442:
1443: /* Condition false & no alternative; continue after the group. */
1444:
1445: else
1.1 misha 1446: {
1447: }
1448: break;
1449:
1450:
1.4 misha 1451: /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1452: to close any currently open capturing brackets. */
1453:
1454: case OP_CLOSE:
1.7 misha 1455: number = GET2(ecode, 1); /* Must be less than 65536 */
1.4 misha 1456: offset = number << 1;
1457:
1458: #ifdef PCRE_DEBUG
1459: printf("end bracket %d at *ACCEPT", number);
1460: printf("\n");
1461: #endif
1462:
1.7 misha 1463: md->capture_last = (md->capture_last & OVFLMASK) | number;
1464: if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1.4 misha 1465: {
1466: md->offset_vector[offset] =
1467: md->offset_vector[md->offset_end - number];
1468: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1.8 moko 1469:
1470: /* If this group is at or above the current highwater mark, ensure that
1471: any groups between the current high water mark and this group are marked
1472: unset and then update the high water mark. */
1473:
1474: if (offset >= offset_top)
1475: {
1476: register int *iptr = md->offset_vector + offset_top;
1477: register int *iend = md->offset_vector + offset;
1478: while (iptr < iend) *iptr++ = -1;
1479: offset_top = offset + 2;
1480: }
1.4 misha 1481: }
1.6 misha 1482: ecode += 1 + IMM2_SIZE;
1.4 misha 1483: break;
1484:
1485:
1.6 misha 1486: /* End of the pattern, either real or forced. */
1.1 misha 1487:
1.6 misha 1488: case OP_END:
1.1 misha 1489: case OP_ACCEPT:
1.6 misha 1490: case OP_ASSERT_ACCEPT:
1.1 misha 1491:
1.6 misha 1492: /* If we have matched an empty string, fail if not in an assertion and not
1493: in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
1494: is set and we have matched at the start of the subject. In both cases,
1495: backtracking will then try other alternatives, if any. */
1496:
1497: if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
1498: md->recursive == NULL &&
1499: (md->notempty ||
1500: (md->notempty_atstart &&
1501: mstart == md->start_subject + md->start_offset)))
1502: RRETURN(MATCH_NOMATCH);
1.4 misha 1503:
1504: /* Otherwise, we have a match. */
1.1 misha 1505:
1506: md->end_match_ptr = eptr; /* Record where we ended */
1507: md->end_offset_top = offset_top; /* and how many extracts were taken */
1508: md->start_match_ptr = mstart; /* and the start (\K can modify) */
1.4 misha 1509:
1510: /* For some reason, the macros don't work properly if an expression is
1.6 misha 1511: given as the argument to RRETURN when the heap is in use. */
1.4 misha 1512:
1513: rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1.6 misha 1514: RRETURN(rrc);
1.1 misha 1515:
1516: /* Assertion brackets. Check the alternative branches in turn - the
1517: matching won't pass the KET for an assertion. If any one branch matches,
1518: the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1519: start of each branch to move the current point backwards, so the code at
1.6 misha 1520: this level is identical to the lookahead case. When the assertion is part
1521: of a condition, we want to return immediately afterwards. The caller of
1522: this incarnation of the match() function will have set MATCH_CONDASSERT in
1523: md->match_function type, and one of these opcodes will be the first opcode
1524: that is processed. We use a local variable that is preserved over calls to
1525: match() to remember this case. */
1.1 misha 1526:
1527: case OP_ASSERT:
1528: case OP_ASSERTBACK:
1.6 misha 1529: save_mark = md->mark;
1530: if (md->match_function_type == MATCH_CONDASSERT)
1531: {
1532: condassert = TRUE;
1533: md->match_function_type = 0;
1534: }
1535: else condassert = FALSE;
1536:
1.7 misha 1537: /* Loop for each branch */
1538:
1.1 misha 1539: do
1540: {
1.6 misha 1541: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1.7 misha 1542:
1543: /* A match means that the assertion is true; break out of the loop
1544: that matches its alternatives. */
1545:
1.4 misha 1546: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1547: {
1548: mstart = md->start_match_ptr; /* In case \K reset it */
1549: break;
1550: }
1.6 misha 1551:
1.7 misha 1552: /* If not matched, restore the previous mark setting. */
1553:
1554: md->mark = save_mark;
1555:
1556: /* See comment in the code for capturing groups above about handling
1557: THEN. */
1558:
1559: if (rrc == MATCH_THEN)
1560: {
1561: next = ecode + GET(ecode,1);
1562: if (md->start_match_ptr < next &&
1563: (*ecode == OP_ALT || *next == OP_ALT))
1564: rrc = MATCH_NOMATCH;
1565: }
1566:
1567: /* Anything other than NOMATCH causes the entire assertion to fail,
1568: passing back the return code. This includes COMMIT, SKIP, PRUNE and an
1569: uncaptured THEN, which means they take their normal effect. This
1570: consistent approach does not always have exactly the same effect as in
1571: Perl. */
1.6 misha 1572:
1.7 misha 1573: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misha 1574: ecode += GET(ecode, 1);
1575: }
1.7 misha 1576: while (*ecode == OP_ALT); /* Continue for next alternative */
1577:
1578: /* If we have tried all the alternative branches, the assertion has
1579: failed. If not, we broke out after a match. */
1.6 misha 1580:
1581: if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1.1 misha 1582:
1583: /* If checking an assertion for a condition, return MATCH_MATCH. */
1584:
1.6 misha 1585: if (condassert) RRETURN(MATCH_MATCH);
1.1 misha 1586:
1.7 misha 1587: /* Continue from after a successful assertion, updating the offsets high
1588: water mark, since extracts may have been taken during the assertion. */
1.1 misha 1589:
1590: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1591: ecode += 1 + LINK_SIZE;
1592: offset_top = md->end_offset_top;
1593: continue;
1594:
1.7 misha 1595: /* Negative assertion: all branches must fail to match for the assertion to
1596: succeed. */
1.1 misha 1597:
1598: case OP_ASSERT_NOT:
1599: case OP_ASSERTBACK_NOT:
1.6 misha 1600: save_mark = md->mark;
1601: if (md->match_function_type == MATCH_CONDASSERT)
1602: {
1603: condassert = TRUE;
1604: md->match_function_type = 0;
1605: }
1606: else condassert = FALSE;
1607:
1.7 misha 1608: /* Loop for each alternative branch. */
1609:
1.1 misha 1610: do
1611: {
1.6 misha 1612: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
1.7 misha 1613: md->mark = save_mark; /* Always restore the mark setting */
1614:
1615: switch(rrc)
1.4 misha 1616: {
1.7 misha 1617: case MATCH_MATCH: /* A successful match means */
1618: case MATCH_ACCEPT: /* the assertion has failed. */
1619: RRETURN(MATCH_NOMATCH);
1620:
1621: case MATCH_NOMATCH: /* Carry on with next branch */
1622: break;
1623:
1624: /* See comment in the code for capturing groups above about handling
1625: THEN. */
1626:
1627: case MATCH_THEN:
1628: next = ecode + GET(ecode,1);
1629: if (md->start_match_ptr < next &&
1630: (*ecode == OP_ALT || *next == OP_ALT))
1631: {
1632: rrc = MATCH_NOMATCH;
1633: break;
1634: }
1635: /* Otherwise fall through. */
1636:
1637: /* COMMIT, SKIP, PRUNE, and an uncaptured THEN cause the whole
1638: assertion to fail to match, without considering any more alternatives.
1639: Failing to match means the assertion is true. This is a consistent
1640: approach, but does not always have the same effect as in Perl. */
1641:
1642: case MATCH_COMMIT:
1643: case MATCH_SKIP:
1644: case MATCH_SKIP_ARG:
1645: case MATCH_PRUNE:
1.4 misha 1646: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1.7 misha 1647: goto NEG_ASSERT_TRUE; /* Break out of alternation loop */
1648:
1649: /* Anything else is an error */
1650:
1651: default:
1652: RRETURN(rrc);
1.4 misha 1653: }
1.6 misha 1654:
1.7 misha 1655: /* Continue with next branch */
1.6 misha 1656:
1.1 misha 1657: ecode += GET(ecode,1);
1658: }
1659: while (*ecode == OP_ALT);
1660:
1.7 misha 1661: /* All branches in the assertion failed to match. */
1662:
1663: NEG_ASSERT_TRUE:
1.6 misha 1664: if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1.7 misha 1665: ecode += 1 + LINK_SIZE; /* Continue with current branch */
1.1 misha 1666: continue;
1667:
1668: /* Move the subject pointer back. This occurs only at the start of
1669: each branch of a lookbehind assertion. If we are too close to the start to
1670: move back, this match function fails. When working with UTF-8 we move
1671: back a number of characters, not bytes. */
1672:
1673: case OP_REVERSE:
1.6 misha 1674: #ifdef SUPPORT_UTF
1675: if (utf)
1.1 misha 1676: {
1677: i = GET(ecode, 1);
1678: while (i-- > 0)
1679: {
1680: eptr--;
1.6 misha 1681: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1.1 misha 1682: BACKCHAR(eptr);
1683: }
1684: }
1685: else
1686: #endif
1687:
1688: /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1689:
1690: {
1691: eptr -= GET(ecode, 1);
1.6 misha 1692: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1.1 misha 1693: }
1694:
1.4 misha 1695: /* Save the earliest consulted character, then skip to next op code */
1.1 misha 1696:
1.4 misha 1697: if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1.1 misha 1698: ecode += 1 + LINK_SIZE;
1699: break;
1700:
1701: /* The callout item calls an external function, if one is provided, passing
1702: details of the match so far. This is mainly for debugging, though the
1703: function is able to force a failure. */
1704:
1705: case OP_CALLOUT:
1.6 misha 1706: if (PUBL(callout) != NULL)
1.1 misha 1707: {
1.6 misha 1708: PUBL(callout_block) cb;
1709: cb.version = 2; /* Version 1 of the callout block */
1.1 misha 1710: cb.callout_number = ecode[1];
1711: cb.offset_vector = md->offset_vector;
1.7 misha 1712: #if defined COMPILE_PCRE8
1.1 misha 1713: cb.subject = (PCRE_SPTR)md->start_subject;
1.7 misha 1714: #elif defined COMPILE_PCRE16
1.6 misha 1715: cb.subject = (PCRE_SPTR16)md->start_subject;
1.7 misha 1716: #elif defined COMPILE_PCRE32
1717: cb.subject = (PCRE_SPTR32)md->start_subject;
1.6 misha 1718: #endif
1.4 misha 1719: cb.subject_length = (int)(md->end_subject - md->start_subject);
1720: cb.start_match = (int)(mstart - md->start_subject);
1721: cb.current_position = (int)(eptr - md->start_subject);
1.1 misha 1722: cb.pattern_position = GET(ecode, 2);
1723: cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1724: cb.capture_top = offset_top/2;
1.7 misha 1725: cb.capture_last = md->capture_last & CAPLMASK;
1726: /* Internal change requires this for API compatibility. */
1727: if (cb.capture_last == 0) cb.capture_last = -1;
1.1 misha 1728: cb.callout_data = md->callout_data;
1.6 misha 1729: cb.mark = md->nomatch_mark;
1730: if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1.1 misha 1731: if (rrc < 0) RRETURN(rrc);
1732: }
1733: ecode += 2 + 2*LINK_SIZE;
1734: break;
1735:
1736: /* Recursion either matches the current regex, or some subexpression. The
1737: offset data is the offset to the starting bracket from the start of the
1738: whole pattern. (This is so that it works from duplicated subpatterns.)
1739:
1.6 misha 1740: The state of the capturing groups is preserved over recursion, and
1741: re-instated afterwards. We don't know how many are started and not yet
1742: finished (offset_top records the completed total) so we just have to save
1743: all the potential data. There may be up to 65535 such values, which is too
1744: large to put on the stack, but using malloc for small numbers seems
1745: expensive. As a compromise, the stack is used when there are no more than
1746: REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1.1 misha 1747:
1748: There are also other values that have to be saved. We use a chained
1749: sequence of blocks that actually live on the stack. Thanks to Robin Houston
1.6 misha 1750: for the original version of this logic. It has, however, been hacked around
1751: a lot, so he is not to blame for the current way it works. */
1.1 misha 1752:
1753: case OP_RECURSE:
1754: {
1.6 misha 1755: recursion_info *ri;
1.7 misha 1756: unsigned int recno;
1.6 misha 1757:
1.1 misha 1758: callpat = md->start_code + GET(ecode, 1);
1.6 misha 1759: recno = (callpat == md->start_code)? 0 :
1.1 misha 1760: GET2(callpat, 1 + LINK_SIZE);
1761:
1.6 misha 1762: /* Check for repeating a recursion without advancing the subject pointer.
1763: This should catch convoluted mutual recursions. (Some simple cases are
1764: caught at compile time.) */
1765:
1766: for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
1767: if (recno == ri->group_num && eptr == ri->subject_position)
1768: RRETURN(PCRE_ERROR_RECURSELOOP);
1769:
1.1 misha 1770: /* Add to "recursing stack" */
1771:
1.6 misha 1772: new_recursive.group_num = recno;
1.7 misha 1773: new_recursive.saved_capture_last = md->capture_last;
1.6 misha 1774: new_recursive.subject_position = eptr;
1.1 misha 1775: new_recursive.prevrec = md->recursive;
1776: md->recursive = &new_recursive;
1777:
1.6 misha 1778: /* Where to continue from afterwards */
1.1 misha 1779:
1780: ecode += 1 + LINK_SIZE;
1781:
1.6 misha 1782: /* Now save the offset data */
1.1 misha 1783:
1784: new_recursive.saved_max = md->offset_end;
1785: if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1786: new_recursive.offset_save = stacksave;
1787: else
1788: {
1789: new_recursive.offset_save =
1.6 misha 1790: (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1.1 misha 1791: if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1792: }
1793: memcpy(new_recursive.offset_save, md->offset_vector,
1794: new_recursive.saved_max * sizeof(int));
1795:
1.6 misha 1796: /* OK, now we can do the recursion. After processing each alternative,
1.7 misha 1797: restore the offset data and the last captured value. If there were nested
1798: recursions, md->recursive might be changed, so reset it before looping.
1799: */
1.1 misha 1800:
1801: DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1.6 misha 1802: cbegroup = (*callpat >= OP_SBRA);
1.1 misha 1803: do
1804: {
1.6 misha 1805: if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
1806: RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
1807: md, eptrb, RM6);
1808: memcpy(md->offset_vector, new_recursive.offset_save,
1809: new_recursive.saved_max * sizeof(int));
1.7 misha 1810: md->capture_last = new_recursive.saved_capture_last;
1.6 misha 1811: md->recursive = new_recursive.prevrec;
1.4 misha 1812: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1.1 misha 1813: {
1814: DPRINTF(("Recursion matched\n"));
1815: if (new_recursive.offset_save != stacksave)
1.6 misha 1816: (PUBL(free))(new_recursive.offset_save);
1817:
1818: /* Set where we got to in the subject, and reset the start in case
1819: it was changed by \K. This *is* propagated back out of a recursion,
1820: for Perl compatibility. */
1821:
1822: eptr = md->end_match_ptr;
1823: mstart = md->start_match_ptr;
1824: goto RECURSION_MATCHED; /* Exit loop; end processing */
1.1 misha 1825: }
1.6 misha 1826:
1.7 misha 1827: /* PCRE does not allow THEN, SKIP, PRUNE or COMMIT to escape beyond a
1828: recursion; they cause a NOMATCH for the entire recursion. These codes
1829: are defined in a range that can be tested for. */
1830:
1831: if (rrc >= MATCH_BACKTRACK_MIN && rrc <= MATCH_BACKTRACK_MAX)
1.8 moko 1832: {
1833: if (new_recursive.offset_save != stacksave)
1834: (PUBL(free))(new_recursive.offset_save);
1.7 misha 1835: RRETURN(MATCH_NOMATCH);
1.8 moko 1836: }
1.7 misha 1837:
1838: /* Any return code other than NOMATCH is an error. */
1.6 misha 1839:
1.7 misha 1840: if (rrc != MATCH_NOMATCH)
1.1 misha 1841: {
1842: DPRINTF(("Recursion gave error %d\n", rrc));
1.3 misha 1843: if (new_recursive.offset_save != stacksave)
1.6 misha 1844: (PUBL(free))(new_recursive.offset_save);
1.1 misha 1845: RRETURN(rrc);
1846: }
1847:
1848: md->recursive = &new_recursive;
1849: callpat += GET(callpat, 1);
1850: }
1851: while (*callpat == OP_ALT);
1852:
1853: DPRINTF(("Recursion didn't match\n"));
1854: md->recursive = new_recursive.prevrec;
1855: if (new_recursive.offset_save != stacksave)
1.6 misha 1856: (PUBL(free))(new_recursive.offset_save);
1857: RRETURN(MATCH_NOMATCH);
1.1 misha 1858: }
1859:
1.6 misha 1860: RECURSION_MATCHED:
1861: break;
1.1 misha 1862:
1863: /* An alternation is the end of a branch; scan along to find the end of the
1864: bracketed group and go to there. */
1865:
1866: case OP_ALT:
1867: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1868: break;
1869:
1870: /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1871: indicating that it may occur zero times. It may repeat infinitely, or not
1872: at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1873: with fixed upper repeat limits are compiled as a number of copies, with the
1874: optional ones preceded by BRAZERO or BRAMINZERO. */
1875:
1876: case OP_BRAZERO:
1.6 misha 1877: next = ecode + 1;
1878: RMATCH(eptr, next, offset_top, md, eptrb, RM10);
1879: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1880: do next += GET(next, 1); while (*next == OP_ALT);
1881: ecode = next + 1 + LINK_SIZE;
1.1 misha 1882: break;
1883:
1884: case OP_BRAMINZERO:
1.6 misha 1885: next = ecode + 1;
1886: do next += GET(next, 1); while (*next == OP_ALT);
1887: RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
1888: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1889: ecode++;
1.1 misha 1890: break;
1891:
1892: case OP_SKIPZERO:
1.6 misha 1893: next = ecode+1;
1894: do next += GET(next,1); while (*next == OP_ALT);
1895: ecode = next + 1 + LINK_SIZE;
1.1 misha 1896: break;
1897:
1.6 misha 1898: /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
1899: here; just jump to the group, with allow_zero set TRUE. */
1900:
1901: case OP_BRAPOSZERO:
1902: op = *(++ecode);
1903: allow_zero = TRUE;
1904: if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
1905: goto POSSESSIVE_NON_CAPTURE;
1906:
1.1 misha 1907: /* End of a group, repeated or non-repeating. */
1908:
1909: case OP_KET:
1910: case OP_KETRMIN:
1911: case OP_KETRMAX:
1.6 misha 1912: case OP_KETRPOS:
1.1 misha 1913: prev = ecode - GET(ecode, 1);
1914:
1915: /* If this was a group that remembered the subject start, in order to break
1916: infinite repeats of empty string matches, retrieve the subject start from
1917: the chain. Otherwise, set it NULL. */
1918:
1.6 misha 1919: if (*prev >= OP_SBRA || *prev == OP_ONCE)
1.1 misha 1920: {
1921: saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1922: eptrb = eptrb->epb_prev; /* Backup to previous group */
1923: }
1924: else saved_eptr = NULL;
1925:
1.6 misha 1926: /* If we are at the end of an assertion group or a non-capturing atomic
1927: group, stop matching and return MATCH_MATCH, but record the current high
1928: water mark for use by positive assertions. We also need to record the match
1929: start in case it was changed by \K. */
1930:
1931: if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
1932: *prev == OP_ONCE_NC)
1.1 misha 1933: {
1.6 misha 1934: md->end_match_ptr = eptr; /* For ONCE_NC */
1.1 misha 1935: md->end_offset_top = offset_top;
1.4 misha 1936: md->start_match_ptr = mstart;
1.6 misha 1937: RRETURN(MATCH_MATCH); /* Sets md->mark */
1.1 misha 1938: }
1939:
1940: /* For capturing groups we have to check the group number back at the start
1941: and if necessary complete handling an extraction by setting the offsets and
1.6 misha 1942: bumping the high water mark. Whole-pattern recursion is coded as a recurse
1943: into group 0, so it won't be picked up here. Instead, we catch it when the
1944: OP_END is reached. Other recursion is handled here. We just have to record
1945: the current subject position and start match pointer and give a MATCH
1946: return. */
1.1 misha 1947:
1.6 misha 1948: if (*prev == OP_CBRA || *prev == OP_SCBRA ||
1949: *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1.1 misha 1950: {
1951: number = GET2(prev, 1+LINK_SIZE);
1952: offset = number << 1;
1953:
1.4 misha 1954: #ifdef PCRE_DEBUG
1.1 misha 1955: printf("end bracket %d", number);
1956: printf("\n");
1957: #endif
1958:
1.6 misha 1959: /* Handle a recursively called group. */
1960:
1961: if (md->recursive != NULL && md->recursive->group_num == number)
1962: {
1963: md->end_match_ptr = eptr;
1964: md->start_match_ptr = mstart;
1965: RRETURN(MATCH_MATCH);
1966: }
1967:
1968: /* Deal with capturing */
1969:
1.7 misha 1970: md->capture_last = (md->capture_last & OVFLMASK) | number;
1971: if (offset >= md->offset_max) md->capture_last |= OVFLBIT; else
1.1 misha 1972: {
1.6 misha 1973: /* If offset is greater than offset_top, it means that we are
1974: "skipping" a capturing group, and that group's offsets must be marked
1975: unset. In earlier versions of PCRE, all the offsets were unset at the
1976: start of matching, but this doesn't work because atomic groups and
1977: assertions can cause a value to be set that should later be unset.
1978: Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
1979: part of the atomic group, but this is not on the final matching path,
1980: so must be unset when 2 is set. (If there is no group 2, there is no
1981: problem, because offset_top will then be 2, indicating no capture.) */
1982:
1983: if (offset > offset_top)
1984: {
1985: register int *iptr = md->offset_vector + offset_top;
1986: register int *iend = md->offset_vector + offset;
1987: while (iptr < iend) *iptr++ = -1;
1988: }
1989:
1990: /* Now make the extraction */
1991:
1.1 misha 1992: md->offset_vector[offset] =
1993: md->offset_vector[md->offset_end - number];
1.4 misha 1994: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1.1 misha 1995: if (offset_top <= offset) offset_top = offset + 2;
1996: }
1.6 misha 1997: }
1.1 misha 1998:
1.8 moko 1999: /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
2000: and return the MATCH_KETRPOS. This makes it possible to do the repeats one
2001: at a time from the outer level, thus saving stack. This must precede the
2002: empty string test - in this case that test is done at the outer level. */
2003:
2004: if (*ecode == OP_KETRPOS)
2005: {
2006: md->start_match_ptr = mstart; /* In case \K reset it */
2007: md->end_match_ptr = eptr;
2008: md->end_offset_top = offset_top;
2009: RRETURN(MATCH_KETRPOS);
2010: }
2011:
1.6 misha 2012: /* For an ordinary non-repeating ket, just continue at this level. This
2013: also happens for a repeating ket if no characters were matched in the
2014: group. This is the forcible breaking of infinite loops as implemented in
2015: Perl 5.005. For a non-repeating atomic group that includes captures,
2016: establish a backup point by processing the rest of the pattern at a lower
2017: level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
2018: original OP_ONCE level, thereby bypassing intermediate backup points, but
2019: resetting any captures that happened along the way. */
1.1 misha 2020:
1.6 misha 2021: if (*ecode == OP_KET || eptr == saved_eptr)
2022: {
2023: if (*prev == OP_ONCE)
1.1 misha 2024: {
1.6 misha 2025: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
2026: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2027: md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2028: RRETURN(MATCH_ONCE);
1.1 misha 2029: }
1.6 misha 2030: ecode += 1 + LINK_SIZE; /* Carry on at this level */
2031: break;
1.1 misha 2032: }
2033:
1.6 misha 2034: /* The normal repeating kets try the rest of the pattern or restart from
2035: the preceding bracket, in the appropriate order. In the second case, we can
2036: use tail recursion to avoid using another stack frame, unless we have an
2037: an atomic group or an unlimited repeat of a group that can match an empty
2038: string. */
1.1 misha 2039:
2040: if (*ecode == OP_KETRMIN)
2041: {
1.6 misha 2042: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1.1 misha 2043: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 2044: if (*prev == OP_ONCE)
1.1 misha 2045: {
1.6 misha 2046: RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
2047: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2048: md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
2049: RRETURN(MATCH_ONCE);
2050: }
2051: if (*prev >= OP_SBRA) /* Could match an empty string */
2052: {
2053: RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1.1 misha 2054: RRETURN(rrc);
2055: }
2056: ecode = prev;
2057: goto TAIL_RECURSE;
2058: }
2059: else /* OP_KETRMAX */
2060: {
1.6 misha 2061: RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
2062: if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1.1 misha 2063: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 2064: if (*prev == OP_ONCE)
2065: {
2066: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
2067: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2068: md->once_target = prev;
2069: RRETURN(MATCH_ONCE);
2070: }
1.1 misha 2071: ecode += 1 + LINK_SIZE;
2072: goto TAIL_RECURSE;
2073: }
2074: /* Control never gets here */
2075:
1.6 misha 2076: /* Not multiline mode: start of subject assertion, unless notbol. */
1.1 misha 2077:
2078: case OP_CIRC:
1.6 misha 2079: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1.1 misha 2080:
2081: /* Start of subject assertion */
2082:
2083: case OP_SOD:
1.6 misha 2084: if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
2085: ecode++;
2086: break;
2087:
2088: /* Multiline mode: start of subject unless notbol, or after any newline. */
2089:
2090: case OP_CIRCM:
2091: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
2092: if (eptr != md->start_subject &&
2093: (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
2094: RRETURN(MATCH_NOMATCH);
1.1 misha 2095: ecode++;
2096: break;
2097:
2098: /* Start of match assertion */
2099:
2100: case OP_SOM:
1.6 misha 2101: if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1.1 misha 2102: ecode++;
2103: break;
2104:
2105: /* Reset the start of match point */
2106:
2107: case OP_SET_SOM:
2108: mstart = eptr;
2109: ecode++;
2110: break;
2111:
1.6 misha 2112: /* Multiline mode: assert before any newline, or before end of subject
2113: unless noteol is set. */
1.1 misha 2114:
1.6 misha 2115: case OP_DOLLM:
2116: if (eptr < md->end_subject)
1.7 misha 2117: {
2118: if (!IS_NEWLINE(eptr))
2119: {
2120: if (md->partial != 0 &&
2121: eptr + 1 >= md->end_subject &&
2122: NLBLOCK->nltype == NLTYPE_FIXED &&
2123: NLBLOCK->nllen == 2 &&
1.8 moko 2124: UCHAR21TEST(eptr) == NLBLOCK->nl[0])
1.7 misha 2125: {
2126: md->hitend = TRUE;
2127: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2128: }
2129: RRETURN(MATCH_NOMATCH);
2130: }
2131: }
1.6 misha 2132: else
1.1 misha 2133: {
1.6 misha 2134: if (md->noteol) RRETURN(MATCH_NOMATCH);
2135: SCHECK_PARTIAL();
1.1 misha 2136: }
1.6 misha 2137: ecode++;
2138: break;
2139:
2140: /* Not multiline mode: assert before a terminating newline or before end of
2141: subject unless noteol is set. */
2142:
2143: case OP_DOLL:
2144: if (md->noteol) RRETURN(MATCH_NOMATCH);
2145: if (!md->endonly) goto ASSERT_NL_OR_EOS;
1.5 misha 2146:
1.1 misha 2147: /* ... else fall through for endonly */
2148:
2149: /* End of subject assertion (\z) */
2150:
2151: case OP_EOD:
1.6 misha 2152: if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1.5 misha 2153: SCHECK_PARTIAL();
1.1 misha 2154: ecode++;
2155: break;
2156:
2157: /* End of subject or ending \n assertion (\Z) */
2158:
2159: case OP_EODN:
1.5 misha 2160: ASSERT_NL_OR_EOS:
2161: if (eptr < md->end_subject &&
1.1 misha 2162: (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1.7 misha 2163: {
2164: if (md->partial != 0 &&
2165: eptr + 1 >= md->end_subject &&
2166: NLBLOCK->nltype == NLTYPE_FIXED &&
2167: NLBLOCK->nllen == 2 &&
1.8 moko 2168: UCHAR21TEST(eptr) == NLBLOCK->nl[0])
1.7 misha 2169: {
2170: md->hitend = TRUE;
2171: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2172: }
1.6 misha 2173: RRETURN(MATCH_NOMATCH);
1.7 misha 2174: }
1.5 misha 2175:
2176: /* Either at end of string or \n before end. */
2177:
2178: SCHECK_PARTIAL();
1.1 misha 2179: ecode++;
2180: break;
2181:
2182: /* Word boundary assertions */
2183:
2184: case OP_NOT_WORD_BOUNDARY:
2185: case OP_WORD_BOUNDARY:
2186: {
2187:
2188: /* Find out if the previous and current characters are "word" characters.
2189: It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1.4 misha 2190: be "non-word" characters. Remember the earliest consulted character for
2191: partial matching. */
1.1 misha 2192:
1.6 misha 2193: #ifdef SUPPORT_UTF
2194: if (utf)
1.1 misha 2195: {
1.4 misha 2196: /* Get status of previous character */
2197:
1.1 misha 2198: if (eptr == md->start_subject) prev_is_word = FALSE; else
2199: {
1.6 misha 2200: PCRE_PUCHAR lastptr = eptr - 1;
2201: BACKCHAR(lastptr);
1.4 misha 2202: if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1.1 misha 2203: GETCHAR(c, lastptr);
1.4 misha 2204: #ifdef SUPPORT_UCP
2205: if (md->use_ucp)
2206: {
2207: if (c == '_') prev_is_word = TRUE; else
2208: {
2209: int cat = UCD_CATEGORY(c);
2210: prev_is_word = (cat == ucp_L || cat == ucp_N);
2211: }
2212: }
2213: else
2214: #endif
1.1 misha 2215: prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2216: }
1.4 misha 2217:
2218: /* Get status of next character */
2219:
2220: if (eptr >= md->end_subject)
2221: {
2222: SCHECK_PARTIAL();
2223: cur_is_word = FALSE;
2224: }
2225: else
1.1 misha 2226: {
2227: GETCHAR(c, eptr);
1.4 misha 2228: #ifdef SUPPORT_UCP
2229: if (md->use_ucp)
2230: {
2231: if (c == '_') cur_is_word = TRUE; else
2232: {
2233: int cat = UCD_CATEGORY(c);
2234: cur_is_word = (cat == ucp_L || cat == ucp_N);
2235: }
2236: }
2237: else
2238: #endif
1.1 misha 2239: cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2240: }
2241: }
2242: else
2243: #endif
2244:
1.4 misha 2245: /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2246: consistency with the behaviour of \w we do use it in this case. */
1.1 misha 2247:
2248: {
1.4 misha 2249: /* Get status of previous character */
2250:
2251: if (eptr == md->start_subject) prev_is_word = FALSE; else
2252: {
2253: if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2254: #ifdef SUPPORT_UCP
2255: if (md->use_ucp)
2256: {
2257: c = eptr[-1];
2258: if (c == '_') prev_is_word = TRUE; else
2259: {
2260: int cat = UCD_CATEGORY(c);
2261: prev_is_word = (cat == ucp_L || cat == ucp_N);
2262: }
2263: }
2264: else
2265: #endif
1.6 misha 2266: prev_is_word = MAX_255(eptr[-1])
2267: && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1.4 misha 2268: }
2269:
2270: /* Get status of next character */
2271:
2272: if (eptr >= md->end_subject)
2273: {
2274: SCHECK_PARTIAL();
2275: cur_is_word = FALSE;
2276: }
2277: else
2278: #ifdef SUPPORT_UCP
2279: if (md->use_ucp)
2280: {
2281: c = *eptr;
2282: if (c == '_') cur_is_word = TRUE; else
2283: {
2284: int cat = UCD_CATEGORY(c);
2285: cur_is_word = (cat == ucp_L || cat == ucp_N);
2286: }
2287: }
2288: else
2289: #endif
1.6 misha 2290: cur_is_word = MAX_255(*eptr)
2291: && ((md->ctypes[*eptr] & ctype_word) != 0);
1.1 misha 2292: }
2293:
2294: /* Now see if the situation is what we want */
2295:
2296: if ((*ecode++ == OP_WORD_BOUNDARY)?
2297: cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1.6 misha 2298: RRETURN(MATCH_NOMATCH);
1.1 misha 2299: }
2300: break;
2301:
1.7 misha 2302: /* Match any single character type except newline; have to take care with
2303: CRLF newlines and partial matching. */
1.1 misha 2304:
2305: case OP_ANY:
1.6 misha 2306: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1.7 misha 2307: if (md->partial != 0 &&
1.9 ! moko 2308: eptr == md->end_subject - 1 &&
1.7 misha 2309: NLBLOCK->nltype == NLTYPE_FIXED &&
2310: NLBLOCK->nllen == 2 &&
1.8 moko 2311: UCHAR21TEST(eptr) == NLBLOCK->nl[0])
1.7 misha 2312: {
2313: md->hitend = TRUE;
2314: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2315: }
2316:
1.1 misha 2317: /* Fall through */
2318:
1.7 misha 2319: /* Match any single character whatsoever. */
2320:
1.1 misha 2321: case OP_ALLANY:
1.6 misha 2322: if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2323: { /* not be updated before SCHECK_PARTIAL. */
1.4 misha 2324: SCHECK_PARTIAL();
1.6 misha 2325: RRETURN(MATCH_NOMATCH);
1.4 misha 2326: }
1.6 misha 2327: eptr++;
2328: #ifdef SUPPORT_UTF
2329: if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
2330: #endif
1.1 misha 2331: ecode++;
2332: break;
2333:
2334: /* Match a single byte, even in UTF-8 mode. This opcode really does match
2335: any byte, even newline, independent of the setting of PCRE_DOTALL. */
2336:
2337: case OP_ANYBYTE:
1.6 misha 2338: if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
2339: { /* not be updated before SCHECK_PARTIAL. */
1.4 misha 2340: SCHECK_PARTIAL();
1.6 misha 2341: RRETURN(MATCH_NOMATCH);
1.4 misha 2342: }
1.6 misha 2343: eptr++;
1.1 misha 2344: ecode++;
2345: break;
2346:
2347: case OP_NOT_DIGIT:
1.4 misha 2348: if (eptr >= md->end_subject)
2349: {
2350: SCHECK_PARTIAL();
1.6 misha 2351: RRETURN(MATCH_NOMATCH);
1.4 misha 2352: }
1.1 misha 2353: GETCHARINCTEST(c, eptr);
2354: if (
1.6 misha 2355: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
1.1 misha 2356: c < 256 &&
2357: #endif
2358: (md->ctypes[c] & ctype_digit) != 0
2359: )
1.6 misha 2360: RRETURN(MATCH_NOMATCH);
1.1 misha 2361: ecode++;
2362: break;
2363:
2364: case OP_DIGIT:
1.4 misha 2365: if (eptr >= md->end_subject)
2366: {
2367: SCHECK_PARTIAL();
1.6 misha 2368: RRETURN(MATCH_NOMATCH);
1.4 misha 2369: }
1.1 misha 2370: GETCHARINCTEST(c, eptr);
2371: if (
1.6 misha 2372: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2373: c > 255 ||
1.1 misha 2374: #endif
2375: (md->ctypes[c] & ctype_digit) == 0
2376: )
1.6 misha 2377: RRETURN(MATCH_NOMATCH);
1.1 misha 2378: ecode++;
2379: break;
2380:
2381: case OP_NOT_WHITESPACE:
1.4 misha 2382: if (eptr >= md->end_subject)
2383: {
2384: SCHECK_PARTIAL();
1.6 misha 2385: RRETURN(MATCH_NOMATCH);
1.4 misha 2386: }
1.1 misha 2387: GETCHARINCTEST(c, eptr);
2388: if (
1.6 misha 2389: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
1.1 misha 2390: c < 256 &&
2391: #endif
2392: (md->ctypes[c] & ctype_space) != 0
2393: )
1.6 misha 2394: RRETURN(MATCH_NOMATCH);
1.1 misha 2395: ecode++;
2396: break;
2397:
2398: case OP_WHITESPACE:
1.4 misha 2399: if (eptr >= md->end_subject)
2400: {
2401: SCHECK_PARTIAL();
1.6 misha 2402: RRETURN(MATCH_NOMATCH);
1.4 misha 2403: }
1.1 misha 2404: GETCHARINCTEST(c, eptr);
2405: if (
1.6 misha 2406: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2407: c > 255 ||
1.1 misha 2408: #endif
2409: (md->ctypes[c] & ctype_space) == 0
2410: )
1.6 misha 2411: RRETURN(MATCH_NOMATCH);
1.1 misha 2412: ecode++;
2413: break;
2414:
2415: case OP_NOT_WORDCHAR:
1.4 misha 2416: if (eptr >= md->end_subject)
2417: {
2418: SCHECK_PARTIAL();
1.6 misha 2419: RRETURN(MATCH_NOMATCH);
1.4 misha 2420: }
1.1 misha 2421: GETCHARINCTEST(c, eptr);
2422: if (
1.6 misha 2423: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
1.1 misha 2424: c < 256 &&
2425: #endif
2426: (md->ctypes[c] & ctype_word) != 0
2427: )
1.6 misha 2428: RRETURN(MATCH_NOMATCH);
1.1 misha 2429: ecode++;
2430: break;
2431:
2432: case OP_WORDCHAR:
1.4 misha 2433: if (eptr >= md->end_subject)
2434: {
2435: SCHECK_PARTIAL();
1.6 misha 2436: RRETURN(MATCH_NOMATCH);
1.4 misha 2437: }
1.1 misha 2438: GETCHARINCTEST(c, eptr);
2439: if (
1.6 misha 2440: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
2441: c > 255 ||
1.1 misha 2442: #endif
2443: (md->ctypes[c] & ctype_word) == 0
2444: )
1.6 misha 2445: RRETURN(MATCH_NOMATCH);
1.1 misha 2446: ecode++;
2447: break;
2448:
2449: case OP_ANYNL:
1.4 misha 2450: if (eptr >= md->end_subject)
2451: {
2452: SCHECK_PARTIAL();
1.6 misha 2453: RRETURN(MATCH_NOMATCH);
1.4 misha 2454: }
1.1 misha 2455: GETCHARINCTEST(c, eptr);
2456: switch(c)
2457: {
1.6 misha 2458: default: RRETURN(MATCH_NOMATCH);
2459:
1.7 misha 2460: case CHAR_CR:
2461: if (eptr >= md->end_subject)
2462: {
2463: SCHECK_PARTIAL();
2464: }
1.8 moko 2465: else if (UCHAR21TEST(eptr) == CHAR_LF) eptr++;
1.1 misha 2466: break;
2467:
1.7 misha 2468: case CHAR_LF:
1.1 misha 2469: break;
2470:
1.7 misha 2471: case CHAR_VT:
2472: case CHAR_FF:
2473: case CHAR_NEL:
2474: #ifndef EBCDIC
1.1 misha 2475: case 0x2028:
2476: case 0x2029:
1.7 misha 2477: #endif /* Not EBCDIC */
1.6 misha 2478: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1.1 misha 2479: break;
2480: }
2481: ecode++;
2482: break;
2483:
2484: case OP_NOT_HSPACE:
1.4 misha 2485: if (eptr >= md->end_subject)
2486: {
2487: SCHECK_PARTIAL();
1.6 misha 2488: RRETURN(MATCH_NOMATCH);
1.4 misha 2489: }
1.1 misha 2490: GETCHARINCTEST(c, eptr);
2491: switch(c)
2492: {
1.7 misha 2493: HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
1.1 misha 2494: default: break;
2495: }
2496: ecode++;
2497: break;
2498:
2499: case OP_HSPACE:
1.4 misha 2500: if (eptr >= md->end_subject)
2501: {
2502: SCHECK_PARTIAL();
1.6 misha 2503: RRETURN(MATCH_NOMATCH);
1.4 misha 2504: }
1.1 misha 2505: GETCHARINCTEST(c, eptr);
2506: switch(c)
2507: {
1.7 misha 2508: HSPACE_CASES: break; /* Byte and multibyte cases */
1.6 misha 2509: default: RRETURN(MATCH_NOMATCH);
1.1 misha 2510: }
2511: ecode++;
2512: break;
2513:
2514: case OP_NOT_VSPACE:
1.4 misha 2515: if (eptr >= md->end_subject)
2516: {
2517: SCHECK_PARTIAL();
1.6 misha 2518: RRETURN(MATCH_NOMATCH);
1.4 misha 2519: }
1.1 misha 2520: GETCHARINCTEST(c, eptr);
2521: switch(c)
2522: {
1.7 misha 2523: VSPACE_CASES: RRETURN(MATCH_NOMATCH);
1.1 misha 2524: default: break;
2525: }
2526: ecode++;
2527: break;
2528:
2529: case OP_VSPACE:
1.4 misha 2530: if (eptr >= md->end_subject)
2531: {
2532: SCHECK_PARTIAL();
1.6 misha 2533: RRETURN(MATCH_NOMATCH);
1.4 misha 2534: }
1.1 misha 2535: GETCHARINCTEST(c, eptr);
2536: switch(c)
2537: {
1.7 misha 2538: VSPACE_CASES: break;
1.6 misha 2539: default: RRETURN(MATCH_NOMATCH);
1.1 misha 2540: }
2541: ecode++;
2542: break;
2543:
2544: #ifdef SUPPORT_UCP
2545: /* Check the next character by Unicode property. We will get here only
2546: if the support is in the binary; otherwise a compile-time error occurs. */
2547:
2548: case OP_PROP:
2549: case OP_NOTPROP:
1.4 misha 2550: if (eptr >= md->end_subject)
2551: {
2552: SCHECK_PARTIAL();
1.6 misha 2553: RRETURN(MATCH_NOMATCH);
1.4 misha 2554: }
1.1 misha 2555: GETCHARINCTEST(c, eptr);
2556: {
1.7 misha 2557: const pcre_uint32 *cp;
1.3 misha 2558: const ucd_record *prop = GET_UCD(c);
1.1 misha 2559:
2560: switch(ecode[1])
2561: {
2562: case PT_ANY:
1.6 misha 2563: if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1.1 misha 2564: break;
2565:
2566: case PT_LAMP:
1.2 misha 2567: if ((prop->chartype == ucp_Lu ||
2568: prop->chartype == ucp_Ll ||
2569: prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1.6 misha 2570: RRETURN(MATCH_NOMATCH);
1.4 misha 2571: break;
1.1 misha 2572:
2573: case PT_GC:
1.6 misha 2574: if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
2575: RRETURN(MATCH_NOMATCH);
1.1 misha 2576: break;
2577:
2578: case PT_PC:
1.2 misha 2579: if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1.6 misha 2580: RRETURN(MATCH_NOMATCH);
1.1 misha 2581: break;
2582:
2583: case PT_SC:
1.2 misha 2584: if ((ecode[2] != prop->script) == (op == OP_PROP))
1.6 misha 2585: RRETURN(MATCH_NOMATCH);
1.4 misha 2586: break;
2587:
2588: /* These are specials */
2589:
2590: case PT_ALNUM:
1.6 misha 2591: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2592: PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2593: RRETURN(MATCH_NOMATCH);
1.4 misha 2594: break;
2595:
1.8 moko 2596: /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2597: which means that Perl space and POSIX space are now identical. PCRE
2598: was changed at release 8.34. */
2599:
1.4 misha 2600: case PT_SPACE: /* Perl space */
1.8 moko 2601: case PT_PXSPACE: /* POSIX space */
2602: switch(c)
2603: {
2604: HSPACE_CASES:
2605: VSPACE_CASES:
2606: if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
2607: break;
1.4 misha 2608:
1.8 moko 2609: default:
2610: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z) ==
2611: (op == OP_NOTPROP)) RRETURN(MATCH_NOMATCH);
2612: break;
2613: }
1.4 misha 2614: break;
2615:
2616: case PT_WORD:
1.6 misha 2617: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
2618: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.4 misha 2619: c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
1.6 misha 2620: RRETURN(MATCH_NOMATCH);
1.1 misha 2621: break;
2622:
1.7 misha 2623: case PT_CLIST:
2624: cp = PRIV(ucd_caseless_sets) + ecode[2];
2625: for (;;)
2626: {
2627: if (c < *cp)
2628: { if (op == OP_PROP) { RRETURN(MATCH_NOMATCH); } else break; }
2629: if (c == *cp++)
2630: { if (op == OP_PROP) break; else { RRETURN(MATCH_NOMATCH); } }
2631: }
2632: break;
2633:
2634: case PT_UCNC:
2635: if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2636: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2637: c >= 0xe000) == (op == OP_NOTPROP))
2638: RRETURN(MATCH_NOMATCH);
2639: break;
2640:
1.4 misha 2641: /* This should never occur */
2642:
1.1 misha 2643: default:
2644: RRETURN(PCRE_ERROR_INTERNAL);
2645: }
2646:
2647: ecode += 3;
2648: }
2649: break;
2650:
2651: /* Match an extended Unicode sequence. We will get here only if the support
2652: is in the binary; otherwise a compile-time error occurs. */
2653:
2654: case OP_EXTUNI:
1.4 misha 2655: if (eptr >= md->end_subject)
2656: {
2657: SCHECK_PARTIAL();
1.6 misha 2658: RRETURN(MATCH_NOMATCH);
1.4 misha 2659: }
1.7 misha 2660: else
1.1 misha 2661: {
1.7 misha 2662: int lgb, rgb;
2663: GETCHARINCTEST(c, eptr);
2664: lgb = UCD_GRAPHBREAK(c);
2665: while (eptr < md->end_subject)
2666: {
2667: int len = 1;
2668: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
2669: rgb = UCD_GRAPHBREAK(c);
2670: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
2671: lgb = rgb;
2672: eptr += len;
2673: }
1.1 misha 2674: }
1.7 misha 2675: CHECK_PARTIAL();
1.1 misha 2676: ecode++;
2677: break;
1.7 misha 2678: #endif /* SUPPORT_UCP */
1.1 misha 2679:
2680:
2681: /* Match a back reference, possibly repeatedly. Look past the end of the
2682: item to see if there is repeat information following. The code is similar
2683: to that for character classes, but repeated for efficiency. Then obey
2684: similar code to character type repeats - written out again for speed.
2685: However, if the referenced string is the empty string, always treat
2686: it as matched, any number of times (otherwise there could be infinite
1.8 moko 2687: loops). If the reference is unset, there are two possibilities:
1.1 misha 2688:
1.6 misha 2689: (a) In the default, Perl-compatible state, set the length negative;
2690: this ensures that every attempt at a match fails. We can't just fail
2691: here, because of the possibility of quantifiers with zero minima.
1.1 misha 2692:
1.6 misha 2693: (b) If the JavaScript compatibility flag is set, set the length to zero
2694: so that the back reference matches an empty string.
1.1 misha 2695:
1.6 misha 2696: Otherwise, set the length to the length of what was matched by the
1.8 moko 2697: referenced subpattern.
2698:
2699: The OP_REF and OP_REFI opcodes are used for a reference to a numbered group
2700: or to a non-duplicated named group. For a duplicated named group, OP_DNREF
2701: and OP_DNREFI are used. In this case we must scan the list of groups to
2702: which the name refers, and use the first one that is set. */
2703:
2704: case OP_DNREF:
2705: case OP_DNREFI:
2706: caseless = op == OP_DNREFI;
2707: {
2708: int count = GET2(ecode, 1+IMM2_SIZE);
2709: pcre_uchar *slot = md->name_table + GET2(ecode, 1) * md->name_entry_size;
2710: ecode += 1 + 2*IMM2_SIZE;
2711:
2712: /* Setting the default length first and initializing 'offset' avoids
2713: compiler warnings in the REF_REPEAT code. */
1.1 misha 2714:
1.8 moko 2715: length = (md->jscript_compat)? 0 : -1;
2716: offset = 0;
2717:
2718: while (count-- > 0)
2719: {
2720: offset = GET2(slot, 0) << 1;
2721: if (offset < offset_top && md->offset_vector[offset] >= 0)
2722: {
2723: length = md->offset_vector[offset+1] - md->offset_vector[offset];
2724: break;
2725: }
2726: slot += md->name_entry_size;
2727: }
2728: }
2729: goto REF_REPEAT;
2730:
2731: case OP_REF:
2732: case OP_REFI:
2733: caseless = op == OP_REFI;
2734: offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2735: ecode += 1 + IMM2_SIZE;
1.6 misha 2736: if (offset >= offset_top || md->offset_vector[offset] < 0)
2737: length = (md->jscript_compat)? 0 : -1;
2738: else
2739: length = md->offset_vector[offset+1] - md->offset_vector[offset];
1.1 misha 2740:
1.6 misha 2741: /* Set up for repetition, or handle the non-repeated case */
1.1 misha 2742:
1.8 moko 2743: REF_REPEAT:
1.6 misha 2744: switch (*ecode)
2745: {
2746: case OP_CRSTAR:
2747: case OP_CRMINSTAR:
2748: case OP_CRPLUS:
2749: case OP_CRMINPLUS:
2750: case OP_CRQUERY:
2751: case OP_CRMINQUERY:
2752: c = *ecode++ - OP_CRSTAR;
2753: minimize = (c & 1) != 0;
2754: min = rep_min[c]; /* Pick up values from tables; */
2755: max = rep_max[c]; /* zero for max => infinity */
2756: if (max == 0) max = INT_MAX;
2757: break;
1.1 misha 2758:
1.6 misha 2759: case OP_CRRANGE:
2760: case OP_CRMINRANGE:
2761: minimize = (*ecode == OP_CRMINRANGE);
2762: min = GET2(ecode, 1);
2763: max = GET2(ecode, 1 + IMM2_SIZE);
2764: if (max == 0) max = INT_MAX;
2765: ecode += 1 + 2 * IMM2_SIZE;
2766: break;
1.1 misha 2767:
1.6 misha 2768: default: /* No repeat follows */
2769: if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
2770: {
1.7 misha 2771: if (length == -2) eptr = md->end_subject; /* Partial match */
1.6 misha 2772: CHECK_PARTIAL();
2773: RRETURN(MATCH_NOMATCH);
1.1 misha 2774: }
1.6 misha 2775: eptr += length;
2776: continue; /* With the main loop */
2777: }
1.1 misha 2778:
1.6 misha 2779: /* Handle repeated back references. If the length of the reference is
2780: zero, just continue with the main loop. If the length is negative, it
2781: means the reference is unset in non-Java-compatible mode. If the minimum is
2782: zero, we can continue at the same level without recursion. For any other
2783: minimum, carrying on will result in NOMATCH. */
1.1 misha 2784:
1.6 misha 2785: if (length == 0) continue;
2786: if (length < 0 && min == 0) continue;
1.1 misha 2787:
1.6 misha 2788: /* First, ensure the minimum number of matches are present. We get back
2789: the length of the reference string explicitly rather than passing the
2790: address of eptr, so that eptr can be a register variable. */
1.1 misha 2791:
1.6 misha 2792: for (i = 1; i <= min; i++)
2793: {
2794: int slength;
2795: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
1.1 misha 2796: {
1.7 misha 2797: if (slength == -2) eptr = md->end_subject; /* Partial match */
1.6 misha 2798: CHECK_PARTIAL();
2799: RRETURN(MATCH_NOMATCH);
1.1 misha 2800: }
1.6 misha 2801: eptr += slength;
2802: }
1.1 misha 2803:
1.6 misha 2804: /* If min = max, continue at the same level without recursion.
2805: They are not both allowed to be zero. */
1.1 misha 2806:
1.6 misha 2807: if (min == max) continue;
1.1 misha 2808:
1.6 misha 2809: /* If minimizing, keep trying and advancing the pointer */
1.1 misha 2810:
1.6 misha 2811: if (minimize)
2812: {
2813: for (fi = min;; fi++)
1.1 misha 2814: {
1.6 misha 2815: int slength;
2816: RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
2817: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2818: if (fi >= max) RRETURN(MATCH_NOMATCH);
2819: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
1.1 misha 2820: {
1.7 misha 2821: if (slength == -2) eptr = md->end_subject; /* Partial match */
1.6 misha 2822: CHECK_PARTIAL();
2823: RRETURN(MATCH_NOMATCH);
1.1 misha 2824: }
1.6 misha 2825: eptr += slength;
1.1 misha 2826: }
1.6 misha 2827: /* Control never gets here */
2828: }
1.1 misha 2829:
1.6 misha 2830: /* If maximizing, find the longest string and work backwards */
1.1 misha 2831:
1.6 misha 2832: else
2833: {
2834: pp = eptr;
2835: for (i = min; i < max; i++)
1.1 misha 2836: {
1.6 misha 2837: int slength;
2838: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
1.1 misha 2839: {
1.7 misha 2840: /* Can't use CHECK_PARTIAL because we don't want to update eptr in
2841: the soft partial matching case. */
2842:
2843: if (slength == -2 && md->partial != 0 &&
2844: md->end_subject > md->start_used_ptr)
2845: {
2846: md->hitend = TRUE;
2847: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
2848: }
1.6 misha 2849: break;
1.1 misha 2850: }
1.6 misha 2851: eptr += slength;
2852: }
1.7 misha 2853:
1.6 misha 2854: while (eptr >= pp)
2855: {
2856: RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
2857: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2858: eptr -= length;
1.1 misha 2859: }
1.6 misha 2860: RRETURN(MATCH_NOMATCH);
1.1 misha 2861: }
2862: /* Control never gets here */
2863:
2864: /* Match a bit-mapped character class, possibly repeatedly. This op code is
2865: used when all the characters in the class have values in the range 0-255,
2866: and either the matching is caseful, or the characters are in the range
2867: 0-127 when UTF-8 processing is enabled. The only difference between
2868: OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2869: encountered.
2870:
2871: First, look past the end of the item to see if there is repeat information
2872: following. Then obey similar code to character type repeats - written out
2873: again for speed. */
2874:
2875: case OP_NCLASS:
2876: case OP_CLASS:
2877: {
1.6 misha 2878: /* The data variable is saved across frames, so the byte map needs to
2879: be stored there. */
2880: #define BYTE_MAP ((pcre_uint8 *)data)
1.1 misha 2881: data = ecode + 1; /* Save for matching */
1.6 misha 2882: ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
1.1 misha 2883:
2884: switch (*ecode)
2885: {
2886: case OP_CRSTAR:
2887: case OP_CRMINSTAR:
2888: case OP_CRPLUS:
2889: case OP_CRMINPLUS:
2890: case OP_CRQUERY:
2891: case OP_CRMINQUERY:
1.8 moko 2892: case OP_CRPOSSTAR:
2893: case OP_CRPOSPLUS:
2894: case OP_CRPOSQUERY:
1.1 misha 2895: c = *ecode++ - OP_CRSTAR;
1.8 moko 2896: if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
2897: else possessive = TRUE;
1.1 misha 2898: min = rep_min[c]; /* Pick up values from tables; */
2899: max = rep_max[c]; /* zero for max => infinity */
2900: if (max == 0) max = INT_MAX;
2901: break;
2902:
2903: case OP_CRRANGE:
2904: case OP_CRMINRANGE:
1.8 moko 2905: case OP_CRPOSRANGE:
1.1 misha 2906: minimize = (*ecode == OP_CRMINRANGE);
1.8 moko 2907: possessive = (*ecode == OP_CRPOSRANGE);
1.1 misha 2908: min = GET2(ecode, 1);
1.6 misha 2909: max = GET2(ecode, 1 + IMM2_SIZE);
1.1 misha 2910: if (max == 0) max = INT_MAX;
1.6 misha 2911: ecode += 1 + 2 * IMM2_SIZE;
1.1 misha 2912: break;
2913:
2914: default: /* No repeat follows */
2915: min = max = 1;
2916: break;
2917: }
2918:
2919: /* First, ensure the minimum number of matches are present. */
2920:
1.6 misha 2921: #ifdef SUPPORT_UTF
2922: if (utf)
1.1 misha 2923: {
2924: for (i = 1; i <= min; i++)
2925: {
1.4 misha 2926: if (eptr >= md->end_subject)
2927: {
2928: SCHECK_PARTIAL();
1.6 misha 2929: RRETURN(MATCH_NOMATCH);
1.4 misha 2930: }
1.1 misha 2931: GETCHARINC(c, eptr);
2932: if (c > 255)
2933: {
1.6 misha 2934: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1.1 misha 2935: }
2936: else
1.6 misha 2937: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misha 2938: }
2939: }
2940: else
2941: #endif
1.6 misha 2942: /* Not UTF mode */
1.1 misha 2943: {
2944: for (i = 1; i <= min; i++)
2945: {
1.4 misha 2946: if (eptr >= md->end_subject)
2947: {
2948: SCHECK_PARTIAL();
1.6 misha 2949: RRETURN(MATCH_NOMATCH);
1.4 misha 2950: }
1.1 misha 2951: c = *eptr++;
1.6 misha 2952: #ifndef COMPILE_PCRE8
2953: if (c > 255)
2954: {
2955: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2956: }
2957: else
2958: #endif
2959: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misha 2960: }
2961: }
2962:
2963: /* If max == min we can continue with the main loop without the
2964: need to recurse. */
2965:
2966: if (min == max) continue;
2967:
2968: /* If minimizing, keep testing the rest of the expression and advancing
2969: the pointer while it matches the class. */
2970:
2971: if (minimize)
2972: {
1.6 misha 2973: #ifdef SUPPORT_UTF
2974: if (utf)
1.1 misha 2975: {
2976: for (fi = min;; fi++)
2977: {
1.6 misha 2978: RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
1.1 misha 2979: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 2980: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 2981: if (eptr >= md->end_subject)
2982: {
2983: SCHECK_PARTIAL();
1.6 misha 2984: RRETURN(MATCH_NOMATCH);
1.4 misha 2985: }
1.1 misha 2986: GETCHARINC(c, eptr);
2987: if (c > 255)
2988: {
1.6 misha 2989: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1.1 misha 2990: }
2991: else
1.6 misha 2992: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misha 2993: }
2994: }
2995: else
2996: #endif
1.6 misha 2997: /* Not UTF mode */
1.1 misha 2998: {
2999: for (fi = min;; fi++)
3000: {
1.6 misha 3001: RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
1.1 misha 3002: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 3003: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 3004: if (eptr >= md->end_subject)
3005: {
3006: SCHECK_PARTIAL();
1.6 misha 3007: RRETURN(MATCH_NOMATCH);
1.4 misha 3008: }
1.1 misha 3009: c = *eptr++;
1.6 misha 3010: #ifndef COMPILE_PCRE8
3011: if (c > 255)
3012: {
3013: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
3014: }
3015: else
3016: #endif
3017: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misha 3018: }
3019: }
3020: /* Control never gets here */
3021: }
3022:
3023: /* If maximizing, find the longest possible run, then work backwards. */
3024:
3025: else
3026: {
3027: pp = eptr;
3028:
1.6 misha 3029: #ifdef SUPPORT_UTF
3030: if (utf)
1.1 misha 3031: {
3032: for (i = min; i < max; i++)
3033: {
3034: int len = 1;
1.4 misha 3035: if (eptr >= md->end_subject)
3036: {
3037: SCHECK_PARTIAL();
3038: break;
3039: }
1.1 misha 3040: GETCHARLEN(c, eptr, len);
3041: if (c > 255)
3042: {
3043: if (op == OP_CLASS) break;
3044: }
3045: else
1.6 misha 3046: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
1.1 misha 3047: eptr += len;
3048: }
1.8 moko 3049:
3050: if (possessive) continue; /* No backtracking */
3051:
1.1 misha 3052: for (;;)
3053: {
1.6 misha 3054: RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
1.1 misha 3055: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.9 ! moko 3056: if (eptr-- <= pp) break; /* Stop if tried at original pos */
1.1 misha 3057: BACKCHAR(eptr);
3058: }
3059: }
3060: else
3061: #endif
1.6 misha 3062: /* Not UTF mode */
1.1 misha 3063: {
3064: for (i = min; i < max; i++)
3065: {
1.4 misha 3066: if (eptr >= md->end_subject)
3067: {
3068: SCHECK_PARTIAL();
3069: break;
3070: }
1.1 misha 3071: c = *eptr;
1.6 misha 3072: #ifndef COMPILE_PCRE8
3073: if (c > 255)
3074: {
3075: if (op == OP_CLASS) break;
3076: }
3077: else
3078: #endif
3079: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
1.1 misha 3080: eptr++;
3081: }
1.8 moko 3082:
3083: if (possessive) continue; /* No backtracking */
3084:
1.1 misha 3085: while (eptr >= pp)
3086: {
1.6 misha 3087: RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
1.1 misha 3088: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3089: eptr--;
3090: }
3091: }
3092:
1.6 misha 3093: RRETURN(MATCH_NOMATCH);
1.1 misha 3094: }
1.6 misha 3095: #undef BYTE_MAP
1.1 misha 3096: }
3097: /* Control never gets here */
3098:
3099:
1.8 moko 3100: /* Match an extended character class. In the 8-bit library, this opcode is
3101: encountered only when UTF-8 mode mode is supported. In the 16-bit and
3102: 32-bit libraries, codepoints greater than 255 may be encountered even when
3103: UTF is not supported. */
1.1 misha 3104:
1.6 misha 3105: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1.1 misha 3106: case OP_XCLASS:
3107: {
3108: data = ecode + 1 + LINK_SIZE; /* Save for matching */
3109: ecode += GET(ecode, 1); /* Advance past the item */
3110:
3111: switch (*ecode)
3112: {
3113: case OP_CRSTAR:
3114: case OP_CRMINSTAR:
3115: case OP_CRPLUS:
3116: case OP_CRMINPLUS:
3117: case OP_CRQUERY:
3118: case OP_CRMINQUERY:
1.8 moko 3119: case OP_CRPOSSTAR:
3120: case OP_CRPOSPLUS:
3121: case OP_CRPOSQUERY:
1.1 misha 3122: c = *ecode++ - OP_CRSTAR;
1.8 moko 3123: if (c < OP_CRPOSSTAR - OP_CRSTAR) minimize = (c & 1) != 0;
3124: else possessive = TRUE;
1.1 misha 3125: min = rep_min[c]; /* Pick up values from tables; */
3126: max = rep_max[c]; /* zero for max => infinity */
3127: if (max == 0) max = INT_MAX;
3128: break;
3129:
3130: case OP_CRRANGE:
3131: case OP_CRMINRANGE:
1.8 moko 3132: case OP_CRPOSRANGE:
1.1 misha 3133: minimize = (*ecode == OP_CRMINRANGE);
1.8 moko 3134: possessive = (*ecode == OP_CRPOSRANGE);
1.1 misha 3135: min = GET2(ecode, 1);
1.6 misha 3136: max = GET2(ecode, 1 + IMM2_SIZE);
1.1 misha 3137: if (max == 0) max = INT_MAX;
1.6 misha 3138: ecode += 1 + 2 * IMM2_SIZE;
1.1 misha 3139: break;
3140:
3141: default: /* No repeat follows */
3142: min = max = 1;
3143: break;
3144: }
3145:
3146: /* First, ensure the minimum number of matches are present. */
3147:
3148: for (i = 1; i <= min; i++)
3149: {
1.4 misha 3150: if (eptr >= md->end_subject)
3151: {
3152: SCHECK_PARTIAL();
1.6 misha 3153: RRETURN(MATCH_NOMATCH);
1.4 misha 3154: }
1.3 misha 3155: GETCHARINCTEST(c, eptr);
1.6 misha 3156: if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
1.1 misha 3157: }
3158:
3159: /* If max == min we can continue with the main loop without the
3160: need to recurse. */
3161:
3162: if (min == max) continue;
3163:
3164: /* If minimizing, keep testing the rest of the expression and advancing
3165: the pointer while it matches the class. */
3166:
3167: if (minimize)
3168: {
3169: for (fi = min;; fi++)
3170: {
1.6 misha 3171: RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
1.1 misha 3172: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 3173: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 3174: if (eptr >= md->end_subject)
3175: {
3176: SCHECK_PARTIAL();
1.6 misha 3177: RRETURN(MATCH_NOMATCH);
1.4 misha 3178: }
1.3 misha 3179: GETCHARINCTEST(c, eptr);
1.6 misha 3180: if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
1.1 misha 3181: }
3182: /* Control never gets here */
3183: }
3184:
3185: /* If maximizing, find the longest possible run, then work backwards. */
3186:
3187: else
3188: {
3189: pp = eptr;
3190: for (i = min; i < max; i++)
3191: {
3192: int len = 1;
1.4 misha 3193: if (eptr >= md->end_subject)
3194: {
3195: SCHECK_PARTIAL();
3196: break;
3197: }
1.6 misha 3198: #ifdef SUPPORT_UTF
1.3 misha 3199: GETCHARLENTEST(c, eptr, len);
1.6 misha 3200: #else
3201: c = *eptr;
3202: #endif
3203: if (!PRIV(xclass)(c, data, utf)) break;
1.1 misha 3204: eptr += len;
3205: }
1.8 moko 3206:
3207: if (possessive) continue; /* No backtracking */
3208:
1.1 misha 3209: for(;;)
3210: {
1.6 misha 3211: RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
1.1 misha 3212: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.9 ! moko 3213: if (eptr-- <= pp) break; /* Stop if tried at original pos */
1.6 misha 3214: #ifdef SUPPORT_UTF
3215: if (utf) BACKCHAR(eptr);
3216: #endif
1.1 misha 3217: }
1.6 misha 3218: RRETURN(MATCH_NOMATCH);
1.1 misha 3219: }
3220:
3221: /* Control never gets here */
3222: }
3223: #endif /* End of XCLASS */
3224:
3225: /* Match a single character, casefully */
3226:
3227: case OP_CHAR:
1.6 misha 3228: #ifdef SUPPORT_UTF
3229: if (utf)
1.1 misha 3230: {
3231: length = 1;
3232: ecode++;
3233: GETCHARLEN(fc, ecode, length);
1.4 misha 3234: if (length > md->end_subject - eptr)
3235: {
3236: CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
1.6 misha 3237: RRETURN(MATCH_NOMATCH);
1.4 misha 3238: }
1.8 moko 3239: while (length-- > 0) if (*ecode++ != UCHAR21INC(eptr)) RRETURN(MATCH_NOMATCH);
1.1 misha 3240: }
3241: else
3242: #endif
1.6 misha 3243: /* Not UTF mode */
1.1 misha 3244: {
1.4 misha 3245: if (md->end_subject - eptr < 1)
3246: {
3247: SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
1.6 misha 3248: RRETURN(MATCH_NOMATCH);
1.4 misha 3249: }
1.6 misha 3250: if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1.1 misha 3251: ecode += 2;
3252: }
3253: break;
3254:
1.6 misha 3255: /* Match a single character, caselessly. If we are at the end of the
3256: subject, give up immediately. */
1.1 misha 3257:
1.6 misha 3258: case OP_CHARI:
3259: if (eptr >= md->end_subject)
3260: {
3261: SCHECK_PARTIAL();
3262: RRETURN(MATCH_NOMATCH);
3263: }
3264:
3265: #ifdef SUPPORT_UTF
3266: if (utf)
1.1 misha 3267: {
3268: length = 1;
3269: ecode++;
3270: GETCHARLEN(fc, ecode, length);
3271:
3272: /* If the pattern character's value is < 128, we have only one byte, and
1.6 misha 3273: we know that its other case must also be one byte long, so we can use the
3274: fast lookup table. We know that there is at least one byte left in the
3275: subject. */
1.1 misha 3276:
3277: if (fc < 128)
3278: {
1.8 moko 3279: pcre_uint32 cc = UCHAR21(eptr);
1.7 misha 3280: if (md->lcc[fc] != TABLE_GET(cc, md->lcc, cc)) RRETURN(MATCH_NOMATCH);
1.6 misha 3281: ecode++;
3282: eptr++;
1.1 misha 3283: }
3284:
1.6 misha 3285: /* Otherwise we must pick up the subject character. Note that we cannot
3286: use the value of "length" to check for sufficient bytes left, because the
3287: other case of the character may have more or fewer bytes. */
1.1 misha 3288:
3289: else
3290: {
1.7 misha 3291: pcre_uint32 dc;
1.1 misha 3292: GETCHARINC(dc, eptr);
3293: ecode += length;
3294:
3295: /* If we have Unicode property support, we can use it to test the other
3296: case of the character, if there is one. */
3297:
3298: if (fc != dc)
3299: {
3300: #ifdef SUPPORT_UCP
1.2 misha 3301: if (dc != UCD_OTHERCASE(fc))
1.1 misha 3302: #endif
1.6 misha 3303: RRETURN(MATCH_NOMATCH);
1.1 misha 3304: }
3305: }
3306: }
3307: else
1.6 misha 3308: #endif /* SUPPORT_UTF */
1.1 misha 3309:
1.6 misha 3310: /* Not UTF mode */
1.1 misha 3311: {
1.6 misha 3312: if (TABLE_GET(ecode[1], md->lcc, ecode[1])
3313: != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
3314: eptr++;
1.1 misha 3315: ecode += 2;
3316: }
3317: break;
3318:
3319: /* Match a single character repeatedly. */
3320:
3321: case OP_EXACT:
1.6 misha 3322: case OP_EXACTI:
1.1 misha 3323: min = max = GET2(ecode, 1);
1.6 misha 3324: ecode += 1 + IMM2_SIZE;
1.1 misha 3325: goto REPEATCHAR;
3326:
3327: case OP_POSUPTO:
1.6 misha 3328: case OP_POSUPTOI:
1.1 misha 3329: possessive = TRUE;
3330: /* Fall through */
3331:
3332: case OP_UPTO:
1.6 misha 3333: case OP_UPTOI:
1.1 misha 3334: case OP_MINUPTO:
1.6 misha 3335: case OP_MINUPTOI:
1.1 misha 3336: min = 0;
3337: max = GET2(ecode, 1);
1.6 misha 3338: minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
3339: ecode += 1 + IMM2_SIZE;
1.1 misha 3340: goto REPEATCHAR;
3341:
3342: case OP_POSSTAR:
1.6 misha 3343: case OP_POSSTARI:
1.1 misha 3344: possessive = TRUE;
3345: min = 0;
3346: max = INT_MAX;
3347: ecode++;
3348: goto REPEATCHAR;
3349:
3350: case OP_POSPLUS:
1.6 misha 3351: case OP_POSPLUSI:
1.1 misha 3352: possessive = TRUE;
3353: min = 1;
3354: max = INT_MAX;
3355: ecode++;
3356: goto REPEATCHAR;
3357:
3358: case OP_POSQUERY:
1.6 misha 3359: case OP_POSQUERYI:
1.1 misha 3360: possessive = TRUE;
3361: min = 0;
3362: max = 1;
3363: ecode++;
3364: goto REPEATCHAR;
3365:
3366: case OP_STAR:
1.6 misha 3367: case OP_STARI:
1.1 misha 3368: case OP_MINSTAR:
1.6 misha 3369: case OP_MINSTARI:
1.1 misha 3370: case OP_PLUS:
1.6 misha 3371: case OP_PLUSI:
1.1 misha 3372: case OP_MINPLUS:
1.6 misha 3373: case OP_MINPLUSI:
1.1 misha 3374: case OP_QUERY:
1.6 misha 3375: case OP_QUERYI:
1.1 misha 3376: case OP_MINQUERY:
1.6 misha 3377: case OP_MINQUERYI:
3378: c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
1.1 misha 3379: minimize = (c & 1) != 0;
3380: min = rep_min[c]; /* Pick up values from tables; */
3381: max = rep_max[c]; /* zero for max => infinity */
3382: if (max == 0) max = INT_MAX;
3383:
1.7 misha 3384: /* Common code for all repeated single-character matches. We first check
3385: for the minimum number of characters. If the minimum equals the maximum, we
3386: are done. Otherwise, if minimizing, check the rest of the pattern for a
3387: match; if there isn't one, advance up to the maximum, one character at a
3388: time.
3389:
3390: If maximizing, advance up to the maximum number of matching characters,
3391: until eptr is past the end of the maximum run. If possessive, we are
3392: then done (no backing up). Otherwise, match at this position; anything
3393: other than no match is immediately returned. For nomatch, back up one
3394: character, unless we are matching \R and the last thing matched was
3395: \r\n, in which case, back up two bytes. When we reach the first optional
3396: character position, we can save stack by doing a tail recurse.
3397:
3398: The various UTF/non-UTF and caseful/caseless cases are handled separately,
3399: for speed. */
1.1 misha 3400:
3401: REPEATCHAR:
1.6 misha 3402: #ifdef SUPPORT_UTF
3403: if (utf)
1.1 misha 3404: {
3405: length = 1;
3406: charptr = ecode;
3407: GETCHARLEN(fc, ecode, length);
3408: ecode += length;
3409:
3410: /* Handle multibyte character matching specially here. There is
3411: support for caseless matching if UCP support is present. */
3412:
3413: if (length > 1)
3414: {
3415: #ifdef SUPPORT_UCP
1.7 misha 3416: pcre_uint32 othercase;
1.6 misha 3417: if (op >= OP_STARI && /* Caseless */
1.2 misha 3418: (othercase = UCD_OTHERCASE(fc)) != fc)
1.6 misha 3419: oclength = PRIV(ord2utf)(othercase, occhars);
1.1 misha 3420: else oclength = 0;
3421: #endif /* SUPPORT_UCP */
3422:
3423: for (i = 1; i <= min; i++)
3424: {
1.4 misha 3425: if (eptr <= md->end_subject - length &&
1.6 misha 3426: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
1.1 misha 3427: #ifdef SUPPORT_UCP
1.4 misha 3428: else if (oclength > 0 &&
3429: eptr <= md->end_subject - oclength &&
1.6 misha 3430: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
1.4 misha 3431: #endif /* SUPPORT_UCP */
1.1 misha 3432: else
3433: {
1.4 misha 3434: CHECK_PARTIAL();
1.6 misha 3435: RRETURN(MATCH_NOMATCH);
1.1 misha 3436: }
3437: }
3438:
3439: if (min == max) continue;
3440:
3441: if (minimize)
3442: {
3443: for (fi = min;; fi++)
3444: {
1.6 misha 3445: RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
1.1 misha 3446: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 3447: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 3448: if (eptr <= md->end_subject - length &&
1.6 misha 3449: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
1.1 misha 3450: #ifdef SUPPORT_UCP
1.4 misha 3451: else if (oclength > 0 &&
3452: eptr <= md->end_subject - oclength &&
1.6 misha 3453: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
1.4 misha 3454: #endif /* SUPPORT_UCP */
1.1 misha 3455: else
3456: {
1.4 misha 3457: CHECK_PARTIAL();
1.6 misha 3458: RRETURN(MATCH_NOMATCH);
1.1 misha 3459: }
3460: }
3461: /* Control never gets here */
3462: }
3463:
3464: else /* Maximize */
3465: {
3466: pp = eptr;
3467: for (i = min; i < max; i++)
3468: {
1.4 misha 3469: if (eptr <= md->end_subject - length &&
1.6 misha 3470: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
1.1 misha 3471: #ifdef SUPPORT_UCP
1.4 misha 3472: else if (oclength > 0 &&
3473: eptr <= md->end_subject - oclength &&
1.6 misha 3474: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
1.4 misha 3475: #endif /* SUPPORT_UCP */
1.1 misha 3476: else
3477: {
1.4 misha 3478: CHECK_PARTIAL();
3479: break;
1.1 misha 3480: }
3481: }
3482:
1.7 misha 3483: if (possessive) continue; /* No backtracking */
1.1 misha 3484: for(;;)
1.4 misha 3485: {
1.8 moko 3486: if (eptr <= pp) goto TAIL_RECURSE;
1.6 misha 3487: RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
1.4 misha 3488: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misha 3489: #ifdef SUPPORT_UCP
1.4 misha 3490: eptr--;
3491: BACKCHAR(eptr);
1.1 misha 3492: #else /* without SUPPORT_UCP */
1.4 misha 3493: eptr -= length;
1.1 misha 3494: #endif /* SUPPORT_UCP */
1.4 misha 3495: }
1.1 misha 3496: }
3497: /* Control never gets here */
3498: }
3499:
3500: /* If the length of a UTF-8 character is 1, we fall through here, and
3501: obey the code as for non-UTF-8 characters below, though in this case the
3502: value of fc will always be < 128. */
3503: }
3504: else
1.6 misha 3505: #endif /* SUPPORT_UTF */
3506: /* When not in UTF-8 mode, load a single-byte character. */
3507: fc = *ecode++;
1.1 misha 3508:
1.6 misha 3509: /* The value of fc at this point is always one character, though we may
3510: or may not be in UTF mode. The code is duplicated for the caseless and
1.1 misha 3511: caseful cases, for speed, since matching characters is likely to be quite
3512: common. First, ensure the minimum number of matches are present. If min =
3513: max, continue at the same level without recursing. Otherwise, if
3514: minimizing, keep trying the rest of the expression and advancing one
3515: matching character if failing, up to the maximum. Alternatively, if
3516: maximizing, find the maximum number of characters and work backwards. */
3517:
3518: DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
1.7 misha 3519: max, (char *)eptr));
1.1 misha 3520:
1.6 misha 3521: if (op >= OP_STARI) /* Caseless */
1.1 misha 3522: {
1.6 misha 3523: #ifdef COMPILE_PCRE8
3524: /* fc must be < 128 if UTF is enabled. */
3525: foc = md->fcc[fc];
3526: #else
3527: #ifdef SUPPORT_UTF
3528: #ifdef SUPPORT_UCP
3529: if (utf && fc > 127)
3530: foc = UCD_OTHERCASE(fc);
3531: #else
3532: if (utf && fc > 127)
3533: foc = fc;
3534: #endif /* SUPPORT_UCP */
3535: else
3536: #endif /* SUPPORT_UTF */
3537: foc = TABLE_GET(fc, md->fcc, fc);
3538: #endif /* COMPILE_PCRE8 */
3539:
1.1 misha 3540: for (i = 1; i <= min; i++)
1.4 misha 3541: {
1.7 misha 3542: pcre_uint32 cc; /* Faster than pcre_uchar */
1.4 misha 3543: if (eptr >= md->end_subject)
3544: {
3545: SCHECK_PARTIAL();
1.6 misha 3546: RRETURN(MATCH_NOMATCH);
1.4 misha 3547: }
1.8 moko 3548: cc = UCHAR21TEST(eptr);
1.7 misha 3549: if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
1.6 misha 3550: eptr++;
1.4 misha 3551: }
1.1 misha 3552: if (min == max) continue;
3553: if (minimize)
3554: {
3555: for (fi = min;; fi++)
3556: {
1.7 misha 3557: pcre_uint32 cc; /* Faster than pcre_uchar */
1.6 misha 3558: RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
1.1 misha 3559: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 3560: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 3561: if (eptr >= md->end_subject)
3562: {
3563: SCHECK_PARTIAL();
1.6 misha 3564: RRETURN(MATCH_NOMATCH);
1.4 misha 3565: }
1.8 moko 3566: cc = UCHAR21TEST(eptr);
1.7 misha 3567: if (fc != cc && foc != cc) RRETURN(MATCH_NOMATCH);
1.6 misha 3568: eptr++;
1.1 misha 3569: }
3570: /* Control never gets here */
3571: }
3572: else /* Maximize */
3573: {
3574: pp = eptr;
3575: for (i = min; i < max; i++)
3576: {
1.7 misha 3577: pcre_uint32 cc; /* Faster than pcre_uchar */
1.4 misha 3578: if (eptr >= md->end_subject)
3579: {
3580: SCHECK_PARTIAL();
3581: break;
3582: }
1.8 moko 3583: cc = UCHAR21TEST(eptr);
1.7 misha 3584: if (fc != cc && foc != cc) break;
1.1 misha 3585: eptr++;
3586: }
1.7 misha 3587: if (possessive) continue; /* No backtracking */
3588: for (;;)
1.1 misha 3589: {
1.7 misha 3590: if (eptr == pp) goto TAIL_RECURSE;
1.6 misha 3591: RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
1.1 misha 3592: eptr--;
3593: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3594: }
1.8 moko 3595: /* Control never gets here */
1.1 misha 3596: }
3597: }
3598:
3599: /* Caseful comparisons (includes all multi-byte characters) */
3600:
3601: else
3602: {
1.4 misha 3603: for (i = 1; i <= min; i++)
3604: {
3605: if (eptr >= md->end_subject)
3606: {
3607: SCHECK_PARTIAL();
1.6 misha 3608: RRETURN(MATCH_NOMATCH);
1.4 misha 3609: }
1.8 moko 3610: if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
1.4 misha 3611: }
3612:
1.1 misha 3613: if (min == max) continue;
1.4 misha 3614:
1.1 misha 3615: if (minimize)
3616: {
3617: for (fi = min;; fi++)
3618: {
1.6 misha 3619: RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
1.1 misha 3620: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 3621: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 3622: if (eptr >= md->end_subject)
3623: {
3624: SCHECK_PARTIAL();
1.6 misha 3625: RRETURN(MATCH_NOMATCH);
1.4 misha 3626: }
1.8 moko 3627: if (fc != UCHAR21INCTEST(eptr)) RRETURN(MATCH_NOMATCH);
1.1 misha 3628: }
3629: /* Control never gets here */
3630: }
3631: else /* Maximize */
3632: {
3633: pp = eptr;
3634: for (i = min; i < max; i++)
3635: {
1.4 misha 3636: if (eptr >= md->end_subject)
3637: {
3638: SCHECK_PARTIAL();
3639: break;
3640: }
1.8 moko 3641: if (fc != UCHAR21TEST(eptr)) break;
1.1 misha 3642: eptr++;
3643: }
1.7 misha 3644: if (possessive) continue; /* No backtracking */
3645: for (;;)
1.1 misha 3646: {
1.7 misha 3647: if (eptr == pp) goto TAIL_RECURSE;
1.6 misha 3648: RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
1.1 misha 3649: eptr--;
3650: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3651: }
1.8 moko 3652: /* Control never gets here */
1.1 misha 3653: }
3654: }
3655: /* Control never gets here */
3656:
3657: /* Match a negated single one-byte character. The character we are
3658: checking can be multibyte. */
3659:
3660: case OP_NOT:
1.6 misha 3661: case OP_NOTI:
1.4 misha 3662: if (eptr >= md->end_subject)
3663: {
3664: SCHECK_PARTIAL();
1.6 misha 3665: RRETURN(MATCH_NOMATCH);
1.4 misha 3666: }
1.7 misha 3667: #ifdef SUPPORT_UTF
3668: if (utf)
1.1 misha 3669: {
1.7 misha 3670: register pcre_uint32 ch, och;
3671:
3672: ecode++;
3673: GETCHARINC(ch, ecode);
3674: GETCHARINC(c, eptr);
3675:
3676: if (op == OP_NOT)
3677: {
3678: if (ch == c) RRETURN(MATCH_NOMATCH);
3679: }
3680: else
3681: {
1.6 misha 3682: #ifdef SUPPORT_UCP
1.7 misha 3683: if (ch > 127)
3684: och = UCD_OTHERCASE(ch);
1.6 misha 3685: #else
1.7 misha 3686: if (ch > 127)
3687: och = ch;
1.6 misha 3688: #endif /* SUPPORT_UCP */
1.7 misha 3689: else
3690: och = TABLE_GET(ch, md->fcc, ch);
3691: if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
3692: }
1.1 misha 3693: }
1.7 misha 3694: else
3695: #endif
1.1 misha 3696: {
1.7 misha 3697: register pcre_uint32 ch = ecode[1];
3698: c = *eptr++;
3699: if (ch == c || (op == OP_NOTI && TABLE_GET(ch, md->fcc, ch) == c))
3700: RRETURN(MATCH_NOMATCH);
3701: ecode += 2;
1.1 misha 3702: }
3703: break;
3704:
3705: /* Match a negated single one-byte character repeatedly. This is almost a
3706: repeat of the code for a repeated single character, but I haven't found a
3707: nice way of commoning these up that doesn't require a test of the
3708: positive/negative option for each character match. Maybe that wouldn't add
3709: very much to the time taken, but character matching *is* what this is all
3710: about... */
3711:
3712: case OP_NOTEXACT:
1.6 misha 3713: case OP_NOTEXACTI:
1.1 misha 3714: min = max = GET2(ecode, 1);
1.6 misha 3715: ecode += 1 + IMM2_SIZE;
1.1 misha 3716: goto REPEATNOTCHAR;
3717:
3718: case OP_NOTUPTO:
1.6 misha 3719: case OP_NOTUPTOI:
1.1 misha 3720: case OP_NOTMINUPTO:
1.6 misha 3721: case OP_NOTMINUPTOI:
1.1 misha 3722: min = 0;
3723: max = GET2(ecode, 1);
1.6 misha 3724: minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
3725: ecode += 1 + IMM2_SIZE;
1.1 misha 3726: goto REPEATNOTCHAR;
3727:
3728: case OP_NOTPOSSTAR:
1.6 misha 3729: case OP_NOTPOSSTARI:
1.1 misha 3730: possessive = TRUE;
3731: min = 0;
3732: max = INT_MAX;
3733: ecode++;
3734: goto REPEATNOTCHAR;
3735:
3736: case OP_NOTPOSPLUS:
1.6 misha 3737: case OP_NOTPOSPLUSI:
1.1 misha 3738: possessive = TRUE;
3739: min = 1;
3740: max = INT_MAX;
3741: ecode++;
3742: goto REPEATNOTCHAR;
3743:
3744: case OP_NOTPOSQUERY:
1.6 misha 3745: case OP_NOTPOSQUERYI:
1.1 misha 3746: possessive = TRUE;
3747: min = 0;
3748: max = 1;
3749: ecode++;
3750: goto REPEATNOTCHAR;
3751:
3752: case OP_NOTPOSUPTO:
1.6 misha 3753: case OP_NOTPOSUPTOI:
1.1 misha 3754: possessive = TRUE;
3755: min = 0;
3756: max = GET2(ecode, 1);
1.6 misha 3757: ecode += 1 + IMM2_SIZE;
1.1 misha 3758: goto REPEATNOTCHAR;
3759:
3760: case OP_NOTSTAR:
1.6 misha 3761: case OP_NOTSTARI:
1.1 misha 3762: case OP_NOTMINSTAR:
1.6 misha 3763: case OP_NOTMINSTARI:
1.1 misha 3764: case OP_NOTPLUS:
1.6 misha 3765: case OP_NOTPLUSI:
1.1 misha 3766: case OP_NOTMINPLUS:
1.6 misha 3767: case OP_NOTMINPLUSI:
1.1 misha 3768: case OP_NOTQUERY:
1.6 misha 3769: case OP_NOTQUERYI:
1.1 misha 3770: case OP_NOTMINQUERY:
1.6 misha 3771: case OP_NOTMINQUERYI:
3772: c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
1.1 misha 3773: minimize = (c & 1) != 0;
3774: min = rep_min[c]; /* Pick up values from tables; */
3775: max = rep_max[c]; /* zero for max => infinity */
3776: if (max == 0) max = INT_MAX;
3777:
1.4 misha 3778: /* Common code for all repeated single-byte matches. */
1.1 misha 3779:
3780: REPEATNOTCHAR:
1.7 misha 3781: GETCHARINCTEST(fc, ecode);
1.1 misha 3782:
3783: /* The code is duplicated for the caseless and caseful cases, for speed,
3784: since matching characters is likely to be quite common. First, ensure the
3785: minimum number of matches are present. If min = max, continue at the same
3786: level without recursing. Otherwise, if minimizing, keep trying the rest of
3787: the expression and advancing one matching character if failing, up to the
3788: maximum. Alternatively, if maximizing, find the maximum number of
3789: characters and work backwards. */
3790:
3791: DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
1.7 misha 3792: max, (char *)eptr));
1.1 misha 3793:
1.6 misha 3794: if (op >= OP_NOTSTARI) /* Caseless */
1.1 misha 3795: {
1.6 misha 3796: #ifdef SUPPORT_UTF
3797: #ifdef SUPPORT_UCP
3798: if (utf && fc > 127)
3799: foc = UCD_OTHERCASE(fc);
3800: #else
3801: if (utf && fc > 127)
3802: foc = fc;
3803: #endif /* SUPPORT_UCP */
3804: else
3805: #endif /* SUPPORT_UTF */
3806: foc = TABLE_GET(fc, md->fcc, fc);
1.1 misha 3807:
1.6 misha 3808: #ifdef SUPPORT_UTF
3809: if (utf)
1.1 misha 3810: {
1.7 misha 3811: register pcre_uint32 d;
1.1 misha 3812: for (i = 1; i <= min; i++)
3813: {
1.4 misha 3814: if (eptr >= md->end_subject)
3815: {
3816: SCHECK_PARTIAL();
1.6 misha 3817: RRETURN(MATCH_NOMATCH);
1.4 misha 3818: }
1.1 misha 3819: GETCHARINC(d, eptr);
1.7 misha 3820: if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
1.1 misha 3821: }
3822: }
3823: else
1.7 misha 3824: #endif /* SUPPORT_UTF */
1.6 misha 3825: /* Not UTF mode */
1.1 misha 3826: {
3827: for (i = 1; i <= min; i++)
1.4 misha 3828: {
3829: if (eptr >= md->end_subject)
3830: {
3831: SCHECK_PARTIAL();
1.6 misha 3832: RRETURN(MATCH_NOMATCH);
1.4 misha 3833: }
1.6 misha 3834: if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3835: eptr++;
1.4 misha 3836: }
1.1 misha 3837: }
3838:
3839: if (min == max) continue;
3840:
3841: if (minimize)
3842: {
1.6 misha 3843: #ifdef SUPPORT_UTF
3844: if (utf)
1.1 misha 3845: {
1.7 misha 3846: register pcre_uint32 d;
1.1 misha 3847: for (fi = min;; fi++)
3848: {
1.6 misha 3849: RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
1.1 misha 3850: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 3851: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 3852: if (eptr >= md->end_subject)
3853: {
3854: SCHECK_PARTIAL();
1.6 misha 3855: RRETURN(MATCH_NOMATCH);
1.4 misha 3856: }
1.1 misha 3857: GETCHARINC(d, eptr);
1.6 misha 3858: if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
1.1 misha 3859: }
3860: }
3861: else
1.7 misha 3862: #endif /*SUPPORT_UTF */
1.6 misha 3863: /* Not UTF mode */
1.1 misha 3864: {
3865: for (fi = min;; fi++)
3866: {
1.6 misha 3867: RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
1.1 misha 3868: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 3869: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 3870: if (eptr >= md->end_subject)
3871: {
3872: SCHECK_PARTIAL();
1.6 misha 3873: RRETURN(MATCH_NOMATCH);
1.4 misha 3874: }
1.6 misha 3875: if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
3876: eptr++;
1.1 misha 3877: }
3878: }
3879: /* Control never gets here */
3880: }
3881:
3882: /* Maximize case */
3883:
3884: else
3885: {
3886: pp = eptr;
3887:
1.6 misha 3888: #ifdef SUPPORT_UTF
3889: if (utf)
1.1 misha 3890: {
1.7 misha 3891: register pcre_uint32 d;
1.1 misha 3892: for (i = min; i < max; i++)
3893: {
3894: int len = 1;
1.4 misha 3895: if (eptr >= md->end_subject)
3896: {
3897: SCHECK_PARTIAL();
3898: break;
3899: }
1.1 misha 3900: GETCHARLEN(d, eptr, len);
1.6 misha 3901: if (fc == d || (unsigned int)foc == d) break;
1.1 misha 3902: eptr += len;
3903: }
1.7 misha 3904: if (possessive) continue; /* No backtracking */
1.6 misha 3905: for(;;)
1.1 misha 3906: {
1.8 moko 3907: if (eptr <= pp) goto TAIL_RECURSE;
1.6 misha 3908: RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
1.1 misha 3909: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.7 misha 3910: eptr--;
1.1 misha 3911: BACKCHAR(eptr);
3912: }
3913: }
3914: else
1.7 misha 3915: #endif /* SUPPORT_UTF */
1.6 misha 3916: /* Not UTF mode */
1.1 misha 3917: {
3918: for (i = min; i < max; i++)
3919: {
1.4 misha 3920: if (eptr >= md->end_subject)
3921: {
3922: SCHECK_PARTIAL();
3923: break;
3924: }
1.6 misha 3925: if (fc == *eptr || foc == *eptr) break;
1.1 misha 3926: eptr++;
3927: }
1.7 misha 3928: if (possessive) continue; /* No backtracking */
3929: for (;;)
1.1 misha 3930: {
1.7 misha 3931: if (eptr == pp) goto TAIL_RECURSE;
1.6 misha 3932: RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
1.1 misha 3933: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3934: eptr--;
3935: }
3936: }
1.8 moko 3937: /* Control never gets here */
1.1 misha 3938: }
3939: }
3940:
3941: /* Caseful comparisons */
3942:
3943: else
3944: {
1.6 misha 3945: #ifdef SUPPORT_UTF
3946: if (utf)
1.1 misha 3947: {
1.7 misha 3948: register pcre_uint32 d;
1.1 misha 3949: for (i = 1; i <= min; i++)
3950: {
1.4 misha 3951: if (eptr >= md->end_subject)
3952: {
3953: SCHECK_PARTIAL();
1.6 misha 3954: RRETURN(MATCH_NOMATCH);
1.4 misha 3955: }
1.1 misha 3956: GETCHARINC(d, eptr);
1.6 misha 3957: if (fc == d) RRETURN(MATCH_NOMATCH);
1.1 misha 3958: }
3959: }
3960: else
3961: #endif
1.6 misha 3962: /* Not UTF mode */
1.1 misha 3963: {
3964: for (i = 1; i <= min; i++)
1.4 misha 3965: {
3966: if (eptr >= md->end_subject)
3967: {
3968: SCHECK_PARTIAL();
1.6 misha 3969: RRETURN(MATCH_NOMATCH);
1.4 misha 3970: }
1.6 misha 3971: if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
1.4 misha 3972: }
1.1 misha 3973: }
3974:
3975: if (min == max) continue;
3976:
3977: if (minimize)
3978: {
1.6 misha 3979: #ifdef SUPPORT_UTF
3980: if (utf)
1.1 misha 3981: {
1.7 misha 3982: register pcre_uint32 d;
1.1 misha 3983: for (fi = min;; fi++)
3984: {
1.6 misha 3985: RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
1.1 misha 3986: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 3987: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 3988: if (eptr >= md->end_subject)
3989: {
3990: SCHECK_PARTIAL();
1.6 misha 3991: RRETURN(MATCH_NOMATCH);
1.4 misha 3992: }
1.1 misha 3993: GETCHARINC(d, eptr);
1.6 misha 3994: if (fc == d) RRETURN(MATCH_NOMATCH);
1.1 misha 3995: }
3996: }
3997: else
3998: #endif
1.6 misha 3999: /* Not UTF mode */
1.1 misha 4000: {
4001: for (fi = min;; fi++)
4002: {
1.6 misha 4003: RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
1.1 misha 4004: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 4005: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4006: if (eptr >= md->end_subject)
4007: {
4008: SCHECK_PARTIAL();
1.6 misha 4009: RRETURN(MATCH_NOMATCH);
1.4 misha 4010: }
1.6 misha 4011: if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
1.1 misha 4012: }
4013: }
4014: /* Control never gets here */
4015: }
4016:
4017: /* Maximize case */
4018:
4019: else
4020: {
4021: pp = eptr;
4022:
1.6 misha 4023: #ifdef SUPPORT_UTF
4024: if (utf)
1.1 misha 4025: {
1.7 misha 4026: register pcre_uint32 d;
1.1 misha 4027: for (i = min; i < max; i++)
4028: {
4029: int len = 1;
1.4 misha 4030: if (eptr >= md->end_subject)
4031: {
4032: SCHECK_PARTIAL();
4033: break;
4034: }
1.1 misha 4035: GETCHARLEN(d, eptr, len);
4036: if (fc == d) break;
4037: eptr += len;
4038: }
1.7 misha 4039: if (possessive) continue; /* No backtracking */
1.1 misha 4040: for(;;)
4041: {
1.8 moko 4042: if (eptr <= pp) goto TAIL_RECURSE;
1.6 misha 4043: RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
1.1 misha 4044: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.7 misha 4045: eptr--;
1.1 misha 4046: BACKCHAR(eptr);
4047: }
4048: }
4049: else
4050: #endif
1.6 misha 4051: /* Not UTF mode */
1.1 misha 4052: {
4053: for (i = min; i < max; i++)
4054: {
1.4 misha 4055: if (eptr >= md->end_subject)
4056: {
4057: SCHECK_PARTIAL();
4058: break;
4059: }
4060: if (fc == *eptr) break;
1.1 misha 4061: eptr++;
4062: }
1.7 misha 4063: if (possessive) continue; /* No backtracking */
4064: for (;;)
1.1 misha 4065: {
1.7 misha 4066: if (eptr == pp) goto TAIL_RECURSE;
1.6 misha 4067: RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
1.1 misha 4068: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4069: eptr--;
4070: }
4071: }
1.8 moko 4072: /* Control never gets here */
1.1 misha 4073: }
4074: }
4075: /* Control never gets here */
4076:
4077: /* Match a single character type repeatedly; several different opcodes
4078: share code. This is very similar to the code for single characters, but we
4079: repeat it in the interests of efficiency. */
4080:
4081: case OP_TYPEEXACT:
4082: min = max = GET2(ecode, 1);
4083: minimize = TRUE;
1.6 misha 4084: ecode += 1 + IMM2_SIZE;
1.1 misha 4085: goto REPEATTYPE;
4086:
4087: case OP_TYPEUPTO:
4088: case OP_TYPEMINUPTO:
4089: min = 0;
4090: max = GET2(ecode, 1);
4091: minimize = *ecode == OP_TYPEMINUPTO;
1.6 misha 4092: ecode += 1 + IMM2_SIZE;
1.1 misha 4093: goto REPEATTYPE;
4094:
4095: case OP_TYPEPOSSTAR:
4096: possessive = TRUE;
4097: min = 0;
4098: max = INT_MAX;
4099: ecode++;
4100: goto REPEATTYPE;
4101:
4102: case OP_TYPEPOSPLUS:
4103: possessive = TRUE;
4104: min = 1;
4105: max = INT_MAX;
4106: ecode++;
4107: goto REPEATTYPE;
4108:
4109: case OP_TYPEPOSQUERY:
4110: possessive = TRUE;
4111: min = 0;
4112: max = 1;
4113: ecode++;
4114: goto REPEATTYPE;
4115:
4116: case OP_TYPEPOSUPTO:
4117: possessive = TRUE;
4118: min = 0;
4119: max = GET2(ecode, 1);
1.6 misha 4120: ecode += 1 + IMM2_SIZE;
1.1 misha 4121: goto REPEATTYPE;
4122:
4123: case OP_TYPESTAR:
4124: case OP_TYPEMINSTAR:
4125: case OP_TYPEPLUS:
4126: case OP_TYPEMINPLUS:
4127: case OP_TYPEQUERY:
4128: case OP_TYPEMINQUERY:
4129: c = *ecode++ - OP_TYPESTAR;
4130: minimize = (c & 1) != 0;
4131: min = rep_min[c]; /* Pick up values from tables; */
4132: max = rep_max[c]; /* zero for max => infinity */
4133: if (max == 0) max = INT_MAX;
4134:
4135: /* Common code for all repeated single character type matches. Note that
4136: in UTF-8 mode, '.' matches a character of any length, but for the other
4137: character types, the valid characters are all one-byte long. */
4138:
4139: REPEATTYPE:
4140: ctype = *ecode++; /* Code for the character type */
4141:
4142: #ifdef SUPPORT_UCP
4143: if (ctype == OP_PROP || ctype == OP_NOTPROP)
4144: {
4145: prop_fail_result = ctype == OP_NOTPROP;
4146: prop_type = *ecode++;
4147: prop_value = *ecode++;
4148: }
4149: else prop_type = -1;
4150: #endif
4151:
4152: /* First, ensure the minimum number of matches are present. Use inline
4153: code for maximizing the speed, and do the type test once at the start
1.4 misha 4154: (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
1.1 misha 4155: is tidier. Also separate the UCP code, which can be the same for both UTF-8
4156: and single-bytes. */
4157:
4158: if (min > 0)
4159: {
4160: #ifdef SUPPORT_UCP
4161: if (prop_type >= 0)
4162: {
4163: switch(prop_type)
4164: {
4165: case PT_ANY:
1.6 misha 4166: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
1.1 misha 4167: for (i = 1; i <= min; i++)
4168: {
1.4 misha 4169: if (eptr >= md->end_subject)
4170: {
4171: SCHECK_PARTIAL();
1.6 misha 4172: RRETURN(MATCH_NOMATCH);
1.4 misha 4173: }
1.1 misha 4174: GETCHARINCTEST(c, eptr);
4175: }
4176: break;
4177:
4178: case PT_LAMP:
4179: for (i = 1; i <= min; i++)
4180: {
1.6 misha 4181: int chartype;
1.4 misha 4182: if (eptr >= md->end_subject)
4183: {
4184: SCHECK_PARTIAL();
1.6 misha 4185: RRETURN(MATCH_NOMATCH);
1.4 misha 4186: }
1.1 misha 4187: GETCHARINCTEST(c, eptr);
1.6 misha 4188: chartype = UCD_CHARTYPE(c);
4189: if ((chartype == ucp_Lu ||
4190: chartype == ucp_Ll ||
4191: chartype == ucp_Lt) == prop_fail_result)
4192: RRETURN(MATCH_NOMATCH);
1.1 misha 4193: }
4194: break;
4195:
4196: case PT_GC:
4197: for (i = 1; i <= min; i++)
4198: {
1.4 misha 4199: if (eptr >= md->end_subject)
4200: {
4201: SCHECK_PARTIAL();
1.6 misha 4202: RRETURN(MATCH_NOMATCH);
1.4 misha 4203: }
1.1 misha 4204: GETCHARINCTEST(c, eptr);
1.6 misha 4205: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4206: RRETURN(MATCH_NOMATCH);
1.1 misha 4207: }
4208: break;
4209:
4210: case PT_PC:
4211: for (i = 1; i <= min; i++)
4212: {
1.4 misha 4213: if (eptr >= md->end_subject)
4214: {
4215: SCHECK_PARTIAL();
1.6 misha 4216: RRETURN(MATCH_NOMATCH);
1.4 misha 4217: }
1.1 misha 4218: GETCHARINCTEST(c, eptr);
1.6 misha 4219: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4220: RRETURN(MATCH_NOMATCH);
1.1 misha 4221: }
4222: break;
4223:
4224: case PT_SC:
4225: for (i = 1; i <= min; i++)
4226: {
1.4 misha 4227: if (eptr >= md->end_subject)
4228: {
4229: SCHECK_PARTIAL();
1.6 misha 4230: RRETURN(MATCH_NOMATCH);
1.4 misha 4231: }
1.1 misha 4232: GETCHARINCTEST(c, eptr);
1.6 misha 4233: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4234: RRETURN(MATCH_NOMATCH);
1.4 misha 4235: }
4236: break;
4237:
4238: case PT_ALNUM:
4239: for (i = 1; i <= min; i++)
4240: {
1.6 misha 4241: int category;
1.4 misha 4242: if (eptr >= md->end_subject)
4243: {
4244: SCHECK_PARTIAL();
1.6 misha 4245: RRETURN(MATCH_NOMATCH);
1.4 misha 4246: }
4247: GETCHARINCTEST(c, eptr);
1.6 misha 4248: category = UCD_CATEGORY(c);
4249: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4250: RRETURN(MATCH_NOMATCH);
1.4 misha 4251: }
4252: break;
4253:
1.8 moko 4254: /* Perl space used to exclude VT, but from Perl 5.18 it is included,
4255: which means that Perl space and POSIX space are now identical. PCRE
4256: was changed at release 8.34. */
4257:
1.4 misha 4258: case PT_SPACE: /* Perl space */
1.8 moko 4259: case PT_PXSPACE: /* POSIX space */
1.4 misha 4260: for (i = 1; i <= min; i++)
4261: {
4262: if (eptr >= md->end_subject)
4263: {
4264: SCHECK_PARTIAL();
1.6 misha 4265: RRETURN(MATCH_NOMATCH);
1.4 misha 4266: }
4267: GETCHARINCTEST(c, eptr);
1.8 moko 4268: switch(c)
4269: {
4270: HSPACE_CASES:
4271: VSPACE_CASES:
4272: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
4273: break;
1.1 misha 4274:
1.8 moko 4275: default:
4276: if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
4277: RRETURN(MATCH_NOMATCH);
4278: break;
1.4 misha 4279: }
4280: }
4281: break;
4282:
4283: case PT_WORD:
4284: for (i = 1; i <= min; i++)
4285: {
1.6 misha 4286: int category;
1.4 misha 4287: if (eptr >= md->end_subject)
4288: {
4289: SCHECK_PARTIAL();
1.6 misha 4290: RRETURN(MATCH_NOMATCH);
1.4 misha 4291: }
4292: GETCHARINCTEST(c, eptr);
1.6 misha 4293: category = UCD_CATEGORY(c);
4294: if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
1.4 misha 4295: == prop_fail_result)
1.6 misha 4296: RRETURN(MATCH_NOMATCH);
1.4 misha 4297: }
4298: break;
4299:
1.7 misha 4300: case PT_CLIST:
4301: for (i = 1; i <= min; i++)
4302: {
4303: const pcre_uint32 *cp;
4304: if (eptr >= md->end_subject)
4305: {
4306: SCHECK_PARTIAL();
4307: RRETURN(MATCH_NOMATCH);
4308: }
4309: GETCHARINCTEST(c, eptr);
4310: cp = PRIV(ucd_caseless_sets) + prop_value;
4311: for (;;)
4312: {
4313: if (c < *cp)
4314: { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
4315: if (c == *cp++)
4316: { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
4317: }
4318: }
4319: break;
4320:
4321: case PT_UCNC:
4322: for (i = 1; i <= min; i++)
4323: {
4324: if (eptr >= md->end_subject)
4325: {
4326: SCHECK_PARTIAL();
4327: RRETURN(MATCH_NOMATCH);
4328: }
4329: GETCHARINCTEST(c, eptr);
4330: if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
4331: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
4332: c >= 0xe000) == prop_fail_result)
4333: RRETURN(MATCH_NOMATCH);
4334: }
4335: break;
4336:
1.4 misha 4337: /* This should not occur */
4338:
1.1 misha 4339: default:
4340: RRETURN(PCRE_ERROR_INTERNAL);
4341: }
4342: }
4343:
4344: /* Match extended Unicode sequences. We will get here only if the
4345: support is in the binary; otherwise a compile-time error occurs. */
4346:
4347: else if (ctype == OP_EXTUNI)
4348: {
4349: for (i = 1; i <= min; i++)
4350: {
1.4 misha 4351: if (eptr >= md->end_subject)
4352: {
4353: SCHECK_PARTIAL();
1.6 misha 4354: RRETURN(MATCH_NOMATCH);
1.4 misha 4355: }
1.7 misha 4356: else
1.1 misha 4357: {
1.7 misha 4358: int lgb, rgb;
4359: GETCHARINCTEST(c, eptr);
4360: lgb = UCD_GRAPHBREAK(c);
4361: while (eptr < md->end_subject)
4362: {
4363: int len = 1;
4364: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
4365: rgb = UCD_GRAPHBREAK(c);
4366: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
4367: lgb = rgb;
4368: eptr += len;
4369: }
1.1 misha 4370: }
1.7 misha 4371: CHECK_PARTIAL();
1.1 misha 4372: }
4373: }
4374:
4375: else
4376: #endif /* SUPPORT_UCP */
4377:
4378: /* Handle all other cases when the coding is UTF-8 */
4379:
1.6 misha 4380: #ifdef SUPPORT_UTF
4381: if (utf) switch(ctype)
1.1 misha 4382: {
4383: case OP_ANY:
4384: for (i = 1; i <= min; i++)
4385: {
1.4 misha 4386: if (eptr >= md->end_subject)
4387: {
4388: SCHECK_PARTIAL();
1.6 misha 4389: RRETURN(MATCH_NOMATCH);
1.4 misha 4390: }
1.6 misha 4391: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1.7 misha 4392: if (md->partial != 0 &&
4393: eptr + 1 >= md->end_subject &&
4394: NLBLOCK->nltype == NLTYPE_FIXED &&
4395: NLBLOCK->nllen == 2 &&
1.8 moko 4396: UCHAR21(eptr) == NLBLOCK->nl[0])
1.7 misha 4397: {
4398: md->hitend = TRUE;
4399: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4400: }
1.1 misha 4401: eptr++;
1.6 misha 4402: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misha 4403: }
4404: break;
4405:
4406: case OP_ALLANY:
4407: for (i = 1; i <= min; i++)
4408: {
1.4 misha 4409: if (eptr >= md->end_subject)
4410: {
4411: SCHECK_PARTIAL();
1.6 misha 4412: RRETURN(MATCH_NOMATCH);
1.4 misha 4413: }
1.1 misha 4414: eptr++;
1.6 misha 4415: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misha 4416: }
4417: break;
4418:
4419: case OP_ANYBYTE:
1.6 misha 4420: if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
1.1 misha 4421: eptr += min;
4422: break;
4423:
4424: case OP_ANYNL:
4425: for (i = 1; i <= min; i++)
4426: {
1.4 misha 4427: if (eptr >= md->end_subject)
4428: {
4429: SCHECK_PARTIAL();
1.6 misha 4430: RRETURN(MATCH_NOMATCH);
1.4 misha 4431: }
1.1 misha 4432: GETCHARINC(c, eptr);
4433: switch(c)
4434: {
1.6 misha 4435: default: RRETURN(MATCH_NOMATCH);
4436:
1.7 misha 4437: case CHAR_CR:
1.8 moko 4438: if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
1.1 misha 4439: break;
4440:
1.7 misha 4441: case CHAR_LF:
1.1 misha 4442: break;
4443:
1.7 misha 4444: case CHAR_VT:
4445: case CHAR_FF:
4446: case CHAR_NEL:
4447: #ifndef EBCDIC
1.1 misha 4448: case 0x2028:
4449: case 0x2029:
1.7 misha 4450: #endif /* Not EBCDIC */
1.6 misha 4451: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1.1 misha 4452: break;
4453: }
4454: }
4455: break;
4456:
4457: case OP_NOT_HSPACE:
4458: for (i = 1; i <= min; i++)
4459: {
1.4 misha 4460: if (eptr >= md->end_subject)
4461: {
4462: SCHECK_PARTIAL();
1.6 misha 4463: RRETURN(MATCH_NOMATCH);
1.4 misha 4464: }
1.1 misha 4465: GETCHARINC(c, eptr);
4466: switch(c)
4467: {
1.7 misha 4468: HSPACE_CASES: RRETURN(MATCH_NOMATCH); /* Byte and multibyte cases */
1.1 misha 4469: default: break;
4470: }
4471: }
4472: break;
4473:
4474: case OP_HSPACE:
4475: for (i = 1; i <= min; i++)
4476: {
1.4 misha 4477: if (eptr >= md->end_subject)
4478: {
4479: SCHECK_PARTIAL();
1.6 misha 4480: RRETURN(MATCH_NOMATCH);
1.4 misha 4481: }
1.1 misha 4482: GETCHARINC(c, eptr);
4483: switch(c)
4484: {
1.7 misha 4485: HSPACE_CASES: break; /* Byte and multibyte cases */
1.6 misha 4486: default: RRETURN(MATCH_NOMATCH);
1.1 misha 4487: }
4488: }
4489: break;
4490:
4491: case OP_NOT_VSPACE:
4492: for (i = 1; i <= min; i++)
4493: {
1.4 misha 4494: if (eptr >= md->end_subject)
4495: {
4496: SCHECK_PARTIAL();
1.6 misha 4497: RRETURN(MATCH_NOMATCH);
1.4 misha 4498: }
1.1 misha 4499: GETCHARINC(c, eptr);
4500: switch(c)
4501: {
1.7 misha 4502: VSPACE_CASES: RRETURN(MATCH_NOMATCH);
1.1 misha 4503: default: break;
4504: }
4505: }
4506: break;
4507:
4508: case OP_VSPACE:
4509: for (i = 1; i <= min; i++)
4510: {
1.4 misha 4511: if (eptr >= md->end_subject)
4512: {
4513: SCHECK_PARTIAL();
1.6 misha 4514: RRETURN(MATCH_NOMATCH);
1.4 misha 4515: }
1.1 misha 4516: GETCHARINC(c, eptr);
4517: switch(c)
4518: {
1.7 misha 4519: VSPACE_CASES: break;
1.6 misha 4520: default: RRETURN(MATCH_NOMATCH);
1.1 misha 4521: }
4522: }
4523: break;
4524:
4525: case OP_NOT_DIGIT:
4526: for (i = 1; i <= min; i++)
4527: {
1.4 misha 4528: if (eptr >= md->end_subject)
4529: {
4530: SCHECK_PARTIAL();
1.6 misha 4531: RRETURN(MATCH_NOMATCH);
1.4 misha 4532: }
1.1 misha 4533: GETCHARINC(c, eptr);
4534: if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
1.6 misha 4535: RRETURN(MATCH_NOMATCH);
1.1 misha 4536: }
4537: break;
4538:
4539: case OP_DIGIT:
4540: for (i = 1; i <= min; i++)
4541: {
1.7 misha 4542: pcre_uint32 cc;
1.4 misha 4543: if (eptr >= md->end_subject)
4544: {
4545: SCHECK_PARTIAL();
1.6 misha 4546: RRETURN(MATCH_NOMATCH);
1.4 misha 4547: }
1.8 moko 4548: cc = UCHAR21(eptr);
1.7 misha 4549: if (cc >= 128 || (md->ctypes[cc] & ctype_digit) == 0)
1.6 misha 4550: RRETURN(MATCH_NOMATCH);
4551: eptr++;
1.1 misha 4552: /* No need to skip more bytes - we know it's a 1-byte character */
4553: }
4554: break;
4555:
4556: case OP_NOT_WHITESPACE:
4557: for (i = 1; i <= min; i++)
4558: {
1.7 misha 4559: pcre_uint32 cc;
1.4 misha 4560: if (eptr >= md->end_subject)
4561: {
4562: SCHECK_PARTIAL();
1.6 misha 4563: RRETURN(MATCH_NOMATCH);
1.4 misha 4564: }
1.8 moko 4565: cc = UCHAR21(eptr);
1.7 misha 4566: if (cc < 128 && (md->ctypes[cc] & ctype_space) != 0)
1.6 misha 4567: RRETURN(MATCH_NOMATCH);
4568: eptr++;
4569: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misha 4570: }
4571: break;
4572:
4573: case OP_WHITESPACE:
4574: for (i = 1; i <= min; i++)
4575: {
1.7 misha 4576: pcre_uint32 cc;
1.4 misha 4577: if (eptr >= md->end_subject)
4578: {
4579: SCHECK_PARTIAL();
1.6 misha 4580: RRETURN(MATCH_NOMATCH);
1.4 misha 4581: }
1.8 moko 4582: cc = UCHAR21(eptr);
1.7 misha 4583: if (cc >= 128 || (md->ctypes[cc] & ctype_space) == 0)
1.6 misha 4584: RRETURN(MATCH_NOMATCH);
4585: eptr++;
1.1 misha 4586: /* No need to skip more bytes - we know it's a 1-byte character */
4587: }
4588: break;
4589:
4590: case OP_NOT_WORDCHAR:
4591: for (i = 1; i <= min; i++)
4592: {
1.7 misha 4593: pcre_uint32 cc;
1.4 misha 4594: if (eptr >= md->end_subject)
4595: {
4596: SCHECK_PARTIAL();
1.6 misha 4597: RRETURN(MATCH_NOMATCH);
1.4 misha 4598: }
1.8 moko 4599: cc = UCHAR21(eptr);
1.7 misha 4600: if (cc < 128 && (md->ctypes[cc] & ctype_word) != 0)
1.6 misha 4601: RRETURN(MATCH_NOMATCH);
4602: eptr++;
4603: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misha 4604: }
4605: break;
4606:
4607: case OP_WORDCHAR:
4608: for (i = 1; i <= min; i++)
4609: {
1.7 misha 4610: pcre_uint32 cc;
1.4 misha 4611: if (eptr >= md->end_subject)
4612: {
4613: SCHECK_PARTIAL();
1.6 misha 4614: RRETURN(MATCH_NOMATCH);
1.4 misha 4615: }
1.8 moko 4616: cc = UCHAR21(eptr);
1.7 misha 4617: if (cc >= 128 || (md->ctypes[cc] & ctype_word) == 0)
1.6 misha 4618: RRETURN(MATCH_NOMATCH);
4619: eptr++;
1.1 misha 4620: /* No need to skip more bytes - we know it's a 1-byte character */
4621: }
4622: break;
4623:
4624: default:
4625: RRETURN(PCRE_ERROR_INTERNAL);
4626: } /* End switch(ctype) */
4627:
4628: else
1.6 misha 4629: #endif /* SUPPORT_UTF */
1.1 misha 4630:
4631: /* Code for the non-UTF-8 case for minimum matching of operators other
1.4 misha 4632: than OP_PROP and OP_NOTPROP. */
1.1 misha 4633:
4634: switch(ctype)
4635: {
4636: case OP_ANY:
4637: for (i = 1; i <= min; i++)
4638: {
1.4 misha 4639: if (eptr >= md->end_subject)
4640: {
4641: SCHECK_PARTIAL();
1.6 misha 4642: RRETURN(MATCH_NOMATCH);
1.4 misha 4643: }
1.6 misha 4644: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1.7 misha 4645: if (md->partial != 0 &&
4646: eptr + 1 >= md->end_subject &&
4647: NLBLOCK->nltype == NLTYPE_FIXED &&
4648: NLBLOCK->nllen == 2 &&
4649: *eptr == NLBLOCK->nl[0])
4650: {
4651: md->hitend = TRUE;
4652: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
4653: }
1.1 misha 4654: eptr++;
4655: }
4656: break;
4657:
4658: case OP_ALLANY:
1.4 misha 4659: if (eptr > md->end_subject - min)
4660: {
4661: SCHECK_PARTIAL();
1.6 misha 4662: RRETURN(MATCH_NOMATCH);
1.4 misha 4663: }
1.1 misha 4664: eptr += min;
4665: break;
4666:
4667: case OP_ANYBYTE:
1.4 misha 4668: if (eptr > md->end_subject - min)
4669: {
4670: SCHECK_PARTIAL();
1.6 misha 4671: RRETURN(MATCH_NOMATCH);
1.4 misha 4672: }
1.1 misha 4673: eptr += min;
4674: break;
4675:
4676: case OP_ANYNL:
4677: for (i = 1; i <= min; i++)
4678: {
1.4 misha 4679: if (eptr >= md->end_subject)
4680: {
4681: SCHECK_PARTIAL();
1.6 misha 4682: RRETURN(MATCH_NOMATCH);
1.4 misha 4683: }
1.1 misha 4684: switch(*eptr++)
4685: {
1.6 misha 4686: default: RRETURN(MATCH_NOMATCH);
4687:
1.7 misha 4688: case CHAR_CR:
4689: if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
1.1 misha 4690: break;
1.6 misha 4691:
1.7 misha 4692: case CHAR_LF:
1.1 misha 4693: break;
4694:
1.7 misha 4695: case CHAR_VT:
4696: case CHAR_FF:
4697: case CHAR_NEL:
4698: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1.6 misha 4699: case 0x2028:
4700: case 0x2029:
4701: #endif
4702: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1.1 misha 4703: break;
4704: }
4705: }
4706: break;
4707:
4708: case OP_NOT_HSPACE:
4709: for (i = 1; i <= min; i++)
4710: {
1.4 misha 4711: if (eptr >= md->end_subject)
4712: {
4713: SCHECK_PARTIAL();
1.6 misha 4714: RRETURN(MATCH_NOMATCH);
1.4 misha 4715: }
1.1 misha 4716: switch(*eptr++)
4717: {
4718: default: break;
1.7 misha 4719: HSPACE_BYTE_CASES:
4720: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4721: HSPACE_MULTIBYTE_CASES:
1.6 misha 4722: #endif
4723: RRETURN(MATCH_NOMATCH);
1.1 misha 4724: }
4725: }
4726: break;
4727:
4728: case OP_HSPACE:
4729: for (i = 1; i <= min; i++)
4730: {
1.4 misha 4731: if (eptr >= md->end_subject)
4732: {
4733: SCHECK_PARTIAL();
1.6 misha 4734: RRETURN(MATCH_NOMATCH);
1.4 misha 4735: }
1.1 misha 4736: switch(*eptr++)
4737: {
1.6 misha 4738: default: RRETURN(MATCH_NOMATCH);
1.7 misha 4739: HSPACE_BYTE_CASES:
4740: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4741: HSPACE_MULTIBYTE_CASES:
1.6 misha 4742: #endif
1.1 misha 4743: break;
4744: }
4745: }
4746: break;
4747:
4748: case OP_NOT_VSPACE:
4749: for (i = 1; i <= min; i++)
4750: {
1.4 misha 4751: if (eptr >= md->end_subject)
4752: {
4753: SCHECK_PARTIAL();
1.6 misha 4754: RRETURN(MATCH_NOMATCH);
1.4 misha 4755: }
1.1 misha 4756: switch(*eptr++)
4757: {
1.7 misha 4758: VSPACE_BYTE_CASES:
4759: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4760: VSPACE_MULTIBYTE_CASES:
1.6 misha 4761: #endif
4762: RRETURN(MATCH_NOMATCH);
1.7 misha 4763: default: break;
1.1 misha 4764: }
4765: }
4766: break;
4767:
4768: case OP_VSPACE:
4769: for (i = 1; i <= min; i++)
4770: {
1.4 misha 4771: if (eptr >= md->end_subject)
4772: {
4773: SCHECK_PARTIAL();
1.6 misha 4774: RRETURN(MATCH_NOMATCH);
1.4 misha 4775: }
1.1 misha 4776: switch(*eptr++)
4777: {
1.6 misha 4778: default: RRETURN(MATCH_NOMATCH);
1.7 misha 4779: VSPACE_BYTE_CASES:
4780: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
4781: VSPACE_MULTIBYTE_CASES:
1.6 misha 4782: #endif
1.1 misha 4783: break;
4784: }
4785: }
4786: break;
4787:
4788: case OP_NOT_DIGIT:
4789: for (i = 1; i <= min; i++)
1.4 misha 4790: {
4791: if (eptr >= md->end_subject)
4792: {
4793: SCHECK_PARTIAL();
1.6 misha 4794: RRETURN(MATCH_NOMATCH);
1.4 misha 4795: }
1.6 misha 4796: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
4797: RRETURN(MATCH_NOMATCH);
4798: eptr++;
1.4 misha 4799: }
1.1 misha 4800: break;
4801:
4802: case OP_DIGIT:
4803: for (i = 1; i <= min; i++)
1.4 misha 4804: {
4805: if (eptr >= md->end_subject)
4806: {
4807: SCHECK_PARTIAL();
1.6 misha 4808: RRETURN(MATCH_NOMATCH);
1.4 misha 4809: }
1.6 misha 4810: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
4811: RRETURN(MATCH_NOMATCH);
4812: eptr++;
1.4 misha 4813: }
1.1 misha 4814: break;
4815:
4816: case OP_NOT_WHITESPACE:
4817: for (i = 1; i <= min; i++)
1.4 misha 4818: {
4819: if (eptr >= md->end_subject)
4820: {
4821: SCHECK_PARTIAL();
1.6 misha 4822: RRETURN(MATCH_NOMATCH);
1.4 misha 4823: }
1.6 misha 4824: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
4825: RRETURN(MATCH_NOMATCH);
4826: eptr++;
1.4 misha 4827: }
1.1 misha 4828: break;
4829:
4830: case OP_WHITESPACE:
4831: for (i = 1; i <= min; i++)
1.4 misha 4832: {
4833: if (eptr >= md->end_subject)
4834: {
4835: SCHECK_PARTIAL();
1.6 misha 4836: RRETURN(MATCH_NOMATCH);
1.4 misha 4837: }
1.6 misha 4838: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
4839: RRETURN(MATCH_NOMATCH);
4840: eptr++;
1.4 misha 4841: }
1.1 misha 4842: break;
4843:
4844: case OP_NOT_WORDCHAR:
4845: for (i = 1; i <= min; i++)
1.4 misha 4846: {
4847: if (eptr >= md->end_subject)
4848: {
4849: SCHECK_PARTIAL();
1.6 misha 4850: RRETURN(MATCH_NOMATCH);
1.4 misha 4851: }
1.6 misha 4852: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
4853: RRETURN(MATCH_NOMATCH);
4854: eptr++;
1.4 misha 4855: }
1.1 misha 4856: break;
4857:
4858: case OP_WORDCHAR:
4859: for (i = 1; i <= min; i++)
1.4 misha 4860: {
4861: if (eptr >= md->end_subject)
4862: {
4863: SCHECK_PARTIAL();
1.6 misha 4864: RRETURN(MATCH_NOMATCH);
1.4 misha 4865: }
1.6 misha 4866: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
4867: RRETURN(MATCH_NOMATCH);
4868: eptr++;
1.4 misha 4869: }
1.1 misha 4870: break;
4871:
4872: default:
4873: RRETURN(PCRE_ERROR_INTERNAL);
4874: }
4875: }
4876:
4877: /* If min = max, continue at the same level without recursing */
4878:
4879: if (min == max) continue;
4880:
4881: /* If minimizing, we have to test the rest of the pattern before each
4882: subsequent match. Again, separate the UTF-8 case for speed, and also
4883: separate the UCP cases. */
4884:
4885: if (minimize)
4886: {
4887: #ifdef SUPPORT_UCP
4888: if (prop_type >= 0)
4889: {
4890: switch(prop_type)
4891: {
4892: case PT_ANY:
4893: for (fi = min;; fi++)
4894: {
1.6 misha 4895: RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
1.1 misha 4896: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 4897: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4898: if (eptr >= md->end_subject)
4899: {
4900: SCHECK_PARTIAL();
1.6 misha 4901: RRETURN(MATCH_NOMATCH);
1.4 misha 4902: }
4903: GETCHARINCTEST(c, eptr);
1.6 misha 4904: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
1.1 misha 4905: }
4906: /* Control never gets here */
4907:
4908: case PT_LAMP:
4909: for (fi = min;; fi++)
4910: {
1.6 misha 4911: int chartype;
4912: RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
1.1 misha 4913: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 4914: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4915: if (eptr >= md->end_subject)
4916: {
4917: SCHECK_PARTIAL();
1.6 misha 4918: RRETURN(MATCH_NOMATCH);
1.4 misha 4919: }
4920: GETCHARINCTEST(c, eptr);
1.6 misha 4921: chartype = UCD_CHARTYPE(c);
4922: if ((chartype == ucp_Lu ||
4923: chartype == ucp_Ll ||
4924: chartype == ucp_Lt) == prop_fail_result)
4925: RRETURN(MATCH_NOMATCH);
1.1 misha 4926: }
4927: /* Control never gets here */
4928:
4929: case PT_GC:
4930: for (fi = min;; fi++)
4931: {
1.6 misha 4932: RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
1.1 misha 4933: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 4934: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4935: if (eptr >= md->end_subject)
4936: {
4937: SCHECK_PARTIAL();
1.6 misha 4938: RRETURN(MATCH_NOMATCH);
1.4 misha 4939: }
4940: GETCHARINCTEST(c, eptr);
1.6 misha 4941: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
4942: RRETURN(MATCH_NOMATCH);
1.1 misha 4943: }
4944: /* Control never gets here */
4945:
4946: case PT_PC:
4947: for (fi = min;; fi++)
4948: {
1.6 misha 4949: RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
1.1 misha 4950: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 4951: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4952: if (eptr >= md->end_subject)
4953: {
4954: SCHECK_PARTIAL();
1.6 misha 4955: RRETURN(MATCH_NOMATCH);
1.4 misha 4956: }
4957: GETCHARINCTEST(c, eptr);
1.6 misha 4958: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
4959: RRETURN(MATCH_NOMATCH);
1.1 misha 4960: }
4961: /* Control never gets here */
4962:
4963: case PT_SC:
4964: for (fi = min;; fi++)
4965: {
1.6 misha 4966: RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
1.1 misha 4967: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 4968: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4969: if (eptr >= md->end_subject)
4970: {
4971: SCHECK_PARTIAL();
1.6 misha 4972: RRETURN(MATCH_NOMATCH);
1.4 misha 4973: }
4974: GETCHARINCTEST(c, eptr);
1.6 misha 4975: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
4976: RRETURN(MATCH_NOMATCH);
1.4 misha 4977: }
4978: /* Control never gets here */
4979:
4980: case PT_ALNUM:
4981: for (fi = min;; fi++)
4982: {
1.6 misha 4983: int category;
4984: RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
1.4 misha 4985: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 4986: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4987: if (eptr >= md->end_subject)
4988: {
4989: SCHECK_PARTIAL();
1.6 misha 4990: RRETURN(MATCH_NOMATCH);
1.4 misha 4991: }
4992: GETCHARINCTEST(c, eptr);
1.6 misha 4993: category = UCD_CATEGORY(c);
4994: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
4995: RRETURN(MATCH_NOMATCH);
1.4 misha 4996: }
4997: /* Control never gets here */
4998:
1.8 moko 4999: /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5000: which means that Perl space and POSIX space are now identical. PCRE
5001: was changed at release 8.34. */
5002:
1.4 misha 5003: case PT_SPACE: /* Perl space */
1.8 moko 5004: case PT_PXSPACE: /* POSIX space */
1.4 misha 5005: for (fi = min;; fi++)
5006: {
1.8 moko 5007: RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
1.4 misha 5008: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 5009: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 5010: if (eptr >= md->end_subject)
5011: {
5012: SCHECK_PARTIAL();
1.6 misha 5013: RRETURN(MATCH_NOMATCH);
1.4 misha 5014: }
5015: GETCHARINCTEST(c, eptr);
1.8 moko 5016: switch(c)
5017: {
5018: HSPACE_CASES:
5019: VSPACE_CASES:
5020: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
5021: break;
1.4 misha 5022:
1.8 moko 5023: default:
5024: if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5025: RRETURN(MATCH_NOMATCH);
5026: break;
1.4 misha 5027: }
1.1 misha 5028: }
5029: /* Control never gets here */
5030:
1.4 misha 5031: case PT_WORD:
5032: for (fi = min;; fi++)
5033: {
1.6 misha 5034: int category;
5035: RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
1.4 misha 5036: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 5037: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 5038: if (eptr >= md->end_subject)
5039: {
5040: SCHECK_PARTIAL();
1.6 misha 5041: RRETURN(MATCH_NOMATCH);
1.4 misha 5042: }
5043: GETCHARINCTEST(c, eptr);
1.6 misha 5044: category = UCD_CATEGORY(c);
5045: if ((category == ucp_L ||
5046: category == ucp_N ||
1.4 misha 5047: c == CHAR_UNDERSCORE)
5048: == prop_fail_result)
1.6 misha 5049: RRETURN(MATCH_NOMATCH);
1.4 misha 5050: }
5051: /* Control never gets here */
5052:
1.7 misha 5053: case PT_CLIST:
5054: for (fi = min;; fi++)
5055: {
5056: const pcre_uint32 *cp;
5057: RMATCH(eptr, ecode, offset_top, md, eptrb, RM67);
5058: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5059: if (fi >= max) RRETURN(MATCH_NOMATCH);
5060: if (eptr >= md->end_subject)
5061: {
5062: SCHECK_PARTIAL();
5063: RRETURN(MATCH_NOMATCH);
5064: }
5065: GETCHARINCTEST(c, eptr);
5066: cp = PRIV(ucd_caseless_sets) + prop_value;
5067: for (;;)
5068: {
5069: if (c < *cp)
5070: { if (prop_fail_result) break; else { RRETURN(MATCH_NOMATCH); } }
5071: if (c == *cp++)
5072: { if (prop_fail_result) { RRETURN(MATCH_NOMATCH); } else break; }
5073: }
5074: }
5075: /* Control never gets here */
5076:
5077: case PT_UCNC:
5078: for (fi = min;; fi++)
5079: {
1.8 moko 5080: RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
1.7 misha 5081: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5082: if (fi >= max) RRETURN(MATCH_NOMATCH);
5083: if (eptr >= md->end_subject)
5084: {
5085: SCHECK_PARTIAL();
5086: RRETURN(MATCH_NOMATCH);
5087: }
5088: GETCHARINCTEST(c, eptr);
5089: if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5090: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5091: c >= 0xe000) == prop_fail_result)
5092: RRETURN(MATCH_NOMATCH);
5093: }
5094: /* Control never gets here */
5095:
1.4 misha 5096: /* This should never occur */
1.1 misha 5097: default:
5098: RRETURN(PCRE_ERROR_INTERNAL);
5099: }
5100: }
5101:
5102: /* Match extended Unicode sequences. We will get here only if the
5103: support is in the binary; otherwise a compile-time error occurs. */
5104:
5105: else if (ctype == OP_EXTUNI)
5106: {
5107: for (fi = min;; fi++)
5108: {
1.6 misha 5109: RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
1.1 misha 5110: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 5111: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 5112: if (eptr >= md->end_subject)
5113: {
5114: SCHECK_PARTIAL();
1.6 misha 5115: RRETURN(MATCH_NOMATCH);
1.4 misha 5116: }
1.7 misha 5117: else
1.1 misha 5118: {
1.7 misha 5119: int lgb, rgb;
5120: GETCHARINCTEST(c, eptr);
5121: lgb = UCD_GRAPHBREAK(c);
5122: while (eptr < md->end_subject)
5123: {
5124: int len = 1;
5125: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5126: rgb = UCD_GRAPHBREAK(c);
5127: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5128: lgb = rgb;
5129: eptr += len;
5130: }
1.1 misha 5131: }
1.7 misha 5132: CHECK_PARTIAL();
1.1 misha 5133: }
5134: }
5135: else
5136: #endif /* SUPPORT_UCP */
5137:
1.6 misha 5138: #ifdef SUPPORT_UTF
5139: if (utf)
1.1 misha 5140: {
5141: for (fi = min;; fi++)
5142: {
1.6 misha 5143: RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
1.1 misha 5144: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 5145: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 5146: if (eptr >= md->end_subject)
5147: {
5148: SCHECK_PARTIAL();
1.6 misha 5149: RRETURN(MATCH_NOMATCH);
1.4 misha 5150: }
5151: if (ctype == OP_ANY && IS_NEWLINE(eptr))
1.6 misha 5152: RRETURN(MATCH_NOMATCH);
1.1 misha 5153: GETCHARINC(c, eptr);
5154: switch(ctype)
5155: {
1.7 misha 5156: case OP_ANY: /* This is the non-NL case */
5157: if (md->partial != 0 && /* Take care with CRLF partial */
5158: eptr >= md->end_subject &&
5159: NLBLOCK->nltype == NLTYPE_FIXED &&
5160: NLBLOCK->nllen == 2 &&
5161: c == NLBLOCK->nl[0])
5162: {
5163: md->hitend = TRUE;
5164: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5165: }
5166: break;
5167:
1.1 misha 5168: case OP_ALLANY:
5169: case OP_ANYBYTE:
5170: break;
5171:
5172: case OP_ANYNL:
5173: switch(c)
5174: {
1.6 misha 5175: default: RRETURN(MATCH_NOMATCH);
1.7 misha 5176: case CHAR_CR:
1.8 moko 5177: if (eptr < md->end_subject && UCHAR21(eptr) == CHAR_LF) eptr++;
1.1 misha 5178: break;
1.7 misha 5179:
5180: case CHAR_LF:
1.1 misha 5181: break;
5182:
1.7 misha 5183: case CHAR_VT:
5184: case CHAR_FF:
5185: case CHAR_NEL:
5186: #ifndef EBCDIC
1.1 misha 5187: case 0x2028:
5188: case 0x2029:
1.7 misha 5189: #endif /* Not EBCDIC */
1.6 misha 5190: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1.1 misha 5191: break;
5192: }
5193: break;
5194:
5195: case OP_NOT_HSPACE:
5196: switch(c)
5197: {
1.7 misha 5198: HSPACE_CASES: RRETURN(MATCH_NOMATCH);
1.1 misha 5199: default: break;
5200: }
5201: break;
5202:
5203: case OP_HSPACE:
5204: switch(c)
5205: {
1.7 misha 5206: HSPACE_CASES: break;
1.6 misha 5207: default: RRETURN(MATCH_NOMATCH);
1.1 misha 5208: }
5209: break;
5210:
5211: case OP_NOT_VSPACE:
5212: switch(c)
5213: {
1.7 misha 5214: VSPACE_CASES: RRETURN(MATCH_NOMATCH);
1.1 misha 5215: default: break;
5216: }
5217: break;
5218:
5219: case OP_VSPACE:
5220: switch(c)
5221: {
1.7 misha 5222: VSPACE_CASES: break;
1.6 misha 5223: default: RRETURN(MATCH_NOMATCH);
1.1 misha 5224: }
5225: break;
5226:
5227: case OP_NOT_DIGIT:
5228: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
1.6 misha 5229: RRETURN(MATCH_NOMATCH);
1.1 misha 5230: break;
5231:
5232: case OP_DIGIT:
5233: if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
1.6 misha 5234: RRETURN(MATCH_NOMATCH);
1.1 misha 5235: break;
5236:
5237: case OP_NOT_WHITESPACE:
5238: if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
1.6 misha 5239: RRETURN(MATCH_NOMATCH);
1.1 misha 5240: break;
5241:
5242: case OP_WHITESPACE:
1.6 misha 5243: if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
5244: RRETURN(MATCH_NOMATCH);
1.1 misha 5245: break;
5246:
5247: case OP_NOT_WORDCHAR:
5248: if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
1.6 misha 5249: RRETURN(MATCH_NOMATCH);
1.1 misha 5250: break;
5251:
5252: case OP_WORDCHAR:
5253: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
1.6 misha 5254: RRETURN(MATCH_NOMATCH);
1.1 misha 5255: break;
5256:
5257: default:
5258: RRETURN(PCRE_ERROR_INTERNAL);
5259: }
5260: }
5261: }
5262: else
5263: #endif
1.6 misha 5264: /* Not UTF mode */
1.1 misha 5265: {
5266: for (fi = min;; fi++)
5267: {
1.6 misha 5268: RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
1.1 misha 5269: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 misha 5270: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 5271: if (eptr >= md->end_subject)
5272: {
5273: SCHECK_PARTIAL();
1.6 misha 5274: RRETURN(MATCH_NOMATCH);
1.4 misha 5275: }
5276: if (ctype == OP_ANY && IS_NEWLINE(eptr))
1.6 misha 5277: RRETURN(MATCH_NOMATCH);
1.1 misha 5278: c = *eptr++;
5279: switch(ctype)
5280: {
1.7 misha 5281: case OP_ANY: /* This is the non-NL case */
5282: if (md->partial != 0 && /* Take care with CRLF partial */
5283: eptr >= md->end_subject &&
5284: NLBLOCK->nltype == NLTYPE_FIXED &&
5285: NLBLOCK->nllen == 2 &&
5286: c == NLBLOCK->nl[0])
5287: {
5288: md->hitend = TRUE;
5289: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5290: }
5291: break;
5292:
1.1 misha 5293: case OP_ALLANY:
5294: case OP_ANYBYTE:
5295: break;
5296:
5297: case OP_ANYNL:
5298: switch(c)
5299: {
1.6 misha 5300: default: RRETURN(MATCH_NOMATCH);
1.7 misha 5301: case CHAR_CR:
5302: if (eptr < md->end_subject && *eptr == CHAR_LF) eptr++;
1.1 misha 5303: break;
5304:
1.7 misha 5305: case CHAR_LF:
1.1 misha 5306: break;
5307:
1.7 misha 5308: case CHAR_VT:
5309: case CHAR_FF:
5310: case CHAR_NEL:
5311: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
1.6 misha 5312: case 0x2028:
5313: case 0x2029:
5314: #endif
5315: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1.1 misha 5316: break;
5317: }
5318: break;
5319:
5320: case OP_NOT_HSPACE:
5321: switch(c)
5322: {
5323: default: break;
1.7 misha 5324: HSPACE_BYTE_CASES:
5325: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5326: HSPACE_MULTIBYTE_CASES:
1.6 misha 5327: #endif
5328: RRETURN(MATCH_NOMATCH);
1.1 misha 5329: }
5330: break;
5331:
5332: case OP_HSPACE:
5333: switch(c)
5334: {
1.6 misha 5335: default: RRETURN(MATCH_NOMATCH);
1.7 misha 5336: HSPACE_BYTE_CASES:
5337: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5338: HSPACE_MULTIBYTE_CASES:
1.6 misha 5339: #endif
1.1 misha 5340: break;
5341: }
5342: break;
5343:
5344: case OP_NOT_VSPACE:
5345: switch(c)
5346: {
5347: default: break;
1.7 misha 5348: VSPACE_BYTE_CASES:
5349: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5350: VSPACE_MULTIBYTE_CASES:
1.6 misha 5351: #endif
5352: RRETURN(MATCH_NOMATCH);
1.1 misha 5353: }
5354: break;
5355:
5356: case OP_VSPACE:
5357: switch(c)
5358: {
1.6 misha 5359: default: RRETURN(MATCH_NOMATCH);
1.7 misha 5360: VSPACE_BYTE_CASES:
5361: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
5362: VSPACE_MULTIBYTE_CASES:
1.6 misha 5363: #endif
1.1 misha 5364: break;
5365: }
5366: break;
5367:
5368: case OP_NOT_DIGIT:
1.6 misha 5369: if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
1.1 misha 5370: break;
5371:
5372: case OP_DIGIT:
1.6 misha 5373: if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
1.1 misha 5374: break;
5375:
5376: case OP_NOT_WHITESPACE:
1.6 misha 5377: if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
1.1 misha 5378: break;
5379:
5380: case OP_WHITESPACE:
1.6 misha 5381: if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
1.1 misha 5382: break;
5383:
5384: case OP_NOT_WORDCHAR:
1.6 misha 5385: if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
1.1 misha 5386: break;
5387:
5388: case OP_WORDCHAR:
1.6 misha 5389: if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
1.1 misha 5390: break;
5391:
5392: default:
5393: RRETURN(PCRE_ERROR_INTERNAL);
5394: }
5395: }
5396: }
5397: /* Control never gets here */
5398: }
5399:
5400: /* If maximizing, it is worth using inline code for speed, doing the type
5401: test once at the start (i.e. keep it out of the loop). Again, keep the
5402: UTF-8 and UCP stuff separate. */
5403:
5404: else
5405: {
5406: pp = eptr; /* Remember where we started */
5407:
5408: #ifdef SUPPORT_UCP
5409: if (prop_type >= 0)
5410: {
5411: switch(prop_type)
5412: {
5413: case PT_ANY:
5414: for (i = min; i < max; i++)
5415: {
5416: int len = 1;
1.4 misha 5417: if (eptr >= md->end_subject)
5418: {
5419: SCHECK_PARTIAL();
5420: break;
5421: }
5422: GETCHARLENTEST(c, eptr, len);
1.1 misha 5423: if (prop_fail_result) break;
5424: eptr+= len;
5425: }
5426: break;
5427:
5428: case PT_LAMP:
5429: for (i = min; i < max; i++)
5430: {
1.6 misha 5431: int chartype;
1.1 misha 5432: int len = 1;
1.4 misha 5433: if (eptr >= md->end_subject)
5434: {
5435: SCHECK_PARTIAL();
5436: break;
5437: }
5438: GETCHARLENTEST(c, eptr, len);
1.6 misha 5439: chartype = UCD_CHARTYPE(c);
5440: if ((chartype == ucp_Lu ||
5441: chartype == ucp_Ll ||
5442: chartype == ucp_Lt) == prop_fail_result)
1.1 misha 5443: break;
5444: eptr+= len;
5445: }
5446: break;
5447:
5448: case PT_GC:
5449: for (i = min; i < max; i++)
5450: {
5451: int len = 1;
1.4 misha 5452: if (eptr >= md->end_subject)
5453: {
5454: SCHECK_PARTIAL();
5455: break;
5456: }
5457: GETCHARLENTEST(c, eptr, len);
1.6 misha 5458: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
1.1 misha 5459: eptr+= len;
5460: }
5461: break;
5462:
5463: case PT_PC:
5464: for (i = min; i < max; i++)
5465: {
5466: int len = 1;
1.4 misha 5467: if (eptr >= md->end_subject)
5468: {
5469: SCHECK_PARTIAL();
5470: break;
5471: }
5472: GETCHARLENTEST(c, eptr, len);
1.6 misha 5473: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
1.1 misha 5474: eptr+= len;
5475: }
5476: break;
5477:
5478: case PT_SC:
5479: for (i = min; i < max; i++)
5480: {
5481: int len = 1;
1.4 misha 5482: if (eptr >= md->end_subject)
5483: {
5484: SCHECK_PARTIAL();
5485: break;
5486: }
5487: GETCHARLENTEST(c, eptr, len);
1.6 misha 5488: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
1.1 misha 5489: eptr+= len;
5490: }
5491: break;
1.4 misha 5492:
5493: case PT_ALNUM:
5494: for (i = min; i < max; i++)
5495: {
1.6 misha 5496: int category;
1.4 misha 5497: int len = 1;
5498: if (eptr >= md->end_subject)
5499: {
5500: SCHECK_PARTIAL();
5501: break;
5502: }
5503: GETCHARLENTEST(c, eptr, len);
1.6 misha 5504: category = UCD_CATEGORY(c);
5505: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
1.4 misha 5506: break;
5507: eptr+= len;
5508: }
5509: break;
5510:
1.8 moko 5511: /* Perl space used to exclude VT, but from Perl 5.18 it is included,
5512: which means that Perl space and POSIX space are now identical. PCRE
5513: was changed at release 8.34. */
5514:
1.4 misha 5515: case PT_SPACE: /* Perl space */
1.8 moko 5516: case PT_PXSPACE: /* POSIX space */
1.4 misha 5517: for (i = min; i < max; i++)
5518: {
5519: int len = 1;
5520: if (eptr >= md->end_subject)
5521: {
5522: SCHECK_PARTIAL();
5523: break;
5524: }
5525: GETCHARLENTEST(c, eptr, len);
1.8 moko 5526: switch(c)
5527: {
5528: HSPACE_CASES:
5529: VSPACE_CASES:
5530: if (prop_fail_result) goto ENDLOOP99; /* Break the loop */
1.4 misha 5531: break;
5532:
1.8 moko 5533: default:
5534: if ((UCD_CATEGORY(c) == ucp_Z) == prop_fail_result)
5535: goto ENDLOOP99; /* Break the loop */
1.4 misha 5536: break;
5537: }
5538: eptr+= len;
5539: }
1.8 moko 5540: ENDLOOP99:
1.4 misha 5541: break;
5542:
5543: case PT_WORD:
5544: for (i = min; i < max; i++)
5545: {
1.6 misha 5546: int category;
1.4 misha 5547: int len = 1;
5548: if (eptr >= md->end_subject)
5549: {
5550: SCHECK_PARTIAL();
5551: break;
5552: }
5553: GETCHARLENTEST(c, eptr, len);
1.6 misha 5554: category = UCD_CATEGORY(c);
5555: if ((category == ucp_L || category == ucp_N ||
1.4 misha 5556: c == CHAR_UNDERSCORE) == prop_fail_result)
5557: break;
5558: eptr+= len;
5559: }
5560: break;
5561:
1.7 misha 5562: case PT_CLIST:
5563: for (i = min; i < max; i++)
5564: {
5565: const pcre_uint32 *cp;
5566: int len = 1;
5567: if (eptr >= md->end_subject)
5568: {
5569: SCHECK_PARTIAL();
5570: break;
5571: }
5572: GETCHARLENTEST(c, eptr, len);
5573: cp = PRIV(ucd_caseless_sets) + prop_value;
5574: for (;;)
5575: {
5576: if (c < *cp)
5577: { if (prop_fail_result) break; else goto GOT_MAX; }
5578: if (c == *cp++)
5579: { if (prop_fail_result) goto GOT_MAX; else break; }
5580: }
5581: eptr += len;
5582: }
5583: GOT_MAX:
5584: break;
5585:
5586: case PT_UCNC:
5587: for (i = min; i < max; i++)
5588: {
5589: int len = 1;
5590: if (eptr >= md->end_subject)
5591: {
5592: SCHECK_PARTIAL();
5593: break;
5594: }
5595: GETCHARLENTEST(c, eptr, len);
5596: if ((c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
5597: c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
5598: c >= 0xe000) == prop_fail_result)
5599: break;
5600: eptr += len;
5601: }
5602: break;
5603:
1.4 misha 5604: default:
5605: RRETURN(PCRE_ERROR_INTERNAL);
1.1 misha 5606: }
5607:
5608: /* eptr is now past the end of the maximum run */
5609:
1.7 misha 5610: if (possessive) continue; /* No backtracking */
1.1 misha 5611: for(;;)
5612: {
1.8 moko 5613: if (eptr <= pp) goto TAIL_RECURSE;
1.6 misha 5614: RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
1.1 misha 5615: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.7 misha 5616: eptr--;
1.6 misha 5617: if (utf) BACKCHAR(eptr);
1.1 misha 5618: }
5619: }
5620:
1.8 moko 5621: /* Match extended Unicode grapheme clusters. We will get here only if the
1.1 misha 5622: support is in the binary; otherwise a compile-time error occurs. */
5623:
5624: else if (ctype == OP_EXTUNI)
5625: {
5626: for (i = min; i < max; i++)
5627: {
1.4 misha 5628: if (eptr >= md->end_subject)
5629: {
5630: SCHECK_PARTIAL();
5631: break;
5632: }
1.7 misha 5633: else
1.1 misha 5634: {
1.7 misha 5635: int lgb, rgb;
5636: GETCHARINCTEST(c, eptr);
5637: lgb = UCD_GRAPHBREAK(c);
5638: while (eptr < md->end_subject)
5639: {
5640: int len = 1;
5641: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
5642: rgb = UCD_GRAPHBREAK(c);
5643: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5644: lgb = rgb;
5645: eptr += len;
5646: }
1.1 misha 5647: }
1.7 misha 5648: CHECK_PARTIAL();
1.1 misha 5649: }
5650:
5651: /* eptr is now past the end of the maximum run */
5652:
1.7 misha 5653: if (possessive) continue; /* No backtracking */
1.8 moko 5654:
5655: /* We use <= pp rather than == pp to detect the start of the run while
5656: backtracking because the use of \C in UTF mode can cause BACKCHAR to
5657: move back past pp. This is just palliative; the use of \C in UTF mode
5658: is fraught with danger. */
5659:
1.1 misha 5660: for(;;)
5661: {
1.8 moko 5662: int lgb, rgb;
5663: PCRE_PUCHAR fptr;
5664:
5665: if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
1.6 misha 5666: RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
1.1 misha 5667: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.8 moko 5668:
5669: /* Backtracking over an extended grapheme cluster involves inspecting
5670: the previous two characters (if present) to see if a break is
5671: permitted between them. */
5672:
1.7 misha 5673: eptr--;
1.8 moko 5674: if (!utf) c = *eptr; else
5675: {
5676: BACKCHAR(eptr);
5677: GETCHAR(c, eptr);
5678: }
5679: rgb = UCD_GRAPHBREAK(c);
5680:
5681: for (;;)
1.1 misha 5682: {
1.8 moko 5683: if (eptr <= pp) goto TAIL_RECURSE; /* At start of char run */
5684: fptr = eptr - 1;
5685: if (!utf) c = *fptr; else
1.1 misha 5686: {
1.8 moko 5687: BACKCHAR(fptr);
5688: GETCHAR(c, fptr);
1.1 misha 5689: }
1.8 moko 5690: lgb = UCD_GRAPHBREAK(c);
5691: if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break;
5692: eptr = fptr;
5693: rgb = lgb;
1.1 misha 5694: }
5695: }
5696: }
5697:
5698: else
5699: #endif /* SUPPORT_UCP */
5700:
1.6 misha 5701: #ifdef SUPPORT_UTF
5702: if (utf)
1.1 misha 5703: {
5704: switch(ctype)
5705: {
5706: case OP_ANY:
1.8 moko 5707: for (i = min; i < max; i++)
1.1 misha 5708: {
1.8 moko 5709: if (eptr >= md->end_subject)
1.1 misha 5710: {
1.8 moko 5711: SCHECK_PARTIAL();
5712: break;
1.1 misha 5713: }
1.8 moko 5714: if (IS_NEWLINE(eptr)) break;
5715: if (md->partial != 0 && /* Take care with CRLF partial */
5716: eptr + 1 >= md->end_subject &&
5717: NLBLOCK->nltype == NLTYPE_FIXED &&
5718: NLBLOCK->nllen == 2 &&
5719: UCHAR21(eptr) == NLBLOCK->nl[0])
1.1 misha 5720: {
1.8 moko 5721: md->hitend = TRUE;
5722: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
1.1 misha 5723: }
1.8 moko 5724: eptr++;
5725: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misha 5726: }
5727: break;
5728:
5729: case OP_ALLANY:
5730: if (max < INT_MAX)
5731: {
5732: for (i = min; i < max; i++)
5733: {
1.4 misha 5734: if (eptr >= md->end_subject)
5735: {
5736: SCHECK_PARTIAL();
5737: break;
5738: }
1.1 misha 5739: eptr++;
1.6 misha 5740: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misha 5741: }
5742: }
1.6 misha 5743: else
5744: {
5745: eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5746: SCHECK_PARTIAL();
5747: }
1.1 misha 5748: break;
5749:
5750: /* The byte case is the same as non-UTF8 */
5751:
5752: case OP_ANYBYTE:
5753: c = max - min;
5754: if (c > (unsigned int)(md->end_subject - eptr))
1.4 misha 5755: {
5756: eptr = md->end_subject;
5757: SCHECK_PARTIAL();
5758: }
5759: else eptr += c;
1.1 misha 5760: break;
5761:
5762: case OP_ANYNL:
5763: for (i = min; i < max; i++)
5764: {
5765: int len = 1;
1.4 misha 5766: if (eptr >= md->end_subject)
5767: {
5768: SCHECK_PARTIAL();
5769: break;
5770: }
1.1 misha 5771: GETCHARLEN(c, eptr, len);
1.7 misha 5772: if (c == CHAR_CR)
1.1 misha 5773: {
5774: if (++eptr >= md->end_subject) break;
1.8 moko 5775: if (UCHAR21(eptr) == CHAR_LF) eptr++;
1.1 misha 5776: }
5777: else
5778: {
1.7 misha 5779: if (c != CHAR_LF &&
1.1 misha 5780: (md->bsr_anycrlf ||
1.7 misha 5781: (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5782: #ifndef EBCDIC
5783: && c != 0x2028 && c != 0x2029
5784: #endif /* Not EBCDIC */
5785: )))
1.1 misha 5786: break;
5787: eptr += len;
5788: }
5789: }
5790: break;
5791:
5792: case OP_NOT_HSPACE:
5793: case OP_HSPACE:
5794: for (i = min; i < max; i++)
5795: {
5796: BOOL gotspace;
5797: int len = 1;
1.4 misha 5798: if (eptr >= md->end_subject)
5799: {
5800: SCHECK_PARTIAL();
5801: break;
5802: }
1.1 misha 5803: GETCHARLEN(c, eptr, len);
5804: switch(c)
5805: {
1.7 misha 5806: HSPACE_CASES: gotspace = TRUE; break;
1.1 misha 5807: default: gotspace = FALSE; break;
5808: }
5809: if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5810: eptr += len;
5811: }
5812: break;
5813:
5814: case OP_NOT_VSPACE:
5815: case OP_VSPACE:
5816: for (i = min; i < max; i++)
5817: {
5818: BOOL gotspace;
5819: int len = 1;
1.4 misha 5820: if (eptr >= md->end_subject)
5821: {
5822: SCHECK_PARTIAL();
5823: break;
5824: }
1.1 misha 5825: GETCHARLEN(c, eptr, len);
5826: switch(c)
5827: {
1.7 misha 5828: VSPACE_CASES: gotspace = TRUE; break;
1.1 misha 5829: default: gotspace = FALSE; break;
5830: }
5831: if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5832: eptr += len;
5833: }
5834: break;
5835:
5836: case OP_NOT_DIGIT:
5837: for (i = min; i < max; i++)
5838: {
5839: int len = 1;
1.4 misha 5840: if (eptr >= md->end_subject)
5841: {
5842: SCHECK_PARTIAL();
5843: break;
5844: }
1.1 misha 5845: GETCHARLEN(c, eptr, len);
5846: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5847: eptr+= len;
5848: }
5849: break;
5850:
5851: case OP_DIGIT:
5852: for (i = min; i < max; i++)
5853: {
5854: int len = 1;
1.4 misha 5855: if (eptr >= md->end_subject)
5856: {
5857: SCHECK_PARTIAL();
5858: break;
5859: }
1.1 misha 5860: GETCHARLEN(c, eptr, len);
5861: if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5862: eptr+= len;
5863: }
5864: break;
5865:
5866: case OP_NOT_WHITESPACE:
5867: for (i = min; i < max; i++)
5868: {
5869: int len = 1;
1.4 misha 5870: if (eptr >= md->end_subject)
5871: {
5872: SCHECK_PARTIAL();
5873: break;
5874: }
1.1 misha 5875: GETCHARLEN(c, eptr, len);
5876: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5877: eptr+= len;
5878: }
5879: break;
5880:
5881: case OP_WHITESPACE:
5882: for (i = min; i < max; i++)
5883: {
5884: int len = 1;
1.4 misha 5885: if (eptr >= md->end_subject)
5886: {
5887: SCHECK_PARTIAL();
5888: break;
5889: }
1.1 misha 5890: GETCHARLEN(c, eptr, len);
5891: if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5892: eptr+= len;
5893: }
5894: break;
5895:
5896: case OP_NOT_WORDCHAR:
5897: for (i = min; i < max; i++)
5898: {
5899: int len = 1;
1.4 misha 5900: if (eptr >= md->end_subject)
5901: {
5902: SCHECK_PARTIAL();
5903: break;
5904: }
1.1 misha 5905: GETCHARLEN(c, eptr, len);
5906: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5907: eptr+= len;
5908: }
5909: break;
5910:
5911: case OP_WORDCHAR:
5912: for (i = min; i < max; i++)
5913: {
5914: int len = 1;
1.4 misha 5915: if (eptr >= md->end_subject)
5916: {
5917: SCHECK_PARTIAL();
5918: break;
5919: }
1.1 misha 5920: GETCHARLEN(c, eptr, len);
5921: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5922: eptr+= len;
5923: }
5924: break;
5925:
5926: default:
5927: RRETURN(PCRE_ERROR_INTERNAL);
5928: }
5929:
1.7 misha 5930: if (possessive) continue; /* No backtracking */
1.1 misha 5931: for(;;)
5932: {
1.8 moko 5933: if (eptr <= pp) goto TAIL_RECURSE;
1.6 misha 5934: RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
1.1 misha 5935: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.7 misha 5936: eptr--;
1.1 misha 5937: BACKCHAR(eptr);
1.8 moko 5938: if (ctype == OP_ANYNL && eptr > pp && UCHAR21(eptr) == CHAR_NL &&
5939: UCHAR21(eptr - 1) == CHAR_CR) eptr--;
1.1 misha 5940: }
5941: }
5942: else
1.6 misha 5943: #endif /* SUPPORT_UTF */
5944: /* Not UTF mode */
1.1 misha 5945: {
5946: switch(ctype)
5947: {
5948: case OP_ANY:
5949: for (i = min; i < max; i++)
5950: {
1.4 misha 5951: if (eptr >= md->end_subject)
5952: {
5953: SCHECK_PARTIAL();
5954: break;
5955: }
5956: if (IS_NEWLINE(eptr)) break;
1.7 misha 5957: if (md->partial != 0 && /* Take care with CRLF partial */
5958: eptr + 1 >= md->end_subject &&
5959: NLBLOCK->nltype == NLTYPE_FIXED &&
5960: NLBLOCK->nllen == 2 &&
5961: *eptr == NLBLOCK->nl[0])
5962: {
5963: md->hitend = TRUE;
5964: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL);
5965: }
1.1 misha 5966: eptr++;
5967: }
5968: break;
5969:
5970: case OP_ALLANY:
5971: case OP_ANYBYTE:
5972: c = max - min;
5973: if (c > (unsigned int)(md->end_subject - eptr))
1.4 misha 5974: {
5975: eptr = md->end_subject;
5976: SCHECK_PARTIAL();
5977: }
5978: else eptr += c;
1.1 misha 5979: break;
5980:
5981: case OP_ANYNL:
5982: for (i = min; i < max; i++)
5983: {
1.4 misha 5984: if (eptr >= md->end_subject)
5985: {
5986: SCHECK_PARTIAL();
5987: break;
5988: }
1.1 misha 5989: c = *eptr;
1.7 misha 5990: if (c == CHAR_CR)
1.1 misha 5991: {
5992: if (++eptr >= md->end_subject) break;
1.7 misha 5993: if (*eptr == CHAR_LF) eptr++;
1.1 misha 5994: }
5995: else
5996: {
1.7 misha 5997: if (c != CHAR_LF && (md->bsr_anycrlf ||
5998: (c != CHAR_VT && c != CHAR_FF && c != CHAR_NEL
5999: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6000: && c != 0x2028 && c != 0x2029
1.6 misha 6001: #endif
1.7 misha 6002: ))) break;
1.1 misha 6003: eptr++;
6004: }
6005: }
6006: break;
6007:
6008: case OP_NOT_HSPACE:
6009: for (i = min; i < max; i++)
6010: {
1.4 misha 6011: if (eptr >= md->end_subject)
6012: {
6013: SCHECK_PARTIAL();
6014: break;
6015: }
1.7 misha 6016: switch(*eptr)
6017: {
6018: default: eptr++; break;
6019: HSPACE_BYTE_CASES:
6020: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6021: HSPACE_MULTIBYTE_CASES:
1.6 misha 6022: #endif
1.7 misha 6023: goto ENDLOOP00;
6024: }
1.1 misha 6025: }
1.7 misha 6026: ENDLOOP00:
1.1 misha 6027: break;
6028:
6029: case OP_HSPACE:
6030: for (i = min; i < max; i++)
6031: {
1.4 misha 6032: if (eptr >= md->end_subject)
6033: {
6034: SCHECK_PARTIAL();
6035: break;
6036: }
1.7 misha 6037: switch(*eptr)
6038: {
6039: default: goto ENDLOOP01;
6040: HSPACE_BYTE_CASES:
6041: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6042: HSPACE_MULTIBYTE_CASES:
1.6 misha 6043: #endif
1.7 misha 6044: eptr++; break;
6045: }
1.1 misha 6046: }
1.7 misha 6047: ENDLOOP01:
1.1 misha 6048: break;
6049:
6050: case OP_NOT_VSPACE:
6051: for (i = min; i < max; i++)
6052: {
1.4 misha 6053: if (eptr >= md->end_subject)
6054: {
6055: SCHECK_PARTIAL();
6056: break;
6057: }
1.7 misha 6058: switch(*eptr)
6059: {
6060: default: eptr++; break;
6061: VSPACE_BYTE_CASES:
6062: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6063: VSPACE_MULTIBYTE_CASES:
1.6 misha 6064: #endif
1.7 misha 6065: goto ENDLOOP02;
6066: }
1.1 misha 6067: }
1.7 misha 6068: ENDLOOP02:
1.1 misha 6069: break;
6070:
6071: case OP_VSPACE:
6072: for (i = min; i < max; i++)
6073: {
1.4 misha 6074: if (eptr >= md->end_subject)
6075: {
6076: SCHECK_PARTIAL();
6077: break;
6078: }
1.7 misha 6079: switch(*eptr)
6080: {
6081: default: goto ENDLOOP03;
6082: VSPACE_BYTE_CASES:
6083: #if defined COMPILE_PCRE16 || defined COMPILE_PCRE32
6084: VSPACE_MULTIBYTE_CASES:
1.6 misha 6085: #endif
1.7 misha 6086: eptr++; break;
6087: }
1.1 misha 6088: }
1.7 misha 6089: ENDLOOP03:
1.1 misha 6090: break;
6091:
6092: case OP_NOT_DIGIT:
6093: for (i = min; i < max; i++)
6094: {
1.4 misha 6095: if (eptr >= md->end_subject)
6096: {
6097: SCHECK_PARTIAL();
1.1 misha 6098: break;
1.4 misha 6099: }
1.6 misha 6100: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
1.1 misha 6101: eptr++;
6102: }
6103: break;
6104:
6105: case OP_DIGIT:
6106: for (i = min; i < max; i++)
6107: {
1.4 misha 6108: if (eptr >= md->end_subject)
6109: {
6110: SCHECK_PARTIAL();
1.1 misha 6111: break;
1.4 misha 6112: }
1.6 misha 6113: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
1.1 misha 6114: eptr++;
6115: }
6116: break;
6117:
6118: case OP_NOT_WHITESPACE:
6119: for (i = min; i < max; i++)
6120: {
1.4 misha 6121: if (eptr >= md->end_subject)
6122: {
6123: SCHECK_PARTIAL();
1.1 misha 6124: break;
1.4 misha 6125: }
1.6 misha 6126: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
1.1 misha 6127: eptr++;
6128: }
6129: break;
6130:
6131: case OP_WHITESPACE:
6132: for (i = min; i < max; i++)
6133: {
1.4 misha 6134: if (eptr >= md->end_subject)
6135: {
6136: SCHECK_PARTIAL();
1.1 misha 6137: break;
1.4 misha 6138: }
1.6 misha 6139: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
1.1 misha 6140: eptr++;
6141: }
6142: break;
6143:
6144: case OP_NOT_WORDCHAR:
6145: for (i = min; i < max; i++)
6146: {
1.4 misha 6147: if (eptr >= md->end_subject)
6148: {
6149: SCHECK_PARTIAL();
1.1 misha 6150: break;
1.4 misha 6151: }
1.6 misha 6152: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
1.1 misha 6153: eptr++;
6154: }
6155: break;
6156:
6157: case OP_WORDCHAR:
6158: for (i = min; i < max; i++)
6159: {
1.4 misha 6160: if (eptr >= md->end_subject)
6161: {
6162: SCHECK_PARTIAL();
1.1 misha 6163: break;
1.4 misha 6164: }
1.6 misha 6165: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
1.1 misha 6166: eptr++;
6167: }
6168: break;
6169:
6170: default:
6171: RRETURN(PCRE_ERROR_INTERNAL);
6172: }
6173:
1.7 misha 6174: if (possessive) continue; /* No backtracking */
6175: for (;;)
1.1 misha 6176: {
1.7 misha 6177: if (eptr == pp) goto TAIL_RECURSE;
1.6 misha 6178: RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
6179: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misha 6180: eptr--;
1.7 misha 6181: if (ctype == OP_ANYNL && eptr > pp && *eptr == CHAR_LF &&
6182: eptr[-1] == CHAR_CR) eptr--;
1.1 misha 6183: }
6184: }
6185:
1.8 moko 6186: /* Control never gets here */
1.1 misha 6187: }
6188:
6189: /* There's been some horrible disaster. Arrival here can only mean there is
6190: something seriously wrong in the code above or the OP_xxx definitions. */
6191:
6192: default:
6193: DPRINTF(("Unknown opcode %d\n", *ecode));
6194: RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6195: }
6196:
6197: /* Do not stick any code in here without much thought; it is assumed
6198: that "continue" in the code above comes out to here to repeat the main
6199: loop. */
6200:
6201: } /* End of main loop */
6202: /* Control never reaches here */
6203:
6204:
6205: /* When compiling to use the heap rather than the stack for recursive calls to
6206: match(), the RRETURN() macro jumps here. The number that is saved in
6207: frame->Xwhere indicates which label we actually want to return to. */
6208:
6209: #ifdef NO_RECURSE
6210: #define LBL(val) case val: goto L_RM##val;
6211: HEAP_RETURN:
6212: switch (frame->Xwhere)
6213: {
6214: LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6215: LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6216: LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6217: LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
1.6 misha 6218: LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
6219: LBL(65) LBL(66)
6220: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1.8 moko 6221: LBL(20) LBL(21)
1.6 misha 6222: #endif
6223: #ifdef SUPPORT_UTF
1.8 moko 6224: LBL(16) LBL(18)
1.6 misha 6225: LBL(22) LBL(23) LBL(28) LBL(30)
1.1 misha 6226: LBL(32) LBL(34) LBL(42) LBL(46)
6227: #ifdef SUPPORT_UCP
6228: LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
1.8 moko 6229: LBL(59) LBL(60) LBL(61) LBL(62) LBL(67)
1.1 misha 6230: #endif /* SUPPORT_UCP */
1.6 misha 6231: #endif /* SUPPORT_UTF */
1.1 misha 6232: default:
6233: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
6234: return PCRE_ERROR_INTERNAL;
6235: }
6236: #undef LBL
6237: #endif /* NO_RECURSE */
6238: }
6239:
6240:
6241: /***************************************************************************
6242: ****************************************************************************
6243: RECURSION IN THE match() FUNCTION
6244:
6245: Undefine all the macros that were defined above to handle this. */
6246:
6247: #ifdef NO_RECURSE
6248: #undef eptr
6249: #undef ecode
6250: #undef mstart
6251: #undef offset_top
6252: #undef eptrb
6253: #undef flags
6254:
6255: #undef callpat
6256: #undef charptr
6257: #undef data
6258: #undef next
6259: #undef pp
6260: #undef prev
6261: #undef saved_eptr
6262:
6263: #undef new_recursive
6264:
6265: #undef cur_is_word
6266: #undef condition
6267: #undef prev_is_word
6268:
6269: #undef ctype
6270: #undef length
6271: #undef max
6272: #undef min
6273: #undef number
6274: #undef offset
6275: #undef op
6276: #undef save_capture_last
6277: #undef save_offset1
6278: #undef save_offset2
6279: #undef save_offset3
6280: #undef stacksave
6281:
6282: #undef newptrb
6283:
6284: #endif
6285:
6286: /* These two are defined as macros in both cases */
6287:
6288: #undef fc
6289: #undef fi
6290:
6291: /***************************************************************************
6292: ***************************************************************************/
6293:
6294:
1.7 misha 6295: #ifdef NO_RECURSE
6296: /*************************************************
6297: * Release allocated heap frames *
6298: *************************************************/
6299:
6300: /* This function releases all the allocated frames. The base frame is on the
6301: machine stack, and so must not be freed.
6302:
6303: Argument: the address of the base frame
6304: Returns: nothing
6305: */
6306:
6307: static void
6308: release_match_heapframes (heapframe *frame_base)
6309: {
6310: heapframe *nextframe = frame_base->Xnextframe;
6311: while (nextframe != NULL)
6312: {
6313: heapframe *oldframe = nextframe;
6314: nextframe = nextframe->Xnextframe;
6315: (PUBL(stack_free))(oldframe);
6316: }
6317: }
6318: #endif
6319:
1.1 misha 6320:
6321: /*************************************************
6322: * Execute a Regular Expression *
6323: *************************************************/
6324:
6325: /* This function applies a compiled re to a subject string and picks out
6326: portions of the string if it matches. Two elements in the vector are set for
6327: each substring: the offsets to the start and end of the substring.
6328:
6329: Arguments:
6330: argument_re points to the compiled expression
6331: extra_data points to extra data or is NULL
6332: subject points to the subject string
6333: length length of subject string (may contain binary zeros)
6334: start_offset where to start in the subject string
6335: options option bits
6336: offsets points to a vector of ints to be filled in with offsets
6337: offsetcount the number of elements in the vector
6338:
6339: Returns: > 0 => success; value is the number of elements filled in
6340: = 0 => success, but offsets is not big enough
6341: -1 => failed to match
6342: < -1 => some kind of unexpected problem
6343: */
6344:
1.7 misha 6345: #if defined COMPILE_PCRE8
1.2 misha 6346: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
1.1 misha 6347: pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6348: PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6349: int offsetcount)
1.7 misha 6350: #elif defined COMPILE_PCRE16
1.6 misha 6351: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6352: pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
6353: PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
6354: int offsetcount)
1.7 misha 6355: #elif defined COMPILE_PCRE32
6356: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
6357: pcre32_exec(const pcre32 *argument_re, const pcre32_extra *extra_data,
6358: PCRE_SPTR32 subject, int length, int start_offset, int options, int *offsets,
6359: int offsetcount)
1.6 misha 6360: #endif
1.1 misha 6361: {
1.6 misha 6362: int rc, ocount, arg_offset_max;
1.1 misha 6363: int newline;
6364: BOOL using_temporary_offsets = FALSE;
6365: BOOL anchored;
6366: BOOL startline;
6367: BOOL firstline;
1.6 misha 6368: BOOL utf;
6369: BOOL has_first_char = FALSE;
6370: BOOL has_req_char = FALSE;
6371: pcre_uchar first_char = 0;
6372: pcre_uchar first_char2 = 0;
6373: pcre_uchar req_char = 0;
6374: pcre_uchar req_char2 = 0;
1.1 misha 6375: match_data match_block;
6376: match_data *md = &match_block;
1.6 misha 6377: const pcre_uint8 *tables;
6378: const pcre_uint8 *start_bits = NULL;
6379: PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
6380: PCRE_PUCHAR end_subject;
6381: PCRE_PUCHAR start_partial = NULL;
1.8 moko 6382: PCRE_PUCHAR match_partial = NULL;
1.6 misha 6383: PCRE_PUCHAR req_char_ptr = start_match - 1;
1.1 misha 6384:
6385: const pcre_study_data *study;
1.6 misha 6386: const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
6387:
1.7 misha 6388: #ifdef NO_RECURSE
6389: heapframe frame_zero;
6390: frame_zero.Xprevframe = NULL; /* Marks the top level */
6391: frame_zero.Xnextframe = NULL; /* None are allocated yet */
6392: md->match_frames_base = &frame_zero;
6393: #endif
6394:
1.6 misha 6395: /* Check for the special magic call that measures the size of the stack used
1.7 misha 6396: per recursive call of match(). Without the funny casting for sizeof, a Windows
6397: compiler gave this error: "unary minus operator applied to unsigned type,
6398: result still unsigned". Hopefully the cast fixes that. */
1.1 misha 6399:
1.6 misha 6400: if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
6401: start_offset == -999)
6402: #ifdef NO_RECURSE
1.7 misha 6403: return -((int)sizeof(heapframe));
1.6 misha 6404: #else
6405: return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
6406: #endif
1.1 misha 6407:
6408: /* Plausibility checks */
6409:
6410: if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
1.6 misha 6411: if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
6412: return PCRE_ERROR_NULL;
1.1 misha 6413: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
1.7 misha 6414: if (length < 0) return PCRE_ERROR_BADLENGTH;
1.5 misha 6415: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
1.1 misha 6416:
1.6 misha 6417: /* Check that the first field in the block is the magic number. If it is not,
6418: return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
6419: REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
6420: means that the pattern is likely compiled with different endianness. */
6421:
6422: if (re->magic_number != MAGIC_NUMBER)
6423: return re->magic_number == REVERSED_MAGIC_NUMBER?
6424: PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
6425: if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
6426:
6427: /* These two settings are used in the code for checking a UTF-8 string that
6428: follows immediately afterwards. Other values in the md block are used only
6429: during "normal" pcre_exec() processing, not when the JIT support is in use,
6430: so they are set up later. */
6431:
6432: /* PCRE_UTF16 has the same value as PCRE_UTF8. */
6433: utf = md->utf = (re->options & PCRE_UTF8) != 0;
6434: md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
6435: ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
6436:
6437: /* Check a UTF-8 string if required. Pass back the character offset and error
6438: code for an invalid string if a results vector is available. */
6439:
6440: #ifdef SUPPORT_UTF
6441: if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
6442: {
6443: int erroroffset;
6444: int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
6445: if (errorcode != 0)
6446: {
6447: if (offsetcount >= 2)
6448: {
6449: offsets[0] = erroroffset;
6450: offsets[1] = errorcode;
6451: }
1.7 misha 6452: #if defined COMPILE_PCRE8
6453: return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
6454: PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
6455: #elif defined COMPILE_PCRE16
1.6 misha 6456: return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
6457: PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
1.7 misha 6458: #elif defined COMPILE_PCRE32
6459: return PCRE_ERROR_BADUTF32;
1.6 misha 6460: #endif
6461: }
1.7 misha 6462: #if defined COMPILE_PCRE8 || defined COMPILE_PCRE16
1.6 misha 6463: /* Check that a start_offset points to the start of a UTF character. */
6464: if (start_offset > 0 && start_offset < length &&
6465: NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
6466: return PCRE_ERROR_BADUTF8_OFFSET;
1.7 misha 6467: #endif
1.6 misha 6468: }
6469: #endif
6470:
6471: /* If the pattern was successfully studied with JIT support, run the JIT
6472: executable instead of the rest of this function. Most options must be set at
6473: compile time for the JIT code to be usable. Fallback to the normal code path if
1.7 misha 6474: an unsupported flag is set. */
1.6 misha 6475:
6476: #ifdef SUPPORT_JIT
6477: if (extra_data != NULL
1.7 misha 6478: && (extra_data->flags & (PCRE_EXTRA_EXECUTABLE_JIT |
6479: PCRE_EXTRA_TABLES)) == PCRE_EXTRA_EXECUTABLE_JIT
1.6 misha 6480: && extra_data->executable_jit != NULL
1.7 misha 6481: && (options & ~PUBLIC_JIT_EXEC_OPTIONS) == 0)
6482: {
6483: rc = PRIV(jit_exec)(extra_data, (const pcre_uchar *)subject, length,
6484: start_offset, options, offsets, offsetcount);
6485:
6486: /* PCRE_ERROR_NULL means that the selected normal or partial matching
6487: mode is not compiled. In this case we simply fallback to interpreter. */
6488:
6489: if (rc != PCRE_ERROR_JIT_BADOPTION) return rc;
6490: }
1.6 misha 6491: #endif
1.4 misha 6492:
1.6 misha 6493: /* Carry on with non-JIT matching. This information is for finding all the
6494: numbers associated with a given name, for condition testing. */
6495:
6496: md->name_table = (pcre_uchar *)re + re->name_table_offset;
1.4 misha 6497: md->name_count = re->name_count;
6498: md->name_entry_size = re->name_entry_size;
6499:
1.1 misha 6500: /* Fish out the optional data from the extra_data structure, first setting
6501: the default values. */
6502:
6503: study = NULL;
6504: md->match_limit = MATCH_LIMIT;
6505: md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6506: md->callout_data = NULL;
6507:
6508: /* The table pointer is always in native byte order. */
6509:
1.6 misha 6510: tables = re->tables;
1.1 misha 6511:
1.7 misha 6512: /* The two limit values override the defaults, whatever their value. */
6513:
1.1 misha 6514: if (extra_data != NULL)
6515: {
1.8 moko 6516: unsigned long int flags = extra_data->flags;
1.1 misha 6517: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6518: study = (const pcre_study_data *)extra_data->study_data;
6519: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6520: md->match_limit = extra_data->match_limit;
6521: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6522: md->match_limit_recursion = extra_data->match_limit_recursion;
6523: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6524: md->callout_data = extra_data->callout_data;
6525: if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6526: }
6527:
1.7 misha 6528: /* Limits in the regex override only if they are smaller. */
6529:
6530: if ((re->flags & PCRE_MLSET) != 0 && re->limit_match < md->match_limit)
6531: md->match_limit = re->limit_match;
6532:
6533: if ((re->flags & PCRE_RLSET) != 0 &&
6534: re->limit_recursion < md->match_limit_recursion)
6535: md->match_limit_recursion = re->limit_recursion;
6536:
1.1 misha 6537: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6538: is a feature that makes it possible to save compiled regex and re-use them
6539: in other programs later. */
6540:
1.6 misha 6541: if (tables == NULL) tables = PRIV(default_tables);
1.1 misha 6542:
6543: /* Set up other data */
6544:
6545: anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6546: startline = (re->flags & PCRE_STARTLINE) != 0;
6547: firstline = (re->options & PCRE_FIRSTLINE) != 0;
6548:
6549: /* The code starts after the real_pcre block and the capture name table. */
6550:
1.6 misha 6551: md->start_code = (const pcre_uchar *)re + re->name_table_offset +
1.1 misha 6552: re->name_count * re->name_entry_size;
6553:
1.6 misha 6554: md->start_subject = (PCRE_PUCHAR)subject;
1.1 misha 6555: md->start_offset = start_offset;
6556: md->end_subject = md->start_subject + length;
6557: end_subject = md->end_subject;
6558:
6559: md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
1.4 misha 6560: md->use_ucp = (re->options & PCRE_UCP) != 0;
1.1 misha 6561: md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
1.7 misha 6562: md->ignore_skip_arg = 0;
1.6 misha 6563:
6564: /* Some options are unpacked into BOOL variables in the hope that testing
6565: them will be faster than individual option bits. */
1.1 misha 6566:
6567: md->notbol = (options & PCRE_NOTBOL) != 0;
6568: md->noteol = (options & PCRE_NOTEOL) != 0;
6569: md->notempty = (options & PCRE_NOTEMPTY) != 0;
1.4 misha 6570: md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
1.6 misha 6571:
1.1 misha 6572: md->hitend = FALSE;
1.6 misha 6573: md->mark = md->nomatch_mark = NULL; /* In case never set */
1.1 misha 6574:
6575: md->recursive = NULL; /* No recursion at top level */
1.6 misha 6576: md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
1.1 misha 6577:
6578: md->lcc = tables + lcc_offset;
1.6 misha 6579: md->fcc = tables + fcc_offset;
1.1 misha 6580: md->ctypes = tables + ctypes_offset;
6581:
6582: /* Handle different \R options. */
6583:
6584: switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6585: {
6586: case 0:
6587: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6588: md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6589: else
6590: #ifdef BSR_ANYCRLF
6591: md->bsr_anycrlf = TRUE;
6592: #else
6593: md->bsr_anycrlf = FALSE;
6594: #endif
6595: break;
6596:
6597: case PCRE_BSR_ANYCRLF:
6598: md->bsr_anycrlf = TRUE;
6599: break;
6600:
6601: case PCRE_BSR_UNICODE:
6602: md->bsr_anycrlf = FALSE;
6603: break;
6604:
6605: default: return PCRE_ERROR_BADNEWLINE;
6606: }
6607:
6608: /* Handle different types of newline. The three bits give eight cases. If
6609: nothing is set at run time, whatever was used at compile time applies. */
6610:
6611: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6612: (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6613: {
6614: case 0: newline = NEWLINE; break; /* Compile-time default */
1.3 misha 6615: case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6616: case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
1.1 misha 6617: case PCRE_NEWLINE_CR+
1.3 misha 6618: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
1.1 misha 6619: case PCRE_NEWLINE_ANY: newline = -1; break;
6620: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6621: default: return PCRE_ERROR_BADNEWLINE;
6622: }
6623:
6624: if (newline == -2)
6625: {
6626: md->nltype = NLTYPE_ANYCRLF;
6627: }
6628: else if (newline < 0)
6629: {
6630: md->nltype = NLTYPE_ANY;
6631: }
6632: else
6633: {
6634: md->nltype = NLTYPE_FIXED;
6635: if (newline > 255)
6636: {
6637: md->nllen = 2;
6638: md->nl[0] = (newline >> 8) & 255;
6639: md->nl[1] = newline & 255;
6640: }
6641: else
6642: {
6643: md->nllen = 1;
6644: md->nl[0] = newline;
6645: }
6646: }
6647:
1.4 misha 6648: /* Partial matching was originally supported only for a restricted set of
6649: regexes; from release 8.00 there are no restrictions, but the bits are still
6650: defined (though never set). So there's no harm in leaving this code. */
1.1 misha 6651:
6652: if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6653: return PCRE_ERROR_BADPARTIAL;
6654:
6655: /* If the expression has got more back references than the offsets supplied can
6656: hold, we get a temporary chunk of working store to use during the matching.
6657: Otherwise, we can use the vector supplied, rounding down its size to a multiple
6658: of 3. */
6659:
6660: ocount = offsetcount - (offsetcount % 3);
1.6 misha 6661: arg_offset_max = (2*ocount)/3;
1.1 misha 6662:
6663: if (re->top_backref > 0 && re->top_backref >= ocount/3)
6664: {
6665: ocount = re->top_backref * 3 + 3;
1.6 misha 6666: md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
1.1 misha 6667: if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6668: using_temporary_offsets = TRUE;
6669: DPRINTF(("Got memory to hold back references\n"));
6670: }
6671: else md->offset_vector = offsets;
6672: md->offset_end = ocount;
6673: md->offset_max = (2*ocount)/3;
1.7 misha 6674: md->capture_last = 0;
1.1 misha 6675:
6676: /* Reset the working variable associated with each extraction. These should
6677: never be used unless previously set, but they get saved and restored, and so we
1.6 misha 6678: initialize them to avoid reading uninitialized locations. Also, unset the
6679: offsets for the matched string. This is really just for tidiness with callouts,
6680: in case they inspect these fields. */
1.1 misha 6681:
6682: if (md->offset_vector != NULL)
6683: {
6684: register int *iptr = md->offset_vector + ocount;
1.6 misha 6685: register int *iend = iptr - re->top_bracket;
6686: if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
1.1 misha 6687: while (--iptr >= iend) *iptr = -1;
1.9 ! moko 6688: if (offsetcount > 0) md->offset_vector[0] = -1;
! 6689: if (offsetcount > 1) md->offset_vector[1] = -1;
1.1 misha 6690: }
6691:
1.6 misha 6692: /* Set up the first character to match, if available. The first_char value is
1.1 misha 6693: never set for an anchored regular expression, but the anchoring may be forced
6694: at run time, so we have to test for anchoring. The first char may be unset for
6695: an unanchored pattern, of course. If there's no first char and the pattern was
6696: studied, there may be a bitmap of possible first characters. */
6697:
6698: if (!anchored)
6699: {
6700: if ((re->flags & PCRE_FIRSTSET) != 0)
6701: {
1.6 misha 6702: has_first_char = TRUE;
6703: first_char = first_char2 = (pcre_uchar)(re->first_char);
6704: if ((re->flags & PCRE_FCH_CASELESS) != 0)
6705: {
6706: first_char2 = TABLE_GET(first_char, md->fcc, first_char);
6707: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6708: if (utf && first_char > 127)
6709: first_char2 = UCD_OTHERCASE(first_char);
6710: #endif
6711: }
1.1 misha 6712: }
6713: else
6714: if (!startline && study != NULL &&
1.4 misha 6715: (study->flags & PCRE_STUDY_MAPPED) != 0)
1.1 misha 6716: start_bits = study->start_bits;
6717: }
6718:
6719: /* For anchored or unanchored matches, there may be a "last known required
6720: character" set. */
6721:
6722: if ((re->flags & PCRE_REQCHSET) != 0)
6723: {
1.6 misha 6724: has_req_char = TRUE;
6725: req_char = req_char2 = (pcre_uchar)(re->req_char);
6726: if ((re->flags & PCRE_RCH_CASELESS) != 0)
6727: {
6728: req_char2 = TABLE_GET(req_char, md->fcc, req_char);
6729: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
6730: if (utf && req_char > 127)
6731: req_char2 = UCD_OTHERCASE(req_char);
6732: #endif
6733: }
1.1 misha 6734: }
6735:
6736:
6737: /* ==========================================================================*/
6738:
6739: /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6740: the loop runs just once. */
6741:
6742: for(;;)
6743: {
1.6 misha 6744: PCRE_PUCHAR save_end_subject = end_subject;
6745: PCRE_PUCHAR new_start_match;
1.1 misha 6746:
1.3 misha 6747: /* If firstline is TRUE, the start of the match is constrained to the first
6748: line of a multiline string. That is, the match must be before or at the first
6749: newline. Implement this by temporarily adjusting end_subject so that we stop
6750: scanning at a newline. If the match fails at the newline, later code breaks
6751: this loop. */
1.1 misha 6752:
6753: if (firstline)
6754: {
1.6 misha 6755: PCRE_PUCHAR t = start_match;
6756: #ifdef SUPPORT_UTF
6757: if (utf)
1.2 misha 6758: {
6759: while (t < md->end_subject && !IS_NEWLINE(t))
6760: {
6761: t++;
1.6 misha 6762: ACROSSCHAR(t < end_subject, *t, t++);
1.2 misha 6763: }
6764: }
6765: else
6766: #endif
1.1 misha 6767: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6768: end_subject = t;
6769: }
6770:
1.3 misha 6771: /* There are some optimizations that avoid running the match if a known
6772: starting point is not found, or if a known later character is not present.
6773: However, there is an option that disables these, for testing and for ensuring
1.5 misha 6774: that all callouts do actually occur. The option can be set in the regex by
6775: (*NO_START_OPT) or passed in match-time options. */
1.1 misha 6776:
1.5 misha 6777: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
1.1 misha 6778: {
1.6 misha 6779: /* Advance to a unique first char if there is one. */
1.3 misha 6780:
1.6 misha 6781: if (has_first_char)
1.3 misha 6782: {
1.7 misha 6783: pcre_uchar smc;
6784:
1.6 misha 6785: if (first_char != first_char2)
6786: while (start_match < end_subject &&
1.8 moko 6787: (smc = UCHAR21TEST(start_match)) != first_char && smc != first_char2)
1.3 misha 6788: start_match++;
6789: else
1.8 moko 6790: while (start_match < end_subject && UCHAR21TEST(start_match) != first_char)
1.3 misha 6791: start_match++;
6792: }
1.1 misha 6793:
1.3 misha 6794: /* Or to just after a linebreak for a multiline match */
1.1 misha 6795:
1.3 misha 6796: else if (startline)
1.1 misha 6797: {
1.3 misha 6798: if (start_match > md->start_subject + start_offset)
6799: {
1.6 misha 6800: #ifdef SUPPORT_UTF
6801: if (utf)
1.2 misha 6802: {
1.3 misha 6803: while (start_match < end_subject && !WAS_NEWLINE(start_match))
6804: {
1.2 misha 6805: start_match++;
1.6 misha 6806: ACROSSCHAR(start_match < end_subject, *start_match,
6807: start_match++);
1.3 misha 6808: }
1.2 misha 6809: }
1.3 misha 6810: else
1.2 misha 6811: #endif
1.3 misha 6812: while (start_match < end_subject && !WAS_NEWLINE(start_match))
6813: start_match++;
1.1 misha 6814:
1.3 misha 6815: /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6816: and we are now at a LF, advance the match position by one more character.
6817: */
6818:
6819: if (start_match[-1] == CHAR_CR &&
6820: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6821: start_match < end_subject &&
1.8 moko 6822: UCHAR21TEST(start_match) == CHAR_NL)
1.3 misha 6823: start_match++;
6824: }
1.1 misha 6825: }
6826:
1.3 misha 6827: /* Or to a non-unique first byte after study */
1.1 misha 6828:
1.3 misha 6829: else if (start_bits != NULL)
1.1 misha 6830: {
1.3 misha 6831: while (start_match < end_subject)
6832: {
1.8 moko 6833: register pcre_uint32 c = UCHAR21TEST(start_match);
1.6 misha 6834: #ifndef COMPILE_PCRE8
6835: if (c > 255) c = 255;
6836: #endif
1.8 moko 6837: if ((start_bits[c/8] & (1 << (c&7))) != 0) break;
6838: start_match++;
1.3 misha 6839: }
1.1 misha 6840: }
1.3 misha 6841: } /* Starting optimizations */
1.1 misha 6842:
6843: /* Restore fudged end_subject */
6844:
6845: end_subject = save_end_subject;
6846:
1.4 misha 6847: /* The following two optimizations are disabled for partial matching or if
6848: disabling is explicitly requested. */
1.1 misha 6849:
1.6 misha 6850: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
1.4 misha 6851: {
6852: /* If the pattern was studied, a minimum subject length may be set. This is
6853: a lower bound; no actual string of that length may actually match the
6854: pattern. Although the value is, strictly, in characters, we treat it as
6855: bytes to avoid spending too much time in this optimization. */
1.1 misha 6856:
1.4 misha 6857: if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6858: (pcre_uint32)(end_subject - start_match) < study->minlength)
6859: {
6860: rc = MATCH_NOMATCH;
6861: break;
6862: }
1.1 misha 6863:
1.6 misha 6864: /* If req_char is set, we know that that character must appear in the
6865: subject for the match to succeed. If the first character is set, req_char
1.4 misha 6866: must be later in the subject; otherwise the test starts at the match point.
6867: This optimization can save a huge amount of backtracking in patterns with
6868: nested unlimited repeats that aren't going to match. Writing separate code
6869: for cased/caseless versions makes it go faster, as does using an
6870: autoincrement and backing off on a match.
1.1 misha 6871:
1.4 misha 6872: HOWEVER: when the subject string is very, very long, searching to its end
6873: can take a long time, and give bad performance on quite ordinary patterns.
6874: This showed up when somebody was matching something like /^\d+C/ on a
6875: 32-megabyte string... so we don't do this when the string is sufficiently
6876: long. */
1.1 misha 6877:
1.6 misha 6878: if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
1.1 misha 6879: {
1.6 misha 6880: register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
1.4 misha 6881:
6882: /* We don't need to repeat the search if we haven't yet reached the
6883: place we found it at last time. */
6884:
1.6 misha 6885: if (p > req_char_ptr)
1.1 misha 6886: {
1.6 misha 6887: if (req_char != req_char2)
1.1 misha 6888: {
1.4 misha 6889: while (p < end_subject)
6890: {
1.8 moko 6891: register pcre_uint32 pp = UCHAR21INCTEST(p);
1.6 misha 6892: if (pp == req_char || pp == req_char2) { p--; break; }
1.4 misha 6893: }
1.1 misha 6894: }
1.4 misha 6895: else
1.1 misha 6896: {
1.4 misha 6897: while (p < end_subject)
6898: {
1.8 moko 6899: if (UCHAR21INCTEST(p) == req_char) { p--; break; }
1.4 misha 6900: }
1.1 misha 6901: }
6902:
1.4 misha 6903: /* If we can't find the required character, break the matching loop,
6904: forcing a match failure. */
1.1 misha 6905:
1.4 misha 6906: if (p >= end_subject)
6907: {
6908: rc = MATCH_NOMATCH;
6909: break;
6910: }
1.1 misha 6911:
1.4 misha 6912: /* If we have found the required character, save the point where we
6913: found it, so that we don't search again next time round the loop if
6914: the start hasn't passed this character yet. */
1.1 misha 6915:
1.6 misha 6916: req_char_ptr = p;
1.4 misha 6917: }
1.1 misha 6918: }
6919: }
6920:
1.4 misha 6921: #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6922: printf(">>>> Match against: ");
6923: pchars(start_match, end_subject - start_match, TRUE, md);
6924: printf("\n");
6925: #endif
6926:
6927: /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6928: first starting point for which a partial match was found. */
1.1 misha 6929:
6930: md->start_match_ptr = start_match;
1.4 misha 6931: md->start_used_ptr = start_match;
1.1 misha 6932: md->match_call_count = 0;
1.6 misha 6933: md->match_function_type = 0;
6934: md->end_offset_top = 0;
1.7 misha 6935: md->skip_arg_count = 0;
1.6 misha 6936: rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
1.7 misha 6937: if (md->hitend && start_partial == NULL)
6938: {
6939: start_partial = md->start_used_ptr;
6940: match_partial = start_match;
6941: }
1.1 misha 6942:
6943: switch(rc)
6944: {
1.6 misha 6945: /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6946: the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
6947: entirely. The only way we can do that is to re-do the match at the same
6948: point, with a flag to force SKIP with an argument to be ignored. Just
6949: treating this case as NOMATCH does not work because it does not check other
6950: alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
6951:
6952: case MATCH_SKIP_ARG:
6953: new_start_match = start_match;
1.7 misha 6954: md->ignore_skip_arg = md->skip_arg_count;
1.6 misha 6955: break;
6956:
1.7 misha 6957: /* SKIP passes back the next starting point explicitly, but if it is no
6958: greater than the match we have just done, treat it as NOMATCH. */
1.4 misha 6959:
6960: case MATCH_SKIP:
1.7 misha 6961: if (md->start_match_ptr > start_match)
1.4 misha 6962: {
6963: new_start_match = md->start_match_ptr;
6964: break;
6965: }
6966: /* Fall through */
6967:
1.1 misha 6968: /* NOMATCH and PRUNE advance by one character. THEN at this level acts
1.7 misha 6969: exactly like PRUNE. Unset ignore SKIP-with-argument. */
1.1 misha 6970:
6971: case MATCH_NOMATCH:
6972: case MATCH_PRUNE:
6973: case MATCH_THEN:
1.7 misha 6974: md->ignore_skip_arg = 0;
1.1 misha 6975: new_start_match = start_match + 1;
1.6 misha 6976: #ifdef SUPPORT_UTF
6977: if (utf)
6978: ACROSSCHAR(new_start_match < end_subject, *new_start_match,
6979: new_start_match++);
1.1 misha 6980: #endif
6981: break;
6982:
6983: /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6984:
6985: case MATCH_COMMIT:
6986: rc = MATCH_NOMATCH;
6987: goto ENDLOOP;
6988:
1.4 misha 6989: /* Any other return is either a match, or some kind of error. */
1.1 misha 6990:
6991: default:
6992: goto ENDLOOP;
6993: }
6994:
6995: /* Control reaches here for the various types of "no match at this point"
6996: result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6997:
6998: rc = MATCH_NOMATCH;
6999:
7000: /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
7001: newline in the subject (though it may continue over the newline). Therefore,
7002: if we have just failed to match, starting at a newline, do not continue. */
7003:
7004: if (firstline && IS_NEWLINE(start_match)) break;
7005:
7006: /* Advance to new matching position */
7007:
7008: start_match = new_start_match;
7009:
7010: /* Break the loop if the pattern is anchored or if we have passed the end of
7011: the subject. */
7012:
7013: if (anchored || start_match > end_subject) break;
7014:
7015: /* If we have just passed a CR and we are now at a LF, and the pattern does
7016: not contain any explicit matches for \r or \n, and the newline option is CRLF
1.6 misha 7017: or ANY or ANYCRLF, advance the match position by one more character. In
7018: normal matching start_match will aways be greater than the first position at
7019: this stage, but a failed *SKIP can cause a return at the same point, which is
7020: why the first test exists. */
1.1 misha 7021:
1.6 misha 7022: if (start_match > (PCRE_PUCHAR)subject + start_offset &&
7023: start_match[-1] == CHAR_CR &&
1.1 misha 7024: start_match < end_subject &&
1.3 misha 7025: *start_match == CHAR_NL &&
1.1 misha 7026: (re->flags & PCRE_HASCRORLF) == 0 &&
7027: (md->nltype == NLTYPE_ANY ||
7028: md->nltype == NLTYPE_ANYCRLF ||
7029: md->nllen == 2))
7030: start_match++;
7031:
1.4 misha 7032: md->mark = NULL; /* Reset for start of next match attempt */
7033: } /* End of for(;;) "bumpalong" loop */
1.1 misha 7034:
7035: /* ==========================================================================*/
7036:
7037: /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
7038: conditions is true:
7039:
7040: (1) The pattern is anchored or the match was failed by (*COMMIT);
7041:
7042: (2) We are past the end of the subject;
7043:
7044: (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
7045: this option requests that a match occur at or before the first newline in
7046: the subject.
7047:
7048: When we have a match and the offset vector is big enough to deal with any
7049: backreferences, captured substring offsets will already be set up. In the case
7050: where we had to get some local store to hold offsets for backreference
7051: processing, copy those that we can. In this case there need not be overflow if
7052: certain parts of the pattern were not used, even though there are more
7053: capturing parentheses than vector slots. */
7054:
7055: ENDLOOP:
7056:
1.4 misha 7057: if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
1.1 misha 7058: {
7059: if (using_temporary_offsets)
7060: {
1.6 misha 7061: if (arg_offset_max >= 4)
1.1 misha 7062: {
7063: memcpy(offsets + 2, md->offset_vector + 2,
1.6 misha 7064: (arg_offset_max - 2) * sizeof(int));
1.1 misha 7065: DPRINTF(("Copied offsets from temporary memory\n"));
7066: }
1.7 misha 7067: if (md->end_offset_top > arg_offset_max) md->capture_last |= OVFLBIT;
1.1 misha 7068: DPRINTF(("Freeing temporary memory\n"));
1.6 misha 7069: (PUBL(free))(md->offset_vector);
1.1 misha 7070: }
7071:
1.6 misha 7072: /* Set the return code to the number of captured strings, or 0 if there were
1.1 misha 7073: too many to fit into the vector. */
7074:
1.7 misha 7075: rc = ((md->capture_last & OVFLBIT) != 0 &&
7076: md->end_offset_top >= arg_offset_max)?
1.6 misha 7077: 0 : md->end_offset_top/2;
7078:
7079: /* If there is space in the offset vector, set any unused pairs at the end of
7080: the pattern to -1 for backwards compatibility. It is documented that this
7081: happens. In earlier versions, the whole set of potential capturing offsets
7082: was set to -1 each time round the loop, but this is handled differently now.
7083: "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
7084: those at the end that need unsetting here. We can't just unset them all at
7085: the start of the whole thing because they may get set in one branch that is
7086: not the final matching branch. */
7087:
7088: if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
7089: {
7090: register int *iptr, *iend;
7091: int resetcount = 2 + re->top_bracket * 2;
1.7 misha 7092: if (resetcount > offsetcount) resetcount = offsetcount;
1.6 misha 7093: iptr = offsets + md->end_offset_top;
7094: iend = offsets + resetcount;
7095: while (iptr < iend) *iptr++ = -1;
7096: }
1.1 misha 7097:
7098: /* If there is space, set up the whole thing as substring 0. The value of
7099: md->start_match_ptr might be modified if \K was encountered on the success
7100: matching path. */
7101:
7102: if (offsetcount < 2) rc = 0; else
7103: {
1.4 misha 7104: offsets[0] = (int)(md->start_match_ptr - md->start_subject);
7105: offsets[1] = (int)(md->end_match_ptr - md->start_subject);
1.1 misha 7106: }
7107:
1.6 misha 7108: /* Return MARK data if requested */
7109:
7110: if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
7111: *(extra_data->mark) = (pcre_uchar *)md->mark;
1.1 misha 7112: DPRINTF((">>>> returning %d\n", rc));
1.7 misha 7113: #ifdef NO_RECURSE
7114: release_match_heapframes(&frame_zero);
7115: #endif
1.6 misha 7116: return rc;
1.1 misha 7117: }
7118:
7119: /* Control gets here if there has been an error, or if the overall match
7120: attempt has failed at all permitted starting positions. */
7121:
7122: if (using_temporary_offsets)
7123: {
7124: DPRINTF(("Freeing temporary memory\n"));
1.6 misha 7125: (PUBL(free))(md->offset_vector);
1.1 misha 7126: }
7127:
1.4 misha 7128: /* For anything other than nomatch or partial match, just return the code. */
7129:
7130: if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
1.1 misha 7131: {
7132: DPRINTF((">>>> error: returning %d\n", rc));
1.7 misha 7133: #ifdef NO_RECURSE
7134: release_match_heapframes(&frame_zero);
7135: #endif
1.1 misha 7136: return rc;
7137: }
1.4 misha 7138:
7139: /* Handle partial matches - disable any mark data */
7140:
1.8 moko 7141: if (match_partial != NULL)
1.1 misha 7142: {
7143: DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
1.4 misha 7144: md->mark = NULL;
7145: if (offsetcount > 1)
7146: {
1.6 misha 7147: offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
7148: offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
1.7 misha 7149: if (offsetcount > 2)
7150: offsets[2] = (int)(match_partial - (PCRE_PUCHAR)subject);
1.4 misha 7151: }
7152: rc = PCRE_ERROR_PARTIAL;
1.1 misha 7153: }
1.4 misha 7154:
7155: /* This is the classic nomatch case */
7156:
1.1 misha 7157: else
7158: {
7159: DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
1.4 misha 7160: rc = PCRE_ERROR_NOMATCH;
1.1 misha 7161: }
1.4 misha 7162:
7163: /* Return the MARK data if it has been requested. */
7164:
7165: if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
1.6 misha 7166: *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark;
1.7 misha 7167: #ifdef NO_RECURSE
7168: release_match_heapframes(&frame_zero);
7169: #endif
1.4 misha 7170: return rc;
1.1 misha 7171: }
7172:
7173: /* End of pcre_exec.c */
E-mail: