Annotation of win32/pcre/pcre_exec.c, revision 1.5
1.1 misha 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
1.4 misha 9: Copyright (c) 1997-2010 University of Cambridge
1.1 misha 10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40:
41: /* This module contains pcre_exec(), the externally visible function that does
42: pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43: possible. There are also some static supporting functions. */
44:
45: #ifdef HAVE_CONFIG_H
46: #include "config.h"
47: #endif
48:
49: #define NLBLOCK md /* Block containing newline information */
50: #define PSSTART start_subject /* Field containing processed string start */
51: #define PSEND end_subject /* Field containing processed string end */
52:
53: #include "pcre_internal.h"
54:
55: /* Undefine some potentially clashing cpp symbols */
56:
57: #undef min
58: #undef max
59:
60: /* Flag bits for the match() function */
61:
62: #define match_condassert 0x01 /* Called to check a condition assertion */
63: #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64:
65: /* Non-error returns from the match() function. Error returns are externally
66: defined PCRE_ERROR_xxx codes, which are all negative. */
67:
68: #define MATCH_MATCH 1
69: #define MATCH_NOMATCH 0
70:
71: /* Special internal returns from the match() function. Make them sufficiently
72: negative to avoid the external error codes. */
73:
1.4 misha 74: #define MATCH_ACCEPT (-999)
75: #define MATCH_COMMIT (-998)
76: #define MATCH_PRUNE (-997)
77: #define MATCH_SKIP (-996)
78: #define MATCH_SKIP_ARG (-995)
79: #define MATCH_THEN (-994)
80:
81: /* This is a convenience macro for code that occurs many times. */
82:
83: #define MRRETURN(ra) \
84: { \
85: md->mark = markptr; \
86: RRETURN(ra); \
87: }
1.1 misha 88:
89: /* Maximum number of ints of offset to save on the stack for recursive calls.
90: If the offset vector is bigger, malloc is used. This should be a multiple of 3,
91: because the offset vector is always a multiple of 3 long. */
92:
93: #define REC_STACK_SAVE_MAX 30
94:
95: /* Min and max values for the common repeats; for the maxima, 0 => infinity */
96:
97: static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
98: static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
99:
100:
101:
1.4 misha 102: #ifdef PCRE_DEBUG
1.1 misha 103: /*************************************************
104: * Debugging function to print chars *
105: *************************************************/
106:
107: /* Print a sequence of chars in printable format, stopping at the end of the
108: subject if the requested.
109:
110: Arguments:
111: p points to characters
112: length number to print
113: is_subject TRUE if printing from within md->start_subject
114: md pointer to matching data block, if is_subject is TRUE
115:
116: Returns: nothing
117: */
118:
119: static void
120: pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
121: {
122: unsigned int c;
123: if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
124: while (length-- > 0)
125: if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
126: }
127: #endif
128:
129:
130:
131: /*************************************************
132: * Match a back-reference *
133: *************************************************/
134:
135: /* If a back reference hasn't been set, the length that is passed is greater
136: than the number of characters left in the string, so the match fails.
137:
138: Arguments:
139: offset index into the offset vector
140: eptr points into the subject
141: length length to be matched
142: md points to match data block
143: ims the ims flags
144:
145: Returns: TRUE if matched
146: */
147:
148: static BOOL
149: match_ref(int offset, register USPTR eptr, int length, match_data *md,
150: unsigned long int ims)
151: {
152: USPTR p = md->start_subject + md->offset_vector[offset];
153:
1.4 misha 154: #ifdef PCRE_DEBUG
1.1 misha 155: if (eptr >= md->end_subject)
156: printf("matching subject <null>");
157: else
158: {
159: printf("matching subject ");
160: pchars(eptr, length, TRUE, md);
161: }
162: printf(" against backref ");
163: pchars(p, length, FALSE, md);
164: printf("\n");
165: #endif
166:
167: /* Always fail if not enough characters left */
168:
169: if (length > md->end_subject - eptr) return FALSE;
170:
1.2 misha 171: /* Separate the caseless case for speed. In UTF-8 mode we can only do this
172: properly if Unicode properties are supported. Otherwise, we can check only
173: ASCII characters. */
1.1 misha 174:
175: if ((ims & PCRE_CASELESS) != 0)
176: {
1.2 misha 177: #ifdef SUPPORT_UTF8
178: #ifdef SUPPORT_UCP
179: if (md->utf8)
180: {
181: USPTR endptr = eptr + length;
182: while (eptr < endptr)
183: {
184: int c, d;
185: GETCHARINC(c, eptr);
186: GETCHARINC(d, p);
187: if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
188: }
189: }
190: else
191: #endif
192: #endif
193:
194: /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
195: is no UCP support. */
196:
1.1 misha 197: while (length-- > 0)
1.2 misha 198: { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
1.1 misha 199: }
1.2 misha 200:
201: /* In the caseful case, we can just compare the bytes, whether or not we
202: are in UTF-8 mode. */
203:
1.1 misha 204: else
205: { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
206:
207: return TRUE;
208: }
209:
210:
211:
212: /***************************************************************************
213: ****************************************************************************
214: RECURSION IN THE match() FUNCTION
215:
216: The match() function is highly recursive, though not every recursive call
217: increases the recursive depth. Nevertheless, some regular expressions can cause
218: it to recurse to a great depth. I was writing for Unix, so I just let it call
219: itself recursively. This uses the stack for saving everything that has to be
220: saved for a recursive call. On Unix, the stack can be large, and this works
221: fine.
222:
223: It turns out that on some non-Unix-like systems there are problems with
224: programs that use a lot of stack. (This despite the fact that every last chip
225: has oodles of memory these days, and techniques for extending the stack have
226: been known for decades.) So....
227:
228: There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
229: calls by keeping local variables that need to be preserved in blocks of memory
230: obtained from malloc() instead instead of on the stack. Macros are used to
231: achieve this so that the actual code doesn't look very different to what it
232: always used to.
233:
234: The original heap-recursive code used longjmp(). However, it seems that this
235: can be very slow on some operating systems. Following a suggestion from Stan
236: Switzer, the use of longjmp() has been abolished, at the cost of having to
237: provide a unique number for each call to RMATCH. There is no way of generating
238: a sequence of numbers at compile time in C. I have given them names, to make
239: them stand out more clearly.
240:
241: Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
242: FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
243: tests. Furthermore, not using longjmp() means that local dynamic variables
244: don't have indeterminate values; this has meant that the frame size can be
245: reduced because the result can be "passed back" by straight setting of the
246: variable instead of being passed in the frame.
247: ****************************************************************************
248: ***************************************************************************/
249:
250: /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
251: below must be updated in sync. */
252:
253: enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
254: RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
255: RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
256: RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
257: RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
1.4 misha 258: RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
259: RM61, RM62 };
1.1 misha 260:
261: /* These versions of the macros use the stack, as normal. There are debugging
262: versions and production versions. Note that the "rw" argument of RMATCH isn't
1.4 misha 263: actually used in this definition. */
1.1 misha 264:
265: #ifndef NO_RECURSE
266: #define REGISTER register
267:
1.4 misha 268: #ifdef PCRE_DEBUG
1.1 misha 269: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
270: { \
271: printf("match() called in line %d\n", __LINE__); \
1.4 misha 272: rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
1.1 misha 273: printf("to line %d\n", __LINE__); \
274: }
275: #define RRETURN(ra) \
276: { \
277: printf("match() returned %d from line %d ", ra, __LINE__); \
278: return ra; \
279: }
280: #else
281: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
1.4 misha 282: rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
1.1 misha 283: #define RRETURN(ra) return ra
284: #endif
285:
286: #else
287:
288:
289: /* These versions of the macros manage a private stack on the heap. Note that
290: the "rd" argument of RMATCH isn't actually used in this definition. It's the md
291: argument of match(), which never changes. */
292:
293: #define REGISTER
294:
295: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
296: {\
1.5 ! misha 297: heapframe *newframe = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));\
1.4 misha 298: if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
1.1 misha 299: frame->Xwhere = rw; \
300: newframe->Xeptr = ra;\
301: newframe->Xecode = rb;\
302: newframe->Xmstart = mstart;\
1.4 misha 303: newframe->Xmarkptr = markptr;\
1.1 misha 304: newframe->Xoffset_top = rc;\
305: newframe->Xims = re;\
306: newframe->Xeptrb = rf;\
307: newframe->Xflags = rg;\
308: newframe->Xrdepth = frame->Xrdepth + 1;\
309: newframe->Xprevframe = frame;\
310: frame = newframe;\
311: DPRINTF(("restarting from line %d\n", __LINE__));\
312: goto HEAP_RECURSE;\
313: L_##rw:\
314: DPRINTF(("jumped back to line %d\n", __LINE__));\
315: }
316:
317: #define RRETURN(ra)\
318: {\
1.4 misha 319: heapframe *oldframe = frame;\
320: frame = oldframe->Xprevframe;\
321: (pcre_stack_free)(oldframe);\
1.1 misha 322: if (frame != NULL)\
323: {\
324: rrc = ra;\
325: goto HEAP_RETURN;\
326: }\
327: return ra;\
328: }
329:
330:
331: /* Structure for remembering the local variables in a private frame */
332:
333: typedef struct heapframe {
334: struct heapframe *Xprevframe;
335:
336: /* Function arguments that may change */
337:
1.3 misha 338: USPTR Xeptr;
1.1 misha 339: const uschar *Xecode;
1.3 misha 340: USPTR Xmstart;
1.4 misha 341: USPTR Xmarkptr;
1.1 misha 342: int Xoffset_top;
343: long int Xims;
344: eptrblock *Xeptrb;
345: int Xflags;
346: unsigned int Xrdepth;
347:
348: /* Function local variables */
349:
1.3 misha 350: USPTR Xcallpat;
351: #ifdef SUPPORT_UTF8
352: USPTR Xcharptr;
353: #endif
354: USPTR Xdata;
355: USPTR Xnext;
356: USPTR Xpp;
357: USPTR Xprev;
358: USPTR Xsaved_eptr;
1.1 misha 359:
360: recursion_info Xnew_recursive;
361:
362: BOOL Xcur_is_word;
363: BOOL Xcondition;
364: BOOL Xprev_is_word;
365:
366: unsigned long int Xoriginal_ims;
367:
368: #ifdef SUPPORT_UCP
369: int Xprop_type;
370: int Xprop_value;
371: int Xprop_fail_result;
372: int Xprop_category;
373: int Xprop_chartype;
374: int Xprop_script;
375: int Xoclength;
376: uschar Xocchars[8];
377: #endif
378:
1.3 misha 379: int Xcodelink;
1.1 misha 380: int Xctype;
381: unsigned int Xfc;
382: int Xfi;
383: int Xlength;
384: int Xmax;
385: int Xmin;
386: int Xnumber;
387: int Xoffset;
388: int Xop;
389: int Xsave_capture_last;
390: int Xsave_offset1, Xsave_offset2, Xsave_offset3;
391: int Xstacksave[REC_STACK_SAVE_MAX];
392:
393: eptrblock Xnewptrb;
394:
395: /* Where to jump back to */
396:
397: int Xwhere;
398:
399: } heapframe;
400:
401: #endif
402:
403:
404: /***************************************************************************
405: ***************************************************************************/
406:
407:
408:
409: /*************************************************
410: * Match from current position *
411: *************************************************/
412:
413: /* This function is called recursively in many circumstances. Whenever it
414: returns a negative (error) response, the outer incarnation must also return the
1.4 misha 415: same response. */
416:
417: /* These macros pack up tests that are used for partial matching, and which
418: appears several times in the code. We set the "hit end" flag if the pointer is
419: at the end of the subject and also past the start of the subject (i.e.
420: something has been matched). For hard partial matching, we then return
421: immediately. The second one is used when we already know we are past the end of
422: the subject. */
423:
424: #define CHECK_PARTIAL()\
1.5 ! misha 425: if (md->partial != 0 && eptr >= md->end_subject && \
! 426: eptr > md->start_used_ptr) \
! 427: { \
! 428: md->hitend = TRUE; \
! 429: if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
1.4 misha 430: }
1.1 misha 431:
1.4 misha 432: #define SCHECK_PARTIAL()\
1.5 ! misha 433: if (md->partial != 0 && eptr > md->start_used_ptr) \
! 434: { \
! 435: md->hitend = TRUE; \
! 436: if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL); \
1.4 misha 437: }
438:
439:
440: /* Performance note: It might be tempting to extract commonly used fields from
441: the md structure (e.g. utf8, end_subject) into individual variables to improve
1.1 misha 442: performance. Tests using gcc on a SPARC disproved this; in the first case, it
443: made performance worse.
444:
445: Arguments:
446: eptr pointer to current character in subject
447: ecode pointer to current position in compiled code
448: mstart pointer to the current match start position (can be modified
449: by encountering \K)
1.4 misha 450: markptr pointer to the most recent MARK name, or NULL
1.1 misha 451: offset_top current top pointer
452: md pointer to "static" info for the match
453: ims current /i, /m, and /s options
454: eptrb pointer to chain of blocks containing eptr at start of
455: brackets - for testing for empty matches
456: flags can contain
457: match_condassert - this is an assertion condition
458: match_cbegroup - this is the start of an unlimited repeat
459: group that can match an empty string
460: rdepth the recursion depth
461:
462: Returns: MATCH_MATCH if matched ) these values are >= 0
463: MATCH_NOMATCH if failed to match )
1.4 misha 464: a negative MATCH_xxx value for PRUNE, SKIP, etc
1.1 misha 465: a negative PCRE_ERROR_xxx value if aborted by an error condition
466: (e.g. stopped by repeated call or recursion limit)
467: */
468:
469: static int
1.3 misha 470: match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
1.4 misha 471: const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
472: eptrblock *eptrb, int flags, unsigned int rdepth)
1.1 misha 473: {
474: /* These variables do not need to be preserved over recursion in this function,
475: so they can be ordinary variables in all cases. Mark some of them with
476: "register" because they are used a lot in loops. */
477:
478: register int rrc; /* Returns from recursive calls */
479: register int i; /* Used for loops not involving calls to RMATCH() */
480: register unsigned int c; /* Character values not kept over RMATCH() calls */
481: register BOOL utf8; /* Local copy of UTF-8 flag for speed */
482:
483: BOOL minimize, possessive; /* Quantifier options */
1.3 misha 484: int condcode;
1.1 misha 485:
486: /* When recursion is not being used, all "local" variables that have to be
487: preserved over calls to RMATCH() are part of a "frame" which is obtained from
488: heap storage. Set up the top-level frame here; others are obtained from the
489: heap whenever RMATCH() does a "recursion". See the macro definitions above. */
490:
491: #ifdef NO_RECURSE
1.5 ! misha 492: heapframe *frame = (heapframe *)(pcre_stack_malloc)(sizeof(heapframe));
1.4 misha 493: if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1.1 misha 494: frame->Xprevframe = NULL; /* Marks the top level */
495:
496: /* Copy in the original argument variables */
497:
498: frame->Xeptr = eptr;
499: frame->Xecode = ecode;
500: frame->Xmstart = mstart;
1.4 misha 501: frame->Xmarkptr = markptr;
1.1 misha 502: frame->Xoffset_top = offset_top;
503: frame->Xims = ims;
504: frame->Xeptrb = eptrb;
505: frame->Xflags = flags;
506: frame->Xrdepth = rdepth;
507:
508: /* This is where control jumps back to to effect "recursion" */
509:
510: HEAP_RECURSE:
511:
512: /* Macros make the argument variables come from the current frame */
513:
514: #define eptr frame->Xeptr
515: #define ecode frame->Xecode
516: #define mstart frame->Xmstart
1.4 misha 517: #define markptr frame->Xmarkptr
1.1 misha 518: #define offset_top frame->Xoffset_top
519: #define ims frame->Xims
520: #define eptrb frame->Xeptrb
521: #define flags frame->Xflags
522: #define rdepth frame->Xrdepth
523:
524: /* Ditto for the local variables */
525:
526: #ifdef SUPPORT_UTF8
527: #define charptr frame->Xcharptr
528: #endif
529: #define callpat frame->Xcallpat
1.3 misha 530: #define codelink frame->Xcodelink
1.1 misha 531: #define data frame->Xdata
532: #define next frame->Xnext
533: #define pp frame->Xpp
534: #define prev frame->Xprev
535: #define saved_eptr frame->Xsaved_eptr
536:
537: #define new_recursive frame->Xnew_recursive
538:
539: #define cur_is_word frame->Xcur_is_word
540: #define condition frame->Xcondition
541: #define prev_is_word frame->Xprev_is_word
542:
543: #define original_ims frame->Xoriginal_ims
544:
545: #ifdef SUPPORT_UCP
546: #define prop_type frame->Xprop_type
547: #define prop_value frame->Xprop_value
548: #define prop_fail_result frame->Xprop_fail_result
549: #define prop_category frame->Xprop_category
550: #define prop_chartype frame->Xprop_chartype
551: #define prop_script frame->Xprop_script
552: #define oclength frame->Xoclength
553: #define occhars frame->Xocchars
554: #endif
555:
556: #define ctype frame->Xctype
557: #define fc frame->Xfc
558: #define fi frame->Xfi
559: #define length frame->Xlength
560: #define max frame->Xmax
561: #define min frame->Xmin
562: #define number frame->Xnumber
563: #define offset frame->Xoffset
564: #define op frame->Xop
565: #define save_capture_last frame->Xsave_capture_last
566: #define save_offset1 frame->Xsave_offset1
567: #define save_offset2 frame->Xsave_offset2
568: #define save_offset3 frame->Xsave_offset3
569: #define stacksave frame->Xstacksave
570:
571: #define newptrb frame->Xnewptrb
572:
573: /* When recursion is being used, local variables are allocated on the stack and
574: get preserved during recursion in the normal way. In this environment, fi and
575: i, and fc and c, can be the same variables. */
576:
577: #else /* NO_RECURSE not defined */
578: #define fi i
579: #define fc c
580:
581:
582: #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
583: const uschar *charptr; /* in small blocks of the code. My normal */
584: #endif /* style of coding would have declared */
585: const uschar *callpat; /* them within each of those blocks. */
586: const uschar *data; /* However, in order to accommodate the */
587: const uschar *next; /* version of this code that uses an */
588: USPTR pp; /* external "stack" implemented on the */
589: const uschar *prev; /* heap, it is easier to declare them all */
590: USPTR saved_eptr; /* here, so the declarations can be cut */
591: /* out in a block. The only declarations */
592: recursion_info new_recursive; /* within blocks below are for variables */
593: /* that do not have to be preserved over */
594: BOOL cur_is_word; /* a recursive call to RMATCH(). */
595: BOOL condition;
596: BOOL prev_is_word;
597:
598: unsigned long int original_ims;
599:
600: #ifdef SUPPORT_UCP
601: int prop_type;
602: int prop_value;
603: int prop_fail_result;
604: int prop_category;
605: int prop_chartype;
606: int prop_script;
607: int oclength;
608: uschar occhars[8];
609: #endif
610:
1.3 misha 611: int codelink;
1.1 misha 612: int ctype;
613: int length;
614: int max;
615: int min;
616: int number;
617: int offset;
618: int op;
619: int save_capture_last;
620: int save_offset1, save_offset2, save_offset3;
621: int stacksave[REC_STACK_SAVE_MAX];
622:
623: eptrblock newptrb;
624: #endif /* NO_RECURSE */
625:
626: /* These statements are here to stop the compiler complaining about unitialized
627: variables. */
628:
629: #ifdef SUPPORT_UCP
630: prop_value = 0;
631: prop_fail_result = 0;
632: #endif
633:
634:
635: /* This label is used for tail recursion, which is used in a few cases even
636: when NO_RECURSE is not defined, in order to reduce the amount of stack that is
637: used. Thanks to Ian Taylor for noticing this possibility and sending the
638: original patch. */
639:
640: TAIL_RECURSE:
641:
642: /* OK, now we can get on with the real code of the function. Recursive calls
643: are specified by the macro RMATCH and RRETURN is used to return. When
644: NO_RECURSE is *not* defined, these just turn into a recursive call to match()
1.4 misha 645: and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
1.1 misha 646: defined). However, RMATCH isn't like a function call because it's quite a
647: complicated macro. It has to be used in one particular way. This shouldn't,
648: however, impact performance when true recursion is being used. */
649:
650: #ifdef SUPPORT_UTF8
651: utf8 = md->utf8; /* Local copy of the flag */
652: #else
653: utf8 = FALSE;
654: #endif
655:
656: /* First check that we haven't called match() too many times, or that we
657: haven't exceeded the recursive call limit. */
658:
659: if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
660: if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
661:
662: original_ims = ims; /* Save for resetting on ')' */
663:
664: /* At the start of a group with an unlimited repeat that may match an empty
665: string, the match_cbegroup flag is set. When this is the case, add the current
666: subject pointer to the chain of such remembered pointers, to be checked when we
667: hit the closing ket, in order to break infinite loops that match no characters.
668: When match() is called in other circumstances, don't add to the chain. The
669: match_cbegroup flag must NOT be used with tail recursion, because the memory
670: block that is used is on the stack, so a new one may be required for each
671: match(). */
672:
673: if ((flags & match_cbegroup) != 0)
674: {
675: newptrb.epb_saved_eptr = eptr;
676: newptrb.epb_prev = eptrb;
677: eptrb = &newptrb;
678: }
679:
680: /* Now start processing the opcodes. */
681:
682: for (;;)
683: {
684: minimize = possessive = FALSE;
685: op = *ecode;
686:
1.4 misha 687: switch(op)
688: {
689: case OP_MARK:
690: markptr = ecode + 2;
691: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
692: ims, eptrb, flags, RM55);
693:
694: /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
695: argument, and we must check whether that argument matches this MARK's
696: argument. It is passed back in md->start_match_ptr (an overloading of that
697: variable). If it does match, we reset that variable to the current subject
698: position and return MATCH_SKIP. Otherwise, pass back the return code
699: unaltered. */
700:
701: if (rrc == MATCH_SKIP_ARG &&
702: strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
703: {
704: md->start_match_ptr = eptr;
705: RRETURN(MATCH_SKIP);
706: }
1.1 misha 707:
1.4 misha 708: if (md->mark == NULL) md->mark = markptr;
709: RRETURN(rrc);
1.1 misha 710:
711: case OP_FAIL:
1.4 misha 712: MRRETURN(MATCH_NOMATCH);
713:
1.5 ! misha 714: /* COMMIT overrides PRUNE, SKIP, and THEN */
! 715:
1.4 misha 716: case OP_COMMIT:
717: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
718: ims, eptrb, flags, RM52);
1.5 ! misha 719: if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
! 720: rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
! 721: rrc != MATCH_THEN)
! 722: RRETURN(rrc);
1.4 misha 723: MRRETURN(MATCH_COMMIT);
1.1 misha 724:
1.5 ! misha 725: /* PRUNE overrides THEN */
! 726:
1.1 misha 727: case OP_PRUNE:
728: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
729: ims, eptrb, flags, RM51);
1.5 ! misha 730: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1.4 misha 731: MRRETURN(MATCH_PRUNE);
1.1 misha 732:
1.4 misha 733: case OP_PRUNE_ARG:
734: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
735: ims, eptrb, flags, RM56);
1.5 ! misha 736: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1.4 misha 737: md->mark = ecode + 2;
738: RRETURN(MATCH_PRUNE);
1.1 misha 739:
1.5 ! misha 740: /* SKIP overrides PRUNE and THEN */
! 741:
1.1 misha 742: case OP_SKIP:
743: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
744: ims, eptrb, flags, RM53);
1.5 ! misha 745: if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
! 746: RRETURN(rrc);
1.1 misha 747: md->start_match_ptr = eptr; /* Pass back current position */
1.4 misha 748: MRRETURN(MATCH_SKIP);
749:
750: case OP_SKIP_ARG:
751: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
752: ims, eptrb, flags, RM57);
1.5 ! misha 753: if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
! 754: RRETURN(rrc);
1.4 misha 755:
756: /* Pass back the current skip name by overloading md->start_match_ptr and
757: returning the special MATCH_SKIP_ARG return code. This will either be
758: caught by a matching MARK, or get to the top, where it is treated the same
759: as PRUNE. */
760:
761: md->start_match_ptr = ecode + 2;
762: RRETURN(MATCH_SKIP_ARG);
1.1 misha 763:
1.5 ! misha 764: /* For THEN (and THEN_ARG) we pass back the address of the bracket or
! 765: the alt that is at the start of the current branch. This makes it possible
! 766: to skip back past alternatives that precede the THEN within the current
! 767: branch. */
! 768:
1.1 misha 769: case OP_THEN:
770: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
771: ims, eptrb, flags, RM54);
772: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.5 ! misha 773: md->start_match_ptr = ecode - GET(ecode, 1);
1.4 misha 774: MRRETURN(MATCH_THEN);
775:
776: case OP_THEN_ARG:
1.5 ! misha 777: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1+LINK_SIZE],
! 778: offset_top, md, ims, eptrb, flags, RM58);
1.4 misha 779: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.5 ! misha 780: md->start_match_ptr = ecode - GET(ecode, 1);
! 781: md->mark = ecode + LINK_SIZE + 2;
1.1 misha 782: RRETURN(MATCH_THEN);
783:
784: /* Handle a capturing bracket. If there is space in the offset vector, save
785: the current subject position in the working slot at the top of the vector.
786: We mustn't change the current values of the data slot, because they may be
787: set from a previous iteration of this group, and be referred to by a
788: reference inside the group.
789:
790: If the bracket fails to match, we need to restore this value and also the
791: values of the final offsets, in case they were set by a previous iteration
792: of the same bracket.
793:
794: If there isn't enough space in the offset vector, treat this as if it were
795: a non-capturing bracket. Don't worry about setting the flag for the error
796: case here; that is handled in the code for KET. */
797:
798: case OP_CBRA:
799: case OP_SCBRA:
800: number = GET2(ecode, 1+LINK_SIZE);
801: offset = number << 1;
802:
1.4 misha 803: #ifdef PCRE_DEBUG
1.1 misha 804: printf("start bracket %d\n", number);
805: printf("subject=");
806: pchars(eptr, 16, TRUE, md);
807: printf("\n");
808: #endif
809:
810: if (offset < md->offset_max)
811: {
812: save_offset1 = md->offset_vector[offset];
813: save_offset2 = md->offset_vector[offset+1];
814: save_offset3 = md->offset_vector[md->offset_end - number];
815: save_capture_last = md->capture_last;
816:
817: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1.4 misha 818: md->offset_vector[md->offset_end - number] =
819: (int)(eptr - md->start_subject);
1.1 misha 820:
821: flags = (op == OP_SCBRA)? match_cbegroup : 0;
822: do
823: {
824: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
825: ims, eptrb, flags, RM1);
1.5 ! misha 826: if (rrc != MATCH_NOMATCH &&
! 827: (rrc != MATCH_THEN || md->start_match_ptr != ecode))
! 828: RRETURN(rrc);
1.1 misha 829: md->capture_last = save_capture_last;
830: ecode += GET(ecode, 1);
831: }
832: while (*ecode == OP_ALT);
833:
834: DPRINTF(("bracket %d failed\n", number));
835:
836: md->offset_vector[offset] = save_offset1;
837: md->offset_vector[offset+1] = save_offset2;
838: md->offset_vector[md->offset_end - number] = save_offset3;
839:
1.4 misha 840: if (rrc != MATCH_THEN) md->mark = markptr;
1.1 misha 841: RRETURN(MATCH_NOMATCH);
842: }
843:
844: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
845: as a non-capturing bracket. */
846:
847: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
848: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
849:
850: DPRINTF(("insufficient capture room: treat as non-capturing\n"));
851:
852: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
853: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
854:
855: /* Non-capturing bracket. Loop for all the alternatives. When we get to the
856: final alternative within the brackets, we would return the result of a
857: recursive call to match() whatever happened. We can reduce stack usage by
858: turning this into a tail recursion, except in the case when match_cbegroup
859: is set.*/
860:
861: case OP_BRA:
862: case OP_SBRA:
863: DPRINTF(("start non-capturing bracket\n"));
864: flags = (op >= OP_SBRA)? match_cbegroup : 0;
865: for (;;)
866: {
867: if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
868: {
869: if (flags == 0) /* Not a possibly empty group */
870: {
871: ecode += _pcre_OP_lengths[*ecode];
872: DPRINTF(("bracket 0 tail recursion\n"));
873: goto TAIL_RECURSE;
874: }
875:
876: /* Possibly empty group; can't use tail recursion. */
877:
878: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
879: eptrb, flags, RM48);
1.4 misha 880: if (rrc == MATCH_NOMATCH) md->mark = markptr;
1.1 misha 881: RRETURN(rrc);
882: }
883:
884: /* For non-final alternatives, continue the loop for a NOMATCH result;
885: otherwise return. */
886:
887: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
888: eptrb, flags, RM2);
1.5 ! misha 889: if (rrc != MATCH_NOMATCH &&
! 890: (rrc != MATCH_THEN || md->start_match_ptr != ecode))
! 891: RRETURN(rrc);
1.1 misha 892: ecode += GET(ecode, 1);
893: }
894: /* Control never reaches here. */
895:
896: /* Conditional group: compilation checked that there are no more than
897: two branches. If the condition is false, skipping the first branch takes us
898: past the end if there is only one branch, but that's OK because that is
899: exactly what going to the ket would do. As there is only one branch to be
900: obeyed, we can use tail recursion to avoid using another stack frame. */
901:
902: case OP_COND:
903: case OP_SCOND:
1.3 misha 904: codelink= GET(ecode, 1);
905:
906: /* Because of the way auto-callout works during compile, a callout item is
907: inserted between OP_COND and an assertion condition. */
908:
909: if (ecode[LINK_SIZE+1] == OP_CALLOUT)
910: {
911: if (pcre_callout != NULL)
912: {
913: pcre_callout_block cb;
914: cb.version = 1; /* Version 1 of the callout block */
915: cb.callout_number = ecode[LINK_SIZE+2];
916: cb.offset_vector = md->offset_vector;
917: cb.subject = (PCRE_SPTR)md->start_subject;
1.4 misha 918: cb.subject_length = (int)(md->end_subject - md->start_subject);
919: cb.start_match = (int)(mstart - md->start_subject);
920: cb.current_position = (int)(eptr - md->start_subject);
1.3 misha 921: cb.pattern_position = GET(ecode, LINK_SIZE + 3);
922: cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
923: cb.capture_top = offset_top/2;
924: cb.capture_last = md->capture_last;
925: cb.callout_data = md->callout_data;
1.4 misha 926: if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1.3 misha 927: if (rrc < 0) RRETURN(rrc);
928: }
929: ecode += _pcre_OP_lengths[OP_CALLOUT];
930: }
931:
932: condcode = ecode[LINK_SIZE+1];
933:
934: /* Now see what the actual condition is */
935:
1.4 misha 936: if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1.1 misha 937: {
1.4 misha 938: if (md->recursive == NULL) /* Not recursing => FALSE */
939: {
940: condition = FALSE;
941: ecode += GET(ecode, 1);
942: }
943: else
944: {
945: int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
946: condition = (recno == RREF_ANY || recno == md->recursive->group_num);
947:
948: /* If the test is for recursion into a specific subpattern, and it is
949: false, but the test was set up by name, scan the table to see if the
950: name refers to any other numbers, and test them. The condition is true
951: if any one is set. */
952:
953: if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
954: {
955: uschar *slotA = md->name_table;
956: for (i = 0; i < md->name_count; i++)
957: {
958: if (GET2(slotA, 0) == recno) break;
959: slotA += md->name_entry_size;
960: }
961:
962: /* Found a name for the number - there can be only one; duplicate
963: names for different numbers are allowed, but not vice versa. First
964: scan down for duplicates. */
965:
966: if (i < md->name_count)
967: {
968: uschar *slotB = slotA;
969: while (slotB > md->name_table)
970: {
971: slotB -= md->name_entry_size;
972: if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
973: {
974: condition = GET2(slotB, 0) == md->recursive->group_num;
975: if (condition) break;
976: }
977: else break;
978: }
979:
980: /* Scan up for duplicates */
981:
982: if (!condition)
983: {
984: slotB = slotA;
985: for (i++; i < md->name_count; i++)
986: {
987: slotB += md->name_entry_size;
988: if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
989: {
990: condition = GET2(slotB, 0) == md->recursive->group_num;
991: if (condition) break;
992: }
993: else break;
994: }
995: }
996: }
997: }
998:
999: /* Chose branch according to the condition */
1000:
1001: ecode += condition? 3 : GET(ecode, 1);
1002: }
1.1 misha 1003: }
1004:
1.4 misha 1005: else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1.1 misha 1006: {
1007: offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1008: condition = offset < offset_top && md->offset_vector[offset] >= 0;
1.4 misha 1009:
1010: /* If the numbered capture is unset, but the reference was by name,
1011: scan the table to see if the name refers to any other numbers, and test
1012: them. The condition is true if any one is set. This is tediously similar
1013: to the code above, but not close enough to try to amalgamate. */
1014:
1015: if (!condition && condcode == OP_NCREF)
1016: {
1017: int refno = offset >> 1;
1018: uschar *slotA = md->name_table;
1019:
1020: for (i = 0; i < md->name_count; i++)
1021: {
1022: if (GET2(slotA, 0) == refno) break;
1023: slotA += md->name_entry_size;
1024: }
1025:
1026: /* Found a name for the number - there can be only one; duplicate names
1027: for different numbers are allowed, but not vice versa. First scan down
1028: for duplicates. */
1029:
1030: if (i < md->name_count)
1031: {
1032: uschar *slotB = slotA;
1033: while (slotB > md->name_table)
1034: {
1035: slotB -= md->name_entry_size;
1036: if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1037: {
1038: offset = GET2(slotB, 0) << 1;
1039: condition = offset < offset_top &&
1040: md->offset_vector[offset] >= 0;
1041: if (condition) break;
1042: }
1043: else break;
1044: }
1045:
1046: /* Scan up for duplicates */
1047:
1048: if (!condition)
1049: {
1050: slotB = slotA;
1051: for (i++; i < md->name_count; i++)
1052: {
1053: slotB += md->name_entry_size;
1054: if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
1055: {
1056: offset = GET2(slotB, 0) << 1;
1057: condition = offset < offset_top &&
1058: md->offset_vector[offset] >= 0;
1059: if (condition) break;
1060: }
1061: else break;
1062: }
1063: }
1064: }
1065: }
1066:
1067: /* Chose branch according to the condition */
1068:
1.1 misha 1069: ecode += condition? 3 : GET(ecode, 1);
1070: }
1071:
1.3 misha 1072: else if (condcode == OP_DEF) /* DEFINE - always false */
1.1 misha 1073: {
1074: condition = FALSE;
1075: ecode += GET(ecode, 1);
1076: }
1077:
1078: /* The condition is an assertion. Call match() to evaluate it - setting
1079: the final argument match_condassert causes it to stop at the end of an
1080: assertion. */
1081:
1082: else
1083: {
1084: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
1085: match_condassert, RM3);
1086: if (rrc == MATCH_MATCH)
1087: {
1088: condition = TRUE;
1089: ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1090: while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1091: }
1.5 ! misha 1092: else if (rrc != MATCH_NOMATCH &&
! 1093: (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1.1 misha 1094: {
1095: RRETURN(rrc); /* Need braces because of following else */
1096: }
1097: else
1098: {
1099: condition = FALSE;
1.3 misha 1100: ecode += codelink;
1.1 misha 1101: }
1102: }
1103:
1104: /* We are now at the branch that is to be obeyed. As there is only one,
1105: we can use tail recursion to avoid using another stack frame, except when
1106: match_cbegroup is required for an unlimited repeat of a possibly empty
1107: group. If the second alternative doesn't exist, we can just plough on. */
1108:
1109: if (condition || *ecode == OP_ALT)
1110: {
1111: ecode += 1 + LINK_SIZE;
1112: if (op == OP_SCOND) /* Possibly empty group */
1113: {
1114: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1115: RRETURN(rrc);
1116: }
1117: else /* Group must match something */
1118: {
1119: flags = 0;
1120: goto TAIL_RECURSE;
1121: }
1122: }
1.3 misha 1123: else /* Condition false & no alternative */
1.1 misha 1124: {
1125: ecode += 1 + LINK_SIZE;
1126: }
1127: break;
1128:
1129:
1.4 misha 1130: /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1131: to close any currently open capturing brackets. */
1132:
1133: case OP_CLOSE:
1134: number = GET2(ecode, 1);
1135: offset = number << 1;
1136:
1137: #ifdef PCRE_DEBUG
1138: printf("end bracket %d at *ACCEPT", number);
1139: printf("\n");
1140: #endif
1141:
1142: md->capture_last = number;
1143: if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1144: {
1145: md->offset_vector[offset] =
1146: md->offset_vector[md->offset_end - number];
1147: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1148: if (offset_top <= offset) offset_top = offset + 2;
1149: }
1150: ecode += 3;
1151: break;
1152:
1153:
1.1 misha 1154: /* End of the pattern, either real or forced. If we are in a top-level
1155: recursion, we should restore the offsets appropriately and continue from
1156: after the call. */
1157:
1158: case OP_ACCEPT:
1159: case OP_END:
1160: if (md->recursive != NULL && md->recursive->group_num == 0)
1161: {
1162: recursion_info *rec = md->recursive;
1163: DPRINTF(("End of pattern in a (?0) recursion\n"));
1164: md->recursive = rec->prevrec;
1165: memmove(md->offset_vector, rec->offset_save,
1166: rec->saved_max * sizeof(int));
1.4 misha 1167: offset_top = rec->save_offset_top;
1.1 misha 1168: ims = original_ims;
1169: ecode = rec->after_call;
1170: break;
1171: }
1172:
1.4 misha 1173: /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
1174: set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
1175: the subject. In both cases, backtracking will then try other alternatives,
1176: if any. */
1177:
1178: if (eptr == mstart &&
1179: (md->notempty ||
1180: (md->notempty_atstart &&
1181: mstart == md->start_subject + md->start_offset)))
1182: MRRETURN(MATCH_NOMATCH);
1183:
1184: /* Otherwise, we have a match. */
1.1 misha 1185:
1186: md->end_match_ptr = eptr; /* Record where we ended */
1187: md->end_offset_top = offset_top; /* and how many extracts were taken */
1188: md->start_match_ptr = mstart; /* and the start (\K can modify) */
1.4 misha 1189:
1190: /* For some reason, the macros don't work properly if an expression is
1191: given as the argument to MRRETURN when the heap is in use. */
1192:
1193: rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1194: MRRETURN(rrc);
1.1 misha 1195:
1196: /* Change option settings */
1197:
1198: case OP_OPT:
1199: ims = ecode[1];
1200: ecode += 2;
1201: DPRINTF(("ims set to %02lx\n", ims));
1202: break;
1203:
1204: /* Assertion brackets. Check the alternative branches in turn - the
1205: matching won't pass the KET for an assertion. If any one branch matches,
1206: the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1207: start of each branch to move the current point backwards, so the code at
1208: this level is identical to the lookahead case. */
1209:
1210: case OP_ASSERT:
1211: case OP_ASSERTBACK:
1212: do
1213: {
1214: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1215: RM4);
1.4 misha 1216: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1217: {
1218: mstart = md->start_match_ptr; /* In case \K reset it */
1219: break;
1220: }
1.5 ! misha 1221: if (rrc != MATCH_NOMATCH &&
! 1222: (rrc != MATCH_THEN || md->start_match_ptr != ecode))
! 1223: RRETURN(rrc);
1.1 misha 1224: ecode += GET(ecode, 1);
1225: }
1226: while (*ecode == OP_ALT);
1.4 misha 1227: if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1.1 misha 1228:
1229: /* If checking an assertion for a condition, return MATCH_MATCH. */
1230:
1231: if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1232:
1233: /* Continue from after the assertion, updating the offsets high water
1234: mark, since extracts may have been taken during the assertion. */
1235:
1236: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1237: ecode += 1 + LINK_SIZE;
1238: offset_top = md->end_offset_top;
1239: continue;
1240:
1.4 misha 1241: /* Negative assertion: all branches must fail to match. Encountering SKIP,
1242: PRUNE, or COMMIT means we must assume failure without checking subsequent
1243: branches. */
1.1 misha 1244:
1245: case OP_ASSERT_NOT:
1246: case OP_ASSERTBACK_NOT:
1247: do
1248: {
1249: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1250: RM5);
1.4 misha 1251: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
1252: if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1253: {
1254: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1255: break;
1256: }
1.5 ! misha 1257: if (rrc != MATCH_NOMATCH &&
! 1258: (rrc != MATCH_THEN || md->start_match_ptr != ecode))
! 1259: RRETURN(rrc);
1.1 misha 1260: ecode += GET(ecode,1);
1261: }
1262: while (*ecode == OP_ALT);
1263:
1264: if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1265:
1266: ecode += 1 + LINK_SIZE;
1267: continue;
1268:
1269: /* Move the subject pointer back. This occurs only at the start of
1270: each branch of a lookbehind assertion. If we are too close to the start to
1271: move back, this match function fails. When working with UTF-8 we move
1272: back a number of characters, not bytes. */
1273:
1274: case OP_REVERSE:
1275: #ifdef SUPPORT_UTF8
1276: if (utf8)
1277: {
1278: i = GET(ecode, 1);
1279: while (i-- > 0)
1280: {
1281: eptr--;
1.4 misha 1282: if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1.1 misha 1283: BACKCHAR(eptr);
1284: }
1285: }
1286: else
1287: #endif
1288:
1289: /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1290:
1291: {
1292: eptr -= GET(ecode, 1);
1.4 misha 1293: if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1.1 misha 1294: }
1295:
1.4 misha 1296: /* Save the earliest consulted character, then skip to next op code */
1.1 misha 1297:
1.4 misha 1298: if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1.1 misha 1299: ecode += 1 + LINK_SIZE;
1300: break;
1301:
1302: /* The callout item calls an external function, if one is provided, passing
1303: details of the match so far. This is mainly for debugging, though the
1304: function is able to force a failure. */
1305:
1306: case OP_CALLOUT:
1307: if (pcre_callout != NULL)
1308: {
1309: pcre_callout_block cb;
1310: cb.version = 1; /* Version 1 of the callout block */
1311: cb.callout_number = ecode[1];
1312: cb.offset_vector = md->offset_vector;
1313: cb.subject = (PCRE_SPTR)md->start_subject;
1.4 misha 1314: cb.subject_length = (int)(md->end_subject - md->start_subject);
1315: cb.start_match = (int)(mstart - md->start_subject);
1316: cb.current_position = (int)(eptr - md->start_subject);
1.1 misha 1317: cb.pattern_position = GET(ecode, 2);
1318: cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1319: cb.capture_top = offset_top/2;
1320: cb.capture_last = md->capture_last;
1321: cb.callout_data = md->callout_data;
1.4 misha 1322: if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 1323: if (rrc < 0) RRETURN(rrc);
1324: }
1325: ecode += 2 + 2*LINK_SIZE;
1326: break;
1327:
1328: /* Recursion either matches the current regex, or some subexpression. The
1329: offset data is the offset to the starting bracket from the start of the
1330: whole pattern. (This is so that it works from duplicated subpatterns.)
1331:
1332: If there are any capturing brackets started but not finished, we have to
1333: save their starting points and reinstate them after the recursion. However,
1334: we don't know how many such there are (offset_top records the completed
1335: total) so we just have to save all the potential data. There may be up to
1336: 65535 such values, which is too large to put on the stack, but using malloc
1337: for small numbers seems expensive. As a compromise, the stack is used when
1338: there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1339: is used. A problem is what to do if the malloc fails ... there is no way of
1340: returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1341: values on the stack, and accept that the rest may be wrong.
1342:
1343: There are also other values that have to be saved. We use a chained
1344: sequence of blocks that actually live on the stack. Thanks to Robin Houston
1345: for the original version of this logic. */
1346:
1347: case OP_RECURSE:
1348: {
1349: callpat = md->start_code + GET(ecode, 1);
1350: new_recursive.group_num = (callpat == md->start_code)? 0 :
1351: GET2(callpat, 1 + LINK_SIZE);
1352:
1353: /* Add to "recursing stack" */
1354:
1355: new_recursive.prevrec = md->recursive;
1356: md->recursive = &new_recursive;
1357:
1358: /* Find where to continue from afterwards */
1359:
1360: ecode += 1 + LINK_SIZE;
1361: new_recursive.after_call = ecode;
1362:
1363: /* Now save the offset data. */
1364:
1365: new_recursive.saved_max = md->offset_end;
1366: if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1367: new_recursive.offset_save = stacksave;
1368: else
1369: {
1370: new_recursive.offset_save =
1371: (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1372: if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1373: }
1374:
1375: memcpy(new_recursive.offset_save, md->offset_vector,
1376: new_recursive.saved_max * sizeof(int));
1.4 misha 1377: new_recursive.save_offset_top = offset_top;
1.1 misha 1378:
1379: /* OK, now we can do the recursion. For each top-level alternative we
1380: restore the offset and recursion data. */
1381:
1382: DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1383: flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1384: do
1385: {
1386: RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1387: md, ims, eptrb, flags, RM6);
1.4 misha 1388: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1.1 misha 1389: {
1390: DPRINTF(("Recursion matched\n"));
1391: md->recursive = new_recursive.prevrec;
1392: if (new_recursive.offset_save != stacksave)
1393: (pcre_free)(new_recursive.offset_save);
1.4 misha 1394: MRRETURN(MATCH_MATCH);
1.1 misha 1395: }
1.5 ! misha 1396: else if (rrc != MATCH_NOMATCH &&
! 1397: (rrc != MATCH_THEN || md->start_match_ptr != ecode))
1.1 misha 1398: {
1399: DPRINTF(("Recursion gave error %d\n", rrc));
1.3 misha 1400: if (new_recursive.offset_save != stacksave)
1401: (pcre_free)(new_recursive.offset_save);
1.1 misha 1402: RRETURN(rrc);
1403: }
1404:
1405: md->recursive = &new_recursive;
1406: memcpy(md->offset_vector, new_recursive.offset_save,
1407: new_recursive.saved_max * sizeof(int));
1408: callpat += GET(callpat, 1);
1409: }
1410: while (*callpat == OP_ALT);
1411:
1412: DPRINTF(("Recursion didn't match\n"));
1413: md->recursive = new_recursive.prevrec;
1414: if (new_recursive.offset_save != stacksave)
1415: (pcre_free)(new_recursive.offset_save);
1.4 misha 1416: MRRETURN(MATCH_NOMATCH);
1.1 misha 1417: }
1418: /* Control never reaches here */
1419:
1420: /* "Once" brackets are like assertion brackets except that after a match,
1421: the point in the subject string is not moved back. Thus there can never be
1422: a move back into the brackets. Friedl calls these "atomic" subpatterns.
1423: Check the alternative branches in turn - the matching won't pass the KET
1424: for this kind of subpattern. If any one branch matches, we carry on as at
1.4 misha 1425: the end of a normal bracket, leaving the subject pointer, but resetting
1426: the start-of-match value in case it was changed by \K. */
1.1 misha 1427:
1428: case OP_ONCE:
1429: prev = ecode;
1430: saved_eptr = eptr;
1431:
1432: do
1433: {
1434: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1.4 misha 1435: if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
1436: {
1437: mstart = md->start_match_ptr;
1438: break;
1439: }
1.5 ! misha 1440: if (rrc != MATCH_NOMATCH &&
! 1441: (rrc != MATCH_THEN || md->start_match_ptr != ecode))
! 1442: RRETURN(rrc);
1.1 misha 1443: ecode += GET(ecode,1);
1444: }
1445: while (*ecode == OP_ALT);
1446:
1447: /* If hit the end of the group (which could be repeated), fail */
1448:
1449: if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1450:
1451: /* Continue as from after the assertion, updating the offsets high water
1452: mark, since extracts may have been taken. */
1453:
1454: do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1455:
1456: offset_top = md->end_offset_top;
1457: eptr = md->end_match_ptr;
1458:
1459: /* For a non-repeating ket, just continue at this level. This also
1460: happens for a repeating ket if no characters were matched in the group.
1461: This is the forcible breaking of infinite loops as implemented in Perl
1462: 5.005. If there is an options reset, it will get obeyed in the normal
1463: course of events. */
1464:
1465: if (*ecode == OP_KET || eptr == saved_eptr)
1466: {
1467: ecode += 1+LINK_SIZE;
1468: break;
1469: }
1470:
1471: /* The repeating kets try the rest of the pattern or restart from the
1472: preceding bracket, in the appropriate order. The second "call" of match()
1473: uses tail recursion, to avoid using another stack frame. We need to reset
1474: any options that changed within the bracket before re-running it, so
1475: check the next opcode. */
1476:
1477: if (ecode[1+LINK_SIZE] == OP_OPT)
1478: {
1479: ims = (ims & ~PCRE_IMS) | ecode[4];
1480: DPRINTF(("ims set to %02lx at group repeat\n", ims));
1481: }
1482:
1483: if (*ecode == OP_KETRMIN)
1484: {
1485: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1486: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1487: ecode = prev;
1488: flags = 0;
1489: goto TAIL_RECURSE;
1490: }
1491: else /* OP_KETRMAX */
1492: {
1493: RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1494: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1495: ecode += 1 + LINK_SIZE;
1496: flags = 0;
1497: goto TAIL_RECURSE;
1498: }
1499: /* Control never gets here */
1500:
1501: /* An alternation is the end of a branch; scan along to find the end of the
1502: bracketed group and go to there. */
1503:
1504: case OP_ALT:
1505: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1506: break;
1507:
1508: /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1509: indicating that it may occur zero times. It may repeat infinitely, or not
1510: at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1511: with fixed upper repeat limits are compiled as a number of copies, with the
1512: optional ones preceded by BRAZERO or BRAMINZERO. */
1513:
1514: case OP_BRAZERO:
1515: {
1516: next = ecode+1;
1517: RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1518: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1519: do next += GET(next,1); while (*next == OP_ALT);
1520: ecode = next + 1 + LINK_SIZE;
1521: }
1522: break;
1523:
1524: case OP_BRAMINZERO:
1525: {
1526: next = ecode+1;
1527: do next += GET(next, 1); while (*next == OP_ALT);
1528: RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1529: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1530: ecode++;
1531: }
1532: break;
1533:
1534: case OP_SKIPZERO:
1535: {
1536: next = ecode+1;
1537: do next += GET(next,1); while (*next == OP_ALT);
1538: ecode = next + 1 + LINK_SIZE;
1539: }
1540: break;
1541:
1542: /* End of a group, repeated or non-repeating. */
1543:
1544: case OP_KET:
1545: case OP_KETRMIN:
1546: case OP_KETRMAX:
1547: prev = ecode - GET(ecode, 1);
1548:
1549: /* If this was a group that remembered the subject start, in order to break
1550: infinite repeats of empty string matches, retrieve the subject start from
1551: the chain. Otherwise, set it NULL. */
1552:
1553: if (*prev >= OP_SBRA)
1554: {
1555: saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1556: eptrb = eptrb->epb_prev; /* Backup to previous group */
1557: }
1558: else saved_eptr = NULL;
1559:
1.4 misha 1560: /* If we are at the end of an assertion group or an atomic group, stop
1561: matching and return MATCH_MATCH, but record the current high water mark for
1562: use by positive assertions. We also need to record the match start in case
1563: it was changed by \K. */
1.1 misha 1564:
1565: if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1566: *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1567: *prev == OP_ONCE)
1568: {
1569: md->end_match_ptr = eptr; /* For ONCE */
1570: md->end_offset_top = offset_top;
1.4 misha 1571: md->start_match_ptr = mstart;
1572: MRRETURN(MATCH_MATCH);
1.1 misha 1573: }
1574:
1575: /* For capturing groups we have to check the group number back at the start
1576: and if necessary complete handling an extraction by setting the offsets and
1577: bumping the high water mark. Note that whole-pattern recursion is coded as
1578: a recurse into group 0, so it won't be picked up here. Instead, we catch it
1579: when the OP_END is reached. Other recursion is handled here. */
1580:
1581: if (*prev == OP_CBRA || *prev == OP_SCBRA)
1582: {
1583: number = GET2(prev, 1+LINK_SIZE);
1584: offset = number << 1;
1585:
1.4 misha 1586: #ifdef PCRE_DEBUG
1.1 misha 1587: printf("end bracket %d", number);
1588: printf("\n");
1589: #endif
1590:
1591: md->capture_last = number;
1592: if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1593: {
1594: md->offset_vector[offset] =
1595: md->offset_vector[md->offset_end - number];
1.4 misha 1596: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1.1 misha 1597: if (offset_top <= offset) offset_top = offset + 2;
1598: }
1599:
1600: /* Handle a recursively called group. Restore the offsets
1601: appropriately and continue from after the call. */
1602:
1603: if (md->recursive != NULL && md->recursive->group_num == number)
1604: {
1605: recursion_info *rec = md->recursive;
1606: DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1607: md->recursive = rec->prevrec;
1608: memcpy(md->offset_vector, rec->offset_save,
1609: rec->saved_max * sizeof(int));
1.4 misha 1610: offset_top = rec->save_offset_top;
1.1 misha 1611: ecode = rec->after_call;
1612: ims = original_ims;
1613: break;
1614: }
1615: }
1616:
1617: /* For both capturing and non-capturing groups, reset the value of the ims
1618: flags, in case they got changed during the group. */
1619:
1620: ims = original_ims;
1621: DPRINTF(("ims reset to %02lx\n", ims));
1622:
1623: /* For a non-repeating ket, just continue at this level. This also
1624: happens for a repeating ket if no characters were matched in the group.
1625: This is the forcible breaking of infinite loops as implemented in Perl
1626: 5.005. If there is an options reset, it will get obeyed in the normal
1627: course of events. */
1628:
1629: if (*ecode == OP_KET || eptr == saved_eptr)
1630: {
1631: ecode += 1 + LINK_SIZE;
1632: break;
1633: }
1634:
1635: /* The repeating kets try the rest of the pattern or restart from the
1636: preceding bracket, in the appropriate order. In the second case, we can use
1637: tail recursion to avoid using another stack frame, unless we have an
1638: unlimited repeat of a group that can match an empty string. */
1639:
1640: flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1641:
1642: if (*ecode == OP_KETRMIN)
1643: {
1644: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1645: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1646: if (flags != 0) /* Could match an empty string */
1647: {
1648: RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1649: RRETURN(rrc);
1650: }
1651: ecode = prev;
1652: goto TAIL_RECURSE;
1653: }
1654: else /* OP_KETRMAX */
1655: {
1656: RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1657: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1658: ecode += 1 + LINK_SIZE;
1659: flags = 0;
1660: goto TAIL_RECURSE;
1661: }
1662: /* Control never gets here */
1663:
1664: /* Start of subject unless notbol, or after internal newline if multiline */
1665:
1666: case OP_CIRC:
1.4 misha 1667: if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1.1 misha 1668: if ((ims & PCRE_MULTILINE) != 0)
1669: {
1670: if (eptr != md->start_subject &&
1671: (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1.4 misha 1672: MRRETURN(MATCH_NOMATCH);
1.1 misha 1673: ecode++;
1674: break;
1675: }
1676: /* ... else fall through */
1677:
1678: /* Start of subject assertion */
1679:
1680: case OP_SOD:
1.4 misha 1681: if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1.1 misha 1682: ecode++;
1683: break;
1684:
1685: /* Start of match assertion */
1686:
1687: case OP_SOM:
1.4 misha 1688: if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1.1 misha 1689: ecode++;
1690: break;
1691:
1692: /* Reset the start of match point */
1693:
1694: case OP_SET_SOM:
1695: mstart = eptr;
1696: ecode++;
1697: break;
1698:
1699: /* Assert before internal newline if multiline, or before a terminating
1700: newline unless endonly is set, else end of subject unless noteol is set. */
1701:
1702: case OP_DOLL:
1703: if ((ims & PCRE_MULTILINE) != 0)
1704: {
1705: if (eptr < md->end_subject)
1.4 misha 1706: { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1.1 misha 1707: else
1.5 ! misha 1708: {
! 1709: if (md->noteol) MRRETURN(MATCH_NOMATCH);
! 1710: SCHECK_PARTIAL();
! 1711: }
1.1 misha 1712: ecode++;
1713: break;
1714: }
1.5 ! misha 1715: else /* Not multiline */
1.1 misha 1716: {
1.4 misha 1717: if (md->noteol) MRRETURN(MATCH_NOMATCH);
1.5 ! misha 1718: if (!md->endonly) goto ASSERT_NL_OR_EOS;
1.1 misha 1719: }
1.5 ! misha 1720:
1.1 misha 1721: /* ... else fall through for endonly */
1722:
1723: /* End of subject assertion (\z) */
1724:
1725: case OP_EOD:
1.4 misha 1726: if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1.5 ! misha 1727: SCHECK_PARTIAL();
1.1 misha 1728: ecode++;
1729: break;
1730:
1731: /* End of subject or ending \n assertion (\Z) */
1732:
1733: case OP_EODN:
1.5 ! misha 1734: ASSERT_NL_OR_EOS:
! 1735: if (eptr < md->end_subject &&
1.1 misha 1736: (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1.4 misha 1737: MRRETURN(MATCH_NOMATCH);
1.5 ! misha 1738:
! 1739: /* Either at end of string or \n before end. */
! 1740:
! 1741: SCHECK_PARTIAL();
1.1 misha 1742: ecode++;
1743: break;
1744:
1745: /* Word boundary assertions */
1746:
1747: case OP_NOT_WORD_BOUNDARY:
1748: case OP_WORD_BOUNDARY:
1749: {
1750:
1751: /* Find out if the previous and current characters are "word" characters.
1752: It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1.4 misha 1753: be "non-word" characters. Remember the earliest consulted character for
1754: partial matching. */
1.1 misha 1755:
1756: #ifdef SUPPORT_UTF8
1757: if (utf8)
1758: {
1.4 misha 1759: /* Get status of previous character */
1760:
1.1 misha 1761: if (eptr == md->start_subject) prev_is_word = FALSE; else
1762: {
1.3 misha 1763: USPTR lastptr = eptr - 1;
1.1 misha 1764: while((*lastptr & 0xc0) == 0x80) lastptr--;
1.4 misha 1765: if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1.1 misha 1766: GETCHAR(c, lastptr);
1.4 misha 1767: #ifdef SUPPORT_UCP
1768: if (md->use_ucp)
1769: {
1770: if (c == '_') prev_is_word = TRUE; else
1771: {
1772: int cat = UCD_CATEGORY(c);
1773: prev_is_word = (cat == ucp_L || cat == ucp_N);
1774: }
1775: }
1776: else
1777: #endif
1.1 misha 1778: prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1779: }
1.4 misha 1780:
1781: /* Get status of next character */
1782:
1783: if (eptr >= md->end_subject)
1784: {
1785: SCHECK_PARTIAL();
1786: cur_is_word = FALSE;
1787: }
1788: else
1.1 misha 1789: {
1790: GETCHAR(c, eptr);
1.4 misha 1791: #ifdef SUPPORT_UCP
1792: if (md->use_ucp)
1793: {
1794: if (c == '_') cur_is_word = TRUE; else
1795: {
1796: int cat = UCD_CATEGORY(c);
1797: cur_is_word = (cat == ucp_L || cat == ucp_N);
1798: }
1799: }
1800: else
1801: #endif
1.1 misha 1802: cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1803: }
1804: }
1805: else
1806: #endif
1807:
1.4 misha 1808: /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
1809: consistency with the behaviour of \w we do use it in this case. */
1.1 misha 1810:
1811: {
1.4 misha 1812: /* Get status of previous character */
1813:
1814: if (eptr == md->start_subject) prev_is_word = FALSE; else
1815: {
1816: if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
1817: #ifdef SUPPORT_UCP
1818: if (md->use_ucp)
1819: {
1820: c = eptr[-1];
1821: if (c == '_') prev_is_word = TRUE; else
1822: {
1823: int cat = UCD_CATEGORY(c);
1824: prev_is_word = (cat == ucp_L || cat == ucp_N);
1825: }
1826: }
1827: else
1828: #endif
1829: prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1830: }
1831:
1832: /* Get status of next character */
1833:
1834: if (eptr >= md->end_subject)
1835: {
1836: SCHECK_PARTIAL();
1837: cur_is_word = FALSE;
1838: }
1839: else
1840: #ifdef SUPPORT_UCP
1841: if (md->use_ucp)
1842: {
1843: c = *eptr;
1844: if (c == '_') cur_is_word = TRUE; else
1845: {
1846: int cat = UCD_CATEGORY(c);
1847: cur_is_word = (cat == ucp_L || cat == ucp_N);
1848: }
1849: }
1850: else
1851: #endif
1852: cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1.1 misha 1853: }
1854:
1855: /* Now see if the situation is what we want */
1856:
1857: if ((*ecode++ == OP_WORD_BOUNDARY)?
1858: cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1.4 misha 1859: MRRETURN(MATCH_NOMATCH);
1.1 misha 1860: }
1861: break;
1862:
1863: /* Match a single character type; inline for speed */
1864:
1865: case OP_ANY:
1.4 misha 1866: if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1.1 misha 1867: /* Fall through */
1868:
1869: case OP_ALLANY:
1.4 misha 1870: if (eptr++ >= md->end_subject)
1871: {
1872: SCHECK_PARTIAL();
1873: MRRETURN(MATCH_NOMATCH);
1874: }
1.1 misha 1875: if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1876: ecode++;
1877: break;
1878:
1879: /* Match a single byte, even in UTF-8 mode. This opcode really does match
1880: any byte, even newline, independent of the setting of PCRE_DOTALL. */
1881:
1882: case OP_ANYBYTE:
1.4 misha 1883: if (eptr++ >= md->end_subject)
1884: {
1885: SCHECK_PARTIAL();
1886: MRRETURN(MATCH_NOMATCH);
1887: }
1.1 misha 1888: ecode++;
1889: break;
1890:
1891: case OP_NOT_DIGIT:
1.4 misha 1892: if (eptr >= md->end_subject)
1893: {
1894: SCHECK_PARTIAL();
1895: MRRETURN(MATCH_NOMATCH);
1896: }
1.1 misha 1897: GETCHARINCTEST(c, eptr);
1898: if (
1899: #ifdef SUPPORT_UTF8
1900: c < 256 &&
1901: #endif
1902: (md->ctypes[c] & ctype_digit) != 0
1903: )
1.4 misha 1904: MRRETURN(MATCH_NOMATCH);
1.1 misha 1905: ecode++;
1906: break;
1907:
1908: case OP_DIGIT:
1.4 misha 1909: if (eptr >= md->end_subject)
1910: {
1911: SCHECK_PARTIAL();
1912: MRRETURN(MATCH_NOMATCH);
1913: }
1.1 misha 1914: GETCHARINCTEST(c, eptr);
1915: if (
1916: #ifdef SUPPORT_UTF8
1917: c >= 256 ||
1918: #endif
1919: (md->ctypes[c] & ctype_digit) == 0
1920: )
1.4 misha 1921: MRRETURN(MATCH_NOMATCH);
1.1 misha 1922: ecode++;
1923: break;
1924:
1925: case OP_NOT_WHITESPACE:
1.4 misha 1926: if (eptr >= md->end_subject)
1927: {
1928: SCHECK_PARTIAL();
1929: MRRETURN(MATCH_NOMATCH);
1930: }
1.1 misha 1931: GETCHARINCTEST(c, eptr);
1932: if (
1933: #ifdef SUPPORT_UTF8
1934: c < 256 &&
1935: #endif
1936: (md->ctypes[c] & ctype_space) != 0
1937: )
1.4 misha 1938: MRRETURN(MATCH_NOMATCH);
1.1 misha 1939: ecode++;
1940: break;
1941:
1942: case OP_WHITESPACE:
1.4 misha 1943: if (eptr >= md->end_subject)
1944: {
1945: SCHECK_PARTIAL();
1946: MRRETURN(MATCH_NOMATCH);
1947: }
1.1 misha 1948: GETCHARINCTEST(c, eptr);
1949: if (
1950: #ifdef SUPPORT_UTF8
1951: c >= 256 ||
1952: #endif
1953: (md->ctypes[c] & ctype_space) == 0
1954: )
1.4 misha 1955: MRRETURN(MATCH_NOMATCH);
1.1 misha 1956: ecode++;
1957: break;
1958:
1959: case OP_NOT_WORDCHAR:
1.4 misha 1960: if (eptr >= md->end_subject)
1961: {
1962: SCHECK_PARTIAL();
1963: MRRETURN(MATCH_NOMATCH);
1964: }
1.1 misha 1965: GETCHARINCTEST(c, eptr);
1966: if (
1967: #ifdef SUPPORT_UTF8
1968: c < 256 &&
1969: #endif
1970: (md->ctypes[c] & ctype_word) != 0
1971: )
1.4 misha 1972: MRRETURN(MATCH_NOMATCH);
1.1 misha 1973: ecode++;
1974: break;
1975:
1976: case OP_WORDCHAR:
1.4 misha 1977: if (eptr >= md->end_subject)
1978: {
1979: SCHECK_PARTIAL();
1980: MRRETURN(MATCH_NOMATCH);
1981: }
1.1 misha 1982: GETCHARINCTEST(c, eptr);
1983: if (
1984: #ifdef SUPPORT_UTF8
1985: c >= 256 ||
1986: #endif
1987: (md->ctypes[c] & ctype_word) == 0
1988: )
1.4 misha 1989: MRRETURN(MATCH_NOMATCH);
1.1 misha 1990: ecode++;
1991: break;
1992:
1993: case OP_ANYNL:
1.4 misha 1994: if (eptr >= md->end_subject)
1995: {
1996: SCHECK_PARTIAL();
1997: MRRETURN(MATCH_NOMATCH);
1998: }
1.1 misha 1999: GETCHARINCTEST(c, eptr);
2000: switch(c)
2001: {
1.4 misha 2002: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 2003: case 0x000d:
2004: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2005: break;
2006:
2007: case 0x000a:
2008: break;
2009:
2010: case 0x000b:
2011: case 0x000c:
2012: case 0x0085:
2013: case 0x2028:
2014: case 0x2029:
1.4 misha 2015: if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1.1 misha 2016: break;
2017: }
2018: ecode++;
2019: break;
2020:
2021: case OP_NOT_HSPACE:
1.4 misha 2022: if (eptr >= md->end_subject)
2023: {
2024: SCHECK_PARTIAL();
2025: MRRETURN(MATCH_NOMATCH);
2026: }
1.1 misha 2027: GETCHARINCTEST(c, eptr);
2028: switch(c)
2029: {
2030: default: break;
2031: case 0x09: /* HT */
2032: case 0x20: /* SPACE */
2033: case 0xa0: /* NBSP */
2034: case 0x1680: /* OGHAM SPACE MARK */
2035: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2036: case 0x2000: /* EN QUAD */
2037: case 0x2001: /* EM QUAD */
2038: case 0x2002: /* EN SPACE */
2039: case 0x2003: /* EM SPACE */
2040: case 0x2004: /* THREE-PER-EM SPACE */
2041: case 0x2005: /* FOUR-PER-EM SPACE */
2042: case 0x2006: /* SIX-PER-EM SPACE */
2043: case 0x2007: /* FIGURE SPACE */
2044: case 0x2008: /* PUNCTUATION SPACE */
2045: case 0x2009: /* THIN SPACE */
2046: case 0x200A: /* HAIR SPACE */
2047: case 0x202f: /* NARROW NO-BREAK SPACE */
2048: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2049: case 0x3000: /* IDEOGRAPHIC SPACE */
1.4 misha 2050: MRRETURN(MATCH_NOMATCH);
1.1 misha 2051: }
2052: ecode++;
2053: break;
2054:
2055: case OP_HSPACE:
1.4 misha 2056: if (eptr >= md->end_subject)
2057: {
2058: SCHECK_PARTIAL();
2059: MRRETURN(MATCH_NOMATCH);
2060: }
1.1 misha 2061: GETCHARINCTEST(c, eptr);
2062: switch(c)
2063: {
1.4 misha 2064: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 2065: case 0x09: /* HT */
2066: case 0x20: /* SPACE */
2067: case 0xa0: /* NBSP */
2068: case 0x1680: /* OGHAM SPACE MARK */
2069: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2070: case 0x2000: /* EN QUAD */
2071: case 0x2001: /* EM QUAD */
2072: case 0x2002: /* EN SPACE */
2073: case 0x2003: /* EM SPACE */
2074: case 0x2004: /* THREE-PER-EM SPACE */
2075: case 0x2005: /* FOUR-PER-EM SPACE */
2076: case 0x2006: /* SIX-PER-EM SPACE */
2077: case 0x2007: /* FIGURE SPACE */
2078: case 0x2008: /* PUNCTUATION SPACE */
2079: case 0x2009: /* THIN SPACE */
2080: case 0x200A: /* HAIR SPACE */
2081: case 0x202f: /* NARROW NO-BREAK SPACE */
2082: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2083: case 0x3000: /* IDEOGRAPHIC SPACE */
2084: break;
2085: }
2086: ecode++;
2087: break;
2088:
2089: case OP_NOT_VSPACE:
1.4 misha 2090: if (eptr >= md->end_subject)
2091: {
2092: SCHECK_PARTIAL();
2093: MRRETURN(MATCH_NOMATCH);
2094: }
1.1 misha 2095: GETCHARINCTEST(c, eptr);
2096: switch(c)
2097: {
2098: default: break;
2099: case 0x0a: /* LF */
2100: case 0x0b: /* VT */
2101: case 0x0c: /* FF */
2102: case 0x0d: /* CR */
2103: case 0x85: /* NEL */
2104: case 0x2028: /* LINE SEPARATOR */
2105: case 0x2029: /* PARAGRAPH SEPARATOR */
1.4 misha 2106: MRRETURN(MATCH_NOMATCH);
1.1 misha 2107: }
2108: ecode++;
2109: break;
2110:
2111: case OP_VSPACE:
1.4 misha 2112: if (eptr >= md->end_subject)
2113: {
2114: SCHECK_PARTIAL();
2115: MRRETURN(MATCH_NOMATCH);
2116: }
1.1 misha 2117: GETCHARINCTEST(c, eptr);
2118: switch(c)
2119: {
1.4 misha 2120: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 2121: case 0x0a: /* LF */
2122: case 0x0b: /* VT */
2123: case 0x0c: /* FF */
2124: case 0x0d: /* CR */
2125: case 0x85: /* NEL */
2126: case 0x2028: /* LINE SEPARATOR */
2127: case 0x2029: /* PARAGRAPH SEPARATOR */
2128: break;
2129: }
2130: ecode++;
2131: break;
2132:
2133: #ifdef SUPPORT_UCP
2134: /* Check the next character by Unicode property. We will get here only
2135: if the support is in the binary; otherwise a compile-time error occurs. */
2136:
2137: case OP_PROP:
2138: case OP_NOTPROP:
1.4 misha 2139: if (eptr >= md->end_subject)
2140: {
2141: SCHECK_PARTIAL();
2142: MRRETURN(MATCH_NOMATCH);
2143: }
1.1 misha 2144: GETCHARINCTEST(c, eptr);
2145: {
1.3 misha 2146: const ucd_record *prop = GET_UCD(c);
1.1 misha 2147:
2148: switch(ecode[1])
2149: {
2150: case PT_ANY:
1.4 misha 2151: if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
1.1 misha 2152: break;
2153:
2154: case PT_LAMP:
1.2 misha 2155: if ((prop->chartype == ucp_Lu ||
2156: prop->chartype == ucp_Ll ||
2157: prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1.4 misha 2158: MRRETURN(MATCH_NOMATCH);
2159: break;
1.1 misha 2160:
2161: case PT_GC:
1.2 misha 2162: if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1.4 misha 2163: MRRETURN(MATCH_NOMATCH);
1.1 misha 2164: break;
2165:
2166: case PT_PC:
1.2 misha 2167: if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1.4 misha 2168: MRRETURN(MATCH_NOMATCH);
1.1 misha 2169: break;
2170:
2171: case PT_SC:
1.2 misha 2172: if ((ecode[2] != prop->script) == (op == OP_PROP))
1.4 misha 2173: MRRETURN(MATCH_NOMATCH);
2174: break;
2175:
2176: /* These are specials */
2177:
2178: case PT_ALNUM:
2179: if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2180: _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
2181: MRRETURN(MATCH_NOMATCH);
2182: break;
2183:
2184: case PT_SPACE: /* Perl space */
2185: if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2186: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2187: == (op == OP_NOTPROP))
2188: MRRETURN(MATCH_NOMATCH);
2189: break;
2190:
2191: case PT_PXSPACE: /* POSIX space */
2192: if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
2193: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2194: c == CHAR_FF || c == CHAR_CR)
2195: == (op == OP_NOTPROP))
2196: MRRETURN(MATCH_NOMATCH);
2197: break;
2198:
2199: case PT_WORD:
2200: if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
2201: _pcre_ucp_gentype[prop->chartype] == ucp_N ||
2202: c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
2203: MRRETURN(MATCH_NOMATCH);
1.1 misha 2204: break;
2205:
1.4 misha 2206: /* This should never occur */
2207:
1.1 misha 2208: default:
2209: RRETURN(PCRE_ERROR_INTERNAL);
2210: }
2211:
2212: ecode += 3;
2213: }
2214: break;
2215:
2216: /* Match an extended Unicode sequence. We will get here only if the support
2217: is in the binary; otherwise a compile-time error occurs. */
2218:
2219: case OP_EXTUNI:
1.4 misha 2220: if (eptr >= md->end_subject)
2221: {
2222: SCHECK_PARTIAL();
2223: MRRETURN(MATCH_NOMATCH);
2224: }
1.1 misha 2225: GETCHARINCTEST(c, eptr);
2226: {
1.2 misha 2227: int category = UCD_CATEGORY(c);
1.4 misha 2228: if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
1.1 misha 2229: while (eptr < md->end_subject)
2230: {
2231: int len = 1;
2232: if (!utf8) c = *eptr; else
2233: {
2234: GETCHARLEN(c, eptr, len);
2235: }
1.2 misha 2236: category = UCD_CATEGORY(c);
1.1 misha 2237: if (category != ucp_M) break;
2238: eptr += len;
2239: }
2240: }
2241: ecode++;
2242: break;
2243: #endif
2244:
2245:
2246: /* Match a back reference, possibly repeatedly. Look past the end of the
2247: item to see if there is repeat information following. The code is similar
2248: to that for character classes, but repeated for efficiency. Then obey
2249: similar code to character type repeats - written out again for speed.
2250: However, if the referenced string is the empty string, always treat
2251: it as matched, any number of times (otherwise there could be infinite
2252: loops). */
2253:
2254: case OP_REF:
2255: {
2256: offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2257: ecode += 3;
2258:
2259: /* If the reference is unset, there are two possibilities:
2260:
2261: (a) In the default, Perl-compatible state, set the length to be longer
2262: than the amount of subject left; this ensures that every attempt at a
2263: match fails. We can't just fail here, because of the possibility of
2264: quantifiers with zero minima.
2265:
2266: (b) If the JavaScript compatibility flag is set, set the length to zero
2267: so that the back reference matches an empty string.
2268:
2269: Otherwise, set the length to the length of what was matched by the
2270: referenced subpattern. */
2271:
2272: if (offset >= offset_top || md->offset_vector[offset] < 0)
1.4 misha 2273: length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);
1.1 misha 2274: else
2275: length = md->offset_vector[offset+1] - md->offset_vector[offset];
2276:
2277: /* Set up for repetition, or handle the non-repeated case */
2278:
2279: switch (*ecode)
2280: {
2281: case OP_CRSTAR:
2282: case OP_CRMINSTAR:
2283: case OP_CRPLUS:
2284: case OP_CRMINPLUS:
2285: case OP_CRQUERY:
2286: case OP_CRMINQUERY:
2287: c = *ecode++ - OP_CRSTAR;
2288: minimize = (c & 1) != 0;
2289: min = rep_min[c]; /* Pick up values from tables; */
2290: max = rep_max[c]; /* zero for max => infinity */
2291: if (max == 0) max = INT_MAX;
2292: break;
2293:
2294: case OP_CRRANGE:
2295: case OP_CRMINRANGE:
2296: minimize = (*ecode == OP_CRMINRANGE);
2297: min = GET2(ecode, 1);
2298: max = GET2(ecode, 3);
2299: if (max == 0) max = INT_MAX;
2300: ecode += 5;
2301: break;
2302:
2303: default: /* No repeat follows */
1.4 misha 2304: if (!match_ref(offset, eptr, length, md, ims))
2305: {
2306: CHECK_PARTIAL();
2307: MRRETURN(MATCH_NOMATCH);
2308: }
1.1 misha 2309: eptr += length;
2310: continue; /* With the main loop */
2311: }
2312:
2313: /* If the length of the reference is zero, just continue with the
2314: main loop. */
2315:
2316: if (length == 0) continue;
2317:
2318: /* First, ensure the minimum number of matches are present. We get back
2319: the length of the reference string explicitly rather than passing the
2320: address of eptr, so that eptr can be a register variable. */
2321:
2322: for (i = 1; i <= min; i++)
2323: {
1.4 misha 2324: if (!match_ref(offset, eptr, length, md, ims))
2325: {
2326: CHECK_PARTIAL();
2327: MRRETURN(MATCH_NOMATCH);
2328: }
1.1 misha 2329: eptr += length;
2330: }
2331:
2332: /* If min = max, continue at the same level without recursion.
2333: They are not both allowed to be zero. */
2334:
2335: if (min == max) continue;
2336:
2337: /* If minimizing, keep trying and advancing the pointer */
2338:
2339: if (minimize)
2340: {
2341: for (fi = min;; fi++)
2342: {
2343: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2344: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 2345: if (fi >= max) MRRETURN(MATCH_NOMATCH);
2346: if (!match_ref(offset, eptr, length, md, ims))
2347: {
2348: CHECK_PARTIAL();
2349: MRRETURN(MATCH_NOMATCH);
2350: }
1.1 misha 2351: eptr += length;
2352: }
2353: /* Control never gets here */
2354: }
2355:
2356: /* If maximizing, find the longest string and work backwards */
2357:
2358: else
2359: {
2360: pp = eptr;
2361: for (i = min; i < max; i++)
2362: {
1.4 misha 2363: if (!match_ref(offset, eptr, length, md, ims))
2364: {
2365: CHECK_PARTIAL();
2366: break;
2367: }
1.1 misha 2368: eptr += length;
2369: }
2370: while (eptr >= pp)
2371: {
2372: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2373: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2374: eptr -= length;
2375: }
1.4 misha 2376: MRRETURN(MATCH_NOMATCH);
1.1 misha 2377: }
2378: }
2379: /* Control never gets here */
2380:
2381: /* Match a bit-mapped character class, possibly repeatedly. This op code is
2382: used when all the characters in the class have values in the range 0-255,
2383: and either the matching is caseful, or the characters are in the range
2384: 0-127 when UTF-8 processing is enabled. The only difference between
2385: OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2386: encountered.
2387:
2388: First, look past the end of the item to see if there is repeat information
2389: following. Then obey similar code to character type repeats - written out
2390: again for speed. */
2391:
2392: case OP_NCLASS:
2393: case OP_CLASS:
2394: {
2395: data = ecode + 1; /* Save for matching */
2396: ecode += 33; /* Advance past the item */
2397:
2398: switch (*ecode)
2399: {
2400: case OP_CRSTAR:
2401: case OP_CRMINSTAR:
2402: case OP_CRPLUS:
2403: case OP_CRMINPLUS:
2404: case OP_CRQUERY:
2405: case OP_CRMINQUERY:
2406: c = *ecode++ - OP_CRSTAR;
2407: minimize = (c & 1) != 0;
2408: min = rep_min[c]; /* Pick up values from tables; */
2409: max = rep_max[c]; /* zero for max => infinity */
2410: if (max == 0) max = INT_MAX;
2411: break;
2412:
2413: case OP_CRRANGE:
2414: case OP_CRMINRANGE:
2415: minimize = (*ecode == OP_CRMINRANGE);
2416: min = GET2(ecode, 1);
2417: max = GET2(ecode, 3);
2418: if (max == 0) max = INT_MAX;
2419: ecode += 5;
2420: break;
2421:
2422: default: /* No repeat follows */
2423: min = max = 1;
2424: break;
2425: }
2426:
2427: /* First, ensure the minimum number of matches are present. */
2428:
2429: #ifdef SUPPORT_UTF8
2430: /* UTF-8 mode */
2431: if (utf8)
2432: {
2433: for (i = 1; i <= min; i++)
2434: {
1.4 misha 2435: if (eptr >= md->end_subject)
2436: {
2437: SCHECK_PARTIAL();
2438: MRRETURN(MATCH_NOMATCH);
2439: }
1.1 misha 2440: GETCHARINC(c, eptr);
2441: if (c > 255)
2442: {
1.4 misha 2443: if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
1.1 misha 2444: }
2445: else
2446: {
1.4 misha 2447: if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 2448: }
2449: }
2450: }
2451: else
2452: #endif
2453: /* Not UTF-8 mode */
2454: {
2455: for (i = 1; i <= min; i++)
2456: {
1.4 misha 2457: if (eptr >= md->end_subject)
2458: {
2459: SCHECK_PARTIAL();
2460: MRRETURN(MATCH_NOMATCH);
2461: }
1.1 misha 2462: c = *eptr++;
1.4 misha 2463: if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 2464: }
2465: }
2466:
2467: /* If max == min we can continue with the main loop without the
2468: need to recurse. */
2469:
2470: if (min == max) continue;
2471:
2472: /* If minimizing, keep testing the rest of the expression and advancing
2473: the pointer while it matches the class. */
2474:
2475: if (minimize)
2476: {
2477: #ifdef SUPPORT_UTF8
2478: /* UTF-8 mode */
2479: if (utf8)
2480: {
2481: for (fi = min;; fi++)
2482: {
2483: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2484: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 2485: if (fi >= max) MRRETURN(MATCH_NOMATCH);
2486: if (eptr >= md->end_subject)
2487: {
2488: SCHECK_PARTIAL();
2489: MRRETURN(MATCH_NOMATCH);
2490: }
1.1 misha 2491: GETCHARINC(c, eptr);
2492: if (c > 255)
2493: {
1.4 misha 2494: if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
1.1 misha 2495: }
2496: else
2497: {
1.4 misha 2498: if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 2499: }
2500: }
2501: }
2502: else
2503: #endif
2504: /* Not UTF-8 mode */
2505: {
2506: for (fi = min;; fi++)
2507: {
2508: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2509: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 2510: if (fi >= max) MRRETURN(MATCH_NOMATCH);
2511: if (eptr >= md->end_subject)
2512: {
2513: SCHECK_PARTIAL();
2514: MRRETURN(MATCH_NOMATCH);
2515: }
1.1 misha 2516: c = *eptr++;
1.4 misha 2517: if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 2518: }
2519: }
2520: /* Control never gets here */
2521: }
2522:
2523: /* If maximizing, find the longest possible run, then work backwards. */
2524:
2525: else
2526: {
2527: pp = eptr;
2528:
2529: #ifdef SUPPORT_UTF8
2530: /* UTF-8 mode */
2531: if (utf8)
2532: {
2533: for (i = min; i < max; i++)
2534: {
2535: int len = 1;
1.4 misha 2536: if (eptr >= md->end_subject)
2537: {
2538: SCHECK_PARTIAL();
2539: break;
2540: }
1.1 misha 2541: GETCHARLEN(c, eptr, len);
2542: if (c > 255)
2543: {
2544: if (op == OP_CLASS) break;
2545: }
2546: else
2547: {
2548: if ((data[c/8] & (1 << (c&7))) == 0) break;
2549: }
2550: eptr += len;
2551: }
2552: for (;;)
2553: {
2554: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2555: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2556: if (eptr-- == pp) break; /* Stop if tried at original pos */
2557: BACKCHAR(eptr);
2558: }
2559: }
2560: else
2561: #endif
2562: /* Not UTF-8 mode */
2563: {
2564: for (i = min; i < max; i++)
2565: {
1.4 misha 2566: if (eptr >= md->end_subject)
2567: {
2568: SCHECK_PARTIAL();
2569: break;
2570: }
1.1 misha 2571: c = *eptr;
2572: if ((data[c/8] & (1 << (c&7))) == 0) break;
2573: eptr++;
2574: }
2575: while (eptr >= pp)
2576: {
2577: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2578: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2579: eptr--;
2580: }
2581: }
2582:
1.4 misha 2583: MRRETURN(MATCH_NOMATCH);
1.1 misha 2584: }
2585: }
2586: /* Control never gets here */
2587:
2588:
2589: /* Match an extended character class. This opcode is encountered only
1.3 misha 2590: when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2591: mode, because Unicode properties are supported in non-UTF-8 mode. */
1.1 misha 2592:
2593: #ifdef SUPPORT_UTF8
2594: case OP_XCLASS:
2595: {
2596: data = ecode + 1 + LINK_SIZE; /* Save for matching */
2597: ecode += GET(ecode, 1); /* Advance past the item */
2598:
2599: switch (*ecode)
2600: {
2601: case OP_CRSTAR:
2602: case OP_CRMINSTAR:
2603: case OP_CRPLUS:
2604: case OP_CRMINPLUS:
2605: case OP_CRQUERY:
2606: case OP_CRMINQUERY:
2607: c = *ecode++ - OP_CRSTAR;
2608: minimize = (c & 1) != 0;
2609: min = rep_min[c]; /* Pick up values from tables; */
2610: max = rep_max[c]; /* zero for max => infinity */
2611: if (max == 0) max = INT_MAX;
2612: break;
2613:
2614: case OP_CRRANGE:
2615: case OP_CRMINRANGE:
2616: minimize = (*ecode == OP_CRMINRANGE);
2617: min = GET2(ecode, 1);
2618: max = GET2(ecode, 3);
2619: if (max == 0) max = INT_MAX;
2620: ecode += 5;
2621: break;
2622:
2623: default: /* No repeat follows */
2624: min = max = 1;
2625: break;
2626: }
2627:
2628: /* First, ensure the minimum number of matches are present. */
2629:
2630: for (i = 1; i <= min; i++)
2631: {
1.4 misha 2632: if (eptr >= md->end_subject)
2633: {
2634: SCHECK_PARTIAL();
2635: MRRETURN(MATCH_NOMATCH);
2636: }
1.3 misha 2637: GETCHARINCTEST(c, eptr);
1.4 misha 2638: if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
1.1 misha 2639: }
2640:
2641: /* If max == min we can continue with the main loop without the
2642: need to recurse. */
2643:
2644: if (min == max) continue;
2645:
2646: /* If minimizing, keep testing the rest of the expression and advancing
2647: the pointer while it matches the class. */
2648:
2649: if (minimize)
2650: {
2651: for (fi = min;; fi++)
2652: {
2653: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2654: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 2655: if (fi >= max) MRRETURN(MATCH_NOMATCH);
2656: if (eptr >= md->end_subject)
2657: {
2658: SCHECK_PARTIAL();
2659: MRRETURN(MATCH_NOMATCH);
2660: }
1.3 misha 2661: GETCHARINCTEST(c, eptr);
1.4 misha 2662: if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
1.1 misha 2663: }
2664: /* Control never gets here */
2665: }
2666:
2667: /* If maximizing, find the longest possible run, then work backwards. */
2668:
2669: else
2670: {
2671: pp = eptr;
2672: for (i = min; i < max; i++)
2673: {
2674: int len = 1;
1.4 misha 2675: if (eptr >= md->end_subject)
2676: {
2677: SCHECK_PARTIAL();
2678: break;
2679: }
1.3 misha 2680: GETCHARLENTEST(c, eptr, len);
1.1 misha 2681: if (!_pcre_xclass(c, data)) break;
2682: eptr += len;
2683: }
2684: for(;;)
2685: {
2686: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2687: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2688: if (eptr-- == pp) break; /* Stop if tried at original pos */
2689: if (utf8) BACKCHAR(eptr);
2690: }
1.4 misha 2691: MRRETURN(MATCH_NOMATCH);
1.1 misha 2692: }
2693:
2694: /* Control never gets here */
2695: }
2696: #endif /* End of XCLASS */
2697:
2698: /* Match a single character, casefully */
2699:
2700: case OP_CHAR:
2701: #ifdef SUPPORT_UTF8
2702: if (utf8)
2703: {
2704: length = 1;
2705: ecode++;
2706: GETCHARLEN(fc, ecode, length);
1.4 misha 2707: if (length > md->end_subject - eptr)
2708: {
2709: CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2710: MRRETURN(MATCH_NOMATCH);
2711: }
2712: while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
1.1 misha 2713: }
2714: else
2715: #endif
2716:
2717: /* Non-UTF-8 mode */
2718: {
1.4 misha 2719: if (md->end_subject - eptr < 1)
2720: {
2721: SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2722: MRRETURN(MATCH_NOMATCH);
2723: }
2724: if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
1.1 misha 2725: ecode += 2;
2726: }
2727: break;
2728:
2729: /* Match a single character, caselessly */
2730:
2731: case OP_CHARNC:
2732: #ifdef SUPPORT_UTF8
2733: if (utf8)
2734: {
2735: length = 1;
2736: ecode++;
2737: GETCHARLEN(fc, ecode, length);
2738:
1.4 misha 2739: if (length > md->end_subject - eptr)
2740: {
2741: CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
2742: MRRETURN(MATCH_NOMATCH);
2743: }
1.1 misha 2744:
2745: /* If the pattern character's value is < 128, we have only one byte, and
2746: can use the fast lookup table. */
2747:
2748: if (fc < 128)
2749: {
1.4 misha 2750: if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
1.1 misha 2751: }
2752:
2753: /* Otherwise we must pick up the subject character */
2754:
2755: else
2756: {
2757: unsigned int dc;
2758: GETCHARINC(dc, eptr);
2759: ecode += length;
2760:
2761: /* If we have Unicode property support, we can use it to test the other
2762: case of the character, if there is one. */
2763:
2764: if (fc != dc)
2765: {
2766: #ifdef SUPPORT_UCP
1.2 misha 2767: if (dc != UCD_OTHERCASE(fc))
1.1 misha 2768: #endif
1.4 misha 2769: MRRETURN(MATCH_NOMATCH);
1.1 misha 2770: }
2771: }
2772: }
2773: else
2774: #endif /* SUPPORT_UTF8 */
2775:
2776: /* Non-UTF-8 mode */
2777: {
1.4 misha 2778: if (md->end_subject - eptr < 1)
2779: {
2780: SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
2781: MRRETURN(MATCH_NOMATCH);
2782: }
2783: if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
1.1 misha 2784: ecode += 2;
2785: }
2786: break;
2787:
2788: /* Match a single character repeatedly. */
2789:
2790: case OP_EXACT:
2791: min = max = GET2(ecode, 1);
2792: ecode += 3;
2793: goto REPEATCHAR;
2794:
2795: case OP_POSUPTO:
2796: possessive = TRUE;
2797: /* Fall through */
2798:
2799: case OP_UPTO:
2800: case OP_MINUPTO:
2801: min = 0;
2802: max = GET2(ecode, 1);
2803: minimize = *ecode == OP_MINUPTO;
2804: ecode += 3;
2805: goto REPEATCHAR;
2806:
2807: case OP_POSSTAR:
2808: possessive = TRUE;
2809: min = 0;
2810: max = INT_MAX;
2811: ecode++;
2812: goto REPEATCHAR;
2813:
2814: case OP_POSPLUS:
2815: possessive = TRUE;
2816: min = 1;
2817: max = INT_MAX;
2818: ecode++;
2819: goto REPEATCHAR;
2820:
2821: case OP_POSQUERY:
2822: possessive = TRUE;
2823: min = 0;
2824: max = 1;
2825: ecode++;
2826: goto REPEATCHAR;
2827:
2828: case OP_STAR:
2829: case OP_MINSTAR:
2830: case OP_PLUS:
2831: case OP_MINPLUS:
2832: case OP_QUERY:
2833: case OP_MINQUERY:
2834: c = *ecode++ - OP_STAR;
2835: minimize = (c & 1) != 0;
1.4 misha 2836:
1.1 misha 2837: min = rep_min[c]; /* Pick up values from tables; */
2838: max = rep_max[c]; /* zero for max => infinity */
2839: if (max == 0) max = INT_MAX;
2840:
1.4 misha 2841: /* Common code for all repeated single-character matches. */
1.1 misha 2842:
2843: REPEATCHAR:
2844: #ifdef SUPPORT_UTF8
2845: if (utf8)
2846: {
2847: length = 1;
2848: charptr = ecode;
2849: GETCHARLEN(fc, ecode, length);
2850: ecode += length;
2851:
2852: /* Handle multibyte character matching specially here. There is
2853: support for caseless matching if UCP support is present. */
2854:
2855: if (length > 1)
2856: {
2857: #ifdef SUPPORT_UCP
2858: unsigned int othercase;
2859: if ((ims & PCRE_CASELESS) != 0 &&
1.2 misha 2860: (othercase = UCD_OTHERCASE(fc)) != fc)
1.1 misha 2861: oclength = _pcre_ord2utf8(othercase, occhars);
2862: else oclength = 0;
2863: #endif /* SUPPORT_UCP */
2864:
2865: for (i = 1; i <= min; i++)
2866: {
1.4 misha 2867: if (eptr <= md->end_subject - length &&
2868: memcmp(eptr, charptr, length) == 0) eptr += length;
1.1 misha 2869: #ifdef SUPPORT_UCP
1.4 misha 2870: else if (oclength > 0 &&
2871: eptr <= md->end_subject - oclength &&
2872: memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2873: #endif /* SUPPORT_UCP */
1.1 misha 2874: else
2875: {
1.4 misha 2876: CHECK_PARTIAL();
2877: MRRETURN(MATCH_NOMATCH);
1.1 misha 2878: }
2879: }
2880:
2881: if (min == max) continue;
2882:
2883: if (minimize)
2884: {
2885: for (fi = min;; fi++)
2886: {
2887: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2888: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 2889: if (fi >= max) MRRETURN(MATCH_NOMATCH);
2890: if (eptr <= md->end_subject - length &&
2891: memcmp(eptr, charptr, length) == 0) eptr += length;
1.1 misha 2892: #ifdef SUPPORT_UCP
1.4 misha 2893: else if (oclength > 0 &&
2894: eptr <= md->end_subject - oclength &&
2895: memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2896: #endif /* SUPPORT_UCP */
1.1 misha 2897: else
2898: {
1.4 misha 2899: CHECK_PARTIAL();
2900: MRRETURN(MATCH_NOMATCH);
1.1 misha 2901: }
2902: }
2903: /* Control never gets here */
2904: }
2905:
2906: else /* Maximize */
2907: {
2908: pp = eptr;
2909: for (i = min; i < max; i++)
2910: {
1.4 misha 2911: if (eptr <= md->end_subject - length &&
2912: memcmp(eptr, charptr, length) == 0) eptr += length;
1.1 misha 2913: #ifdef SUPPORT_UCP
1.4 misha 2914: else if (oclength > 0 &&
2915: eptr <= md->end_subject - oclength &&
2916: memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
2917: #endif /* SUPPORT_UCP */
1.1 misha 2918: else
2919: {
1.4 misha 2920: CHECK_PARTIAL();
2921: break;
1.1 misha 2922: }
2923: }
2924:
2925: if (possessive) continue;
1.4 misha 2926:
1.1 misha 2927: for(;;)
1.4 misha 2928: {
2929: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2930: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2931: if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
1.1 misha 2932: #ifdef SUPPORT_UCP
1.4 misha 2933: eptr--;
2934: BACKCHAR(eptr);
1.1 misha 2935: #else /* without SUPPORT_UCP */
1.4 misha 2936: eptr -= length;
1.1 misha 2937: #endif /* SUPPORT_UCP */
1.4 misha 2938: }
1.1 misha 2939: }
2940: /* Control never gets here */
2941: }
2942:
2943: /* If the length of a UTF-8 character is 1, we fall through here, and
2944: obey the code as for non-UTF-8 characters below, though in this case the
2945: value of fc will always be < 128. */
2946: }
2947: else
2948: #endif /* SUPPORT_UTF8 */
2949:
2950: /* When not in UTF-8 mode, load a single-byte character. */
1.4 misha 2951:
2952: fc = *ecode++;
1.1 misha 2953:
2954: /* The value of fc at this point is always less than 256, though we may or
2955: may not be in UTF-8 mode. The code is duplicated for the caseless and
2956: caseful cases, for speed, since matching characters is likely to be quite
2957: common. First, ensure the minimum number of matches are present. If min =
2958: max, continue at the same level without recursing. Otherwise, if
2959: minimizing, keep trying the rest of the expression and advancing one
2960: matching character if failing, up to the maximum. Alternatively, if
2961: maximizing, find the maximum number of characters and work backwards. */
2962:
2963: DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2964: max, eptr));
2965:
2966: if ((ims & PCRE_CASELESS) != 0)
2967: {
2968: fc = md->lcc[fc];
2969: for (i = 1; i <= min; i++)
1.4 misha 2970: {
2971: if (eptr >= md->end_subject)
2972: {
2973: SCHECK_PARTIAL();
2974: MRRETURN(MATCH_NOMATCH);
2975: }
2976: if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
2977: }
1.1 misha 2978: if (min == max) continue;
2979: if (minimize)
2980: {
2981: for (fi = min;; fi++)
2982: {
2983: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2984: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 2985: if (fi >= max) MRRETURN(MATCH_NOMATCH);
2986: if (eptr >= md->end_subject)
2987: {
2988: SCHECK_PARTIAL();
2989: MRRETURN(MATCH_NOMATCH);
2990: }
2991: if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
1.1 misha 2992: }
2993: /* Control never gets here */
2994: }
2995: else /* Maximize */
2996: {
2997: pp = eptr;
2998: for (i = min; i < max; i++)
2999: {
1.4 misha 3000: if (eptr >= md->end_subject)
3001: {
3002: SCHECK_PARTIAL();
3003: break;
3004: }
3005: if (fc != md->lcc[*eptr]) break;
1.1 misha 3006: eptr++;
3007: }
1.4 misha 3008:
1.1 misha 3009: if (possessive) continue;
1.4 misha 3010:
1.1 misha 3011: while (eptr >= pp)
3012: {
3013: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
3014: eptr--;
3015: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3016: }
1.4 misha 3017: MRRETURN(MATCH_NOMATCH);
1.1 misha 3018: }
3019: /* Control never gets here */
3020: }
3021:
3022: /* Caseful comparisons (includes all multi-byte characters) */
3023:
3024: else
3025: {
1.4 misha 3026: for (i = 1; i <= min; i++)
3027: {
3028: if (eptr >= md->end_subject)
3029: {
3030: SCHECK_PARTIAL();
3031: MRRETURN(MATCH_NOMATCH);
3032: }
3033: if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
3034: }
3035:
1.1 misha 3036: if (min == max) continue;
1.4 misha 3037:
1.1 misha 3038: if (minimize)
3039: {
3040: for (fi = min;; fi++)
3041: {
3042: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
3043: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 3044: if (fi >= max) MRRETURN(MATCH_NOMATCH);
3045: if (eptr >= md->end_subject)
3046: {
3047: SCHECK_PARTIAL();
3048: MRRETURN(MATCH_NOMATCH);
3049: }
3050: if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
1.1 misha 3051: }
3052: /* Control never gets here */
3053: }
3054: else /* Maximize */
3055: {
3056: pp = eptr;
3057: for (i = min; i < max; i++)
3058: {
1.4 misha 3059: if (eptr >= md->end_subject)
3060: {
3061: SCHECK_PARTIAL();
3062: break;
3063: }
3064: if (fc != *eptr) break;
1.1 misha 3065: eptr++;
3066: }
3067: if (possessive) continue;
1.4 misha 3068:
1.1 misha 3069: while (eptr >= pp)
3070: {
3071: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
3072: eptr--;
3073: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3074: }
1.4 misha 3075: MRRETURN(MATCH_NOMATCH);
1.1 misha 3076: }
3077: }
3078: /* Control never gets here */
3079:
3080: /* Match a negated single one-byte character. The character we are
3081: checking can be multibyte. */
3082:
3083: case OP_NOT:
1.4 misha 3084: if (eptr >= md->end_subject)
3085: {
3086: SCHECK_PARTIAL();
3087: MRRETURN(MATCH_NOMATCH);
3088: }
1.1 misha 3089: ecode++;
3090: GETCHARINCTEST(c, eptr);
3091: if ((ims & PCRE_CASELESS) != 0)
3092: {
3093: #ifdef SUPPORT_UTF8
3094: if (c < 256)
3095: #endif
3096: c = md->lcc[c];
1.4 misha 3097: if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
1.1 misha 3098: }
3099: else
3100: {
1.4 misha 3101: if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
1.1 misha 3102: }
3103: break;
3104:
3105: /* Match a negated single one-byte character repeatedly. This is almost a
3106: repeat of the code for a repeated single character, but I haven't found a
3107: nice way of commoning these up that doesn't require a test of the
3108: positive/negative option for each character match. Maybe that wouldn't add
3109: very much to the time taken, but character matching *is* what this is all
3110: about... */
3111:
3112: case OP_NOTEXACT:
3113: min = max = GET2(ecode, 1);
3114: ecode += 3;
3115: goto REPEATNOTCHAR;
3116:
3117: case OP_NOTUPTO:
3118: case OP_NOTMINUPTO:
3119: min = 0;
3120: max = GET2(ecode, 1);
3121: minimize = *ecode == OP_NOTMINUPTO;
3122: ecode += 3;
3123: goto REPEATNOTCHAR;
3124:
3125: case OP_NOTPOSSTAR:
3126: possessive = TRUE;
3127: min = 0;
3128: max = INT_MAX;
3129: ecode++;
3130: goto REPEATNOTCHAR;
3131:
3132: case OP_NOTPOSPLUS:
3133: possessive = TRUE;
3134: min = 1;
3135: max = INT_MAX;
3136: ecode++;
3137: goto REPEATNOTCHAR;
3138:
3139: case OP_NOTPOSQUERY:
3140: possessive = TRUE;
3141: min = 0;
3142: max = 1;
3143: ecode++;
3144: goto REPEATNOTCHAR;
3145:
3146: case OP_NOTPOSUPTO:
3147: possessive = TRUE;
3148: min = 0;
3149: max = GET2(ecode, 1);
3150: ecode += 3;
3151: goto REPEATNOTCHAR;
3152:
3153: case OP_NOTSTAR:
3154: case OP_NOTMINSTAR:
3155: case OP_NOTPLUS:
3156: case OP_NOTMINPLUS:
3157: case OP_NOTQUERY:
3158: case OP_NOTMINQUERY:
3159: c = *ecode++ - OP_NOTSTAR;
3160: minimize = (c & 1) != 0;
3161: min = rep_min[c]; /* Pick up values from tables; */
3162: max = rep_max[c]; /* zero for max => infinity */
3163: if (max == 0) max = INT_MAX;
3164:
1.4 misha 3165: /* Common code for all repeated single-byte matches. */
1.1 misha 3166:
3167: REPEATNOTCHAR:
3168: fc = *ecode++;
3169:
3170: /* The code is duplicated for the caseless and caseful cases, for speed,
3171: since matching characters is likely to be quite common. First, ensure the
3172: minimum number of matches are present. If min = max, continue at the same
3173: level without recursing. Otherwise, if minimizing, keep trying the rest of
3174: the expression and advancing one matching character if failing, up to the
3175: maximum. Alternatively, if maximizing, find the maximum number of
3176: characters and work backwards. */
3177:
3178: DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3179: max, eptr));
3180:
3181: if ((ims & PCRE_CASELESS) != 0)
3182: {
3183: fc = md->lcc[fc];
3184:
3185: #ifdef SUPPORT_UTF8
3186: /* UTF-8 mode */
3187: if (utf8)
3188: {
3189: register unsigned int d;
3190: for (i = 1; i <= min; i++)
3191: {
1.4 misha 3192: if (eptr >= md->end_subject)
3193: {
3194: SCHECK_PARTIAL();
3195: MRRETURN(MATCH_NOMATCH);
3196: }
1.1 misha 3197: GETCHARINC(d, eptr);
3198: if (d < 256) d = md->lcc[d];
1.4 misha 3199: if (fc == d) MRRETURN(MATCH_NOMATCH);
1.1 misha 3200: }
3201: }
3202: else
3203: #endif
3204:
3205: /* Not UTF-8 mode */
3206: {
3207: for (i = 1; i <= min; i++)
1.4 misha 3208: {
3209: if (eptr >= md->end_subject)
3210: {
3211: SCHECK_PARTIAL();
3212: MRRETURN(MATCH_NOMATCH);
3213: }
3214: if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
3215: }
1.1 misha 3216: }
3217:
3218: if (min == max) continue;
3219:
3220: if (minimize)
3221: {
3222: #ifdef SUPPORT_UTF8
3223: /* UTF-8 mode */
3224: if (utf8)
3225: {
3226: register unsigned int d;
3227: for (fi = min;; fi++)
3228: {
3229: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3230: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 3231: if (fi >= max) MRRETURN(MATCH_NOMATCH);
3232: if (eptr >= md->end_subject)
3233: {
3234: SCHECK_PARTIAL();
3235: MRRETURN(MATCH_NOMATCH);
3236: }
1.1 misha 3237: GETCHARINC(d, eptr);
3238: if (d < 256) d = md->lcc[d];
1.4 misha 3239: if (fc == d) MRRETURN(MATCH_NOMATCH);
1.1 misha 3240: }
3241: }
3242: else
3243: #endif
3244: /* Not UTF-8 mode */
3245: {
3246: for (fi = min;; fi++)
3247: {
3248: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3249: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 3250: if (fi >= max) MRRETURN(MATCH_NOMATCH);
3251: if (eptr >= md->end_subject)
3252: {
3253: SCHECK_PARTIAL();
3254: MRRETURN(MATCH_NOMATCH);
3255: }
3256: if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
1.1 misha 3257: }
3258: }
3259: /* Control never gets here */
3260: }
3261:
3262: /* Maximize case */
3263:
3264: else
3265: {
3266: pp = eptr;
3267:
3268: #ifdef SUPPORT_UTF8
3269: /* UTF-8 mode */
3270: if (utf8)
3271: {
3272: register unsigned int d;
3273: for (i = min; i < max; i++)
3274: {
3275: int len = 1;
1.4 misha 3276: if (eptr >= md->end_subject)
3277: {
3278: SCHECK_PARTIAL();
3279: break;
3280: }
1.1 misha 3281: GETCHARLEN(d, eptr, len);
3282: if (d < 256) d = md->lcc[d];
3283: if (fc == d) break;
3284: eptr += len;
3285: }
3286: if (possessive) continue;
3287: for(;;)
3288: {
3289: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3290: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3291: if (eptr-- == pp) break; /* Stop if tried at original pos */
3292: BACKCHAR(eptr);
3293: }
3294: }
3295: else
3296: #endif
3297: /* Not UTF-8 mode */
3298: {
3299: for (i = min; i < max; i++)
3300: {
1.4 misha 3301: if (eptr >= md->end_subject)
3302: {
3303: SCHECK_PARTIAL();
3304: break;
3305: }
3306: if (fc == md->lcc[*eptr]) break;
1.1 misha 3307: eptr++;
3308: }
3309: if (possessive) continue;
3310: while (eptr >= pp)
3311: {
3312: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3313: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3314: eptr--;
3315: }
3316: }
3317:
1.4 misha 3318: MRRETURN(MATCH_NOMATCH);
1.1 misha 3319: }
3320: /* Control never gets here */
3321: }
3322:
3323: /* Caseful comparisons */
3324:
3325: else
3326: {
3327: #ifdef SUPPORT_UTF8
3328: /* UTF-8 mode */
3329: if (utf8)
3330: {
3331: register unsigned int d;
3332: for (i = 1; i <= min; i++)
3333: {
1.4 misha 3334: if (eptr >= md->end_subject)
3335: {
3336: SCHECK_PARTIAL();
3337: MRRETURN(MATCH_NOMATCH);
3338: }
1.1 misha 3339: GETCHARINC(d, eptr);
1.4 misha 3340: if (fc == d) MRRETURN(MATCH_NOMATCH);
1.1 misha 3341: }
3342: }
3343: else
3344: #endif
3345: /* Not UTF-8 mode */
3346: {
3347: for (i = 1; i <= min; i++)
1.4 misha 3348: {
3349: if (eptr >= md->end_subject)
3350: {
3351: SCHECK_PARTIAL();
3352: MRRETURN(MATCH_NOMATCH);
3353: }
3354: if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
3355: }
1.1 misha 3356: }
3357:
3358: if (min == max) continue;
3359:
3360: if (minimize)
3361: {
3362: #ifdef SUPPORT_UTF8
3363: /* UTF-8 mode */
3364: if (utf8)
3365: {
3366: register unsigned int d;
3367: for (fi = min;; fi++)
3368: {
3369: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3370: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 3371: if (fi >= max) MRRETURN(MATCH_NOMATCH);
3372: if (eptr >= md->end_subject)
3373: {
3374: SCHECK_PARTIAL();
3375: MRRETURN(MATCH_NOMATCH);
3376: }
1.1 misha 3377: GETCHARINC(d, eptr);
1.4 misha 3378: if (fc == d) MRRETURN(MATCH_NOMATCH);
1.1 misha 3379: }
3380: }
3381: else
3382: #endif
3383: /* Not UTF-8 mode */
3384: {
3385: for (fi = min;; fi++)
3386: {
3387: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3388: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 3389: if (fi >= max) MRRETURN(MATCH_NOMATCH);
3390: if (eptr >= md->end_subject)
3391: {
3392: SCHECK_PARTIAL();
3393: MRRETURN(MATCH_NOMATCH);
3394: }
3395: if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
1.1 misha 3396: }
3397: }
3398: /* Control never gets here */
3399: }
3400:
3401: /* Maximize case */
3402:
3403: else
3404: {
3405: pp = eptr;
3406:
3407: #ifdef SUPPORT_UTF8
3408: /* UTF-8 mode */
3409: if (utf8)
3410: {
3411: register unsigned int d;
3412: for (i = min; i < max; i++)
3413: {
3414: int len = 1;
1.4 misha 3415: if (eptr >= md->end_subject)
3416: {
3417: SCHECK_PARTIAL();
3418: break;
3419: }
1.1 misha 3420: GETCHARLEN(d, eptr, len);
3421: if (fc == d) break;
3422: eptr += len;
3423: }
3424: if (possessive) continue;
3425: for(;;)
3426: {
3427: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3428: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3429: if (eptr-- == pp) break; /* Stop if tried at original pos */
3430: BACKCHAR(eptr);
3431: }
3432: }
3433: else
3434: #endif
3435: /* Not UTF-8 mode */
3436: {
3437: for (i = min; i < max; i++)
3438: {
1.4 misha 3439: if (eptr >= md->end_subject)
3440: {
3441: SCHECK_PARTIAL();
3442: break;
3443: }
3444: if (fc == *eptr) break;
1.1 misha 3445: eptr++;
3446: }
3447: if (possessive) continue;
3448: while (eptr >= pp)
3449: {
3450: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3451: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3452: eptr--;
3453: }
3454: }
3455:
1.4 misha 3456: MRRETURN(MATCH_NOMATCH);
1.1 misha 3457: }
3458: }
3459: /* Control never gets here */
3460:
3461: /* Match a single character type repeatedly; several different opcodes
3462: share code. This is very similar to the code for single characters, but we
3463: repeat it in the interests of efficiency. */
3464:
3465: case OP_TYPEEXACT:
3466: min = max = GET2(ecode, 1);
3467: minimize = TRUE;
3468: ecode += 3;
3469: goto REPEATTYPE;
3470:
3471: case OP_TYPEUPTO:
3472: case OP_TYPEMINUPTO:
3473: min = 0;
3474: max = GET2(ecode, 1);
3475: minimize = *ecode == OP_TYPEMINUPTO;
3476: ecode += 3;
3477: goto REPEATTYPE;
3478:
3479: case OP_TYPEPOSSTAR:
3480: possessive = TRUE;
3481: min = 0;
3482: max = INT_MAX;
3483: ecode++;
3484: goto REPEATTYPE;
3485:
3486: case OP_TYPEPOSPLUS:
3487: possessive = TRUE;
3488: min = 1;
3489: max = INT_MAX;
3490: ecode++;
3491: goto REPEATTYPE;
3492:
3493: case OP_TYPEPOSQUERY:
3494: possessive = TRUE;
3495: min = 0;
3496: max = 1;
3497: ecode++;
3498: goto REPEATTYPE;
3499:
3500: case OP_TYPEPOSUPTO:
3501: possessive = TRUE;
3502: min = 0;
3503: max = GET2(ecode, 1);
3504: ecode += 3;
3505: goto REPEATTYPE;
3506:
3507: case OP_TYPESTAR:
3508: case OP_TYPEMINSTAR:
3509: case OP_TYPEPLUS:
3510: case OP_TYPEMINPLUS:
3511: case OP_TYPEQUERY:
3512: case OP_TYPEMINQUERY:
3513: c = *ecode++ - OP_TYPESTAR;
3514: minimize = (c & 1) != 0;
3515: min = rep_min[c]; /* Pick up values from tables; */
3516: max = rep_max[c]; /* zero for max => infinity */
3517: if (max == 0) max = INT_MAX;
3518:
3519: /* Common code for all repeated single character type matches. Note that
3520: in UTF-8 mode, '.' matches a character of any length, but for the other
3521: character types, the valid characters are all one-byte long. */
3522:
3523: REPEATTYPE:
3524: ctype = *ecode++; /* Code for the character type */
3525:
3526: #ifdef SUPPORT_UCP
3527: if (ctype == OP_PROP || ctype == OP_NOTPROP)
3528: {
3529: prop_fail_result = ctype == OP_NOTPROP;
3530: prop_type = *ecode++;
3531: prop_value = *ecode++;
3532: }
3533: else prop_type = -1;
3534: #endif
3535:
3536: /* First, ensure the minimum number of matches are present. Use inline
3537: code for maximizing the speed, and do the type test once at the start
1.4 misha 3538: (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
1.1 misha 3539: is tidier. Also separate the UCP code, which can be the same for both UTF-8
3540: and single-bytes. */
3541:
3542: if (min > 0)
3543: {
3544: #ifdef SUPPORT_UCP
3545: if (prop_type >= 0)
3546: {
3547: switch(prop_type)
3548: {
3549: case PT_ANY:
1.4 misha 3550: if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
1.1 misha 3551: for (i = 1; i <= min; i++)
3552: {
1.4 misha 3553: if (eptr >= md->end_subject)
3554: {
3555: SCHECK_PARTIAL();
3556: MRRETURN(MATCH_NOMATCH);
3557: }
1.1 misha 3558: GETCHARINCTEST(c, eptr);
3559: }
3560: break;
3561:
3562: case PT_LAMP:
3563: for (i = 1; i <= min; i++)
3564: {
1.4 misha 3565: if (eptr >= md->end_subject)
3566: {
3567: SCHECK_PARTIAL();
3568: MRRETURN(MATCH_NOMATCH);
3569: }
1.1 misha 3570: GETCHARINCTEST(c, eptr);
1.2 misha 3571: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 3572: if ((prop_chartype == ucp_Lu ||
3573: prop_chartype == ucp_Ll ||
3574: prop_chartype == ucp_Lt) == prop_fail_result)
1.4 misha 3575: MRRETURN(MATCH_NOMATCH);
1.1 misha 3576: }
3577: break;
3578:
3579: case PT_GC:
3580: for (i = 1; i <= min; i++)
3581: {
1.4 misha 3582: if (eptr >= md->end_subject)
3583: {
3584: SCHECK_PARTIAL();
3585: MRRETURN(MATCH_NOMATCH);
3586: }
1.1 misha 3587: GETCHARINCTEST(c, eptr);
1.2 misha 3588: prop_category = UCD_CATEGORY(c);
1.1 misha 3589: if ((prop_category == prop_value) == prop_fail_result)
1.4 misha 3590: MRRETURN(MATCH_NOMATCH);
1.1 misha 3591: }
3592: break;
3593:
3594: case PT_PC:
3595: for (i = 1; i <= min; i++)
3596: {
1.4 misha 3597: if (eptr >= md->end_subject)
3598: {
3599: SCHECK_PARTIAL();
3600: MRRETURN(MATCH_NOMATCH);
3601: }
1.1 misha 3602: GETCHARINCTEST(c, eptr);
1.2 misha 3603: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 3604: if ((prop_chartype == prop_value) == prop_fail_result)
1.4 misha 3605: MRRETURN(MATCH_NOMATCH);
1.1 misha 3606: }
3607: break;
3608:
3609: case PT_SC:
3610: for (i = 1; i <= min; i++)
3611: {
1.4 misha 3612: if (eptr >= md->end_subject)
3613: {
3614: SCHECK_PARTIAL();
3615: MRRETURN(MATCH_NOMATCH);
3616: }
1.1 misha 3617: GETCHARINCTEST(c, eptr);
1.2 misha 3618: prop_script = UCD_SCRIPT(c);
1.1 misha 3619: if ((prop_script == prop_value) == prop_fail_result)
1.4 misha 3620: MRRETURN(MATCH_NOMATCH);
3621: }
3622: break;
3623:
3624: case PT_ALNUM:
3625: for (i = 1; i <= min; i++)
3626: {
3627: if (eptr >= md->end_subject)
3628: {
3629: SCHECK_PARTIAL();
3630: MRRETURN(MATCH_NOMATCH);
3631: }
3632: GETCHARINCTEST(c, eptr);
3633: prop_category = UCD_CATEGORY(c);
3634: if ((prop_category == ucp_L || prop_category == ucp_N)
3635: == prop_fail_result)
3636: MRRETURN(MATCH_NOMATCH);
3637: }
3638: break;
3639:
3640: case PT_SPACE: /* Perl space */
3641: for (i = 1; i <= min; i++)
3642: {
3643: if (eptr >= md->end_subject)
3644: {
3645: SCHECK_PARTIAL();
3646: MRRETURN(MATCH_NOMATCH);
3647: }
3648: GETCHARINCTEST(c, eptr);
3649: prop_category = UCD_CATEGORY(c);
3650: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3651: c == CHAR_FF || c == CHAR_CR)
3652: == prop_fail_result)
3653: MRRETURN(MATCH_NOMATCH);
1.1 misha 3654: }
3655: break;
3656:
1.4 misha 3657: case PT_PXSPACE: /* POSIX space */
3658: for (i = 1; i <= min; i++)
3659: {
3660: if (eptr >= md->end_subject)
3661: {
3662: SCHECK_PARTIAL();
3663: MRRETURN(MATCH_NOMATCH);
3664: }
3665: GETCHARINCTEST(c, eptr);
3666: prop_category = UCD_CATEGORY(c);
3667: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
3668: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
3669: == prop_fail_result)
3670: MRRETURN(MATCH_NOMATCH);
3671: }
3672: break;
3673:
3674: case PT_WORD:
3675: for (i = 1; i <= min; i++)
3676: {
3677: if (eptr >= md->end_subject)
3678: {
3679: SCHECK_PARTIAL();
3680: MRRETURN(MATCH_NOMATCH);
3681: }
3682: GETCHARINCTEST(c, eptr);
3683: prop_category = UCD_CATEGORY(c);
3684: if ((prop_category == ucp_L || prop_category == ucp_N ||
3685: c == CHAR_UNDERSCORE)
3686: == prop_fail_result)
3687: MRRETURN(MATCH_NOMATCH);
3688: }
3689: break;
3690:
3691: /* This should not occur */
3692:
1.1 misha 3693: default:
3694: RRETURN(PCRE_ERROR_INTERNAL);
3695: }
3696: }
3697:
3698: /* Match extended Unicode sequences. We will get here only if the
3699: support is in the binary; otherwise a compile-time error occurs. */
3700:
3701: else if (ctype == OP_EXTUNI)
3702: {
3703: for (i = 1; i <= min; i++)
3704: {
1.4 misha 3705: if (eptr >= md->end_subject)
3706: {
3707: SCHECK_PARTIAL();
3708: MRRETURN(MATCH_NOMATCH);
3709: }
1.1 misha 3710: GETCHARINCTEST(c, eptr);
1.2 misha 3711: prop_category = UCD_CATEGORY(c);
1.4 misha 3712: if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
1.1 misha 3713: while (eptr < md->end_subject)
3714: {
3715: int len = 1;
1.4 misha 3716: if (!utf8) c = *eptr;
3717: else { GETCHARLEN(c, eptr, len); }
1.2 misha 3718: prop_category = UCD_CATEGORY(c);
1.1 misha 3719: if (prop_category != ucp_M) break;
3720: eptr += len;
3721: }
3722: }
3723: }
3724:
3725: else
3726: #endif /* SUPPORT_UCP */
3727:
3728: /* Handle all other cases when the coding is UTF-8 */
3729:
3730: #ifdef SUPPORT_UTF8
3731: if (utf8) switch(ctype)
3732: {
3733: case OP_ANY:
3734: for (i = 1; i <= min; i++)
3735: {
1.4 misha 3736: if (eptr >= md->end_subject)
3737: {
3738: SCHECK_PARTIAL();
3739: MRRETURN(MATCH_NOMATCH);
3740: }
3741: if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1.1 misha 3742: eptr++;
3743: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3744: }
3745: break;
3746:
3747: case OP_ALLANY:
3748: for (i = 1; i <= min; i++)
3749: {
1.4 misha 3750: if (eptr >= md->end_subject)
3751: {
3752: SCHECK_PARTIAL();
3753: MRRETURN(MATCH_NOMATCH);
3754: }
1.1 misha 3755: eptr++;
3756: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3757: }
3758: break;
3759:
3760: case OP_ANYBYTE:
1.4 misha 3761: if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
1.1 misha 3762: eptr += min;
3763: break;
3764:
3765: case OP_ANYNL:
3766: for (i = 1; i <= min; i++)
3767: {
1.4 misha 3768: if (eptr >= md->end_subject)
3769: {
3770: SCHECK_PARTIAL();
3771: MRRETURN(MATCH_NOMATCH);
3772: }
1.1 misha 3773: GETCHARINC(c, eptr);
3774: switch(c)
3775: {
1.4 misha 3776: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 3777: case 0x000d:
3778: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3779: break;
3780:
3781: case 0x000a:
3782: break;
3783:
3784: case 0x000b:
3785: case 0x000c:
3786: case 0x0085:
3787: case 0x2028:
3788: case 0x2029:
1.4 misha 3789: if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1.1 misha 3790: break;
3791: }
3792: }
3793: break;
3794:
3795: case OP_NOT_HSPACE:
3796: for (i = 1; i <= min; i++)
3797: {
1.4 misha 3798: if (eptr >= md->end_subject)
3799: {
3800: SCHECK_PARTIAL();
3801: MRRETURN(MATCH_NOMATCH);
3802: }
1.1 misha 3803: GETCHARINC(c, eptr);
3804: switch(c)
3805: {
3806: default: break;
3807: case 0x09: /* HT */
3808: case 0x20: /* SPACE */
3809: case 0xa0: /* NBSP */
3810: case 0x1680: /* OGHAM SPACE MARK */
3811: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3812: case 0x2000: /* EN QUAD */
3813: case 0x2001: /* EM QUAD */
3814: case 0x2002: /* EN SPACE */
3815: case 0x2003: /* EM SPACE */
3816: case 0x2004: /* THREE-PER-EM SPACE */
3817: case 0x2005: /* FOUR-PER-EM SPACE */
3818: case 0x2006: /* SIX-PER-EM SPACE */
3819: case 0x2007: /* FIGURE SPACE */
3820: case 0x2008: /* PUNCTUATION SPACE */
3821: case 0x2009: /* THIN SPACE */
3822: case 0x200A: /* HAIR SPACE */
3823: case 0x202f: /* NARROW NO-BREAK SPACE */
3824: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3825: case 0x3000: /* IDEOGRAPHIC SPACE */
1.4 misha 3826: MRRETURN(MATCH_NOMATCH);
1.1 misha 3827: }
3828: }
3829: break;
3830:
3831: case OP_HSPACE:
3832: for (i = 1; i <= min; i++)
3833: {
1.4 misha 3834: if (eptr >= md->end_subject)
3835: {
3836: SCHECK_PARTIAL();
3837: MRRETURN(MATCH_NOMATCH);
3838: }
1.1 misha 3839: GETCHARINC(c, eptr);
3840: switch(c)
3841: {
1.4 misha 3842: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 3843: case 0x09: /* HT */
3844: case 0x20: /* SPACE */
3845: case 0xa0: /* NBSP */
3846: case 0x1680: /* OGHAM SPACE MARK */
3847: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3848: case 0x2000: /* EN QUAD */
3849: case 0x2001: /* EM QUAD */
3850: case 0x2002: /* EN SPACE */
3851: case 0x2003: /* EM SPACE */
3852: case 0x2004: /* THREE-PER-EM SPACE */
3853: case 0x2005: /* FOUR-PER-EM SPACE */
3854: case 0x2006: /* SIX-PER-EM SPACE */
3855: case 0x2007: /* FIGURE SPACE */
3856: case 0x2008: /* PUNCTUATION SPACE */
3857: case 0x2009: /* THIN SPACE */
3858: case 0x200A: /* HAIR SPACE */
3859: case 0x202f: /* NARROW NO-BREAK SPACE */
3860: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3861: case 0x3000: /* IDEOGRAPHIC SPACE */
3862: break;
3863: }
3864: }
3865: break;
3866:
3867: case OP_NOT_VSPACE:
3868: for (i = 1; i <= min; i++)
3869: {
1.4 misha 3870: if (eptr >= md->end_subject)
3871: {
3872: SCHECK_PARTIAL();
3873: MRRETURN(MATCH_NOMATCH);
3874: }
1.1 misha 3875: GETCHARINC(c, eptr);
3876: switch(c)
3877: {
3878: default: break;
3879: case 0x0a: /* LF */
3880: case 0x0b: /* VT */
3881: case 0x0c: /* FF */
3882: case 0x0d: /* CR */
3883: case 0x85: /* NEL */
3884: case 0x2028: /* LINE SEPARATOR */
3885: case 0x2029: /* PARAGRAPH SEPARATOR */
1.4 misha 3886: MRRETURN(MATCH_NOMATCH);
1.1 misha 3887: }
3888: }
3889: break;
3890:
3891: case OP_VSPACE:
3892: for (i = 1; i <= min; i++)
3893: {
1.4 misha 3894: if (eptr >= md->end_subject)
3895: {
3896: SCHECK_PARTIAL();
3897: MRRETURN(MATCH_NOMATCH);
3898: }
1.1 misha 3899: GETCHARINC(c, eptr);
3900: switch(c)
3901: {
1.4 misha 3902: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 3903: case 0x0a: /* LF */
3904: case 0x0b: /* VT */
3905: case 0x0c: /* FF */
3906: case 0x0d: /* CR */
3907: case 0x85: /* NEL */
3908: case 0x2028: /* LINE SEPARATOR */
3909: case 0x2029: /* PARAGRAPH SEPARATOR */
3910: break;
3911: }
3912: }
3913: break;
3914:
3915: case OP_NOT_DIGIT:
3916: for (i = 1; i <= min; i++)
3917: {
1.4 misha 3918: if (eptr >= md->end_subject)
3919: {
3920: SCHECK_PARTIAL();
3921: MRRETURN(MATCH_NOMATCH);
3922: }
1.1 misha 3923: GETCHARINC(c, eptr);
3924: if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
1.4 misha 3925: MRRETURN(MATCH_NOMATCH);
1.1 misha 3926: }
3927: break;
3928:
3929: case OP_DIGIT:
3930: for (i = 1; i <= min; i++)
3931: {
1.4 misha 3932: if (eptr >= md->end_subject)
3933: {
3934: SCHECK_PARTIAL();
3935: MRRETURN(MATCH_NOMATCH);
3936: }
3937: if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3938: MRRETURN(MATCH_NOMATCH);
1.1 misha 3939: /* No need to skip more bytes - we know it's a 1-byte character */
3940: }
3941: break;
3942:
3943: case OP_NOT_WHITESPACE:
3944: for (i = 1; i <= min; i++)
3945: {
1.4 misha 3946: if (eptr >= md->end_subject)
3947: {
3948: SCHECK_PARTIAL();
3949: MRRETURN(MATCH_NOMATCH);
3950: }
3951: if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
3952: MRRETURN(MATCH_NOMATCH);
1.1 misha 3953: while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3954: }
3955: break;
3956:
3957: case OP_WHITESPACE:
3958: for (i = 1; i <= min; i++)
3959: {
1.4 misha 3960: if (eptr >= md->end_subject)
3961: {
3962: SCHECK_PARTIAL();
3963: MRRETURN(MATCH_NOMATCH);
3964: }
3965: if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3966: MRRETURN(MATCH_NOMATCH);
1.1 misha 3967: /* No need to skip more bytes - we know it's a 1-byte character */
3968: }
3969: break;
3970:
3971: case OP_NOT_WORDCHAR:
3972: for (i = 1; i <= min; i++)
3973: {
1.4 misha 3974: if (eptr >= md->end_subject)
3975: {
3976: SCHECK_PARTIAL();
3977: MRRETURN(MATCH_NOMATCH);
3978: }
3979: if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
3980: MRRETURN(MATCH_NOMATCH);
1.1 misha 3981: while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3982: }
3983: break;
3984:
3985: case OP_WORDCHAR:
3986: for (i = 1; i <= min; i++)
3987: {
1.4 misha 3988: if (eptr >= md->end_subject)
3989: {
3990: SCHECK_PARTIAL();
3991: MRRETURN(MATCH_NOMATCH);
3992: }
3993: if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3994: MRRETURN(MATCH_NOMATCH);
1.1 misha 3995: /* No need to skip more bytes - we know it's a 1-byte character */
3996: }
3997: break;
3998:
3999: default:
4000: RRETURN(PCRE_ERROR_INTERNAL);
4001: } /* End switch(ctype) */
4002:
4003: else
4004: #endif /* SUPPORT_UTF8 */
4005:
4006: /* Code for the non-UTF-8 case for minimum matching of operators other
1.4 misha 4007: than OP_PROP and OP_NOTPROP. */
1.1 misha 4008:
4009: switch(ctype)
4010: {
4011: case OP_ANY:
4012: for (i = 1; i <= min; i++)
4013: {
1.4 misha 4014: if (eptr >= md->end_subject)
4015: {
4016: SCHECK_PARTIAL();
4017: MRRETURN(MATCH_NOMATCH);
4018: }
4019: if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1.1 misha 4020: eptr++;
4021: }
4022: break;
4023:
4024: case OP_ALLANY:
1.4 misha 4025: if (eptr > md->end_subject - min)
4026: {
4027: SCHECK_PARTIAL();
4028: MRRETURN(MATCH_NOMATCH);
4029: }
1.1 misha 4030: eptr += min;
4031: break;
4032:
4033: case OP_ANYBYTE:
1.4 misha 4034: if (eptr > md->end_subject - min)
4035: {
4036: SCHECK_PARTIAL();
4037: MRRETURN(MATCH_NOMATCH);
4038: }
1.1 misha 4039: eptr += min;
4040: break;
4041:
4042: case OP_ANYNL:
4043: for (i = 1; i <= min; i++)
4044: {
1.4 misha 4045: if (eptr >= md->end_subject)
4046: {
4047: SCHECK_PARTIAL();
4048: MRRETURN(MATCH_NOMATCH);
4049: }
1.1 misha 4050: switch(*eptr++)
4051: {
1.4 misha 4052: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4053: case 0x000d:
4054: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4055: break;
4056: case 0x000a:
4057: break;
4058:
4059: case 0x000b:
4060: case 0x000c:
4061: case 0x0085:
1.4 misha 4062: if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1.1 misha 4063: break;
4064: }
4065: }
4066: break;
4067:
4068: case OP_NOT_HSPACE:
4069: for (i = 1; i <= min; i++)
4070: {
1.4 misha 4071: if (eptr >= md->end_subject)
4072: {
4073: SCHECK_PARTIAL();
4074: MRRETURN(MATCH_NOMATCH);
4075: }
1.1 misha 4076: switch(*eptr++)
4077: {
4078: default: break;
4079: case 0x09: /* HT */
4080: case 0x20: /* SPACE */
4081: case 0xa0: /* NBSP */
1.4 misha 4082: MRRETURN(MATCH_NOMATCH);
1.1 misha 4083: }
4084: }
4085: break;
4086:
4087: case OP_HSPACE:
4088: for (i = 1; i <= min; i++)
4089: {
1.4 misha 4090: if (eptr >= md->end_subject)
4091: {
4092: SCHECK_PARTIAL();
4093: MRRETURN(MATCH_NOMATCH);
4094: }
1.1 misha 4095: switch(*eptr++)
4096: {
1.4 misha 4097: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4098: case 0x09: /* HT */
4099: case 0x20: /* SPACE */
4100: case 0xa0: /* NBSP */
4101: break;
4102: }
4103: }
4104: break;
4105:
4106: case OP_NOT_VSPACE:
4107: for (i = 1; i <= min; i++)
4108: {
1.4 misha 4109: if (eptr >= md->end_subject)
4110: {
4111: SCHECK_PARTIAL();
4112: MRRETURN(MATCH_NOMATCH);
4113: }
1.1 misha 4114: switch(*eptr++)
4115: {
4116: default: break;
4117: case 0x0a: /* LF */
4118: case 0x0b: /* VT */
4119: case 0x0c: /* FF */
4120: case 0x0d: /* CR */
4121: case 0x85: /* NEL */
1.4 misha 4122: MRRETURN(MATCH_NOMATCH);
1.1 misha 4123: }
4124: }
4125: break;
4126:
4127: case OP_VSPACE:
4128: for (i = 1; i <= min; i++)
4129: {
1.4 misha 4130: if (eptr >= md->end_subject)
4131: {
4132: SCHECK_PARTIAL();
4133: MRRETURN(MATCH_NOMATCH);
4134: }
1.1 misha 4135: switch(*eptr++)
4136: {
1.4 misha 4137: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4138: case 0x0a: /* LF */
4139: case 0x0b: /* VT */
4140: case 0x0c: /* FF */
4141: case 0x0d: /* CR */
4142: case 0x85: /* NEL */
4143: break;
4144: }
4145: }
4146: break;
4147:
4148: case OP_NOT_DIGIT:
4149: for (i = 1; i <= min; i++)
1.4 misha 4150: {
4151: if (eptr >= md->end_subject)
4152: {
4153: SCHECK_PARTIAL();
4154: MRRETURN(MATCH_NOMATCH);
4155: }
4156: if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
4157: }
1.1 misha 4158: break;
4159:
4160: case OP_DIGIT:
4161: for (i = 1; i <= min; i++)
1.4 misha 4162: {
4163: if (eptr >= md->end_subject)
4164: {
4165: SCHECK_PARTIAL();
4166: MRRETURN(MATCH_NOMATCH);
4167: }
4168: if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
4169: }
1.1 misha 4170: break;
4171:
4172: case OP_NOT_WHITESPACE:
4173: for (i = 1; i <= min; i++)
1.4 misha 4174: {
4175: if (eptr >= md->end_subject)
4176: {
4177: SCHECK_PARTIAL();
4178: MRRETURN(MATCH_NOMATCH);
4179: }
4180: if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
4181: }
1.1 misha 4182: break;
4183:
4184: case OP_WHITESPACE:
4185: for (i = 1; i <= min; i++)
1.4 misha 4186: {
4187: if (eptr >= md->end_subject)
4188: {
4189: SCHECK_PARTIAL();
4190: MRRETURN(MATCH_NOMATCH);
4191: }
4192: if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
4193: }
1.1 misha 4194: break;
4195:
4196: case OP_NOT_WORDCHAR:
4197: for (i = 1; i <= min; i++)
1.4 misha 4198: {
4199: if (eptr >= md->end_subject)
4200: {
4201: SCHECK_PARTIAL();
4202: MRRETURN(MATCH_NOMATCH);
4203: }
1.1 misha 4204: if ((md->ctypes[*eptr++] & ctype_word) != 0)
1.4 misha 4205: MRRETURN(MATCH_NOMATCH);
4206: }
1.1 misha 4207: break;
4208:
4209: case OP_WORDCHAR:
4210: for (i = 1; i <= min; i++)
1.4 misha 4211: {
4212: if (eptr >= md->end_subject)
4213: {
4214: SCHECK_PARTIAL();
4215: MRRETURN(MATCH_NOMATCH);
4216: }
1.1 misha 4217: if ((md->ctypes[*eptr++] & ctype_word) == 0)
1.4 misha 4218: MRRETURN(MATCH_NOMATCH);
4219: }
1.1 misha 4220: break;
4221:
4222: default:
4223: RRETURN(PCRE_ERROR_INTERNAL);
4224: }
4225: }
4226:
4227: /* If min = max, continue at the same level without recursing */
4228:
4229: if (min == max) continue;
4230:
4231: /* If minimizing, we have to test the rest of the pattern before each
4232: subsequent match. Again, separate the UTF-8 case for speed, and also
4233: separate the UCP cases. */
4234:
4235: if (minimize)
4236: {
4237: #ifdef SUPPORT_UCP
4238: if (prop_type >= 0)
4239: {
4240: switch(prop_type)
4241: {
4242: case PT_ANY:
4243: for (fi = min;; fi++)
4244: {
4245: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
4246: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 4247: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4248: if (eptr >= md->end_subject)
4249: {
4250: SCHECK_PARTIAL();
4251: MRRETURN(MATCH_NOMATCH);
4252: }
4253: GETCHARINCTEST(c, eptr);
4254: if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
1.1 misha 4255: }
4256: /* Control never gets here */
4257:
4258: case PT_LAMP:
4259: for (fi = min;; fi++)
4260: {
4261: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
4262: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 4263: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4264: if (eptr >= md->end_subject)
4265: {
4266: SCHECK_PARTIAL();
4267: MRRETURN(MATCH_NOMATCH);
4268: }
4269: GETCHARINCTEST(c, eptr);
1.2 misha 4270: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 4271: if ((prop_chartype == ucp_Lu ||
4272: prop_chartype == ucp_Ll ||
4273: prop_chartype == ucp_Lt) == prop_fail_result)
1.4 misha 4274: MRRETURN(MATCH_NOMATCH);
1.1 misha 4275: }
4276: /* Control never gets here */
4277:
4278: case PT_GC:
4279: for (fi = min;; fi++)
4280: {
4281: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4282: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 4283: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4284: if (eptr >= md->end_subject)
4285: {
4286: SCHECK_PARTIAL();
4287: MRRETURN(MATCH_NOMATCH);
4288: }
4289: GETCHARINCTEST(c, eptr);
1.2 misha 4290: prop_category = UCD_CATEGORY(c);
1.1 misha 4291: if ((prop_category == prop_value) == prop_fail_result)
1.4 misha 4292: MRRETURN(MATCH_NOMATCH);
1.1 misha 4293: }
4294: /* Control never gets here */
4295:
4296: case PT_PC:
4297: for (fi = min;; fi++)
4298: {
4299: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4300: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 4301: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4302: if (eptr >= md->end_subject)
4303: {
4304: SCHECK_PARTIAL();
4305: MRRETURN(MATCH_NOMATCH);
4306: }
4307: GETCHARINCTEST(c, eptr);
1.2 misha 4308: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 4309: if ((prop_chartype == prop_value) == prop_fail_result)
1.4 misha 4310: MRRETURN(MATCH_NOMATCH);
1.1 misha 4311: }
4312: /* Control never gets here */
4313:
4314: case PT_SC:
4315: for (fi = min;; fi++)
4316: {
4317: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4318: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 4319: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4320: if (eptr >= md->end_subject)
4321: {
4322: SCHECK_PARTIAL();
4323: MRRETURN(MATCH_NOMATCH);
4324: }
4325: GETCHARINCTEST(c, eptr);
1.2 misha 4326: prop_script = UCD_SCRIPT(c);
1.1 misha 4327: if ((prop_script == prop_value) == prop_fail_result)
1.4 misha 4328: MRRETURN(MATCH_NOMATCH);
4329: }
4330: /* Control never gets here */
4331:
4332: case PT_ALNUM:
4333: for (fi = min;; fi++)
4334: {
4335: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM59);
4336: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4337: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4338: if (eptr >= md->end_subject)
4339: {
4340: SCHECK_PARTIAL();
4341: MRRETURN(MATCH_NOMATCH);
4342: }
4343: GETCHARINCTEST(c, eptr);
4344: prop_category = UCD_CATEGORY(c);
4345: if ((prop_category == ucp_L || prop_category == ucp_N)
4346: == prop_fail_result)
4347: MRRETURN(MATCH_NOMATCH);
4348: }
4349: /* Control never gets here */
4350:
4351: case PT_SPACE: /* Perl space */
4352: for (fi = min;; fi++)
4353: {
4354: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM60);
4355: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4356: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4357: if (eptr >= md->end_subject)
4358: {
4359: SCHECK_PARTIAL();
4360: MRRETURN(MATCH_NOMATCH);
4361: }
4362: GETCHARINCTEST(c, eptr);
4363: prop_category = UCD_CATEGORY(c);
4364: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4365: c == CHAR_FF || c == CHAR_CR)
4366: == prop_fail_result)
4367: MRRETURN(MATCH_NOMATCH);
4368: }
4369: /* Control never gets here */
4370:
4371: case PT_PXSPACE: /* POSIX space */
4372: for (fi = min;; fi++)
4373: {
4374: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM61);
4375: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4376: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4377: if (eptr >= md->end_subject)
4378: {
4379: SCHECK_PARTIAL();
4380: MRRETURN(MATCH_NOMATCH);
4381: }
4382: GETCHARINCTEST(c, eptr);
4383: prop_category = UCD_CATEGORY(c);
4384: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4385: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4386: == prop_fail_result)
4387: MRRETURN(MATCH_NOMATCH);
1.1 misha 4388: }
4389: /* Control never gets here */
4390:
1.4 misha 4391: case PT_WORD:
4392: for (fi = min;; fi++)
4393: {
4394: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM62);
4395: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4396: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4397: if (eptr >= md->end_subject)
4398: {
4399: SCHECK_PARTIAL();
4400: MRRETURN(MATCH_NOMATCH);
4401: }
4402: GETCHARINCTEST(c, eptr);
4403: prop_category = UCD_CATEGORY(c);
4404: if ((prop_category == ucp_L ||
4405: prop_category == ucp_N ||
4406: c == CHAR_UNDERSCORE)
4407: == prop_fail_result)
4408: MRRETURN(MATCH_NOMATCH);
4409: }
4410: /* Control never gets here */
4411:
4412: /* This should never occur */
4413:
1.1 misha 4414: default:
4415: RRETURN(PCRE_ERROR_INTERNAL);
4416: }
4417: }
4418:
4419: /* Match extended Unicode sequences. We will get here only if the
4420: support is in the binary; otherwise a compile-time error occurs. */
4421:
4422: else if (ctype == OP_EXTUNI)
4423: {
4424: for (fi = min;; fi++)
4425: {
4426: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4427: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 4428: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4429: if (eptr >= md->end_subject)
4430: {
4431: SCHECK_PARTIAL();
4432: MRRETURN(MATCH_NOMATCH);
4433: }
1.1 misha 4434: GETCHARINCTEST(c, eptr);
1.2 misha 4435: prop_category = UCD_CATEGORY(c);
1.4 misha 4436: if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
1.1 misha 4437: while (eptr < md->end_subject)
4438: {
4439: int len = 1;
1.4 misha 4440: if (!utf8) c = *eptr;
4441: else { GETCHARLEN(c, eptr, len); }
1.2 misha 4442: prop_category = UCD_CATEGORY(c);
1.1 misha 4443: if (prop_category != ucp_M) break;
4444: eptr += len;
4445: }
4446: }
4447: }
4448:
4449: else
4450: #endif /* SUPPORT_UCP */
4451:
4452: #ifdef SUPPORT_UTF8
4453: /* UTF-8 mode */
4454: if (utf8)
4455: {
4456: for (fi = min;; fi++)
4457: {
4458: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4459: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 4460: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4461: if (eptr >= md->end_subject)
4462: {
4463: SCHECK_PARTIAL();
4464: MRRETURN(MATCH_NOMATCH);
4465: }
4466: if (ctype == OP_ANY && IS_NEWLINE(eptr))
4467: MRRETURN(MATCH_NOMATCH);
1.1 misha 4468: GETCHARINC(c, eptr);
4469: switch(ctype)
4470: {
4471: case OP_ANY: /* This is the non-NL case */
4472: case OP_ALLANY:
4473: case OP_ANYBYTE:
4474: break;
4475:
4476: case OP_ANYNL:
4477: switch(c)
4478: {
1.4 misha 4479: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4480: case 0x000d:
4481: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4482: break;
4483: case 0x000a:
4484: break;
4485:
4486: case 0x000b:
4487: case 0x000c:
4488: case 0x0085:
4489: case 0x2028:
4490: case 0x2029:
1.4 misha 4491: if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1.1 misha 4492: break;
4493: }
4494: break;
4495:
4496: case OP_NOT_HSPACE:
4497: switch(c)
4498: {
4499: default: break;
4500: case 0x09: /* HT */
4501: case 0x20: /* SPACE */
4502: case 0xa0: /* NBSP */
4503: case 0x1680: /* OGHAM SPACE MARK */
4504: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4505: case 0x2000: /* EN QUAD */
4506: case 0x2001: /* EM QUAD */
4507: case 0x2002: /* EN SPACE */
4508: case 0x2003: /* EM SPACE */
4509: case 0x2004: /* THREE-PER-EM SPACE */
4510: case 0x2005: /* FOUR-PER-EM SPACE */
4511: case 0x2006: /* SIX-PER-EM SPACE */
4512: case 0x2007: /* FIGURE SPACE */
4513: case 0x2008: /* PUNCTUATION SPACE */
4514: case 0x2009: /* THIN SPACE */
4515: case 0x200A: /* HAIR SPACE */
4516: case 0x202f: /* NARROW NO-BREAK SPACE */
4517: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4518: case 0x3000: /* IDEOGRAPHIC SPACE */
1.4 misha 4519: MRRETURN(MATCH_NOMATCH);
1.1 misha 4520: }
4521: break;
4522:
4523: case OP_HSPACE:
4524: switch(c)
4525: {
1.4 misha 4526: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4527: case 0x09: /* HT */
4528: case 0x20: /* SPACE */
4529: case 0xa0: /* NBSP */
4530: case 0x1680: /* OGHAM SPACE MARK */
4531: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4532: case 0x2000: /* EN QUAD */
4533: case 0x2001: /* EM QUAD */
4534: case 0x2002: /* EN SPACE */
4535: case 0x2003: /* EM SPACE */
4536: case 0x2004: /* THREE-PER-EM SPACE */
4537: case 0x2005: /* FOUR-PER-EM SPACE */
4538: case 0x2006: /* SIX-PER-EM SPACE */
4539: case 0x2007: /* FIGURE SPACE */
4540: case 0x2008: /* PUNCTUATION SPACE */
4541: case 0x2009: /* THIN SPACE */
4542: case 0x200A: /* HAIR SPACE */
4543: case 0x202f: /* NARROW NO-BREAK SPACE */
4544: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4545: case 0x3000: /* IDEOGRAPHIC SPACE */
4546: break;
4547: }
4548: break;
4549:
4550: case OP_NOT_VSPACE:
4551: switch(c)
4552: {
4553: default: break;
4554: case 0x0a: /* LF */
4555: case 0x0b: /* VT */
4556: case 0x0c: /* FF */
4557: case 0x0d: /* CR */
4558: case 0x85: /* NEL */
4559: case 0x2028: /* LINE SEPARATOR */
4560: case 0x2029: /* PARAGRAPH SEPARATOR */
1.4 misha 4561: MRRETURN(MATCH_NOMATCH);
1.1 misha 4562: }
4563: break;
4564:
4565: case OP_VSPACE:
4566: switch(c)
4567: {
1.4 misha 4568: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4569: case 0x0a: /* LF */
4570: case 0x0b: /* VT */
4571: case 0x0c: /* FF */
4572: case 0x0d: /* CR */
4573: case 0x85: /* NEL */
4574: case 0x2028: /* LINE SEPARATOR */
4575: case 0x2029: /* PARAGRAPH SEPARATOR */
4576: break;
4577: }
4578: break;
4579:
4580: case OP_NOT_DIGIT:
4581: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
1.4 misha 4582: MRRETURN(MATCH_NOMATCH);
1.1 misha 4583: break;
4584:
4585: case OP_DIGIT:
4586: if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
1.4 misha 4587: MRRETURN(MATCH_NOMATCH);
1.1 misha 4588: break;
4589:
4590: case OP_NOT_WHITESPACE:
4591: if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
1.4 misha 4592: MRRETURN(MATCH_NOMATCH);
1.1 misha 4593: break;
4594:
4595: case OP_WHITESPACE:
4596: if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
1.4 misha 4597: MRRETURN(MATCH_NOMATCH);
1.1 misha 4598: break;
4599:
4600: case OP_NOT_WORDCHAR:
4601: if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
1.4 misha 4602: MRRETURN(MATCH_NOMATCH);
1.1 misha 4603: break;
4604:
4605: case OP_WORDCHAR:
4606: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
1.4 misha 4607: MRRETURN(MATCH_NOMATCH);
1.1 misha 4608: break;
4609:
4610: default:
4611: RRETURN(PCRE_ERROR_INTERNAL);
4612: }
4613: }
4614: }
4615: else
4616: #endif
4617: /* Not UTF-8 mode */
4618: {
4619: for (fi = min;; fi++)
4620: {
4621: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4622: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 misha 4623: if (fi >= max) MRRETURN(MATCH_NOMATCH);
4624: if (eptr >= md->end_subject)
4625: {
4626: SCHECK_PARTIAL();
4627: MRRETURN(MATCH_NOMATCH);
4628: }
4629: if (ctype == OP_ANY && IS_NEWLINE(eptr))
4630: MRRETURN(MATCH_NOMATCH);
1.1 misha 4631: c = *eptr++;
4632: switch(ctype)
4633: {
4634: case OP_ANY: /* This is the non-NL case */
4635: case OP_ALLANY:
4636: case OP_ANYBYTE:
4637: break;
4638:
4639: case OP_ANYNL:
4640: switch(c)
4641: {
1.4 misha 4642: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4643: case 0x000d:
4644: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4645: break;
4646:
4647: case 0x000a:
4648: break;
4649:
4650: case 0x000b:
4651: case 0x000c:
4652: case 0x0085:
1.4 misha 4653: if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1.1 misha 4654: break;
4655: }
4656: break;
4657:
4658: case OP_NOT_HSPACE:
4659: switch(c)
4660: {
4661: default: break;
4662: case 0x09: /* HT */
4663: case 0x20: /* SPACE */
4664: case 0xa0: /* NBSP */
1.4 misha 4665: MRRETURN(MATCH_NOMATCH);
1.1 misha 4666: }
4667: break;
4668:
4669: case OP_HSPACE:
4670: switch(c)
4671: {
1.4 misha 4672: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4673: case 0x09: /* HT */
4674: case 0x20: /* SPACE */
4675: case 0xa0: /* NBSP */
4676: break;
4677: }
4678: break;
4679:
4680: case OP_NOT_VSPACE:
4681: switch(c)
4682: {
4683: default: break;
4684: case 0x0a: /* LF */
4685: case 0x0b: /* VT */
4686: case 0x0c: /* FF */
4687: case 0x0d: /* CR */
4688: case 0x85: /* NEL */
1.4 misha 4689: MRRETURN(MATCH_NOMATCH);
1.1 misha 4690: }
4691: break;
4692:
4693: case OP_VSPACE:
4694: switch(c)
4695: {
1.4 misha 4696: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4697: case 0x0a: /* LF */
4698: case 0x0b: /* VT */
4699: case 0x0c: /* FF */
4700: case 0x0d: /* CR */
4701: case 0x85: /* NEL */
4702: break;
4703: }
4704: break;
4705:
4706: case OP_NOT_DIGIT:
1.4 misha 4707: if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 4708: break;
4709:
4710: case OP_DIGIT:
1.4 misha 4711: if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 4712: break;
4713:
4714: case OP_NOT_WHITESPACE:
1.4 misha 4715: if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 4716: break;
4717:
4718: case OP_WHITESPACE:
1.4 misha 4719: if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 4720: break;
4721:
4722: case OP_NOT_WORDCHAR:
1.4 misha 4723: if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 4724: break;
4725:
4726: case OP_WORDCHAR:
1.4 misha 4727: if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 4728: break;
4729:
4730: default:
4731: RRETURN(PCRE_ERROR_INTERNAL);
4732: }
4733: }
4734: }
4735: /* Control never gets here */
4736: }
4737:
4738: /* If maximizing, it is worth using inline code for speed, doing the type
4739: test once at the start (i.e. keep it out of the loop). Again, keep the
4740: UTF-8 and UCP stuff separate. */
4741:
4742: else
4743: {
4744: pp = eptr; /* Remember where we started */
4745:
4746: #ifdef SUPPORT_UCP
4747: if (prop_type >= 0)
4748: {
4749: switch(prop_type)
4750: {
4751: case PT_ANY:
4752: for (i = min; i < max; i++)
4753: {
4754: int len = 1;
1.4 misha 4755: if (eptr >= md->end_subject)
4756: {
4757: SCHECK_PARTIAL();
4758: break;
4759: }
4760: GETCHARLENTEST(c, eptr, len);
1.1 misha 4761: if (prop_fail_result) break;
4762: eptr+= len;
4763: }
4764: break;
4765:
4766: case PT_LAMP:
4767: for (i = min; i < max; i++)
4768: {
4769: int len = 1;
1.4 misha 4770: if (eptr >= md->end_subject)
4771: {
4772: SCHECK_PARTIAL();
4773: break;
4774: }
4775: GETCHARLENTEST(c, eptr, len);
1.2 misha 4776: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 4777: if ((prop_chartype == ucp_Lu ||
4778: prop_chartype == ucp_Ll ||
4779: prop_chartype == ucp_Lt) == prop_fail_result)
4780: break;
4781: eptr+= len;
4782: }
4783: break;
4784:
4785: case PT_GC:
4786: for (i = min; i < max; i++)
4787: {
4788: int len = 1;
1.4 misha 4789: if (eptr >= md->end_subject)
4790: {
4791: SCHECK_PARTIAL();
4792: break;
4793: }
4794: GETCHARLENTEST(c, eptr, len);
1.2 misha 4795: prop_category = UCD_CATEGORY(c);
1.1 misha 4796: if ((prop_category == prop_value) == prop_fail_result)
4797: break;
4798: eptr+= len;
4799: }
4800: break;
4801:
4802: case PT_PC:
4803: for (i = min; i < max; i++)
4804: {
4805: int len = 1;
1.4 misha 4806: if (eptr >= md->end_subject)
4807: {
4808: SCHECK_PARTIAL();
4809: break;
4810: }
4811: GETCHARLENTEST(c, eptr, len);
1.2 misha 4812: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 4813: if ((prop_chartype == prop_value) == prop_fail_result)
4814: break;
4815: eptr+= len;
4816: }
4817: break;
4818:
4819: case PT_SC:
4820: for (i = min; i < max; i++)
4821: {
4822: int len = 1;
1.4 misha 4823: if (eptr >= md->end_subject)
4824: {
4825: SCHECK_PARTIAL();
4826: break;
4827: }
4828: GETCHARLENTEST(c, eptr, len);
1.2 misha 4829: prop_script = UCD_SCRIPT(c);
1.1 misha 4830: if ((prop_script == prop_value) == prop_fail_result)
4831: break;
4832: eptr+= len;
4833: }
4834: break;
1.4 misha 4835:
4836: case PT_ALNUM:
4837: for (i = min; i < max; i++)
4838: {
4839: int len = 1;
4840: if (eptr >= md->end_subject)
4841: {
4842: SCHECK_PARTIAL();
4843: break;
4844: }
4845: GETCHARLENTEST(c, eptr, len);
4846: prop_category = UCD_CATEGORY(c);
4847: if ((prop_category == ucp_L || prop_category == ucp_N)
4848: == prop_fail_result)
4849: break;
4850: eptr+= len;
4851: }
4852: break;
4853:
4854: case PT_SPACE: /* Perl space */
4855: for (i = min; i < max; i++)
4856: {
4857: int len = 1;
4858: if (eptr >= md->end_subject)
4859: {
4860: SCHECK_PARTIAL();
4861: break;
4862: }
4863: GETCHARLENTEST(c, eptr, len);
4864: prop_category = UCD_CATEGORY(c);
4865: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4866: c == CHAR_FF || c == CHAR_CR)
4867: == prop_fail_result)
4868: break;
4869: eptr+= len;
4870: }
4871: break;
4872:
4873: case PT_PXSPACE: /* POSIX space */
4874: for (i = min; i < max; i++)
4875: {
4876: int len = 1;
4877: if (eptr >= md->end_subject)
4878: {
4879: SCHECK_PARTIAL();
4880: break;
4881: }
4882: GETCHARLENTEST(c, eptr, len);
4883: prop_category = UCD_CATEGORY(c);
4884: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
4885: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4886: == prop_fail_result)
4887: break;
4888: eptr+= len;
4889: }
4890: break;
4891:
4892: case PT_WORD:
4893: for (i = min; i < max; i++)
4894: {
4895: int len = 1;
4896: if (eptr >= md->end_subject)
4897: {
4898: SCHECK_PARTIAL();
4899: break;
4900: }
4901: GETCHARLENTEST(c, eptr, len);
4902: prop_category = UCD_CATEGORY(c);
4903: if ((prop_category == ucp_L || prop_category == ucp_N ||
4904: c == CHAR_UNDERSCORE) == prop_fail_result)
4905: break;
4906: eptr+= len;
4907: }
4908: break;
4909:
4910: default:
4911: RRETURN(PCRE_ERROR_INTERNAL);
1.1 misha 4912: }
4913:
4914: /* eptr is now past the end of the maximum run */
4915:
4916: if (possessive) continue;
4917: for(;;)
4918: {
4919: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4920: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4921: if (eptr-- == pp) break; /* Stop if tried at original pos */
4922: if (utf8) BACKCHAR(eptr);
4923: }
4924: }
4925:
4926: /* Match extended Unicode sequences. We will get here only if the
4927: support is in the binary; otherwise a compile-time error occurs. */
4928:
4929: else if (ctype == OP_EXTUNI)
4930: {
4931: for (i = min; i < max; i++)
4932: {
1.4 misha 4933: if (eptr >= md->end_subject)
4934: {
4935: SCHECK_PARTIAL();
4936: break;
4937: }
1.1 misha 4938: GETCHARINCTEST(c, eptr);
1.2 misha 4939: prop_category = UCD_CATEGORY(c);
1.1 misha 4940: if (prop_category == ucp_M) break;
4941: while (eptr < md->end_subject)
4942: {
4943: int len = 1;
4944: if (!utf8) c = *eptr; else
4945: {
4946: GETCHARLEN(c, eptr, len);
4947: }
1.2 misha 4948: prop_category = UCD_CATEGORY(c);
1.1 misha 4949: if (prop_category != ucp_M) break;
4950: eptr += len;
4951: }
4952: }
4953:
4954: /* eptr is now past the end of the maximum run */
4955:
4956: if (possessive) continue;
1.4 misha 4957:
1.1 misha 4958: for(;;)
4959: {
4960: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4961: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4962: if (eptr-- == pp) break; /* Stop if tried at original pos */
4963: for (;;) /* Move back over one extended */
4964: {
4965: int len = 1;
4966: if (!utf8) c = *eptr; else
4967: {
4968: BACKCHAR(eptr);
4969: GETCHARLEN(c, eptr, len);
4970: }
1.2 misha 4971: prop_category = UCD_CATEGORY(c);
1.1 misha 4972: if (prop_category != ucp_M) break;
4973: eptr--;
4974: }
4975: }
4976: }
4977:
4978: else
4979: #endif /* SUPPORT_UCP */
4980:
4981: #ifdef SUPPORT_UTF8
4982: /* UTF-8 mode */
4983:
4984: if (utf8)
4985: {
4986: switch(ctype)
4987: {
4988: case OP_ANY:
4989: if (max < INT_MAX)
4990: {
4991: for (i = min; i < max; i++)
4992: {
1.4 misha 4993: if (eptr >= md->end_subject)
4994: {
4995: SCHECK_PARTIAL();
4996: break;
4997: }
4998: if (IS_NEWLINE(eptr)) break;
1.1 misha 4999: eptr++;
5000: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5001: }
5002: }
5003:
5004: /* Handle unlimited UTF-8 repeat */
5005:
5006: else
5007: {
5008: for (i = min; i < max; i++)
5009: {
1.4 misha 5010: if (eptr >= md->end_subject)
5011: {
5012: SCHECK_PARTIAL();
5013: break;
5014: }
5015: if (IS_NEWLINE(eptr)) break;
1.1 misha 5016: eptr++;
5017: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5018: }
5019: }
5020: break;
5021:
5022: case OP_ALLANY:
5023: if (max < INT_MAX)
5024: {
5025: for (i = min; i < max; i++)
5026: {
1.4 misha 5027: if (eptr >= md->end_subject)
5028: {
5029: SCHECK_PARTIAL();
5030: break;
5031: }
1.1 misha 5032: eptr++;
5033: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5034: }
5035: }
5036: else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5037: break;
5038:
5039: /* The byte case is the same as non-UTF8 */
5040:
5041: case OP_ANYBYTE:
5042: c = max - min;
5043: if (c > (unsigned int)(md->end_subject - eptr))
1.4 misha 5044: {
5045: eptr = md->end_subject;
5046: SCHECK_PARTIAL();
5047: }
5048: else eptr += c;
1.1 misha 5049: break;
5050:
5051: case OP_ANYNL:
5052: for (i = min; i < max; i++)
5053: {
5054: int len = 1;
1.4 misha 5055: if (eptr >= md->end_subject)
5056: {
5057: SCHECK_PARTIAL();
5058: break;
5059: }
1.1 misha 5060: GETCHARLEN(c, eptr, len);
5061: if (c == 0x000d)
5062: {
5063: if (++eptr >= md->end_subject) break;
5064: if (*eptr == 0x000a) eptr++;
5065: }
5066: else
5067: {
5068: if (c != 0x000a &&
5069: (md->bsr_anycrlf ||
5070: (c != 0x000b && c != 0x000c &&
5071: c != 0x0085 && c != 0x2028 && c != 0x2029)))
5072: break;
5073: eptr += len;
5074: }
5075: }
5076: break;
5077:
5078: case OP_NOT_HSPACE:
5079: case OP_HSPACE:
5080: for (i = min; i < max; i++)
5081: {
5082: BOOL gotspace;
5083: int len = 1;
1.4 misha 5084: if (eptr >= md->end_subject)
5085: {
5086: SCHECK_PARTIAL();
5087: break;
5088: }
1.1 misha 5089: GETCHARLEN(c, eptr, len);
5090: switch(c)
5091: {
5092: default: gotspace = FALSE; break;
5093: case 0x09: /* HT */
5094: case 0x20: /* SPACE */
5095: case 0xa0: /* NBSP */
5096: case 0x1680: /* OGHAM SPACE MARK */
5097: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5098: case 0x2000: /* EN QUAD */
5099: case 0x2001: /* EM QUAD */
5100: case 0x2002: /* EN SPACE */
5101: case 0x2003: /* EM SPACE */
5102: case 0x2004: /* THREE-PER-EM SPACE */
5103: case 0x2005: /* FOUR-PER-EM SPACE */
5104: case 0x2006: /* SIX-PER-EM SPACE */
5105: case 0x2007: /* FIGURE SPACE */
5106: case 0x2008: /* PUNCTUATION SPACE */
5107: case 0x2009: /* THIN SPACE */
5108: case 0x200A: /* HAIR SPACE */
5109: case 0x202f: /* NARROW NO-BREAK SPACE */
5110: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5111: case 0x3000: /* IDEOGRAPHIC SPACE */
5112: gotspace = TRUE;
5113: break;
5114: }
5115: if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5116: eptr += len;
5117: }
5118: break;
5119:
5120: case OP_NOT_VSPACE:
5121: case OP_VSPACE:
5122: for (i = min; i < max; i++)
5123: {
5124: BOOL gotspace;
5125: int len = 1;
1.4 misha 5126: if (eptr >= md->end_subject)
5127: {
5128: SCHECK_PARTIAL();
5129: break;
5130: }
1.1 misha 5131: GETCHARLEN(c, eptr, len);
5132: switch(c)
5133: {
5134: default: gotspace = FALSE; break;
5135: case 0x0a: /* LF */
5136: case 0x0b: /* VT */
5137: case 0x0c: /* FF */
5138: case 0x0d: /* CR */
5139: case 0x85: /* NEL */
5140: case 0x2028: /* LINE SEPARATOR */
5141: case 0x2029: /* PARAGRAPH SEPARATOR */
5142: gotspace = TRUE;
5143: break;
5144: }
5145: if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5146: eptr += len;
5147: }
5148: break;
5149:
5150: case OP_NOT_DIGIT:
5151: for (i = min; i < max; i++)
5152: {
5153: int len = 1;
1.4 misha 5154: if (eptr >= md->end_subject)
5155: {
5156: SCHECK_PARTIAL();
5157: break;
5158: }
1.1 misha 5159: GETCHARLEN(c, eptr, len);
5160: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5161: eptr+= len;
5162: }
5163: break;
5164:
5165: case OP_DIGIT:
5166: for (i = min; i < max; i++)
5167: {
5168: int len = 1;
1.4 misha 5169: if (eptr >= md->end_subject)
5170: {
5171: SCHECK_PARTIAL();
5172: break;
5173: }
1.1 misha 5174: GETCHARLEN(c, eptr, len);
5175: if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5176: eptr+= len;
5177: }
5178: break;
5179:
5180: case OP_NOT_WHITESPACE:
5181: for (i = min; i < max; i++)
5182: {
5183: int len = 1;
1.4 misha 5184: if (eptr >= md->end_subject)
5185: {
5186: SCHECK_PARTIAL();
5187: break;
5188: }
1.1 misha 5189: GETCHARLEN(c, eptr, len);
5190: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5191: eptr+= len;
5192: }
5193: break;
5194:
5195: case OP_WHITESPACE:
5196: for (i = min; i < max; i++)
5197: {
5198: int len = 1;
1.4 misha 5199: if (eptr >= md->end_subject)
5200: {
5201: SCHECK_PARTIAL();
5202: break;
5203: }
1.1 misha 5204: GETCHARLEN(c, eptr, len);
5205: if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5206: eptr+= len;
5207: }
5208: break;
5209:
5210: case OP_NOT_WORDCHAR:
5211: for (i = min; i < max; i++)
5212: {
5213: int len = 1;
1.4 misha 5214: if (eptr >= md->end_subject)
5215: {
5216: SCHECK_PARTIAL();
5217: break;
5218: }
1.1 misha 5219: GETCHARLEN(c, eptr, len);
5220: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5221: eptr+= len;
5222: }
5223: break;
5224:
5225: case OP_WORDCHAR:
5226: for (i = min; i < max; i++)
5227: {
5228: int len = 1;
1.4 misha 5229: if (eptr >= md->end_subject)
5230: {
5231: SCHECK_PARTIAL();
5232: break;
5233: }
1.1 misha 5234: GETCHARLEN(c, eptr, len);
5235: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5236: eptr+= len;
5237: }
5238: break;
5239:
5240: default:
5241: RRETURN(PCRE_ERROR_INTERNAL);
5242: }
5243:
5244: /* eptr is now past the end of the maximum run */
5245:
5246: if (possessive) continue;
5247: for(;;)
5248: {
5249: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
5250: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5251: if (eptr-- == pp) break; /* Stop if tried at original pos */
5252: BACKCHAR(eptr);
5253: }
5254: }
5255: else
5256: #endif /* SUPPORT_UTF8 */
5257:
5258: /* Not UTF-8 mode */
5259: {
5260: switch(ctype)
5261: {
5262: case OP_ANY:
5263: for (i = min; i < max; i++)
5264: {
1.4 misha 5265: if (eptr >= md->end_subject)
5266: {
5267: SCHECK_PARTIAL();
5268: break;
5269: }
5270: if (IS_NEWLINE(eptr)) break;
1.1 misha 5271: eptr++;
5272: }
5273: break;
5274:
5275: case OP_ALLANY:
5276: case OP_ANYBYTE:
5277: c = max - min;
5278: if (c > (unsigned int)(md->end_subject - eptr))
1.4 misha 5279: {
5280: eptr = md->end_subject;
5281: SCHECK_PARTIAL();
5282: }
5283: else eptr += c;
1.1 misha 5284: break;
5285:
5286: case OP_ANYNL:
5287: for (i = min; i < max; i++)
5288: {
1.4 misha 5289: if (eptr >= md->end_subject)
5290: {
5291: SCHECK_PARTIAL();
5292: break;
5293: }
1.1 misha 5294: c = *eptr;
5295: if (c == 0x000d)
5296: {
5297: if (++eptr >= md->end_subject) break;
5298: if (*eptr == 0x000a) eptr++;
5299: }
5300: else
5301: {
5302: if (c != 0x000a &&
5303: (md->bsr_anycrlf ||
5304: (c != 0x000b && c != 0x000c && c != 0x0085)))
5305: break;
5306: eptr++;
5307: }
5308: }
5309: break;
5310:
5311: case OP_NOT_HSPACE:
5312: for (i = min; i < max; i++)
5313: {
1.4 misha 5314: if (eptr >= md->end_subject)
5315: {
5316: SCHECK_PARTIAL();
5317: break;
5318: }
1.1 misha 5319: c = *eptr;
5320: if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5321: eptr++;
5322: }
5323: break;
5324:
5325: case OP_HSPACE:
5326: for (i = min; i < max; i++)
5327: {
1.4 misha 5328: if (eptr >= md->end_subject)
5329: {
5330: SCHECK_PARTIAL();
5331: break;
5332: }
1.1 misha 5333: c = *eptr;
5334: if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5335: eptr++;
5336: }
5337: break;
5338:
5339: case OP_NOT_VSPACE:
5340: for (i = min; i < max; i++)
5341: {
1.4 misha 5342: if (eptr >= md->end_subject)
5343: {
5344: SCHECK_PARTIAL();
5345: break;
5346: }
1.1 misha 5347: c = *eptr;
5348: if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5349: break;
5350: eptr++;
5351: }
5352: break;
5353:
5354: case OP_VSPACE:
5355: for (i = min; i < max; i++)
5356: {
1.4 misha 5357: if (eptr >= md->end_subject)
5358: {
5359: SCHECK_PARTIAL();
5360: break;
5361: }
1.1 misha 5362: c = *eptr;
5363: if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5364: break;
5365: eptr++;
5366: }
5367: break;
5368:
5369: case OP_NOT_DIGIT:
5370: for (i = min; i < max; i++)
5371: {
1.4 misha 5372: if (eptr >= md->end_subject)
5373: {
5374: SCHECK_PARTIAL();
1.1 misha 5375: break;
1.4 misha 5376: }
5377: if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
1.1 misha 5378: eptr++;
5379: }
5380: break;
5381:
5382: case OP_DIGIT:
5383: for (i = min; i < max; i++)
5384: {
1.4 misha 5385: if (eptr >= md->end_subject)
5386: {
5387: SCHECK_PARTIAL();
1.1 misha 5388: break;
1.4 misha 5389: }
5390: if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
1.1 misha 5391: eptr++;
5392: }
5393: break;
5394:
5395: case OP_NOT_WHITESPACE:
5396: for (i = min; i < max; i++)
5397: {
1.4 misha 5398: if (eptr >= md->end_subject)
5399: {
5400: SCHECK_PARTIAL();
1.1 misha 5401: break;
1.4 misha 5402: }
5403: if ((md->ctypes[*eptr] & ctype_space) != 0) break;
1.1 misha 5404: eptr++;
5405: }
5406: break;
5407:
5408: case OP_WHITESPACE:
5409: for (i = min; i < max; i++)
5410: {
1.4 misha 5411: if (eptr >= md->end_subject)
5412: {
5413: SCHECK_PARTIAL();
1.1 misha 5414: break;
1.4 misha 5415: }
5416: if ((md->ctypes[*eptr] & ctype_space) == 0) break;
1.1 misha 5417: eptr++;
5418: }
5419: break;
5420:
5421: case OP_NOT_WORDCHAR:
5422: for (i = min; i < max; i++)
5423: {
1.4 misha 5424: if (eptr >= md->end_subject)
5425: {
5426: SCHECK_PARTIAL();
1.1 misha 5427: break;
1.4 misha 5428: }
5429: if ((md->ctypes[*eptr] & ctype_word) != 0) break;
1.1 misha 5430: eptr++;
5431: }
5432: break;
5433:
5434: case OP_WORDCHAR:
5435: for (i = min; i < max; i++)
5436: {
1.4 misha 5437: if (eptr >= md->end_subject)
5438: {
5439: SCHECK_PARTIAL();
1.1 misha 5440: break;
1.4 misha 5441: }
5442: if ((md->ctypes[*eptr] & ctype_word) == 0) break;
1.1 misha 5443: eptr++;
5444: }
5445: break;
5446:
5447: default:
5448: RRETURN(PCRE_ERROR_INTERNAL);
5449: }
5450:
5451: /* eptr is now past the end of the maximum run */
5452:
5453: if (possessive) continue;
5454: while (eptr >= pp)
5455: {
5456: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5457: eptr--;
5458: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5459: }
5460: }
5461:
5462: /* Get here if we can't make it match with any permitted repetitions */
5463:
1.4 misha 5464: MRRETURN(MATCH_NOMATCH);
1.1 misha 5465: }
5466: /* Control never gets here */
5467:
5468: /* There's been some horrible disaster. Arrival here can only mean there is
5469: something seriously wrong in the code above or the OP_xxx definitions. */
5470:
5471: default:
5472: DPRINTF(("Unknown opcode %d\n", *ecode));
5473: RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5474: }
5475:
5476: /* Do not stick any code in here without much thought; it is assumed
5477: that "continue" in the code above comes out to here to repeat the main
5478: loop. */
5479:
5480: } /* End of main loop */
5481: /* Control never reaches here */
5482:
5483:
5484: /* When compiling to use the heap rather than the stack for recursive calls to
5485: match(), the RRETURN() macro jumps here. The number that is saved in
5486: frame->Xwhere indicates which label we actually want to return to. */
5487:
5488: #ifdef NO_RECURSE
5489: #define LBL(val) case val: goto L_RM##val;
5490: HEAP_RETURN:
5491: switch (frame->Xwhere)
5492: {
5493: LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5494: LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5495: LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5496: LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
1.4 misha 5497: LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58)
1.1 misha 5498: #ifdef SUPPORT_UTF8
5499: LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5500: LBL(32) LBL(34) LBL(42) LBL(46)
5501: #ifdef SUPPORT_UCP
5502: LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
1.4 misha 5503: LBL(59) LBL(60) LBL(61) LBL(62)
1.1 misha 5504: #endif /* SUPPORT_UCP */
5505: #endif /* SUPPORT_UTF8 */
5506: default:
5507: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5508: return PCRE_ERROR_INTERNAL;
5509: }
5510: #undef LBL
5511: #endif /* NO_RECURSE */
5512: }
5513:
5514:
5515: /***************************************************************************
5516: ****************************************************************************
5517: RECURSION IN THE match() FUNCTION
5518:
5519: Undefine all the macros that were defined above to handle this. */
5520:
5521: #ifdef NO_RECURSE
5522: #undef eptr
5523: #undef ecode
5524: #undef mstart
5525: #undef offset_top
5526: #undef ims
5527: #undef eptrb
5528: #undef flags
5529:
5530: #undef callpat
5531: #undef charptr
5532: #undef data
5533: #undef next
5534: #undef pp
5535: #undef prev
5536: #undef saved_eptr
5537:
5538: #undef new_recursive
5539:
5540: #undef cur_is_word
5541: #undef condition
5542: #undef prev_is_word
5543:
5544: #undef original_ims
5545:
5546: #undef ctype
5547: #undef length
5548: #undef max
5549: #undef min
5550: #undef number
5551: #undef offset
5552: #undef op
5553: #undef save_capture_last
5554: #undef save_offset1
5555: #undef save_offset2
5556: #undef save_offset3
5557: #undef stacksave
5558:
5559: #undef newptrb
5560:
5561: #endif
5562:
5563: /* These two are defined as macros in both cases */
5564:
5565: #undef fc
5566: #undef fi
5567:
5568: /***************************************************************************
5569: ***************************************************************************/
5570:
5571:
5572:
5573: /*************************************************
5574: * Execute a Regular Expression *
5575: *************************************************/
5576:
5577: /* This function applies a compiled re to a subject string and picks out
5578: portions of the string if it matches. Two elements in the vector are set for
5579: each substring: the offsets to the start and end of the substring.
5580:
5581: Arguments:
5582: argument_re points to the compiled expression
5583: extra_data points to extra data or is NULL
5584: subject points to the subject string
5585: length length of subject string (may contain binary zeros)
5586: start_offset where to start in the subject string
5587: options option bits
5588: offsets points to a vector of ints to be filled in with offsets
5589: offsetcount the number of elements in the vector
5590:
5591: Returns: > 0 => success; value is the number of elements filled in
5592: = 0 => success, but offsets is not big enough
5593: -1 => failed to match
5594: < -1 => some kind of unexpected problem
5595: */
5596:
1.2 misha 5597: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
1.1 misha 5598: pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5599: PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5600: int offsetcount)
5601: {
5602: int rc, resetcount, ocount;
5603: int first_byte = -1;
5604: int req_byte = -1;
5605: int req_byte2 = -1;
5606: int newline;
5607: unsigned long int ims;
5608: BOOL using_temporary_offsets = FALSE;
5609: BOOL anchored;
5610: BOOL startline;
5611: BOOL firstline;
5612: BOOL first_byte_caseless = FALSE;
5613: BOOL req_byte_caseless = FALSE;
5614: BOOL utf8;
5615: match_data match_block;
5616: match_data *md = &match_block;
5617: const uschar *tables;
5618: const uschar *start_bits = NULL;
5619: USPTR start_match = (USPTR)subject + start_offset;
5620: USPTR end_subject;
1.4 misha 5621: USPTR start_partial = NULL;
1.1 misha 5622: USPTR req_byte_ptr = start_match - 1;
5623:
5624: pcre_study_data internal_study;
5625: const pcre_study_data *study;
5626:
5627: real_pcre internal_re;
5628: const real_pcre *external_re = (const real_pcre *)argument_re;
5629: const real_pcre *re = external_re;
5630:
5631: /* Plausibility checks */
5632:
5633: if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5634: if (re == NULL || subject == NULL ||
5635: (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5636: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
1.5 ! misha 5637: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
1.1 misha 5638:
1.4 misha 5639: /* This information is for finding all the numbers associated with a given
5640: name, for condition testing. */
5641:
5642: md->name_table = (uschar *)re + re->name_table_offset;
5643: md->name_count = re->name_count;
5644: md->name_entry_size = re->name_entry_size;
5645:
1.1 misha 5646: /* Fish out the optional data from the extra_data structure, first setting
5647: the default values. */
5648:
5649: study = NULL;
5650: md->match_limit = MATCH_LIMIT;
5651: md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5652: md->callout_data = NULL;
5653:
5654: /* The table pointer is always in native byte order. */
5655:
5656: tables = external_re->tables;
5657:
5658: if (extra_data != NULL)
5659: {
5660: register unsigned int flags = extra_data->flags;
5661: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5662: study = (const pcre_study_data *)extra_data->study_data;
5663: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5664: md->match_limit = extra_data->match_limit;
5665: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5666: md->match_limit_recursion = extra_data->match_limit_recursion;
5667: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5668: md->callout_data = extra_data->callout_data;
5669: if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5670: }
5671:
5672: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5673: is a feature that makes it possible to save compiled regex and re-use them
5674: in other programs later. */
5675:
5676: if (tables == NULL) tables = _pcre_default_tables;
5677:
5678: /* Check that the first field in the block is the magic number. If it is not,
5679: test for a regex that was compiled on a host of opposite endianness. If this is
5680: the case, flipped values are put in internal_re and internal_study if there was
5681: study data too. */
5682:
5683: if (re->magic_number != MAGIC_NUMBER)
5684: {
5685: re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5686: if (re == NULL) return PCRE_ERROR_BADMAGIC;
5687: if (study != NULL) study = &internal_study;
5688: }
5689:
5690: /* Set up other data */
5691:
5692: anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5693: startline = (re->flags & PCRE_STARTLINE) != 0;
5694: firstline = (re->options & PCRE_FIRSTLINE) != 0;
5695:
5696: /* The code starts after the real_pcre block and the capture name table. */
5697:
5698: md->start_code = (const uschar *)external_re + re->name_table_offset +
5699: re->name_count * re->name_entry_size;
5700:
5701: md->start_subject = (USPTR)subject;
5702: md->start_offset = start_offset;
5703: md->end_subject = md->start_subject + length;
5704: end_subject = md->end_subject;
5705:
5706: md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5707: utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
1.4 misha 5708: md->use_ucp = (re->options & PCRE_UCP) != 0;
1.1 misha 5709: md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5710:
5711: md->notbol = (options & PCRE_NOTBOL) != 0;
5712: md->noteol = (options & PCRE_NOTEOL) != 0;
5713: md->notempty = (options & PCRE_NOTEMPTY) != 0;
1.4 misha 5714: md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
5715: md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
5716: ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
1.1 misha 5717: md->hitend = FALSE;
1.4 misha 5718: md->mark = NULL; /* In case never set */
1.1 misha 5719:
5720: md->recursive = NULL; /* No recursion at top level */
5721:
5722: md->lcc = tables + lcc_offset;
5723: md->ctypes = tables + ctypes_offset;
5724:
5725: /* Handle different \R options. */
5726:
5727: switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5728: {
5729: case 0:
5730: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5731: md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5732: else
5733: #ifdef BSR_ANYCRLF
5734: md->bsr_anycrlf = TRUE;
5735: #else
5736: md->bsr_anycrlf = FALSE;
5737: #endif
5738: break;
5739:
5740: case PCRE_BSR_ANYCRLF:
5741: md->bsr_anycrlf = TRUE;
5742: break;
5743:
5744: case PCRE_BSR_UNICODE:
5745: md->bsr_anycrlf = FALSE;
5746: break;
5747:
5748: default: return PCRE_ERROR_BADNEWLINE;
5749: }
5750:
5751: /* Handle different types of newline. The three bits give eight cases. If
5752: nothing is set at run time, whatever was used at compile time applies. */
5753:
5754: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5755: (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5756: {
5757: case 0: newline = NEWLINE; break; /* Compile-time default */
1.3 misha 5758: case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5759: case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
1.1 misha 5760: case PCRE_NEWLINE_CR+
1.3 misha 5761: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
1.1 misha 5762: case PCRE_NEWLINE_ANY: newline = -1; break;
5763: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5764: default: return PCRE_ERROR_BADNEWLINE;
5765: }
5766:
5767: if (newline == -2)
5768: {
5769: md->nltype = NLTYPE_ANYCRLF;
5770: }
5771: else if (newline < 0)
5772: {
5773: md->nltype = NLTYPE_ANY;
5774: }
5775: else
5776: {
5777: md->nltype = NLTYPE_FIXED;
5778: if (newline > 255)
5779: {
5780: md->nllen = 2;
5781: md->nl[0] = (newline >> 8) & 255;
5782: md->nl[1] = newline & 255;
5783: }
5784: else
5785: {
5786: md->nllen = 1;
5787: md->nl[0] = newline;
5788: }
5789: }
5790:
1.4 misha 5791: /* Partial matching was originally supported only for a restricted set of
5792: regexes; from release 8.00 there are no restrictions, but the bits are still
5793: defined (though never set). So there's no harm in leaving this code. */
1.1 misha 5794:
5795: if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5796: return PCRE_ERROR_BADPARTIAL;
5797:
5798: /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5799: back the character offset. */
5800:
5801: #ifdef SUPPORT_UTF8
5802: if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5803: {
1.5 ! misha 5804: int tb;
! 5805: if ((tb = _pcre_valid_utf8((USPTR)subject, length)) >= 0)
! 5806: return (tb == length && md->partial > 1)?
! 5807: PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
1.1 misha 5808: if (start_offset > 0 && start_offset < length)
5809: {
1.5 ! misha 5810: tb = ((USPTR)subject)[start_offset] & 0xc0;
! 5811: if (tb == 0x80) return PCRE_ERROR_BADUTF8_OFFSET;
1.1 misha 5812: }
5813: }
5814: #endif
5815:
5816: /* The ims options can vary during the matching as a result of the presence
5817: of (?ims) items in the pattern. They are kept in a local variable so that
5818: restoring at the exit of a group is easy. */
5819:
5820: ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5821:
5822: /* If the expression has got more back references than the offsets supplied can
5823: hold, we get a temporary chunk of working store to use during the matching.
5824: Otherwise, we can use the vector supplied, rounding down its size to a multiple
5825: of 3. */
5826:
5827: ocount = offsetcount - (offsetcount % 3);
5828:
5829: if (re->top_backref > 0 && re->top_backref >= ocount/3)
5830: {
5831: ocount = re->top_backref * 3 + 3;
5832: md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5833: if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5834: using_temporary_offsets = TRUE;
5835: DPRINTF(("Got memory to hold back references\n"));
5836: }
5837: else md->offset_vector = offsets;
5838:
5839: md->offset_end = ocount;
5840: md->offset_max = (2*ocount)/3;
5841: md->offset_overflow = FALSE;
5842: md->capture_last = -1;
5843:
5844: /* Compute the minimum number of offsets that we need to reset each time. Doing
5845: this makes a huge difference to execution time when there aren't many brackets
5846: in the pattern. */
5847:
5848: resetcount = 2 + re->top_bracket * 2;
5849: if (resetcount > offsetcount) resetcount = ocount;
5850:
5851: /* Reset the working variable associated with each extraction. These should
5852: never be used unless previously set, but they get saved and restored, and so we
5853: initialize them to avoid reading uninitialized locations. */
5854:
5855: if (md->offset_vector != NULL)
5856: {
5857: register int *iptr = md->offset_vector + ocount;
5858: register int *iend = iptr - resetcount/2 + 1;
5859: while (--iptr >= iend) *iptr = -1;
5860: }
5861:
5862: /* Set up the first character to match, if available. The first_byte value is
5863: never set for an anchored regular expression, but the anchoring may be forced
5864: at run time, so we have to test for anchoring. The first char may be unset for
5865: an unanchored pattern, of course. If there's no first char and the pattern was
5866: studied, there may be a bitmap of possible first characters. */
5867:
5868: if (!anchored)
5869: {
5870: if ((re->flags & PCRE_FIRSTSET) != 0)
5871: {
5872: first_byte = re->first_byte & 255;
5873: if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5874: first_byte = md->lcc[first_byte];
5875: }
5876: else
5877: if (!startline && study != NULL &&
1.4 misha 5878: (study->flags & PCRE_STUDY_MAPPED) != 0)
1.1 misha 5879: start_bits = study->start_bits;
5880: }
5881:
5882: /* For anchored or unanchored matches, there may be a "last known required
5883: character" set. */
5884:
5885: if ((re->flags & PCRE_REQCHSET) != 0)
5886: {
5887: req_byte = re->req_byte & 255;
5888: req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5889: req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5890: }
5891:
5892:
5893: /* ==========================================================================*/
5894:
5895: /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5896: the loop runs just once. */
5897:
5898: for(;;)
5899: {
5900: USPTR save_end_subject = end_subject;
5901: USPTR new_start_match;
5902:
5903: /* Reset the maximum number of extractions we might see. */
5904:
5905: if (md->offset_vector != NULL)
5906: {
5907: register int *iptr = md->offset_vector;
5908: register int *iend = iptr + resetcount;
5909: while (iptr < iend) *iptr++ = -1;
5910: }
5911:
1.3 misha 5912: /* If firstline is TRUE, the start of the match is constrained to the first
5913: line of a multiline string. That is, the match must be before or at the first
5914: newline. Implement this by temporarily adjusting end_subject so that we stop
5915: scanning at a newline. If the match fails at the newline, later code breaks
5916: this loop. */
1.1 misha 5917:
5918: if (firstline)
5919: {
5920: USPTR t = start_match;
1.2 misha 5921: #ifdef SUPPORT_UTF8
5922: if (utf8)
5923: {
5924: while (t < md->end_subject && !IS_NEWLINE(t))
5925: {
5926: t++;
5927: while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5928: }
5929: }
5930: else
5931: #endif
1.1 misha 5932: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5933: end_subject = t;
5934: }
5935:
1.3 misha 5936: /* There are some optimizations that avoid running the match if a known
5937: starting point is not found, or if a known later character is not present.
5938: However, there is an option that disables these, for testing and for ensuring
1.5 ! misha 5939: that all callouts do actually occur. The option can be set in the regex by
! 5940: (*NO_START_OPT) or passed in match-time options. */
1.1 misha 5941:
1.5 ! misha 5942: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
1.1 misha 5943: {
1.3 misha 5944: /* Advance to a unique first byte if there is one. */
5945:
5946: if (first_byte >= 0)
5947: {
5948: if (first_byte_caseless)
5949: while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5950: start_match++;
5951: else
5952: while (start_match < end_subject && *start_match != first_byte)
5953: start_match++;
5954: }
1.1 misha 5955:
1.3 misha 5956: /* Or to just after a linebreak for a multiline match */
1.1 misha 5957:
1.3 misha 5958: else if (startline)
1.1 misha 5959: {
1.3 misha 5960: if (start_match > md->start_subject + start_offset)
5961: {
1.2 misha 5962: #ifdef SUPPORT_UTF8
1.3 misha 5963: if (utf8)
1.2 misha 5964: {
1.3 misha 5965: while (start_match < end_subject && !WAS_NEWLINE(start_match))
5966: {
1.2 misha 5967: start_match++;
1.3 misha 5968: while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5969: start_match++;
5970: }
1.2 misha 5971: }
1.3 misha 5972: else
1.2 misha 5973: #endif
1.3 misha 5974: while (start_match < end_subject && !WAS_NEWLINE(start_match))
5975: start_match++;
1.1 misha 5976:
1.3 misha 5977: /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5978: and we are now at a LF, advance the match position by one more character.
5979: */
5980:
5981: if (start_match[-1] == CHAR_CR &&
5982: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5983: start_match < end_subject &&
5984: *start_match == CHAR_NL)
5985: start_match++;
5986: }
1.1 misha 5987: }
5988:
1.3 misha 5989: /* Or to a non-unique first byte after study */
1.1 misha 5990:
1.3 misha 5991: else if (start_bits != NULL)
1.1 misha 5992: {
1.3 misha 5993: while (start_match < end_subject)
5994: {
5995: register unsigned int c = *start_match;
1.4 misha 5996: if ((start_bits[c/8] & (1 << (c&7))) == 0)
5997: {
5998: start_match++;
5999: #ifdef SUPPORT_UTF8
6000: if (utf8)
6001: while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
6002: start_match++;
6003: #endif
6004: }
6005: else break;
1.3 misha 6006: }
1.1 misha 6007: }
1.3 misha 6008: } /* Starting optimizations */
1.1 misha 6009:
6010: /* Restore fudged end_subject */
6011:
6012: end_subject = save_end_subject;
6013:
1.4 misha 6014: /* The following two optimizations are disabled for partial matching or if
6015: disabling is explicitly requested. */
1.1 misha 6016:
1.4 misha 6017: if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
6018: {
6019: /* If the pattern was studied, a minimum subject length may be set. This is
6020: a lower bound; no actual string of that length may actually match the
6021: pattern. Although the value is, strictly, in characters, we treat it as
6022: bytes to avoid spending too much time in this optimization. */
1.1 misha 6023:
1.4 misha 6024: if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6025: (pcre_uint32)(end_subject - start_match) < study->minlength)
6026: {
6027: rc = MATCH_NOMATCH;
6028: break;
6029: }
1.1 misha 6030:
1.4 misha 6031: /* If req_byte is set, we know that that character must appear in the
6032: subject for the match to succeed. If the first character is set, req_byte
6033: must be later in the subject; otherwise the test starts at the match point.
6034: This optimization can save a huge amount of backtracking in patterns with
6035: nested unlimited repeats that aren't going to match. Writing separate code
6036: for cased/caseless versions makes it go faster, as does using an
6037: autoincrement and backing off on a match.
1.1 misha 6038:
1.4 misha 6039: HOWEVER: when the subject string is very, very long, searching to its end
6040: can take a long time, and give bad performance on quite ordinary patterns.
6041: This showed up when somebody was matching something like /^\d+C/ on a
6042: 32-megabyte string... so we don't do this when the string is sufficiently
6043: long. */
1.1 misha 6044:
1.4 misha 6045: if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
1.1 misha 6046: {
1.4 misha 6047: register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
6048:
6049: /* We don't need to repeat the search if we haven't yet reached the
6050: place we found it at last time. */
6051:
6052: if (p > req_byte_ptr)
1.1 misha 6053: {
1.4 misha 6054: if (req_byte_caseless)
1.1 misha 6055: {
1.4 misha 6056: while (p < end_subject)
6057: {
6058: register int pp = *p++;
6059: if (pp == req_byte || pp == req_byte2) { p--; break; }
6060: }
1.1 misha 6061: }
1.4 misha 6062: else
1.1 misha 6063: {
1.4 misha 6064: while (p < end_subject)
6065: {
6066: if (*p++ == req_byte) { p--; break; }
6067: }
1.1 misha 6068: }
6069:
1.4 misha 6070: /* If we can't find the required character, break the matching loop,
6071: forcing a match failure. */
1.1 misha 6072:
1.4 misha 6073: if (p >= end_subject)
6074: {
6075: rc = MATCH_NOMATCH;
6076: break;
6077: }
1.1 misha 6078:
1.4 misha 6079: /* If we have found the required character, save the point where we
6080: found it, so that we don't search again next time round the loop if
6081: the start hasn't passed this character yet. */
1.1 misha 6082:
1.4 misha 6083: req_byte_ptr = p;
6084: }
1.1 misha 6085: }
6086: }
6087:
1.4 misha 6088: #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6089: printf(">>>> Match against: ");
6090: pchars(start_match, end_subject - start_match, TRUE, md);
6091: printf("\n");
6092: #endif
6093:
6094: /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6095: first starting point for which a partial match was found. */
1.1 misha 6096:
6097: md->start_match_ptr = start_match;
1.4 misha 6098: md->start_used_ptr = start_match;
1.1 misha 6099: md->match_call_count = 0;
1.4 misha 6100: rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL,
6101: 0, 0);
6102: if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
1.1 misha 6103:
6104: switch(rc)
6105: {
1.4 misha 6106: /* SKIP passes back the next starting point explicitly, but if it is the
6107: same as the match we have just done, treat it as NOMATCH. */
6108:
6109: case MATCH_SKIP:
6110: if (md->start_match_ptr != start_match)
6111: {
6112: new_start_match = md->start_match_ptr;
6113: break;
6114: }
6115: /* Fall through */
6116:
6117: /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
6118: the SKIP's arg was not found. We also treat this as NOMATCH. */
6119:
6120: case MATCH_SKIP_ARG:
6121: /* Fall through */
6122:
1.1 misha 6123: /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6124: exactly like PRUNE. */
6125:
6126: case MATCH_NOMATCH:
6127: case MATCH_PRUNE:
6128: case MATCH_THEN:
6129: new_start_match = start_match + 1;
6130: #ifdef SUPPORT_UTF8
6131: if (utf8)
6132: while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6133: new_start_match++;
6134: #endif
6135: break;
6136:
6137: /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6138:
6139: case MATCH_COMMIT:
6140: rc = MATCH_NOMATCH;
6141: goto ENDLOOP;
6142:
1.4 misha 6143: /* Any other return is either a match, or some kind of error. */
1.1 misha 6144:
6145: default:
6146: goto ENDLOOP;
6147: }
6148:
6149: /* Control reaches here for the various types of "no match at this point"
6150: result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6151:
6152: rc = MATCH_NOMATCH;
6153:
6154: /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6155: newline in the subject (though it may continue over the newline). Therefore,
6156: if we have just failed to match, starting at a newline, do not continue. */
6157:
6158: if (firstline && IS_NEWLINE(start_match)) break;
6159:
6160: /* Advance to new matching position */
6161:
6162: start_match = new_start_match;
6163:
6164: /* Break the loop if the pattern is anchored or if we have passed the end of
6165: the subject. */
6166:
6167: if (anchored || start_match > end_subject) break;
6168:
6169: /* If we have just passed a CR and we are now at a LF, and the pattern does
6170: not contain any explicit matches for \r or \n, and the newline option is CRLF
6171: or ANY or ANYCRLF, advance the match position by one more character. */
6172:
1.3 misha 6173: if (start_match[-1] == CHAR_CR &&
1.1 misha 6174: start_match < end_subject &&
1.3 misha 6175: *start_match == CHAR_NL &&
1.1 misha 6176: (re->flags & PCRE_HASCRORLF) == 0 &&
6177: (md->nltype == NLTYPE_ANY ||
6178: md->nltype == NLTYPE_ANYCRLF ||
6179: md->nllen == 2))
6180: start_match++;
6181:
1.4 misha 6182: md->mark = NULL; /* Reset for start of next match attempt */
6183: } /* End of for(;;) "bumpalong" loop */
1.1 misha 6184:
6185: /* ==========================================================================*/
6186:
6187: /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6188: conditions is true:
6189:
6190: (1) The pattern is anchored or the match was failed by (*COMMIT);
6191:
6192: (2) We are past the end of the subject;
6193:
6194: (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6195: this option requests that a match occur at or before the first newline in
6196: the subject.
6197:
6198: When we have a match and the offset vector is big enough to deal with any
6199: backreferences, captured substring offsets will already be set up. In the case
6200: where we had to get some local store to hold offsets for backreference
6201: processing, copy those that we can. In this case there need not be overflow if
6202: certain parts of the pattern were not used, even though there are more
6203: capturing parentheses than vector slots. */
6204:
6205: ENDLOOP:
6206:
1.4 misha 6207: if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
1.1 misha 6208: {
6209: if (using_temporary_offsets)
6210: {
6211: if (offsetcount >= 4)
6212: {
6213: memcpy(offsets + 2, md->offset_vector + 2,
6214: (offsetcount - 2) * sizeof(int));
6215: DPRINTF(("Copied offsets from temporary memory\n"));
6216: }
6217: if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6218: DPRINTF(("Freeing temporary memory\n"));
6219: (pcre_free)(md->offset_vector);
6220: }
6221:
6222: /* Set the return code to the number of captured strings, or 0 if there are
6223: too many to fit into the vector. */
6224:
6225: rc = md->offset_overflow? 0 : md->end_offset_top/2;
6226:
6227: /* If there is space, set up the whole thing as substring 0. The value of
6228: md->start_match_ptr might be modified if \K was encountered on the success
6229: matching path. */
6230:
6231: if (offsetcount < 2) rc = 0; else
6232: {
1.4 misha 6233: offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6234: offsets[1] = (int)(md->end_match_ptr - md->start_subject);
1.1 misha 6235: }
6236:
6237: DPRINTF((">>>> returning %d\n", rc));
1.4 misha 6238: goto RETURN_MARK;
1.1 misha 6239: }
6240:
6241: /* Control gets here if there has been an error, or if the overall match
6242: attempt has failed at all permitted starting positions. */
6243:
6244: if (using_temporary_offsets)
6245: {
6246: DPRINTF(("Freeing temporary memory\n"));
6247: (pcre_free)(md->offset_vector);
6248: }
6249:
1.4 misha 6250: /* For anything other than nomatch or partial match, just return the code. */
6251:
6252: if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
1.1 misha 6253: {
6254: DPRINTF((">>>> error: returning %d\n", rc));
6255: return rc;
6256: }
1.4 misha 6257:
6258: /* Handle partial matches - disable any mark data */
6259:
6260: if (start_partial != NULL)
1.1 misha 6261: {
6262: DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
1.4 misha 6263: md->mark = NULL;
6264: if (offsetcount > 1)
6265: {
6266: offsets[0] = (int)(start_partial - (USPTR)subject);
6267: offsets[1] = (int)(end_subject - (USPTR)subject);
6268: }
6269: rc = PCRE_ERROR_PARTIAL;
1.1 misha 6270: }
1.4 misha 6271:
6272: /* This is the classic nomatch case */
6273:
1.1 misha 6274: else
6275: {
6276: DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
1.4 misha 6277: rc = PCRE_ERROR_NOMATCH;
1.1 misha 6278: }
1.4 misha 6279:
6280: /* Return the MARK data if it has been requested. */
6281:
6282: RETURN_MARK:
6283:
6284: if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
6285: *(extra_data->mark) = (unsigned char *)(md->mark);
6286: return rc;
1.1 misha 6287: }
6288:
6289: /* End of pcre_exec.c */
E-mail: