Annotation of win32/pcre/pcre_exec.c, revision 1.3
1.1 misha 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
1.3 ! misha 9: Copyright (c) 1997-2009 University of Cambridge
1.1 misha 10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40:
41: /* This module contains pcre_exec(), the externally visible function that does
42: pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43: possible. There are also some static supporting functions. */
44:
45: #ifdef HAVE_CONFIG_H
46: #include "config.h"
47: #endif
48:
49: #define NLBLOCK md /* Block containing newline information */
50: #define PSSTART start_subject /* Field containing processed string start */
51: #define PSEND end_subject /* Field containing processed string end */
52:
53: #include "pcre_internal.h"
54:
55: /* Undefine some potentially clashing cpp symbols */
56:
57: #undef min
58: #undef max
59:
60: /* Flag bits for the match() function */
61:
62: #define match_condassert 0x01 /* Called to check a condition assertion */
63: #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64:
65: /* Non-error returns from the match() function. Error returns are externally
66: defined PCRE_ERROR_xxx codes, which are all negative. */
67:
68: #define MATCH_MATCH 1
69: #define MATCH_NOMATCH 0
70:
71: /* Special internal returns from the match() function. Make them sufficiently
72: negative to avoid the external error codes. */
73:
74: #define MATCH_COMMIT (-999)
75: #define MATCH_PRUNE (-998)
76: #define MATCH_SKIP (-997)
77: #define MATCH_THEN (-996)
78:
79: /* Maximum number of ints of offset to save on the stack for recursive calls.
80: If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81: because the offset vector is always a multiple of 3 long. */
82:
83: #define REC_STACK_SAVE_MAX 30
84:
85: /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86:
87: static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88: static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89:
90:
91:
92: #ifdef DEBUG
93: /*************************************************
94: * Debugging function to print chars *
95: *************************************************/
96:
97: /* Print a sequence of chars in printable format, stopping at the end of the
98: subject if the requested.
99:
100: Arguments:
101: p points to characters
102: length number to print
103: is_subject TRUE if printing from within md->start_subject
104: md pointer to matching data block, if is_subject is TRUE
105:
106: Returns: nothing
107: */
108:
109: static void
110: pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111: {
112: unsigned int c;
113: if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114: while (length-- > 0)
115: if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116: }
117: #endif
118:
119:
120:
121: /*************************************************
122: * Match a back-reference *
123: *************************************************/
124:
125: /* If a back reference hasn't been set, the length that is passed is greater
126: than the number of characters left in the string, so the match fails.
127:
128: Arguments:
129: offset index into the offset vector
130: eptr points into the subject
131: length length to be matched
132: md points to match data block
133: ims the ims flags
134:
135: Returns: TRUE if matched
136: */
137:
138: static BOOL
139: match_ref(int offset, register USPTR eptr, int length, match_data *md,
140: unsigned long int ims)
141: {
142: USPTR p = md->start_subject + md->offset_vector[offset];
143:
144: #ifdef DEBUG
145: if (eptr >= md->end_subject)
146: printf("matching subject <null>");
147: else
148: {
149: printf("matching subject ");
150: pchars(eptr, length, TRUE, md);
151: }
152: printf(" against backref ");
153: pchars(p, length, FALSE, md);
154: printf("\n");
155: #endif
156:
157: /* Always fail if not enough characters left */
158:
159: if (length > md->end_subject - eptr) return FALSE;
160:
1.2 misha 161: /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162: properly if Unicode properties are supported. Otherwise, we can check only
163: ASCII characters. */
1.1 misha 164:
165: if ((ims & PCRE_CASELESS) != 0)
166: {
1.2 misha 167: #ifdef SUPPORT_UTF8
168: #ifdef SUPPORT_UCP
169: if (md->utf8)
170: {
171: USPTR endptr = eptr + length;
172: while (eptr < endptr)
173: {
174: int c, d;
175: GETCHARINC(c, eptr);
176: GETCHARINC(d, p);
177: if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178: }
179: }
180: else
181: #endif
182: #endif
183:
184: /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185: is no UCP support. */
186:
1.1 misha 187: while (length-- > 0)
1.2 misha 188: { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
1.1 misha 189: }
1.2 misha 190:
191: /* In the caseful case, we can just compare the bytes, whether or not we
192: are in UTF-8 mode. */
193:
1.1 misha 194: else
195: { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196:
197: return TRUE;
198: }
199:
200:
201:
202: /***************************************************************************
203: ****************************************************************************
204: RECURSION IN THE match() FUNCTION
205:
206: The match() function is highly recursive, though not every recursive call
207: increases the recursive depth. Nevertheless, some regular expressions can cause
208: it to recurse to a great depth. I was writing for Unix, so I just let it call
209: itself recursively. This uses the stack for saving everything that has to be
210: saved for a recursive call. On Unix, the stack can be large, and this works
211: fine.
212:
213: It turns out that on some non-Unix-like systems there are problems with
214: programs that use a lot of stack. (This despite the fact that every last chip
215: has oodles of memory these days, and techniques for extending the stack have
216: been known for decades.) So....
217:
218: There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219: calls by keeping local variables that need to be preserved in blocks of memory
220: obtained from malloc() instead instead of on the stack. Macros are used to
221: achieve this so that the actual code doesn't look very different to what it
222: always used to.
223:
224: The original heap-recursive code used longjmp(). However, it seems that this
225: can be very slow on some operating systems. Following a suggestion from Stan
226: Switzer, the use of longjmp() has been abolished, at the cost of having to
227: provide a unique number for each call to RMATCH. There is no way of generating
228: a sequence of numbers at compile time in C. I have given them names, to make
229: them stand out more clearly.
230:
231: Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232: FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233: tests. Furthermore, not using longjmp() means that local dynamic variables
234: don't have indeterminate values; this has meant that the frame size can be
235: reduced because the result can be "passed back" by straight setting of the
236: variable instead of being passed in the frame.
237: ****************************************************************************
238: ***************************************************************************/
239:
240: /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241: below must be updated in sync. */
242:
243: enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244: RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245: RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246: RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247: RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248: RM51, RM52, RM53, RM54 };
249:
250: /* These versions of the macros use the stack, as normal. There are debugging
251: versions and production versions. Note that the "rw" argument of RMATCH isn't
252: actuall used in this definition. */
253:
254: #ifndef NO_RECURSE
255: #define REGISTER register
256:
257: #ifdef DEBUG
258: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259: { \
260: printf("match() called in line %d\n", __LINE__); \
261: rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262: printf("to line %d\n", __LINE__); \
263: }
264: #define RRETURN(ra) \
265: { \
266: printf("match() returned %d from line %d ", ra, __LINE__); \
267: return ra; \
268: }
269: #else
270: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271: rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272: #define RRETURN(ra) return ra
273: #endif
274:
275: #else
276:
277:
278: /* These versions of the macros manage a private stack on the heap. Note that
279: the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280: argument of match(), which never changes. */
281:
282: #define REGISTER
283:
284: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285: {\
286: heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287: frame->Xwhere = rw; \
288: newframe->Xeptr = ra;\
289: newframe->Xecode = rb;\
290: newframe->Xmstart = mstart;\
291: newframe->Xoffset_top = rc;\
292: newframe->Xims = re;\
293: newframe->Xeptrb = rf;\
294: newframe->Xflags = rg;\
295: newframe->Xrdepth = frame->Xrdepth + 1;\
296: newframe->Xprevframe = frame;\
297: frame = newframe;\
298: DPRINTF(("restarting from line %d\n", __LINE__));\
299: goto HEAP_RECURSE;\
300: L_##rw:\
301: DPRINTF(("jumped back to line %d\n", __LINE__));\
302: }
303:
304: #define RRETURN(ra)\
305: {\
306: heapframe *newframe = frame;\
307: frame = newframe->Xprevframe;\
308: (pcre_stack_free)(newframe);\
309: if (frame != NULL)\
310: {\
311: rrc = ra;\
312: goto HEAP_RETURN;\
313: }\
314: return ra;\
315: }
316:
317:
318: /* Structure for remembering the local variables in a private frame */
319:
320: typedef struct heapframe {
321: struct heapframe *Xprevframe;
322:
323: /* Function arguments that may change */
324:
1.3 ! misha 325: USPTR Xeptr;
1.1 misha 326: const uschar *Xecode;
1.3 ! misha 327: USPTR Xmstart;
1.1 misha 328: int Xoffset_top;
329: long int Xims;
330: eptrblock *Xeptrb;
331: int Xflags;
332: unsigned int Xrdepth;
333:
334: /* Function local variables */
335:
1.3 ! misha 336: USPTR Xcallpat;
! 337: #ifdef SUPPORT_UTF8
! 338: USPTR Xcharptr;
! 339: #endif
! 340: USPTR Xdata;
! 341: USPTR Xnext;
! 342: USPTR Xpp;
! 343: USPTR Xprev;
! 344: USPTR Xsaved_eptr;
1.1 misha 345:
346: recursion_info Xnew_recursive;
347:
348: BOOL Xcur_is_word;
349: BOOL Xcondition;
350: BOOL Xprev_is_word;
351:
352: unsigned long int Xoriginal_ims;
353:
354: #ifdef SUPPORT_UCP
355: int Xprop_type;
356: int Xprop_value;
357: int Xprop_fail_result;
358: int Xprop_category;
359: int Xprop_chartype;
360: int Xprop_script;
361: int Xoclength;
362: uschar Xocchars[8];
363: #endif
364:
1.3 ! misha 365: int Xcodelink;
1.1 misha 366: int Xctype;
367: unsigned int Xfc;
368: int Xfi;
369: int Xlength;
370: int Xmax;
371: int Xmin;
372: int Xnumber;
373: int Xoffset;
374: int Xop;
375: int Xsave_capture_last;
376: int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377: int Xstacksave[REC_STACK_SAVE_MAX];
378:
379: eptrblock Xnewptrb;
380:
381: /* Where to jump back to */
382:
383: int Xwhere;
384:
385: } heapframe;
386:
387: #endif
388:
389:
390: /***************************************************************************
391: ***************************************************************************/
392:
393:
394:
395: /*************************************************
396: * Match from current position *
397: *************************************************/
398:
399: /* This function is called recursively in many circumstances. Whenever it
400: returns a negative (error) response, the outer incarnation must also return the
401: same response.
402:
403: Performance note: It might be tempting to extract commonly used fields from the
404: md structure (e.g. utf8, end_subject) into individual variables to improve
405: performance. Tests using gcc on a SPARC disproved this; in the first case, it
406: made performance worse.
407:
408: Arguments:
409: eptr pointer to current character in subject
410: ecode pointer to current position in compiled code
411: mstart pointer to the current match start position (can be modified
412: by encountering \K)
413: offset_top current top pointer
414: md pointer to "static" info for the match
415: ims current /i, /m, and /s options
416: eptrb pointer to chain of blocks containing eptr at start of
417: brackets - for testing for empty matches
418: flags can contain
419: match_condassert - this is an assertion condition
420: match_cbegroup - this is the start of an unlimited repeat
421: group that can match an empty string
422: rdepth the recursion depth
423:
424: Returns: MATCH_MATCH if matched ) these values are >= 0
425: MATCH_NOMATCH if failed to match )
426: a negative PCRE_ERROR_xxx value if aborted by an error condition
427: (e.g. stopped by repeated call or recursion limit)
428: */
429:
430: static int
1.3 ! misha 431: match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
1.1 misha 432: int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
433: int flags, unsigned int rdepth)
434: {
435: /* These variables do not need to be preserved over recursion in this function,
436: so they can be ordinary variables in all cases. Mark some of them with
437: "register" because they are used a lot in loops. */
438:
439: register int rrc; /* Returns from recursive calls */
440: register int i; /* Used for loops not involving calls to RMATCH() */
441: register unsigned int c; /* Character values not kept over RMATCH() calls */
442: register BOOL utf8; /* Local copy of UTF-8 flag for speed */
443:
444: BOOL minimize, possessive; /* Quantifier options */
1.3 ! misha 445: int condcode;
1.1 misha 446:
447: /* When recursion is not being used, all "local" variables that have to be
448: preserved over calls to RMATCH() are part of a "frame" which is obtained from
449: heap storage. Set up the top-level frame here; others are obtained from the
450: heap whenever RMATCH() does a "recursion". See the macro definitions above. */
451:
452: #ifdef NO_RECURSE
453: heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
454: frame->Xprevframe = NULL; /* Marks the top level */
455:
456: /* Copy in the original argument variables */
457:
458: frame->Xeptr = eptr;
459: frame->Xecode = ecode;
460: frame->Xmstart = mstart;
461: frame->Xoffset_top = offset_top;
462: frame->Xims = ims;
463: frame->Xeptrb = eptrb;
464: frame->Xflags = flags;
465: frame->Xrdepth = rdepth;
466:
467: /* This is where control jumps back to to effect "recursion" */
468:
469: HEAP_RECURSE:
470:
471: /* Macros make the argument variables come from the current frame */
472:
473: #define eptr frame->Xeptr
474: #define ecode frame->Xecode
475: #define mstart frame->Xmstart
476: #define offset_top frame->Xoffset_top
477: #define ims frame->Xims
478: #define eptrb frame->Xeptrb
479: #define flags frame->Xflags
480: #define rdepth frame->Xrdepth
481:
482: /* Ditto for the local variables */
483:
484: #ifdef SUPPORT_UTF8
485: #define charptr frame->Xcharptr
486: #endif
487: #define callpat frame->Xcallpat
1.3 ! misha 488: #define codelink frame->Xcodelink
1.1 misha 489: #define data frame->Xdata
490: #define next frame->Xnext
491: #define pp frame->Xpp
492: #define prev frame->Xprev
493: #define saved_eptr frame->Xsaved_eptr
494:
495: #define new_recursive frame->Xnew_recursive
496:
497: #define cur_is_word frame->Xcur_is_word
498: #define condition frame->Xcondition
499: #define prev_is_word frame->Xprev_is_word
500:
501: #define original_ims frame->Xoriginal_ims
502:
503: #ifdef SUPPORT_UCP
504: #define prop_type frame->Xprop_type
505: #define prop_value frame->Xprop_value
506: #define prop_fail_result frame->Xprop_fail_result
507: #define prop_category frame->Xprop_category
508: #define prop_chartype frame->Xprop_chartype
509: #define prop_script frame->Xprop_script
510: #define oclength frame->Xoclength
511: #define occhars frame->Xocchars
512: #endif
513:
514: #define ctype frame->Xctype
515: #define fc frame->Xfc
516: #define fi frame->Xfi
517: #define length frame->Xlength
518: #define max frame->Xmax
519: #define min frame->Xmin
520: #define number frame->Xnumber
521: #define offset frame->Xoffset
522: #define op frame->Xop
523: #define save_capture_last frame->Xsave_capture_last
524: #define save_offset1 frame->Xsave_offset1
525: #define save_offset2 frame->Xsave_offset2
526: #define save_offset3 frame->Xsave_offset3
527: #define stacksave frame->Xstacksave
528:
529: #define newptrb frame->Xnewptrb
530:
531: /* When recursion is being used, local variables are allocated on the stack and
532: get preserved during recursion in the normal way. In this environment, fi and
533: i, and fc and c, can be the same variables. */
534:
535: #else /* NO_RECURSE not defined */
536: #define fi i
537: #define fc c
538:
539:
540: #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
541: const uschar *charptr; /* in small blocks of the code. My normal */
542: #endif /* style of coding would have declared */
543: const uschar *callpat; /* them within each of those blocks. */
544: const uschar *data; /* However, in order to accommodate the */
545: const uschar *next; /* version of this code that uses an */
546: USPTR pp; /* external "stack" implemented on the */
547: const uschar *prev; /* heap, it is easier to declare them all */
548: USPTR saved_eptr; /* here, so the declarations can be cut */
549: /* out in a block. The only declarations */
550: recursion_info new_recursive; /* within blocks below are for variables */
551: /* that do not have to be preserved over */
552: BOOL cur_is_word; /* a recursive call to RMATCH(). */
553: BOOL condition;
554: BOOL prev_is_word;
555:
556: unsigned long int original_ims;
557:
558: #ifdef SUPPORT_UCP
559: int prop_type;
560: int prop_value;
561: int prop_fail_result;
562: int prop_category;
563: int prop_chartype;
564: int prop_script;
565: int oclength;
566: uschar occhars[8];
567: #endif
568:
1.3 ! misha 569: int codelink;
1.1 misha 570: int ctype;
571: int length;
572: int max;
573: int min;
574: int number;
575: int offset;
576: int op;
577: int save_capture_last;
578: int save_offset1, save_offset2, save_offset3;
579: int stacksave[REC_STACK_SAVE_MAX];
580:
581: eptrblock newptrb;
582: #endif /* NO_RECURSE */
583:
584: /* These statements are here to stop the compiler complaining about unitialized
585: variables. */
586:
587: #ifdef SUPPORT_UCP
588: prop_value = 0;
589: prop_fail_result = 0;
590: #endif
591:
592:
593: /* This label is used for tail recursion, which is used in a few cases even
594: when NO_RECURSE is not defined, in order to reduce the amount of stack that is
595: used. Thanks to Ian Taylor for noticing this possibility and sending the
596: original patch. */
597:
598: TAIL_RECURSE:
599:
600: /* OK, now we can get on with the real code of the function. Recursive calls
601: are specified by the macro RMATCH and RRETURN is used to return. When
602: NO_RECURSE is *not* defined, these just turn into a recursive call to match()
603: and a "return", respectively (possibly with some debugging if DEBUG is
604: defined). However, RMATCH isn't like a function call because it's quite a
605: complicated macro. It has to be used in one particular way. This shouldn't,
606: however, impact performance when true recursion is being used. */
607:
608: #ifdef SUPPORT_UTF8
609: utf8 = md->utf8; /* Local copy of the flag */
610: #else
611: utf8 = FALSE;
612: #endif
613:
614: /* First check that we haven't called match() too many times, or that we
615: haven't exceeded the recursive call limit. */
616:
617: if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
618: if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
619:
620: original_ims = ims; /* Save for resetting on ')' */
621:
622: /* At the start of a group with an unlimited repeat that may match an empty
623: string, the match_cbegroup flag is set. When this is the case, add the current
624: subject pointer to the chain of such remembered pointers, to be checked when we
625: hit the closing ket, in order to break infinite loops that match no characters.
626: When match() is called in other circumstances, don't add to the chain. The
627: match_cbegroup flag must NOT be used with tail recursion, because the memory
628: block that is used is on the stack, so a new one may be required for each
629: match(). */
630:
631: if ((flags & match_cbegroup) != 0)
632: {
633: newptrb.epb_saved_eptr = eptr;
634: newptrb.epb_prev = eptrb;
635: eptrb = &newptrb;
636: }
637:
638: /* Now start processing the opcodes. */
639:
640: for (;;)
641: {
642: minimize = possessive = FALSE;
643: op = *ecode;
644:
645: /* For partial matching, remember if we ever hit the end of the subject after
646: matching at least one subject character. */
647:
648: if (md->partial &&
649: eptr >= md->end_subject &&
650: eptr > mstart)
651: md->hitend = TRUE;
652:
653: switch(op)
654: {
655: case OP_FAIL:
656: RRETURN(MATCH_NOMATCH);
657:
658: case OP_PRUNE:
659: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
660: ims, eptrb, flags, RM51);
661: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
662: RRETURN(MATCH_PRUNE);
663:
664: case OP_COMMIT:
665: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
666: ims, eptrb, flags, RM52);
667: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
668: RRETURN(MATCH_COMMIT);
669:
670: case OP_SKIP:
671: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
672: ims, eptrb, flags, RM53);
673: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
674: md->start_match_ptr = eptr; /* Pass back current position */
675: RRETURN(MATCH_SKIP);
676:
677: case OP_THEN:
678: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
679: ims, eptrb, flags, RM54);
680: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
681: RRETURN(MATCH_THEN);
682:
683: /* Handle a capturing bracket. If there is space in the offset vector, save
684: the current subject position in the working slot at the top of the vector.
685: We mustn't change the current values of the data slot, because they may be
686: set from a previous iteration of this group, and be referred to by a
687: reference inside the group.
688:
689: If the bracket fails to match, we need to restore this value and also the
690: values of the final offsets, in case they were set by a previous iteration
691: of the same bracket.
692:
693: If there isn't enough space in the offset vector, treat this as if it were
694: a non-capturing bracket. Don't worry about setting the flag for the error
695: case here; that is handled in the code for KET. */
696:
697: case OP_CBRA:
698: case OP_SCBRA:
699: number = GET2(ecode, 1+LINK_SIZE);
700: offset = number << 1;
701:
702: #ifdef DEBUG
703: printf("start bracket %d\n", number);
704: printf("subject=");
705: pchars(eptr, 16, TRUE, md);
706: printf("\n");
707: #endif
708:
709: if (offset < md->offset_max)
710: {
711: save_offset1 = md->offset_vector[offset];
712: save_offset2 = md->offset_vector[offset+1];
713: save_offset3 = md->offset_vector[md->offset_end - number];
714: save_capture_last = md->capture_last;
715:
716: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
717: md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
718:
719: flags = (op == OP_SCBRA)? match_cbegroup : 0;
720: do
721: {
722: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
723: ims, eptrb, flags, RM1);
724: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
725: md->capture_last = save_capture_last;
726: ecode += GET(ecode, 1);
727: }
728: while (*ecode == OP_ALT);
729:
730: DPRINTF(("bracket %d failed\n", number));
731:
732: md->offset_vector[offset] = save_offset1;
733: md->offset_vector[offset+1] = save_offset2;
734: md->offset_vector[md->offset_end - number] = save_offset3;
735:
736: RRETURN(MATCH_NOMATCH);
737: }
738:
739: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
740: as a non-capturing bracket. */
741:
742: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
743: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
744:
745: DPRINTF(("insufficient capture room: treat as non-capturing\n"));
746:
747: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
748: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
749:
750: /* Non-capturing bracket. Loop for all the alternatives. When we get to the
751: final alternative within the brackets, we would return the result of a
752: recursive call to match() whatever happened. We can reduce stack usage by
753: turning this into a tail recursion, except in the case when match_cbegroup
754: is set.*/
755:
756: case OP_BRA:
757: case OP_SBRA:
758: DPRINTF(("start non-capturing bracket\n"));
759: flags = (op >= OP_SBRA)? match_cbegroup : 0;
760: for (;;)
761: {
762: if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
763: {
764: if (flags == 0) /* Not a possibly empty group */
765: {
766: ecode += _pcre_OP_lengths[*ecode];
767: DPRINTF(("bracket 0 tail recursion\n"));
768: goto TAIL_RECURSE;
769: }
770:
771: /* Possibly empty group; can't use tail recursion. */
772:
773: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
774: eptrb, flags, RM48);
775: RRETURN(rrc);
776: }
777:
778: /* For non-final alternatives, continue the loop for a NOMATCH result;
779: otherwise return. */
780:
781: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
782: eptrb, flags, RM2);
783: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
784: ecode += GET(ecode, 1);
785: }
786: /* Control never reaches here. */
787:
788: /* Conditional group: compilation checked that there are no more than
789: two branches. If the condition is false, skipping the first branch takes us
790: past the end if there is only one branch, but that's OK because that is
791: exactly what going to the ket would do. As there is only one branch to be
792: obeyed, we can use tail recursion to avoid using another stack frame. */
793:
794: case OP_COND:
795: case OP_SCOND:
1.3 ! misha 796: codelink= GET(ecode, 1);
! 797:
! 798: /* Because of the way auto-callout works during compile, a callout item is
! 799: inserted between OP_COND and an assertion condition. */
! 800:
! 801: if (ecode[LINK_SIZE+1] == OP_CALLOUT)
! 802: {
! 803: if (pcre_callout != NULL)
! 804: {
! 805: pcre_callout_block cb;
! 806: cb.version = 1; /* Version 1 of the callout block */
! 807: cb.callout_number = ecode[LINK_SIZE+2];
! 808: cb.offset_vector = md->offset_vector;
! 809: cb.subject = (PCRE_SPTR)md->start_subject;
! 810: cb.subject_length = md->end_subject - md->start_subject;
! 811: cb.start_match = mstart - md->start_subject;
! 812: cb.current_position = eptr - md->start_subject;
! 813: cb.pattern_position = GET(ecode, LINK_SIZE + 3);
! 814: cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
! 815: cb.capture_top = offset_top/2;
! 816: cb.capture_last = md->capture_last;
! 817: cb.callout_data = md->callout_data;
! 818: if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
! 819: if (rrc < 0) RRETURN(rrc);
! 820: }
! 821: ecode += _pcre_OP_lengths[OP_CALLOUT];
! 822: }
! 823:
! 824: condcode = ecode[LINK_SIZE+1];
! 825:
! 826: /* Now see what the actual condition is */
! 827:
! 828: if (condcode == OP_RREF) /* Recursion test */
1.1 misha 829: {
830: offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
831: condition = md->recursive != NULL &&
832: (offset == RREF_ANY || offset == md->recursive->group_num);
833: ecode += condition? 3 : GET(ecode, 1);
834: }
835:
1.3 ! misha 836: else if (condcode == OP_CREF) /* Group used test */
1.1 misha 837: {
838: offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
839: condition = offset < offset_top && md->offset_vector[offset] >= 0;
840: ecode += condition? 3 : GET(ecode, 1);
841: }
842:
1.3 ! misha 843: else if (condcode == OP_DEF) /* DEFINE - always false */
1.1 misha 844: {
845: condition = FALSE;
846: ecode += GET(ecode, 1);
847: }
848:
849: /* The condition is an assertion. Call match() to evaluate it - setting
850: the final argument match_condassert causes it to stop at the end of an
851: assertion. */
852:
853: else
854: {
855: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
856: match_condassert, RM3);
857: if (rrc == MATCH_MATCH)
858: {
859: condition = TRUE;
860: ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
861: while (*ecode == OP_ALT) ecode += GET(ecode, 1);
862: }
863: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
864: {
865: RRETURN(rrc); /* Need braces because of following else */
866: }
867: else
868: {
869: condition = FALSE;
1.3 ! misha 870: ecode += codelink;
1.1 misha 871: }
872: }
873:
874: /* We are now at the branch that is to be obeyed. As there is only one,
875: we can use tail recursion to avoid using another stack frame, except when
876: match_cbegroup is required for an unlimited repeat of a possibly empty
877: group. If the second alternative doesn't exist, we can just plough on. */
878:
879: if (condition || *ecode == OP_ALT)
880: {
881: ecode += 1 + LINK_SIZE;
882: if (op == OP_SCOND) /* Possibly empty group */
883: {
884: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
885: RRETURN(rrc);
886: }
887: else /* Group must match something */
888: {
889: flags = 0;
890: goto TAIL_RECURSE;
891: }
892: }
1.3 ! misha 893: else /* Condition false & no alternative */
1.1 misha 894: {
895: ecode += 1 + LINK_SIZE;
896: }
897: break;
898:
899:
900: /* End of the pattern, either real or forced. If we are in a top-level
901: recursion, we should restore the offsets appropriately and continue from
902: after the call. */
903:
904: case OP_ACCEPT:
905: case OP_END:
906: if (md->recursive != NULL && md->recursive->group_num == 0)
907: {
908: recursion_info *rec = md->recursive;
909: DPRINTF(("End of pattern in a (?0) recursion\n"));
910: md->recursive = rec->prevrec;
911: memmove(md->offset_vector, rec->offset_save,
912: rec->saved_max * sizeof(int));
913: mstart = rec->save_start;
914: ims = original_ims;
915: ecode = rec->after_call;
916: break;
917: }
918:
919: /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
920: string - backtracking will then try other alternatives, if any. */
921:
922: if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
923: md->end_match_ptr = eptr; /* Record where we ended */
924: md->end_offset_top = offset_top; /* and how many extracts were taken */
925: md->start_match_ptr = mstart; /* and the start (\K can modify) */
926: RRETURN(MATCH_MATCH);
927:
928: /* Change option settings */
929:
930: case OP_OPT:
931: ims = ecode[1];
932: ecode += 2;
933: DPRINTF(("ims set to %02lx\n", ims));
934: break;
935:
936: /* Assertion brackets. Check the alternative branches in turn - the
937: matching won't pass the KET for an assertion. If any one branch matches,
938: the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
939: start of each branch to move the current point backwards, so the code at
940: this level is identical to the lookahead case. */
941:
942: case OP_ASSERT:
943: case OP_ASSERTBACK:
944: do
945: {
946: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
947: RM4);
948: if (rrc == MATCH_MATCH) break;
949: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
950: ecode += GET(ecode, 1);
951: }
952: while (*ecode == OP_ALT);
953: if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
954:
955: /* If checking an assertion for a condition, return MATCH_MATCH. */
956:
957: if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
958:
959: /* Continue from after the assertion, updating the offsets high water
960: mark, since extracts may have been taken during the assertion. */
961:
962: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
963: ecode += 1 + LINK_SIZE;
964: offset_top = md->end_offset_top;
965: continue;
966:
967: /* Negative assertion: all branches must fail to match */
968:
969: case OP_ASSERT_NOT:
970: case OP_ASSERTBACK_NOT:
971: do
972: {
973: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
974: RM5);
975: if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
976: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
977: ecode += GET(ecode,1);
978: }
979: while (*ecode == OP_ALT);
980:
981: if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
982:
983: ecode += 1 + LINK_SIZE;
984: continue;
985:
986: /* Move the subject pointer back. This occurs only at the start of
987: each branch of a lookbehind assertion. If we are too close to the start to
988: move back, this match function fails. When working with UTF-8 we move
989: back a number of characters, not bytes. */
990:
991: case OP_REVERSE:
992: #ifdef SUPPORT_UTF8
993: if (utf8)
994: {
995: i = GET(ecode, 1);
996: while (i-- > 0)
997: {
998: eptr--;
999: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1000: BACKCHAR(eptr);
1001: }
1002: }
1003: else
1004: #endif
1005:
1006: /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1007:
1008: {
1009: eptr -= GET(ecode, 1);
1010: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1011: }
1012:
1013: /* Skip to next op code */
1014:
1015: ecode += 1 + LINK_SIZE;
1016: break;
1017:
1018: /* The callout item calls an external function, if one is provided, passing
1019: details of the match so far. This is mainly for debugging, though the
1020: function is able to force a failure. */
1021:
1022: case OP_CALLOUT:
1023: if (pcre_callout != NULL)
1024: {
1025: pcre_callout_block cb;
1026: cb.version = 1; /* Version 1 of the callout block */
1027: cb.callout_number = ecode[1];
1028: cb.offset_vector = md->offset_vector;
1029: cb.subject = (PCRE_SPTR)md->start_subject;
1030: cb.subject_length = md->end_subject - md->start_subject;
1031: cb.start_match = mstart - md->start_subject;
1032: cb.current_position = eptr - md->start_subject;
1033: cb.pattern_position = GET(ecode, 2);
1034: cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1035: cb.capture_top = offset_top/2;
1036: cb.capture_last = md->capture_last;
1037: cb.callout_data = md->callout_data;
1038: if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1039: if (rrc < 0) RRETURN(rrc);
1040: }
1041: ecode += 2 + 2*LINK_SIZE;
1042: break;
1043:
1044: /* Recursion either matches the current regex, or some subexpression. The
1045: offset data is the offset to the starting bracket from the start of the
1046: whole pattern. (This is so that it works from duplicated subpatterns.)
1047:
1048: If there are any capturing brackets started but not finished, we have to
1049: save their starting points and reinstate them after the recursion. However,
1050: we don't know how many such there are (offset_top records the completed
1051: total) so we just have to save all the potential data. There may be up to
1052: 65535 such values, which is too large to put on the stack, but using malloc
1053: for small numbers seems expensive. As a compromise, the stack is used when
1054: there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1055: is used. A problem is what to do if the malloc fails ... there is no way of
1056: returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1057: values on the stack, and accept that the rest may be wrong.
1058:
1059: There are also other values that have to be saved. We use a chained
1060: sequence of blocks that actually live on the stack. Thanks to Robin Houston
1061: for the original version of this logic. */
1062:
1063: case OP_RECURSE:
1064: {
1065: callpat = md->start_code + GET(ecode, 1);
1066: new_recursive.group_num = (callpat == md->start_code)? 0 :
1067: GET2(callpat, 1 + LINK_SIZE);
1068:
1069: /* Add to "recursing stack" */
1070:
1071: new_recursive.prevrec = md->recursive;
1072: md->recursive = &new_recursive;
1073:
1074: /* Find where to continue from afterwards */
1075:
1076: ecode += 1 + LINK_SIZE;
1077: new_recursive.after_call = ecode;
1078:
1079: /* Now save the offset data. */
1080:
1081: new_recursive.saved_max = md->offset_end;
1082: if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1083: new_recursive.offset_save = stacksave;
1084: else
1085: {
1086: new_recursive.offset_save =
1087: (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1088: if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1089: }
1090:
1091: memcpy(new_recursive.offset_save, md->offset_vector,
1092: new_recursive.saved_max * sizeof(int));
1093: new_recursive.save_start = mstart;
1094: mstart = eptr;
1095:
1096: /* OK, now we can do the recursion. For each top-level alternative we
1097: restore the offset and recursion data. */
1098:
1099: DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1100: flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1101: do
1102: {
1103: RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1104: md, ims, eptrb, flags, RM6);
1105: if (rrc == MATCH_MATCH)
1106: {
1107: DPRINTF(("Recursion matched\n"));
1108: md->recursive = new_recursive.prevrec;
1109: if (new_recursive.offset_save != stacksave)
1110: (pcre_free)(new_recursive.offset_save);
1111: RRETURN(MATCH_MATCH);
1112: }
1113: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1114: {
1115: DPRINTF(("Recursion gave error %d\n", rrc));
1.3 ! misha 1116: if (new_recursive.offset_save != stacksave)
! 1117: (pcre_free)(new_recursive.offset_save);
1.1 misha 1118: RRETURN(rrc);
1119: }
1120:
1121: md->recursive = &new_recursive;
1122: memcpy(md->offset_vector, new_recursive.offset_save,
1123: new_recursive.saved_max * sizeof(int));
1124: callpat += GET(callpat, 1);
1125: }
1126: while (*callpat == OP_ALT);
1127:
1128: DPRINTF(("Recursion didn't match\n"));
1129: md->recursive = new_recursive.prevrec;
1130: if (new_recursive.offset_save != stacksave)
1131: (pcre_free)(new_recursive.offset_save);
1132: RRETURN(MATCH_NOMATCH);
1133: }
1134: /* Control never reaches here */
1135:
1136: /* "Once" brackets are like assertion brackets except that after a match,
1137: the point in the subject string is not moved back. Thus there can never be
1138: a move back into the brackets. Friedl calls these "atomic" subpatterns.
1139: Check the alternative branches in turn - the matching won't pass the KET
1140: for this kind of subpattern. If any one branch matches, we carry on as at
1141: the end of a normal bracket, leaving the subject pointer. */
1142:
1143: case OP_ONCE:
1144: prev = ecode;
1145: saved_eptr = eptr;
1146:
1147: do
1148: {
1149: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1150: if (rrc == MATCH_MATCH) break;
1151: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1152: ecode += GET(ecode,1);
1153: }
1154: while (*ecode == OP_ALT);
1155:
1156: /* If hit the end of the group (which could be repeated), fail */
1157:
1158: if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1159:
1160: /* Continue as from after the assertion, updating the offsets high water
1161: mark, since extracts may have been taken. */
1162:
1163: do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1164:
1165: offset_top = md->end_offset_top;
1166: eptr = md->end_match_ptr;
1167:
1168: /* For a non-repeating ket, just continue at this level. This also
1169: happens for a repeating ket if no characters were matched in the group.
1170: This is the forcible breaking of infinite loops as implemented in Perl
1171: 5.005. If there is an options reset, it will get obeyed in the normal
1172: course of events. */
1173:
1174: if (*ecode == OP_KET || eptr == saved_eptr)
1175: {
1176: ecode += 1+LINK_SIZE;
1177: break;
1178: }
1179:
1180: /* The repeating kets try the rest of the pattern or restart from the
1181: preceding bracket, in the appropriate order. The second "call" of match()
1182: uses tail recursion, to avoid using another stack frame. We need to reset
1183: any options that changed within the bracket before re-running it, so
1184: check the next opcode. */
1185:
1186: if (ecode[1+LINK_SIZE] == OP_OPT)
1187: {
1188: ims = (ims & ~PCRE_IMS) | ecode[4];
1189: DPRINTF(("ims set to %02lx at group repeat\n", ims));
1190: }
1191:
1192: if (*ecode == OP_KETRMIN)
1193: {
1194: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1195: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1196: ecode = prev;
1197: flags = 0;
1198: goto TAIL_RECURSE;
1199: }
1200: else /* OP_KETRMAX */
1201: {
1202: RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1203: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1204: ecode += 1 + LINK_SIZE;
1205: flags = 0;
1206: goto TAIL_RECURSE;
1207: }
1208: /* Control never gets here */
1209:
1210: /* An alternation is the end of a branch; scan along to find the end of the
1211: bracketed group and go to there. */
1212:
1213: case OP_ALT:
1214: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1215: break;
1216:
1217: /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1218: indicating that it may occur zero times. It may repeat infinitely, or not
1219: at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1220: with fixed upper repeat limits are compiled as a number of copies, with the
1221: optional ones preceded by BRAZERO or BRAMINZERO. */
1222:
1223: case OP_BRAZERO:
1224: {
1225: next = ecode+1;
1226: RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1227: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1228: do next += GET(next,1); while (*next == OP_ALT);
1229: ecode = next + 1 + LINK_SIZE;
1230: }
1231: break;
1232:
1233: case OP_BRAMINZERO:
1234: {
1235: next = ecode+1;
1236: do next += GET(next, 1); while (*next == OP_ALT);
1237: RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1238: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1239: ecode++;
1240: }
1241: break;
1242:
1243: case OP_SKIPZERO:
1244: {
1245: next = ecode+1;
1246: do next += GET(next,1); while (*next == OP_ALT);
1247: ecode = next + 1 + LINK_SIZE;
1248: }
1249: break;
1250:
1251: /* End of a group, repeated or non-repeating. */
1252:
1253: case OP_KET:
1254: case OP_KETRMIN:
1255: case OP_KETRMAX:
1256: prev = ecode - GET(ecode, 1);
1257:
1258: /* If this was a group that remembered the subject start, in order to break
1259: infinite repeats of empty string matches, retrieve the subject start from
1260: the chain. Otherwise, set it NULL. */
1261:
1262: if (*prev >= OP_SBRA)
1263: {
1264: saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1265: eptrb = eptrb->epb_prev; /* Backup to previous group */
1266: }
1267: else saved_eptr = NULL;
1268:
1269: /* If we are at the end of an assertion group, stop matching and return
1270: MATCH_MATCH, but record the current high water mark for use by positive
1271: assertions. Do this also for the "once" (atomic) groups. */
1272:
1273: if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1274: *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1275: *prev == OP_ONCE)
1276: {
1277: md->end_match_ptr = eptr; /* For ONCE */
1278: md->end_offset_top = offset_top;
1279: RRETURN(MATCH_MATCH);
1280: }
1281:
1282: /* For capturing groups we have to check the group number back at the start
1283: and if necessary complete handling an extraction by setting the offsets and
1284: bumping the high water mark. Note that whole-pattern recursion is coded as
1285: a recurse into group 0, so it won't be picked up here. Instead, we catch it
1286: when the OP_END is reached. Other recursion is handled here. */
1287:
1288: if (*prev == OP_CBRA || *prev == OP_SCBRA)
1289: {
1290: number = GET2(prev, 1+LINK_SIZE);
1291: offset = number << 1;
1292:
1293: #ifdef DEBUG
1294: printf("end bracket %d", number);
1295: printf("\n");
1296: #endif
1297:
1298: md->capture_last = number;
1299: if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1300: {
1301: md->offset_vector[offset] =
1302: md->offset_vector[md->offset_end - number];
1303: md->offset_vector[offset+1] = eptr - md->start_subject;
1304: if (offset_top <= offset) offset_top = offset + 2;
1305: }
1306:
1307: /* Handle a recursively called group. Restore the offsets
1308: appropriately and continue from after the call. */
1309:
1310: if (md->recursive != NULL && md->recursive->group_num == number)
1311: {
1312: recursion_info *rec = md->recursive;
1313: DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1314: md->recursive = rec->prevrec;
1315: mstart = rec->save_start;
1316: memcpy(md->offset_vector, rec->offset_save,
1317: rec->saved_max * sizeof(int));
1318: ecode = rec->after_call;
1319: ims = original_ims;
1320: break;
1321: }
1322: }
1323:
1324: /* For both capturing and non-capturing groups, reset the value of the ims
1325: flags, in case they got changed during the group. */
1326:
1327: ims = original_ims;
1328: DPRINTF(("ims reset to %02lx\n", ims));
1329:
1330: /* For a non-repeating ket, just continue at this level. This also
1331: happens for a repeating ket if no characters were matched in the group.
1332: This is the forcible breaking of infinite loops as implemented in Perl
1333: 5.005. If there is an options reset, it will get obeyed in the normal
1334: course of events. */
1335:
1336: if (*ecode == OP_KET || eptr == saved_eptr)
1337: {
1338: ecode += 1 + LINK_SIZE;
1339: break;
1340: }
1341:
1342: /* The repeating kets try the rest of the pattern or restart from the
1343: preceding bracket, in the appropriate order. In the second case, we can use
1344: tail recursion to avoid using another stack frame, unless we have an
1345: unlimited repeat of a group that can match an empty string. */
1346:
1347: flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1348:
1349: if (*ecode == OP_KETRMIN)
1350: {
1351: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1352: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1353: if (flags != 0) /* Could match an empty string */
1354: {
1355: RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1356: RRETURN(rrc);
1357: }
1358: ecode = prev;
1359: goto TAIL_RECURSE;
1360: }
1361: else /* OP_KETRMAX */
1362: {
1363: RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1364: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1365: ecode += 1 + LINK_SIZE;
1366: flags = 0;
1367: goto TAIL_RECURSE;
1368: }
1369: /* Control never gets here */
1370:
1371: /* Start of subject unless notbol, or after internal newline if multiline */
1372:
1373: case OP_CIRC:
1374: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1375: if ((ims & PCRE_MULTILINE) != 0)
1376: {
1377: if (eptr != md->start_subject &&
1378: (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1379: RRETURN(MATCH_NOMATCH);
1380: ecode++;
1381: break;
1382: }
1383: /* ... else fall through */
1384:
1385: /* Start of subject assertion */
1386:
1387: case OP_SOD:
1388: if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1389: ecode++;
1390: break;
1391:
1392: /* Start of match assertion */
1393:
1394: case OP_SOM:
1395: if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1396: ecode++;
1397: break;
1398:
1399: /* Reset the start of match point */
1400:
1401: case OP_SET_SOM:
1402: mstart = eptr;
1403: ecode++;
1404: break;
1405:
1406: /* Assert before internal newline if multiline, or before a terminating
1407: newline unless endonly is set, else end of subject unless noteol is set. */
1408:
1409: case OP_DOLL:
1410: if ((ims & PCRE_MULTILINE) != 0)
1411: {
1412: if (eptr < md->end_subject)
1413: { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1414: else
1415: { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1416: ecode++;
1417: break;
1418: }
1419: else
1420: {
1421: if (md->noteol) RRETURN(MATCH_NOMATCH);
1422: if (!md->endonly)
1423: {
1424: if (eptr != md->end_subject &&
1425: (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1426: RRETURN(MATCH_NOMATCH);
1427: ecode++;
1428: break;
1429: }
1430: }
1431: /* ... else fall through for endonly */
1432:
1433: /* End of subject assertion (\z) */
1434:
1435: case OP_EOD:
1436: if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1437: ecode++;
1438: break;
1439:
1440: /* End of subject or ending \n assertion (\Z) */
1441:
1442: case OP_EODN:
1443: if (eptr != md->end_subject &&
1444: (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1445: RRETURN(MATCH_NOMATCH);
1446: ecode++;
1447: break;
1448:
1449: /* Word boundary assertions */
1450:
1451: case OP_NOT_WORD_BOUNDARY:
1452: case OP_WORD_BOUNDARY:
1453: {
1454:
1455: /* Find out if the previous and current characters are "word" characters.
1456: It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1457: be "non-word" characters. */
1458:
1459: #ifdef SUPPORT_UTF8
1460: if (utf8)
1461: {
1462: if (eptr == md->start_subject) prev_is_word = FALSE; else
1463: {
1.3 ! misha 1464: USPTR lastptr = eptr - 1;
1.1 misha 1465: while((*lastptr & 0xc0) == 0x80) lastptr--;
1466: GETCHAR(c, lastptr);
1467: prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1468: }
1469: if (eptr >= md->end_subject) cur_is_word = FALSE; else
1470: {
1471: GETCHAR(c, eptr);
1472: cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1473: }
1474: }
1475: else
1476: #endif
1477:
1478: /* More streamlined when not in UTF-8 mode */
1479:
1480: {
1481: prev_is_word = (eptr != md->start_subject) &&
1482: ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1483: cur_is_word = (eptr < md->end_subject) &&
1484: ((md->ctypes[*eptr] & ctype_word) != 0);
1485: }
1486:
1487: /* Now see if the situation is what we want */
1488:
1489: if ((*ecode++ == OP_WORD_BOUNDARY)?
1490: cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1491: RRETURN(MATCH_NOMATCH);
1492: }
1493: break;
1494:
1495: /* Match a single character type; inline for speed */
1496:
1497: case OP_ANY:
1498: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1499: /* Fall through */
1500:
1501: case OP_ALLANY:
1502: if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1503: if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1504: ecode++;
1505: break;
1506:
1507: /* Match a single byte, even in UTF-8 mode. This opcode really does match
1508: any byte, even newline, independent of the setting of PCRE_DOTALL. */
1509:
1510: case OP_ANYBYTE:
1511: if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1512: ecode++;
1513: break;
1514:
1515: case OP_NOT_DIGIT:
1516: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1517: GETCHARINCTEST(c, eptr);
1518: if (
1519: #ifdef SUPPORT_UTF8
1520: c < 256 &&
1521: #endif
1522: (md->ctypes[c] & ctype_digit) != 0
1523: )
1524: RRETURN(MATCH_NOMATCH);
1525: ecode++;
1526: break;
1527:
1528: case OP_DIGIT:
1529: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1530: GETCHARINCTEST(c, eptr);
1531: if (
1532: #ifdef SUPPORT_UTF8
1533: c >= 256 ||
1534: #endif
1535: (md->ctypes[c] & ctype_digit) == 0
1536: )
1537: RRETURN(MATCH_NOMATCH);
1538: ecode++;
1539: break;
1540:
1541: case OP_NOT_WHITESPACE:
1542: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1543: GETCHARINCTEST(c, eptr);
1544: if (
1545: #ifdef SUPPORT_UTF8
1546: c < 256 &&
1547: #endif
1548: (md->ctypes[c] & ctype_space) != 0
1549: )
1550: RRETURN(MATCH_NOMATCH);
1551: ecode++;
1552: break;
1553:
1554: case OP_WHITESPACE:
1555: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1556: GETCHARINCTEST(c, eptr);
1557: if (
1558: #ifdef SUPPORT_UTF8
1559: c >= 256 ||
1560: #endif
1561: (md->ctypes[c] & ctype_space) == 0
1562: )
1563: RRETURN(MATCH_NOMATCH);
1564: ecode++;
1565: break;
1566:
1567: case OP_NOT_WORDCHAR:
1568: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1569: GETCHARINCTEST(c, eptr);
1570: if (
1571: #ifdef SUPPORT_UTF8
1572: c < 256 &&
1573: #endif
1574: (md->ctypes[c] & ctype_word) != 0
1575: )
1576: RRETURN(MATCH_NOMATCH);
1577: ecode++;
1578: break;
1579:
1580: case OP_WORDCHAR:
1581: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1582: GETCHARINCTEST(c, eptr);
1583: if (
1584: #ifdef SUPPORT_UTF8
1585: c >= 256 ||
1586: #endif
1587: (md->ctypes[c] & ctype_word) == 0
1588: )
1589: RRETURN(MATCH_NOMATCH);
1590: ecode++;
1591: break;
1592:
1593: case OP_ANYNL:
1594: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1595: GETCHARINCTEST(c, eptr);
1596: switch(c)
1597: {
1598: default: RRETURN(MATCH_NOMATCH);
1599: case 0x000d:
1600: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1601: break;
1602:
1603: case 0x000a:
1604: break;
1605:
1606: case 0x000b:
1607: case 0x000c:
1608: case 0x0085:
1609: case 0x2028:
1610: case 0x2029:
1611: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1612: break;
1613: }
1614: ecode++;
1615: break;
1616:
1617: case OP_NOT_HSPACE:
1618: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1619: GETCHARINCTEST(c, eptr);
1620: switch(c)
1621: {
1622: default: break;
1623: case 0x09: /* HT */
1624: case 0x20: /* SPACE */
1625: case 0xa0: /* NBSP */
1626: case 0x1680: /* OGHAM SPACE MARK */
1627: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1628: case 0x2000: /* EN QUAD */
1629: case 0x2001: /* EM QUAD */
1630: case 0x2002: /* EN SPACE */
1631: case 0x2003: /* EM SPACE */
1632: case 0x2004: /* THREE-PER-EM SPACE */
1633: case 0x2005: /* FOUR-PER-EM SPACE */
1634: case 0x2006: /* SIX-PER-EM SPACE */
1635: case 0x2007: /* FIGURE SPACE */
1636: case 0x2008: /* PUNCTUATION SPACE */
1637: case 0x2009: /* THIN SPACE */
1638: case 0x200A: /* HAIR SPACE */
1639: case 0x202f: /* NARROW NO-BREAK SPACE */
1640: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1641: case 0x3000: /* IDEOGRAPHIC SPACE */
1642: RRETURN(MATCH_NOMATCH);
1643: }
1644: ecode++;
1645: break;
1646:
1647: case OP_HSPACE:
1648: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1649: GETCHARINCTEST(c, eptr);
1650: switch(c)
1651: {
1652: default: RRETURN(MATCH_NOMATCH);
1653: case 0x09: /* HT */
1654: case 0x20: /* SPACE */
1655: case 0xa0: /* NBSP */
1656: case 0x1680: /* OGHAM SPACE MARK */
1657: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1658: case 0x2000: /* EN QUAD */
1659: case 0x2001: /* EM QUAD */
1660: case 0x2002: /* EN SPACE */
1661: case 0x2003: /* EM SPACE */
1662: case 0x2004: /* THREE-PER-EM SPACE */
1663: case 0x2005: /* FOUR-PER-EM SPACE */
1664: case 0x2006: /* SIX-PER-EM SPACE */
1665: case 0x2007: /* FIGURE SPACE */
1666: case 0x2008: /* PUNCTUATION SPACE */
1667: case 0x2009: /* THIN SPACE */
1668: case 0x200A: /* HAIR SPACE */
1669: case 0x202f: /* NARROW NO-BREAK SPACE */
1670: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1671: case 0x3000: /* IDEOGRAPHIC SPACE */
1672: break;
1673: }
1674: ecode++;
1675: break;
1676:
1677: case OP_NOT_VSPACE:
1678: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1679: GETCHARINCTEST(c, eptr);
1680: switch(c)
1681: {
1682: default: break;
1683: case 0x0a: /* LF */
1684: case 0x0b: /* VT */
1685: case 0x0c: /* FF */
1686: case 0x0d: /* CR */
1687: case 0x85: /* NEL */
1688: case 0x2028: /* LINE SEPARATOR */
1689: case 0x2029: /* PARAGRAPH SEPARATOR */
1690: RRETURN(MATCH_NOMATCH);
1691: }
1692: ecode++;
1693: break;
1694:
1695: case OP_VSPACE:
1696: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1697: GETCHARINCTEST(c, eptr);
1698: switch(c)
1699: {
1700: default: RRETURN(MATCH_NOMATCH);
1701: case 0x0a: /* LF */
1702: case 0x0b: /* VT */
1703: case 0x0c: /* FF */
1704: case 0x0d: /* CR */
1705: case 0x85: /* NEL */
1706: case 0x2028: /* LINE SEPARATOR */
1707: case 0x2029: /* PARAGRAPH SEPARATOR */
1708: break;
1709: }
1710: ecode++;
1711: break;
1712:
1713: #ifdef SUPPORT_UCP
1714: /* Check the next character by Unicode property. We will get here only
1715: if the support is in the binary; otherwise a compile-time error occurs. */
1716:
1717: case OP_PROP:
1718: case OP_NOTPROP:
1719: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1720: GETCHARINCTEST(c, eptr);
1721: {
1.3 ! misha 1722: const ucd_record *prop = GET_UCD(c);
1.1 misha 1723:
1724: switch(ecode[1])
1725: {
1726: case PT_ANY:
1727: if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1728: break;
1729:
1730: case PT_LAMP:
1.2 misha 1731: if ((prop->chartype == ucp_Lu ||
1732: prop->chartype == ucp_Ll ||
1733: prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1.1 misha 1734: RRETURN(MATCH_NOMATCH);
1735: break;
1736:
1737: case PT_GC:
1.2 misha 1738: if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1.1 misha 1739: RRETURN(MATCH_NOMATCH);
1740: break;
1741:
1742: case PT_PC:
1.2 misha 1743: if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1.1 misha 1744: RRETURN(MATCH_NOMATCH);
1745: break;
1746:
1747: case PT_SC:
1.2 misha 1748: if ((ecode[2] != prop->script) == (op == OP_PROP))
1.1 misha 1749: RRETURN(MATCH_NOMATCH);
1750: break;
1751:
1752: default:
1753: RRETURN(PCRE_ERROR_INTERNAL);
1754: }
1755:
1756: ecode += 3;
1757: }
1758: break;
1759:
1760: /* Match an extended Unicode sequence. We will get here only if the support
1761: is in the binary; otherwise a compile-time error occurs. */
1762:
1763: case OP_EXTUNI:
1764: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1765: GETCHARINCTEST(c, eptr);
1766: {
1.2 misha 1767: int category = UCD_CATEGORY(c);
1.1 misha 1768: if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1769: while (eptr < md->end_subject)
1770: {
1771: int len = 1;
1772: if (!utf8) c = *eptr; else
1773: {
1774: GETCHARLEN(c, eptr, len);
1775: }
1.2 misha 1776: category = UCD_CATEGORY(c);
1.1 misha 1777: if (category != ucp_M) break;
1778: eptr += len;
1779: }
1780: }
1781: ecode++;
1782: break;
1783: #endif
1784:
1785:
1786: /* Match a back reference, possibly repeatedly. Look past the end of the
1787: item to see if there is repeat information following. The code is similar
1788: to that for character classes, but repeated for efficiency. Then obey
1789: similar code to character type repeats - written out again for speed.
1790: However, if the referenced string is the empty string, always treat
1791: it as matched, any number of times (otherwise there could be infinite
1792: loops). */
1793:
1794: case OP_REF:
1795: {
1796: offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1797: ecode += 3;
1798:
1799: /* If the reference is unset, there are two possibilities:
1800:
1801: (a) In the default, Perl-compatible state, set the length to be longer
1802: than the amount of subject left; this ensures that every attempt at a
1803: match fails. We can't just fail here, because of the possibility of
1804: quantifiers with zero minima.
1805:
1806: (b) If the JavaScript compatibility flag is set, set the length to zero
1807: so that the back reference matches an empty string.
1808:
1809: Otherwise, set the length to the length of what was matched by the
1810: referenced subpattern. */
1811:
1812: if (offset >= offset_top || md->offset_vector[offset] < 0)
1813: length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1814: else
1815: length = md->offset_vector[offset+1] - md->offset_vector[offset];
1816:
1817: /* Set up for repetition, or handle the non-repeated case */
1818:
1819: switch (*ecode)
1820: {
1821: case OP_CRSTAR:
1822: case OP_CRMINSTAR:
1823: case OP_CRPLUS:
1824: case OP_CRMINPLUS:
1825: case OP_CRQUERY:
1826: case OP_CRMINQUERY:
1827: c = *ecode++ - OP_CRSTAR;
1828: minimize = (c & 1) != 0;
1829: min = rep_min[c]; /* Pick up values from tables; */
1830: max = rep_max[c]; /* zero for max => infinity */
1831: if (max == 0) max = INT_MAX;
1832: break;
1833:
1834: case OP_CRRANGE:
1835: case OP_CRMINRANGE:
1836: minimize = (*ecode == OP_CRMINRANGE);
1837: min = GET2(ecode, 1);
1838: max = GET2(ecode, 3);
1839: if (max == 0) max = INT_MAX;
1840: ecode += 5;
1841: break;
1842:
1843: default: /* No repeat follows */
1844: if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1845: eptr += length;
1846: continue; /* With the main loop */
1847: }
1848:
1849: /* If the length of the reference is zero, just continue with the
1850: main loop. */
1851:
1852: if (length == 0) continue;
1853:
1854: /* First, ensure the minimum number of matches are present. We get back
1855: the length of the reference string explicitly rather than passing the
1856: address of eptr, so that eptr can be a register variable. */
1857:
1858: for (i = 1; i <= min; i++)
1859: {
1860: if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1861: eptr += length;
1862: }
1863:
1864: /* If min = max, continue at the same level without recursion.
1865: They are not both allowed to be zero. */
1866:
1867: if (min == max) continue;
1868:
1869: /* If minimizing, keep trying and advancing the pointer */
1870:
1871: if (minimize)
1872: {
1873: for (fi = min;; fi++)
1874: {
1875: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1876: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1877: if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1878: RRETURN(MATCH_NOMATCH);
1879: eptr += length;
1880: }
1881: /* Control never gets here */
1882: }
1883:
1884: /* If maximizing, find the longest string and work backwards */
1885:
1886: else
1887: {
1888: pp = eptr;
1889: for (i = min; i < max; i++)
1890: {
1891: if (!match_ref(offset, eptr, length, md, ims)) break;
1892: eptr += length;
1893: }
1894: while (eptr >= pp)
1895: {
1896: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1897: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1898: eptr -= length;
1899: }
1900: RRETURN(MATCH_NOMATCH);
1901: }
1902: }
1903: /* Control never gets here */
1904:
1905:
1906:
1907: /* Match a bit-mapped character class, possibly repeatedly. This op code is
1908: used when all the characters in the class have values in the range 0-255,
1909: and either the matching is caseful, or the characters are in the range
1910: 0-127 when UTF-8 processing is enabled. The only difference between
1911: OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1912: encountered.
1913:
1914: First, look past the end of the item to see if there is repeat information
1915: following. Then obey similar code to character type repeats - written out
1916: again for speed. */
1917:
1918: case OP_NCLASS:
1919: case OP_CLASS:
1920: {
1921: data = ecode + 1; /* Save for matching */
1922: ecode += 33; /* Advance past the item */
1923:
1924: switch (*ecode)
1925: {
1926: case OP_CRSTAR:
1927: case OP_CRMINSTAR:
1928: case OP_CRPLUS:
1929: case OP_CRMINPLUS:
1930: case OP_CRQUERY:
1931: case OP_CRMINQUERY:
1932: c = *ecode++ - OP_CRSTAR;
1933: minimize = (c & 1) != 0;
1934: min = rep_min[c]; /* Pick up values from tables; */
1935: max = rep_max[c]; /* zero for max => infinity */
1936: if (max == 0) max = INT_MAX;
1937: break;
1938:
1939: case OP_CRRANGE:
1940: case OP_CRMINRANGE:
1941: minimize = (*ecode == OP_CRMINRANGE);
1942: min = GET2(ecode, 1);
1943: max = GET2(ecode, 3);
1944: if (max == 0) max = INT_MAX;
1945: ecode += 5;
1946: break;
1947:
1948: default: /* No repeat follows */
1949: min = max = 1;
1950: break;
1951: }
1952:
1953: /* First, ensure the minimum number of matches are present. */
1954:
1955: #ifdef SUPPORT_UTF8
1956: /* UTF-8 mode */
1957: if (utf8)
1958: {
1959: for (i = 1; i <= min; i++)
1960: {
1961: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1962: GETCHARINC(c, eptr);
1963: if (c > 255)
1964: {
1965: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1966: }
1967: else
1968: {
1969: if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1970: }
1971: }
1972: }
1973: else
1974: #endif
1975: /* Not UTF-8 mode */
1976: {
1977: for (i = 1; i <= min; i++)
1978: {
1979: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1980: c = *eptr++;
1981: if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1982: }
1983: }
1984:
1985: /* If max == min we can continue with the main loop without the
1986: need to recurse. */
1987:
1988: if (min == max) continue;
1989:
1990: /* If minimizing, keep testing the rest of the expression and advancing
1991: the pointer while it matches the class. */
1992:
1993: if (minimize)
1994: {
1995: #ifdef SUPPORT_UTF8
1996: /* UTF-8 mode */
1997: if (utf8)
1998: {
1999: for (fi = min;; fi++)
2000: {
2001: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2002: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2003: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2004: GETCHARINC(c, eptr);
2005: if (c > 255)
2006: {
2007: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2008: }
2009: else
2010: {
2011: if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2012: }
2013: }
2014: }
2015: else
2016: #endif
2017: /* Not UTF-8 mode */
2018: {
2019: for (fi = min;; fi++)
2020: {
2021: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2022: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2023: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2024: c = *eptr++;
2025: if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2026: }
2027: }
2028: /* Control never gets here */
2029: }
2030:
2031: /* If maximizing, find the longest possible run, then work backwards. */
2032:
2033: else
2034: {
2035: pp = eptr;
2036:
2037: #ifdef SUPPORT_UTF8
2038: /* UTF-8 mode */
2039: if (utf8)
2040: {
2041: for (i = min; i < max; i++)
2042: {
2043: int len = 1;
2044: if (eptr >= md->end_subject) break;
2045: GETCHARLEN(c, eptr, len);
2046: if (c > 255)
2047: {
2048: if (op == OP_CLASS) break;
2049: }
2050: else
2051: {
2052: if ((data[c/8] & (1 << (c&7))) == 0) break;
2053: }
2054: eptr += len;
2055: }
2056: for (;;)
2057: {
2058: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2059: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2060: if (eptr-- == pp) break; /* Stop if tried at original pos */
2061: BACKCHAR(eptr);
2062: }
2063: }
2064: else
2065: #endif
2066: /* Not UTF-8 mode */
2067: {
2068: for (i = min; i < max; i++)
2069: {
2070: if (eptr >= md->end_subject) break;
2071: c = *eptr;
2072: if ((data[c/8] & (1 << (c&7))) == 0) break;
2073: eptr++;
2074: }
2075: while (eptr >= pp)
2076: {
2077: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2078: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2079: eptr--;
2080: }
2081: }
2082:
2083: RRETURN(MATCH_NOMATCH);
2084: }
2085: }
2086: /* Control never gets here */
2087:
2088:
2089: /* Match an extended character class. This opcode is encountered only
1.3 ! misha 2090: when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
! 2091: mode, because Unicode properties are supported in non-UTF-8 mode. */
1.1 misha 2092:
2093: #ifdef SUPPORT_UTF8
2094: case OP_XCLASS:
2095: {
2096: data = ecode + 1 + LINK_SIZE; /* Save for matching */
2097: ecode += GET(ecode, 1); /* Advance past the item */
2098:
2099: switch (*ecode)
2100: {
2101: case OP_CRSTAR:
2102: case OP_CRMINSTAR:
2103: case OP_CRPLUS:
2104: case OP_CRMINPLUS:
2105: case OP_CRQUERY:
2106: case OP_CRMINQUERY:
2107: c = *ecode++ - OP_CRSTAR;
2108: minimize = (c & 1) != 0;
2109: min = rep_min[c]; /* Pick up values from tables; */
2110: max = rep_max[c]; /* zero for max => infinity */
2111: if (max == 0) max = INT_MAX;
2112: break;
2113:
2114: case OP_CRRANGE:
2115: case OP_CRMINRANGE:
2116: minimize = (*ecode == OP_CRMINRANGE);
2117: min = GET2(ecode, 1);
2118: max = GET2(ecode, 3);
2119: if (max == 0) max = INT_MAX;
2120: ecode += 5;
2121: break;
2122:
2123: default: /* No repeat follows */
2124: min = max = 1;
2125: break;
2126: }
2127:
2128: /* First, ensure the minimum number of matches are present. */
2129:
2130: for (i = 1; i <= min; i++)
2131: {
2132: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1.3 ! misha 2133: GETCHARINCTEST(c, eptr);
1.1 misha 2134: if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2135: }
2136:
2137: /* If max == min we can continue with the main loop without the
2138: need to recurse. */
2139:
2140: if (min == max) continue;
2141:
2142: /* If minimizing, keep testing the rest of the expression and advancing
2143: the pointer while it matches the class. */
2144:
2145: if (minimize)
2146: {
2147: for (fi = min;; fi++)
2148: {
2149: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2150: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2151: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1.3 ! misha 2152: GETCHARINCTEST(c, eptr);
1.1 misha 2153: if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2154: }
2155: /* Control never gets here */
2156: }
2157:
2158: /* If maximizing, find the longest possible run, then work backwards. */
2159:
2160: else
2161: {
2162: pp = eptr;
2163: for (i = min; i < max; i++)
2164: {
2165: int len = 1;
2166: if (eptr >= md->end_subject) break;
1.3 ! misha 2167: GETCHARLENTEST(c, eptr, len);
1.1 misha 2168: if (!_pcre_xclass(c, data)) break;
2169: eptr += len;
2170: }
2171: for(;;)
2172: {
2173: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2174: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2175: if (eptr-- == pp) break; /* Stop if tried at original pos */
2176: if (utf8) BACKCHAR(eptr);
2177: }
2178: RRETURN(MATCH_NOMATCH);
2179: }
2180:
2181: /* Control never gets here */
2182: }
2183: #endif /* End of XCLASS */
2184:
2185: /* Match a single character, casefully */
2186:
2187: case OP_CHAR:
2188: #ifdef SUPPORT_UTF8
2189: if (utf8)
2190: {
2191: length = 1;
2192: ecode++;
2193: GETCHARLEN(fc, ecode, length);
2194: if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2195: while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2196: }
2197: else
2198: #endif
2199:
2200: /* Non-UTF-8 mode */
2201: {
2202: if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2203: if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2204: ecode += 2;
2205: }
2206: break;
2207:
2208: /* Match a single character, caselessly */
2209:
2210: case OP_CHARNC:
2211: #ifdef SUPPORT_UTF8
2212: if (utf8)
2213: {
2214: length = 1;
2215: ecode++;
2216: GETCHARLEN(fc, ecode, length);
2217:
2218: if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2219:
2220: /* If the pattern character's value is < 128, we have only one byte, and
2221: can use the fast lookup table. */
2222:
2223: if (fc < 128)
2224: {
2225: if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2226: }
2227:
2228: /* Otherwise we must pick up the subject character */
2229:
2230: else
2231: {
2232: unsigned int dc;
2233: GETCHARINC(dc, eptr);
2234: ecode += length;
2235:
2236: /* If we have Unicode property support, we can use it to test the other
2237: case of the character, if there is one. */
2238:
2239: if (fc != dc)
2240: {
2241: #ifdef SUPPORT_UCP
1.2 misha 2242: if (dc != UCD_OTHERCASE(fc))
1.1 misha 2243: #endif
2244: RRETURN(MATCH_NOMATCH);
2245: }
2246: }
2247: }
2248: else
2249: #endif /* SUPPORT_UTF8 */
2250:
2251: /* Non-UTF-8 mode */
2252: {
2253: if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2254: if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2255: ecode += 2;
2256: }
2257: break;
2258:
2259: /* Match a single character repeatedly. */
2260:
2261: case OP_EXACT:
2262: min = max = GET2(ecode, 1);
2263: ecode += 3;
2264: goto REPEATCHAR;
2265:
2266: case OP_POSUPTO:
2267: possessive = TRUE;
2268: /* Fall through */
2269:
2270: case OP_UPTO:
2271: case OP_MINUPTO:
2272: min = 0;
2273: max = GET2(ecode, 1);
2274: minimize = *ecode == OP_MINUPTO;
2275: ecode += 3;
2276: goto REPEATCHAR;
2277:
2278: case OP_POSSTAR:
2279: possessive = TRUE;
2280: min = 0;
2281: max = INT_MAX;
2282: ecode++;
2283: goto REPEATCHAR;
2284:
2285: case OP_POSPLUS:
2286: possessive = TRUE;
2287: min = 1;
2288: max = INT_MAX;
2289: ecode++;
2290: goto REPEATCHAR;
2291:
2292: case OP_POSQUERY:
2293: possessive = TRUE;
2294: min = 0;
2295: max = 1;
2296: ecode++;
2297: goto REPEATCHAR;
2298:
2299: case OP_STAR:
2300: case OP_MINSTAR:
2301: case OP_PLUS:
2302: case OP_MINPLUS:
2303: case OP_QUERY:
2304: case OP_MINQUERY:
2305: c = *ecode++ - OP_STAR;
2306: minimize = (c & 1) != 0;
2307: min = rep_min[c]; /* Pick up values from tables; */
2308: max = rep_max[c]; /* zero for max => infinity */
2309: if (max == 0) max = INT_MAX;
2310:
2311: /* Common code for all repeated single-character matches. We can give
2312: up quickly if there are fewer than the minimum number of characters left in
2313: the subject. */
2314:
2315: REPEATCHAR:
2316: #ifdef SUPPORT_UTF8
2317: if (utf8)
2318: {
2319: length = 1;
2320: charptr = ecode;
2321: GETCHARLEN(fc, ecode, length);
2322: if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2323: ecode += length;
2324:
2325: /* Handle multibyte character matching specially here. There is
2326: support for caseless matching if UCP support is present. */
2327:
2328: if (length > 1)
2329: {
2330: #ifdef SUPPORT_UCP
2331: unsigned int othercase;
2332: if ((ims & PCRE_CASELESS) != 0 &&
1.2 misha 2333: (othercase = UCD_OTHERCASE(fc)) != fc)
1.1 misha 2334: oclength = _pcre_ord2utf8(othercase, occhars);
2335: else oclength = 0;
2336: #endif /* SUPPORT_UCP */
2337:
2338: for (i = 1; i <= min; i++)
2339: {
2340: if (memcmp(eptr, charptr, length) == 0) eptr += length;
2341: #ifdef SUPPORT_UCP
2342: /* Need braces because of following else */
2343: else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2344: else
2345: {
2346: if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2347: eptr += oclength;
2348: }
2349: #else /* without SUPPORT_UCP */
2350: else { RRETURN(MATCH_NOMATCH); }
2351: #endif /* SUPPORT_UCP */
2352: }
2353:
2354: if (min == max) continue;
2355:
2356: if (minimize)
2357: {
2358: for (fi = min;; fi++)
2359: {
2360: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2361: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2362: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2363: if (memcmp(eptr, charptr, length) == 0) eptr += length;
2364: #ifdef SUPPORT_UCP
2365: /* Need braces because of following else */
2366: else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2367: else
2368: {
2369: if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2370: eptr += oclength;
2371: }
2372: #else /* without SUPPORT_UCP */
2373: else { RRETURN (MATCH_NOMATCH); }
2374: #endif /* SUPPORT_UCP */
2375: }
2376: /* Control never gets here */
2377: }
2378:
2379: else /* Maximize */
2380: {
2381: pp = eptr;
2382: for (i = min; i < max; i++)
2383: {
2384: if (eptr > md->end_subject - length) break;
2385: if (memcmp(eptr, charptr, length) == 0) eptr += length;
2386: #ifdef SUPPORT_UCP
2387: else if (oclength == 0) break;
2388: else
2389: {
2390: if (memcmp(eptr, occhars, oclength) != 0) break;
2391: eptr += oclength;
2392: }
2393: #else /* without SUPPORT_UCP */
2394: else break;
2395: #endif /* SUPPORT_UCP */
2396: }
2397:
2398: if (possessive) continue;
2399: for(;;)
2400: {
2401: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2402: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2403: if (eptr == pp) RRETURN(MATCH_NOMATCH);
2404: #ifdef SUPPORT_UCP
2405: eptr--;
2406: BACKCHAR(eptr);
2407: #else /* without SUPPORT_UCP */
2408: eptr -= length;
2409: #endif /* SUPPORT_UCP */
2410: }
2411: }
2412: /* Control never gets here */
2413: }
2414:
2415: /* If the length of a UTF-8 character is 1, we fall through here, and
2416: obey the code as for non-UTF-8 characters below, though in this case the
2417: value of fc will always be < 128. */
2418: }
2419: else
2420: #endif /* SUPPORT_UTF8 */
2421:
2422: /* When not in UTF-8 mode, load a single-byte character. */
2423: {
2424: if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2425: fc = *ecode++;
2426: }
2427:
2428: /* The value of fc at this point is always less than 256, though we may or
2429: may not be in UTF-8 mode. The code is duplicated for the caseless and
2430: caseful cases, for speed, since matching characters is likely to be quite
2431: common. First, ensure the minimum number of matches are present. If min =
2432: max, continue at the same level without recursing. Otherwise, if
2433: minimizing, keep trying the rest of the expression and advancing one
2434: matching character if failing, up to the maximum. Alternatively, if
2435: maximizing, find the maximum number of characters and work backwards. */
2436:
2437: DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2438: max, eptr));
2439:
2440: if ((ims & PCRE_CASELESS) != 0)
2441: {
2442: fc = md->lcc[fc];
2443: for (i = 1; i <= min; i++)
2444: if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2445: if (min == max) continue;
2446: if (minimize)
2447: {
2448: for (fi = min;; fi++)
2449: {
2450: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2451: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2452: if (fi >= max || eptr >= md->end_subject ||
2453: fc != md->lcc[*eptr++])
2454: RRETURN(MATCH_NOMATCH);
2455: }
2456: /* Control never gets here */
2457: }
2458: else /* Maximize */
2459: {
2460: pp = eptr;
2461: for (i = min; i < max; i++)
2462: {
2463: if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2464: eptr++;
2465: }
2466: if (possessive) continue;
2467: while (eptr >= pp)
2468: {
2469: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2470: eptr--;
2471: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2472: }
2473: RRETURN(MATCH_NOMATCH);
2474: }
2475: /* Control never gets here */
2476: }
2477:
2478: /* Caseful comparisons (includes all multi-byte characters) */
2479:
2480: else
2481: {
2482: for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2483: if (min == max) continue;
2484: if (minimize)
2485: {
2486: for (fi = min;; fi++)
2487: {
2488: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2489: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2490: if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2491: RRETURN(MATCH_NOMATCH);
2492: }
2493: /* Control never gets here */
2494: }
2495: else /* Maximize */
2496: {
2497: pp = eptr;
2498: for (i = min; i < max; i++)
2499: {
2500: if (eptr >= md->end_subject || fc != *eptr) break;
2501: eptr++;
2502: }
2503: if (possessive) continue;
2504: while (eptr >= pp)
2505: {
2506: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2507: eptr--;
2508: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2509: }
2510: RRETURN(MATCH_NOMATCH);
2511: }
2512: }
2513: /* Control never gets here */
2514:
2515: /* Match a negated single one-byte character. The character we are
2516: checking can be multibyte. */
2517:
2518: case OP_NOT:
2519: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2520: ecode++;
2521: GETCHARINCTEST(c, eptr);
2522: if ((ims & PCRE_CASELESS) != 0)
2523: {
2524: #ifdef SUPPORT_UTF8
2525: if (c < 256)
2526: #endif
2527: c = md->lcc[c];
2528: if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2529: }
2530: else
2531: {
2532: if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2533: }
2534: break;
2535:
2536: /* Match a negated single one-byte character repeatedly. This is almost a
2537: repeat of the code for a repeated single character, but I haven't found a
2538: nice way of commoning these up that doesn't require a test of the
2539: positive/negative option for each character match. Maybe that wouldn't add
2540: very much to the time taken, but character matching *is* what this is all
2541: about... */
2542:
2543: case OP_NOTEXACT:
2544: min = max = GET2(ecode, 1);
2545: ecode += 3;
2546: goto REPEATNOTCHAR;
2547:
2548: case OP_NOTUPTO:
2549: case OP_NOTMINUPTO:
2550: min = 0;
2551: max = GET2(ecode, 1);
2552: minimize = *ecode == OP_NOTMINUPTO;
2553: ecode += 3;
2554: goto REPEATNOTCHAR;
2555:
2556: case OP_NOTPOSSTAR:
2557: possessive = TRUE;
2558: min = 0;
2559: max = INT_MAX;
2560: ecode++;
2561: goto REPEATNOTCHAR;
2562:
2563: case OP_NOTPOSPLUS:
2564: possessive = TRUE;
2565: min = 1;
2566: max = INT_MAX;
2567: ecode++;
2568: goto REPEATNOTCHAR;
2569:
2570: case OP_NOTPOSQUERY:
2571: possessive = TRUE;
2572: min = 0;
2573: max = 1;
2574: ecode++;
2575: goto REPEATNOTCHAR;
2576:
2577: case OP_NOTPOSUPTO:
2578: possessive = TRUE;
2579: min = 0;
2580: max = GET2(ecode, 1);
2581: ecode += 3;
2582: goto REPEATNOTCHAR;
2583:
2584: case OP_NOTSTAR:
2585: case OP_NOTMINSTAR:
2586: case OP_NOTPLUS:
2587: case OP_NOTMINPLUS:
2588: case OP_NOTQUERY:
2589: case OP_NOTMINQUERY:
2590: c = *ecode++ - OP_NOTSTAR;
2591: minimize = (c & 1) != 0;
2592: min = rep_min[c]; /* Pick up values from tables; */
2593: max = rep_max[c]; /* zero for max => infinity */
2594: if (max == 0) max = INT_MAX;
2595:
2596: /* Common code for all repeated single-byte matches. We can give up quickly
2597: if there are fewer than the minimum number of bytes left in the
2598: subject. */
2599:
2600: REPEATNOTCHAR:
2601: if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2602: fc = *ecode++;
2603:
2604: /* The code is duplicated for the caseless and caseful cases, for speed,
2605: since matching characters is likely to be quite common. First, ensure the
2606: minimum number of matches are present. If min = max, continue at the same
2607: level without recursing. Otherwise, if minimizing, keep trying the rest of
2608: the expression and advancing one matching character if failing, up to the
2609: maximum. Alternatively, if maximizing, find the maximum number of
2610: characters and work backwards. */
2611:
2612: DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2613: max, eptr));
2614:
2615: if ((ims & PCRE_CASELESS) != 0)
2616: {
2617: fc = md->lcc[fc];
2618:
2619: #ifdef SUPPORT_UTF8
2620: /* UTF-8 mode */
2621: if (utf8)
2622: {
2623: register unsigned int d;
2624: for (i = 1; i <= min; i++)
2625: {
2626: GETCHARINC(d, eptr);
2627: if (d < 256) d = md->lcc[d];
2628: if (fc == d) RRETURN(MATCH_NOMATCH);
2629: }
2630: }
2631: else
2632: #endif
2633:
2634: /* Not UTF-8 mode */
2635: {
2636: for (i = 1; i <= min; i++)
2637: if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2638: }
2639:
2640: if (min == max) continue;
2641:
2642: if (minimize)
2643: {
2644: #ifdef SUPPORT_UTF8
2645: /* UTF-8 mode */
2646: if (utf8)
2647: {
2648: register unsigned int d;
2649: for (fi = min;; fi++)
2650: {
2651: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2652: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.2 misha 2653: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1.1 misha 2654: GETCHARINC(d, eptr);
2655: if (d < 256) d = md->lcc[d];
1.2 misha 2656: if (fc == d) RRETURN(MATCH_NOMATCH);
2657:
1.1 misha 2658: }
2659: }
2660: else
2661: #endif
2662: /* Not UTF-8 mode */
2663: {
2664: for (fi = min;; fi++)
2665: {
2666: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2667: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2668: if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2669: RRETURN(MATCH_NOMATCH);
2670: }
2671: }
2672: /* Control never gets here */
2673: }
2674:
2675: /* Maximize case */
2676:
2677: else
2678: {
2679: pp = eptr;
2680:
2681: #ifdef SUPPORT_UTF8
2682: /* UTF-8 mode */
2683: if (utf8)
2684: {
2685: register unsigned int d;
2686: for (i = min; i < max; i++)
2687: {
2688: int len = 1;
2689: if (eptr >= md->end_subject) break;
2690: GETCHARLEN(d, eptr, len);
2691: if (d < 256) d = md->lcc[d];
2692: if (fc == d) break;
2693: eptr += len;
2694: }
2695: if (possessive) continue;
2696: for(;;)
2697: {
2698: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2699: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2700: if (eptr-- == pp) break; /* Stop if tried at original pos */
2701: BACKCHAR(eptr);
2702: }
2703: }
2704: else
2705: #endif
2706: /* Not UTF-8 mode */
2707: {
2708: for (i = min; i < max; i++)
2709: {
2710: if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2711: eptr++;
2712: }
2713: if (possessive) continue;
2714: while (eptr >= pp)
2715: {
2716: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2717: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2718: eptr--;
2719: }
2720: }
2721:
2722: RRETURN(MATCH_NOMATCH);
2723: }
2724: /* Control never gets here */
2725: }
2726:
2727: /* Caseful comparisons */
2728:
2729: else
2730: {
2731: #ifdef SUPPORT_UTF8
2732: /* UTF-8 mode */
2733: if (utf8)
2734: {
2735: register unsigned int d;
2736: for (i = 1; i <= min; i++)
2737: {
2738: GETCHARINC(d, eptr);
2739: if (fc == d) RRETURN(MATCH_NOMATCH);
2740: }
2741: }
2742: else
2743: #endif
2744: /* Not UTF-8 mode */
2745: {
2746: for (i = 1; i <= min; i++)
2747: if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2748: }
2749:
2750: if (min == max) continue;
2751:
2752: if (minimize)
2753: {
2754: #ifdef SUPPORT_UTF8
2755: /* UTF-8 mode */
2756: if (utf8)
2757: {
2758: register unsigned int d;
2759: for (fi = min;; fi++)
2760: {
2761: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2762: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.2 misha 2763: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1.1 misha 2764: GETCHARINC(d, eptr);
1.2 misha 2765: if (fc == d) RRETURN(MATCH_NOMATCH);
1.1 misha 2766: }
2767: }
2768: else
2769: #endif
2770: /* Not UTF-8 mode */
2771: {
2772: for (fi = min;; fi++)
2773: {
2774: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2775: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2776: if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2777: RRETURN(MATCH_NOMATCH);
2778: }
2779: }
2780: /* Control never gets here */
2781: }
2782:
2783: /* Maximize case */
2784:
2785: else
2786: {
2787: pp = eptr;
2788:
2789: #ifdef SUPPORT_UTF8
2790: /* UTF-8 mode */
2791: if (utf8)
2792: {
2793: register unsigned int d;
2794: for (i = min; i < max; i++)
2795: {
2796: int len = 1;
2797: if (eptr >= md->end_subject) break;
2798: GETCHARLEN(d, eptr, len);
2799: if (fc == d) break;
2800: eptr += len;
2801: }
2802: if (possessive) continue;
2803: for(;;)
2804: {
2805: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2806: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2807: if (eptr-- == pp) break; /* Stop if tried at original pos */
2808: BACKCHAR(eptr);
2809: }
2810: }
2811: else
2812: #endif
2813: /* Not UTF-8 mode */
2814: {
2815: for (i = min; i < max; i++)
2816: {
2817: if (eptr >= md->end_subject || fc == *eptr) break;
2818: eptr++;
2819: }
2820: if (possessive) continue;
2821: while (eptr >= pp)
2822: {
2823: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2824: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2825: eptr--;
2826: }
2827: }
2828:
2829: RRETURN(MATCH_NOMATCH);
2830: }
2831: }
2832: /* Control never gets here */
2833:
2834: /* Match a single character type repeatedly; several different opcodes
2835: share code. This is very similar to the code for single characters, but we
2836: repeat it in the interests of efficiency. */
2837:
2838: case OP_TYPEEXACT:
2839: min = max = GET2(ecode, 1);
2840: minimize = TRUE;
2841: ecode += 3;
2842: goto REPEATTYPE;
2843:
2844: case OP_TYPEUPTO:
2845: case OP_TYPEMINUPTO:
2846: min = 0;
2847: max = GET2(ecode, 1);
2848: minimize = *ecode == OP_TYPEMINUPTO;
2849: ecode += 3;
2850: goto REPEATTYPE;
2851:
2852: case OP_TYPEPOSSTAR:
2853: possessive = TRUE;
2854: min = 0;
2855: max = INT_MAX;
2856: ecode++;
2857: goto REPEATTYPE;
2858:
2859: case OP_TYPEPOSPLUS:
2860: possessive = TRUE;
2861: min = 1;
2862: max = INT_MAX;
2863: ecode++;
2864: goto REPEATTYPE;
2865:
2866: case OP_TYPEPOSQUERY:
2867: possessive = TRUE;
2868: min = 0;
2869: max = 1;
2870: ecode++;
2871: goto REPEATTYPE;
2872:
2873: case OP_TYPEPOSUPTO:
2874: possessive = TRUE;
2875: min = 0;
2876: max = GET2(ecode, 1);
2877: ecode += 3;
2878: goto REPEATTYPE;
2879:
2880: case OP_TYPESTAR:
2881: case OP_TYPEMINSTAR:
2882: case OP_TYPEPLUS:
2883: case OP_TYPEMINPLUS:
2884: case OP_TYPEQUERY:
2885: case OP_TYPEMINQUERY:
2886: c = *ecode++ - OP_TYPESTAR;
2887: minimize = (c & 1) != 0;
2888: min = rep_min[c]; /* Pick up values from tables; */
2889: max = rep_max[c]; /* zero for max => infinity */
2890: if (max == 0) max = INT_MAX;
2891:
2892: /* Common code for all repeated single character type matches. Note that
2893: in UTF-8 mode, '.' matches a character of any length, but for the other
2894: character types, the valid characters are all one-byte long. */
2895:
2896: REPEATTYPE:
2897: ctype = *ecode++; /* Code for the character type */
2898:
2899: #ifdef SUPPORT_UCP
2900: if (ctype == OP_PROP || ctype == OP_NOTPROP)
2901: {
2902: prop_fail_result = ctype == OP_NOTPROP;
2903: prop_type = *ecode++;
2904: prop_value = *ecode++;
2905: }
2906: else prop_type = -1;
2907: #endif
2908:
2909: /* First, ensure the minimum number of matches are present. Use inline
2910: code for maximizing the speed, and do the type test once at the start
2911: (i.e. keep it out of the loop). Also we can test that there are at least
2912: the minimum number of bytes before we start. This isn't as effective in
2913: UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2914: is tidier. Also separate the UCP code, which can be the same for both UTF-8
2915: and single-bytes. */
2916:
2917: if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2918: if (min > 0)
2919: {
2920: #ifdef SUPPORT_UCP
2921: if (prop_type >= 0)
2922: {
2923: switch(prop_type)
2924: {
2925: case PT_ANY:
2926: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2927: for (i = 1; i <= min; i++)
2928: {
2929: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2930: GETCHARINCTEST(c, eptr);
2931: }
2932: break;
2933:
2934: case PT_LAMP:
2935: for (i = 1; i <= min; i++)
2936: {
2937: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2938: GETCHARINCTEST(c, eptr);
1.2 misha 2939: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 2940: if ((prop_chartype == ucp_Lu ||
2941: prop_chartype == ucp_Ll ||
2942: prop_chartype == ucp_Lt) == prop_fail_result)
2943: RRETURN(MATCH_NOMATCH);
2944: }
2945: break;
2946:
2947: case PT_GC:
2948: for (i = 1; i <= min; i++)
2949: {
2950: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2951: GETCHARINCTEST(c, eptr);
1.2 misha 2952: prop_category = UCD_CATEGORY(c);
1.1 misha 2953: if ((prop_category == prop_value) == prop_fail_result)
2954: RRETURN(MATCH_NOMATCH);
2955: }
2956: break;
2957:
2958: case PT_PC:
2959: for (i = 1; i <= min; i++)
2960: {
2961: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2962: GETCHARINCTEST(c, eptr);
1.2 misha 2963: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 2964: if ((prop_chartype == prop_value) == prop_fail_result)
2965: RRETURN(MATCH_NOMATCH);
2966: }
2967: break;
2968:
2969: case PT_SC:
2970: for (i = 1; i <= min; i++)
2971: {
2972: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2973: GETCHARINCTEST(c, eptr);
1.2 misha 2974: prop_script = UCD_SCRIPT(c);
1.1 misha 2975: if ((prop_script == prop_value) == prop_fail_result)
2976: RRETURN(MATCH_NOMATCH);
2977: }
2978: break;
2979:
2980: default:
2981: RRETURN(PCRE_ERROR_INTERNAL);
2982: }
2983: }
2984:
2985: /* Match extended Unicode sequences. We will get here only if the
2986: support is in the binary; otherwise a compile-time error occurs. */
2987:
2988: else if (ctype == OP_EXTUNI)
2989: {
2990: for (i = 1; i <= min; i++)
2991: {
2992: GETCHARINCTEST(c, eptr);
1.2 misha 2993: prop_category = UCD_CATEGORY(c);
1.1 misha 2994: if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2995: while (eptr < md->end_subject)
2996: {
2997: int len = 1;
2998: if (!utf8) c = *eptr; else
2999: {
3000: GETCHARLEN(c, eptr, len);
3001: }
1.2 misha 3002: prop_category = UCD_CATEGORY(c);
1.1 misha 3003: if (prop_category != ucp_M) break;
3004: eptr += len;
3005: }
3006: }
3007: }
3008:
3009: else
3010: #endif /* SUPPORT_UCP */
3011:
3012: /* Handle all other cases when the coding is UTF-8 */
3013:
3014: #ifdef SUPPORT_UTF8
3015: if (utf8) switch(ctype)
3016: {
3017: case OP_ANY:
3018: for (i = 1; i <= min; i++)
3019: {
3020: if (eptr >= md->end_subject || IS_NEWLINE(eptr))
3021: RRETURN(MATCH_NOMATCH);
3022: eptr++;
3023: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3024: }
3025: break;
3026:
3027: case OP_ALLANY:
3028: for (i = 1; i <= min; i++)
3029: {
3030: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3031: eptr++;
3032: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3033: }
3034: break;
3035:
3036: case OP_ANYBYTE:
3037: eptr += min;
3038: break;
3039:
3040: case OP_ANYNL:
3041: for (i = 1; i <= min; i++)
3042: {
3043: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3044: GETCHARINC(c, eptr);
3045: switch(c)
3046: {
3047: default: RRETURN(MATCH_NOMATCH);
3048: case 0x000d:
3049: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3050: break;
3051:
3052: case 0x000a:
3053: break;
3054:
3055: case 0x000b:
3056: case 0x000c:
3057: case 0x0085:
3058: case 0x2028:
3059: case 0x2029:
3060: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3061: break;
3062: }
3063: }
3064: break;
3065:
3066: case OP_NOT_HSPACE:
3067: for (i = 1; i <= min; i++)
3068: {
3069: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3070: GETCHARINC(c, eptr);
3071: switch(c)
3072: {
3073: default: break;
3074: case 0x09: /* HT */
3075: case 0x20: /* SPACE */
3076: case 0xa0: /* NBSP */
3077: case 0x1680: /* OGHAM SPACE MARK */
3078: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3079: case 0x2000: /* EN QUAD */
3080: case 0x2001: /* EM QUAD */
3081: case 0x2002: /* EN SPACE */
3082: case 0x2003: /* EM SPACE */
3083: case 0x2004: /* THREE-PER-EM SPACE */
3084: case 0x2005: /* FOUR-PER-EM SPACE */
3085: case 0x2006: /* SIX-PER-EM SPACE */
3086: case 0x2007: /* FIGURE SPACE */
3087: case 0x2008: /* PUNCTUATION SPACE */
3088: case 0x2009: /* THIN SPACE */
3089: case 0x200A: /* HAIR SPACE */
3090: case 0x202f: /* NARROW NO-BREAK SPACE */
3091: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3092: case 0x3000: /* IDEOGRAPHIC SPACE */
3093: RRETURN(MATCH_NOMATCH);
3094: }
3095: }
3096: break;
3097:
3098: case OP_HSPACE:
3099: for (i = 1; i <= min; i++)
3100: {
3101: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3102: GETCHARINC(c, eptr);
3103: switch(c)
3104: {
3105: default: RRETURN(MATCH_NOMATCH);
3106: case 0x09: /* HT */
3107: case 0x20: /* SPACE */
3108: case 0xa0: /* NBSP */
3109: case 0x1680: /* OGHAM SPACE MARK */
3110: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3111: case 0x2000: /* EN QUAD */
3112: case 0x2001: /* EM QUAD */
3113: case 0x2002: /* EN SPACE */
3114: case 0x2003: /* EM SPACE */
3115: case 0x2004: /* THREE-PER-EM SPACE */
3116: case 0x2005: /* FOUR-PER-EM SPACE */
3117: case 0x2006: /* SIX-PER-EM SPACE */
3118: case 0x2007: /* FIGURE SPACE */
3119: case 0x2008: /* PUNCTUATION SPACE */
3120: case 0x2009: /* THIN SPACE */
3121: case 0x200A: /* HAIR SPACE */
3122: case 0x202f: /* NARROW NO-BREAK SPACE */
3123: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3124: case 0x3000: /* IDEOGRAPHIC SPACE */
3125: break;
3126: }
3127: }
3128: break;
3129:
3130: case OP_NOT_VSPACE:
3131: for (i = 1; i <= min; i++)
3132: {
3133: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3134: GETCHARINC(c, eptr);
3135: switch(c)
3136: {
3137: default: break;
3138: case 0x0a: /* LF */
3139: case 0x0b: /* VT */
3140: case 0x0c: /* FF */
3141: case 0x0d: /* CR */
3142: case 0x85: /* NEL */
3143: case 0x2028: /* LINE SEPARATOR */
3144: case 0x2029: /* PARAGRAPH SEPARATOR */
3145: RRETURN(MATCH_NOMATCH);
3146: }
3147: }
3148: break;
3149:
3150: case OP_VSPACE:
3151: for (i = 1; i <= min; i++)
3152: {
3153: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3154: GETCHARINC(c, eptr);
3155: switch(c)
3156: {
3157: default: RRETURN(MATCH_NOMATCH);
3158: case 0x0a: /* LF */
3159: case 0x0b: /* VT */
3160: case 0x0c: /* FF */
3161: case 0x0d: /* CR */
3162: case 0x85: /* NEL */
3163: case 0x2028: /* LINE SEPARATOR */
3164: case 0x2029: /* PARAGRAPH SEPARATOR */
3165: break;
3166: }
3167: }
3168: break;
3169:
3170: case OP_NOT_DIGIT:
3171: for (i = 1; i <= min; i++)
3172: {
3173: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3174: GETCHARINC(c, eptr);
3175: if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3176: RRETURN(MATCH_NOMATCH);
3177: }
3178: break;
3179:
3180: case OP_DIGIT:
3181: for (i = 1; i <= min; i++)
3182: {
3183: if (eptr >= md->end_subject ||
3184: *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3185: RRETURN(MATCH_NOMATCH);
3186: /* No need to skip more bytes - we know it's a 1-byte character */
3187: }
3188: break;
3189:
3190: case OP_NOT_WHITESPACE:
3191: for (i = 1; i <= min; i++)
3192: {
3193: if (eptr >= md->end_subject ||
3194: (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3195: RRETURN(MATCH_NOMATCH);
3196: while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3197: }
3198: break;
3199:
3200: case OP_WHITESPACE:
3201: for (i = 1; i <= min; i++)
3202: {
3203: if (eptr >= md->end_subject ||
3204: *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3205: RRETURN(MATCH_NOMATCH);
3206: /* No need to skip more bytes - we know it's a 1-byte character */
3207: }
3208: break;
3209:
3210: case OP_NOT_WORDCHAR:
3211: for (i = 1; i <= min; i++)
3212: {
3213: if (eptr >= md->end_subject ||
3214: (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3215: RRETURN(MATCH_NOMATCH);
3216: while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3217: }
3218: break;
3219:
3220: case OP_WORDCHAR:
3221: for (i = 1; i <= min; i++)
3222: {
3223: if (eptr >= md->end_subject ||
3224: *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3225: RRETURN(MATCH_NOMATCH);
3226: /* No need to skip more bytes - we know it's a 1-byte character */
3227: }
3228: break;
3229:
3230: default:
3231: RRETURN(PCRE_ERROR_INTERNAL);
3232: } /* End switch(ctype) */
3233:
3234: else
3235: #endif /* SUPPORT_UTF8 */
3236:
3237: /* Code for the non-UTF-8 case for minimum matching of operators other
3238: than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3239: number of bytes present, as this was tested above. */
3240:
3241: switch(ctype)
3242: {
3243: case OP_ANY:
3244: for (i = 1; i <= min; i++)
3245: {
3246: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3247: eptr++;
3248: }
3249: break;
3250:
3251: case OP_ALLANY:
3252: eptr += min;
3253: break;
3254:
3255: case OP_ANYBYTE:
3256: eptr += min;
3257: break;
3258:
3259: /* Because of the CRLF case, we can't assume the minimum number of
3260: bytes are present in this case. */
3261:
3262: case OP_ANYNL:
3263: for (i = 1; i <= min; i++)
3264: {
3265: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3266: switch(*eptr++)
3267: {
3268: default: RRETURN(MATCH_NOMATCH);
3269: case 0x000d:
3270: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3271: break;
3272: case 0x000a:
3273: break;
3274:
3275: case 0x000b:
3276: case 0x000c:
3277: case 0x0085:
3278: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3279: break;
3280: }
3281: }
3282: break;
3283:
3284: case OP_NOT_HSPACE:
3285: for (i = 1; i <= min; i++)
3286: {
3287: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3288: switch(*eptr++)
3289: {
3290: default: break;
3291: case 0x09: /* HT */
3292: case 0x20: /* SPACE */
3293: case 0xa0: /* NBSP */
3294: RRETURN(MATCH_NOMATCH);
3295: }
3296: }
3297: break;
3298:
3299: case OP_HSPACE:
3300: for (i = 1; i <= min; i++)
3301: {
3302: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3303: switch(*eptr++)
3304: {
3305: default: RRETURN(MATCH_NOMATCH);
3306: case 0x09: /* HT */
3307: case 0x20: /* SPACE */
3308: case 0xa0: /* NBSP */
3309: break;
3310: }
3311: }
3312: break;
3313:
3314: case OP_NOT_VSPACE:
3315: for (i = 1; i <= min; i++)
3316: {
3317: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3318: switch(*eptr++)
3319: {
3320: default: break;
3321: case 0x0a: /* LF */
3322: case 0x0b: /* VT */
3323: case 0x0c: /* FF */
3324: case 0x0d: /* CR */
3325: case 0x85: /* NEL */
3326: RRETURN(MATCH_NOMATCH);
3327: }
3328: }
3329: break;
3330:
3331: case OP_VSPACE:
3332: for (i = 1; i <= min; i++)
3333: {
3334: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3335: switch(*eptr++)
3336: {
3337: default: RRETURN(MATCH_NOMATCH);
3338: case 0x0a: /* LF */
3339: case 0x0b: /* VT */
3340: case 0x0c: /* FF */
3341: case 0x0d: /* CR */
3342: case 0x85: /* NEL */
3343: break;
3344: }
3345: }
3346: break;
3347:
3348: case OP_NOT_DIGIT:
3349: for (i = 1; i <= min; i++)
3350: if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3351: break;
3352:
3353: case OP_DIGIT:
3354: for (i = 1; i <= min; i++)
3355: if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3356: break;
3357:
3358: case OP_NOT_WHITESPACE:
3359: for (i = 1; i <= min; i++)
3360: if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3361: break;
3362:
3363: case OP_WHITESPACE:
3364: for (i = 1; i <= min; i++)
3365: if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3366: break;
3367:
3368: case OP_NOT_WORDCHAR:
3369: for (i = 1; i <= min; i++)
3370: if ((md->ctypes[*eptr++] & ctype_word) != 0)
3371: RRETURN(MATCH_NOMATCH);
3372: break;
3373:
3374: case OP_WORDCHAR:
3375: for (i = 1; i <= min; i++)
3376: if ((md->ctypes[*eptr++] & ctype_word) == 0)
3377: RRETURN(MATCH_NOMATCH);
3378: break;
3379:
3380: default:
3381: RRETURN(PCRE_ERROR_INTERNAL);
3382: }
3383: }
3384:
3385: /* If min = max, continue at the same level without recursing */
3386:
3387: if (min == max) continue;
3388:
3389: /* If minimizing, we have to test the rest of the pattern before each
3390: subsequent match. Again, separate the UTF-8 case for speed, and also
3391: separate the UCP cases. */
3392:
3393: if (minimize)
3394: {
3395: #ifdef SUPPORT_UCP
3396: if (prop_type >= 0)
3397: {
3398: switch(prop_type)
3399: {
3400: case PT_ANY:
3401: for (fi = min;; fi++)
3402: {
3403: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3404: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3405: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3406: GETCHARINC(c, eptr);
3407: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3408: }
3409: /* Control never gets here */
3410:
3411: case PT_LAMP:
3412: for (fi = min;; fi++)
3413: {
3414: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3415: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3416: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3417: GETCHARINC(c, eptr);
1.2 misha 3418: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 3419: if ((prop_chartype == ucp_Lu ||
3420: prop_chartype == ucp_Ll ||
3421: prop_chartype == ucp_Lt) == prop_fail_result)
3422: RRETURN(MATCH_NOMATCH);
3423: }
3424: /* Control never gets here */
3425:
3426: case PT_GC:
3427: for (fi = min;; fi++)
3428: {
3429: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3430: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3431: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3432: GETCHARINC(c, eptr);
1.2 misha 3433: prop_category = UCD_CATEGORY(c);
1.1 misha 3434: if ((prop_category == prop_value) == prop_fail_result)
3435: RRETURN(MATCH_NOMATCH);
3436: }
3437: /* Control never gets here */
3438:
3439: case PT_PC:
3440: for (fi = min;; fi++)
3441: {
3442: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3443: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3444: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3445: GETCHARINC(c, eptr);
1.2 misha 3446: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 3447: if ((prop_chartype == prop_value) == prop_fail_result)
3448: RRETURN(MATCH_NOMATCH);
3449: }
3450: /* Control never gets here */
3451:
3452: case PT_SC:
3453: for (fi = min;; fi++)
3454: {
3455: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3456: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3457: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3458: GETCHARINC(c, eptr);
1.2 misha 3459: prop_script = UCD_SCRIPT(c);
1.1 misha 3460: if ((prop_script == prop_value) == prop_fail_result)
3461: RRETURN(MATCH_NOMATCH);
3462: }
3463: /* Control never gets here */
3464:
3465: default:
3466: RRETURN(PCRE_ERROR_INTERNAL);
3467: }
3468: }
3469:
3470: /* Match extended Unicode sequences. We will get here only if the
3471: support is in the binary; otherwise a compile-time error occurs. */
3472:
3473: else if (ctype == OP_EXTUNI)
3474: {
3475: for (fi = min;; fi++)
3476: {
3477: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3478: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3479: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3480: GETCHARINCTEST(c, eptr);
1.2 misha 3481: prop_category = UCD_CATEGORY(c);
1.1 misha 3482: if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3483: while (eptr < md->end_subject)
3484: {
3485: int len = 1;
3486: if (!utf8) c = *eptr; else
3487: {
3488: GETCHARLEN(c, eptr, len);
3489: }
1.2 misha 3490: prop_category = UCD_CATEGORY(c);
1.1 misha 3491: if (prop_category != ucp_M) break;
3492: eptr += len;
3493: }
3494: }
3495: }
3496:
3497: else
3498: #endif /* SUPPORT_UCP */
3499:
3500: #ifdef SUPPORT_UTF8
3501: /* UTF-8 mode */
3502: if (utf8)
3503: {
3504: for (fi = min;; fi++)
3505: {
3506: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3507: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3508: if (fi >= max || eptr >= md->end_subject ||
3509: (ctype == OP_ANY && IS_NEWLINE(eptr)))
3510: RRETURN(MATCH_NOMATCH);
3511:
3512: GETCHARINC(c, eptr);
3513: switch(ctype)
3514: {
3515: case OP_ANY: /* This is the non-NL case */
3516: case OP_ALLANY:
3517: case OP_ANYBYTE:
3518: break;
3519:
3520: case OP_ANYNL:
3521: switch(c)
3522: {
3523: default: RRETURN(MATCH_NOMATCH);
3524: case 0x000d:
3525: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3526: break;
3527: case 0x000a:
3528: break;
3529:
3530: case 0x000b:
3531: case 0x000c:
3532: case 0x0085:
3533: case 0x2028:
3534: case 0x2029:
3535: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3536: break;
3537: }
3538: break;
3539:
3540: case OP_NOT_HSPACE:
3541: switch(c)
3542: {
3543: default: break;
3544: case 0x09: /* HT */
3545: case 0x20: /* SPACE */
3546: case 0xa0: /* NBSP */
3547: case 0x1680: /* OGHAM SPACE MARK */
3548: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3549: case 0x2000: /* EN QUAD */
3550: case 0x2001: /* EM QUAD */
3551: case 0x2002: /* EN SPACE */
3552: case 0x2003: /* EM SPACE */
3553: case 0x2004: /* THREE-PER-EM SPACE */
3554: case 0x2005: /* FOUR-PER-EM SPACE */
3555: case 0x2006: /* SIX-PER-EM SPACE */
3556: case 0x2007: /* FIGURE SPACE */
3557: case 0x2008: /* PUNCTUATION SPACE */
3558: case 0x2009: /* THIN SPACE */
3559: case 0x200A: /* HAIR SPACE */
3560: case 0x202f: /* NARROW NO-BREAK SPACE */
3561: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3562: case 0x3000: /* IDEOGRAPHIC SPACE */
3563: RRETURN(MATCH_NOMATCH);
3564: }
3565: break;
3566:
3567: case OP_HSPACE:
3568: switch(c)
3569: {
3570: default: RRETURN(MATCH_NOMATCH);
3571: case 0x09: /* HT */
3572: case 0x20: /* SPACE */
3573: case 0xa0: /* NBSP */
3574: case 0x1680: /* OGHAM SPACE MARK */
3575: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3576: case 0x2000: /* EN QUAD */
3577: case 0x2001: /* EM QUAD */
3578: case 0x2002: /* EN SPACE */
3579: case 0x2003: /* EM SPACE */
3580: case 0x2004: /* THREE-PER-EM SPACE */
3581: case 0x2005: /* FOUR-PER-EM SPACE */
3582: case 0x2006: /* SIX-PER-EM SPACE */
3583: case 0x2007: /* FIGURE SPACE */
3584: case 0x2008: /* PUNCTUATION SPACE */
3585: case 0x2009: /* THIN SPACE */
3586: case 0x200A: /* HAIR SPACE */
3587: case 0x202f: /* NARROW NO-BREAK SPACE */
3588: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3589: case 0x3000: /* IDEOGRAPHIC SPACE */
3590: break;
3591: }
3592: break;
3593:
3594: case OP_NOT_VSPACE:
3595: switch(c)
3596: {
3597: default: break;
3598: case 0x0a: /* LF */
3599: case 0x0b: /* VT */
3600: case 0x0c: /* FF */
3601: case 0x0d: /* CR */
3602: case 0x85: /* NEL */
3603: case 0x2028: /* LINE SEPARATOR */
3604: case 0x2029: /* PARAGRAPH SEPARATOR */
3605: RRETURN(MATCH_NOMATCH);
3606: }
3607: break;
3608:
3609: case OP_VSPACE:
3610: switch(c)
3611: {
3612: default: RRETURN(MATCH_NOMATCH);
3613: case 0x0a: /* LF */
3614: case 0x0b: /* VT */
3615: case 0x0c: /* FF */
3616: case 0x0d: /* CR */
3617: case 0x85: /* NEL */
3618: case 0x2028: /* LINE SEPARATOR */
3619: case 0x2029: /* PARAGRAPH SEPARATOR */
3620: break;
3621: }
3622: break;
3623:
3624: case OP_NOT_DIGIT:
3625: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3626: RRETURN(MATCH_NOMATCH);
3627: break;
3628:
3629: case OP_DIGIT:
3630: if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3631: RRETURN(MATCH_NOMATCH);
3632: break;
3633:
3634: case OP_NOT_WHITESPACE:
3635: if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3636: RRETURN(MATCH_NOMATCH);
3637: break;
3638:
3639: case OP_WHITESPACE:
3640: if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3641: RRETURN(MATCH_NOMATCH);
3642: break;
3643:
3644: case OP_NOT_WORDCHAR:
3645: if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3646: RRETURN(MATCH_NOMATCH);
3647: break;
3648:
3649: case OP_WORDCHAR:
3650: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3651: RRETURN(MATCH_NOMATCH);
3652: break;
3653:
3654: default:
3655: RRETURN(PCRE_ERROR_INTERNAL);
3656: }
3657: }
3658: }
3659: else
3660: #endif
3661: /* Not UTF-8 mode */
3662: {
3663: for (fi = min;; fi++)
3664: {
3665: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3666: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3667: if (fi >= max || eptr >= md->end_subject ||
3668: (ctype == OP_ANY && IS_NEWLINE(eptr)))
3669: RRETURN(MATCH_NOMATCH);
3670:
3671: c = *eptr++;
3672: switch(ctype)
3673: {
3674: case OP_ANY: /* This is the non-NL case */
3675: case OP_ALLANY:
3676: case OP_ANYBYTE:
3677: break;
3678:
3679: case OP_ANYNL:
3680: switch(c)
3681: {
3682: default: RRETURN(MATCH_NOMATCH);
3683: case 0x000d:
3684: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3685: break;
3686:
3687: case 0x000a:
3688: break;
3689:
3690: case 0x000b:
3691: case 0x000c:
3692: case 0x0085:
3693: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3694: break;
3695: }
3696: break;
3697:
3698: case OP_NOT_HSPACE:
3699: switch(c)
3700: {
3701: default: break;
3702: case 0x09: /* HT */
3703: case 0x20: /* SPACE */
3704: case 0xa0: /* NBSP */
3705: RRETURN(MATCH_NOMATCH);
3706: }
3707: break;
3708:
3709: case OP_HSPACE:
3710: switch(c)
3711: {
3712: default: RRETURN(MATCH_NOMATCH);
3713: case 0x09: /* HT */
3714: case 0x20: /* SPACE */
3715: case 0xa0: /* NBSP */
3716: break;
3717: }
3718: break;
3719:
3720: case OP_NOT_VSPACE:
3721: switch(c)
3722: {
3723: default: break;
3724: case 0x0a: /* LF */
3725: case 0x0b: /* VT */
3726: case 0x0c: /* FF */
3727: case 0x0d: /* CR */
3728: case 0x85: /* NEL */
3729: RRETURN(MATCH_NOMATCH);
3730: }
3731: break;
3732:
3733: case OP_VSPACE:
3734: switch(c)
3735: {
3736: default: RRETURN(MATCH_NOMATCH);
3737: case 0x0a: /* LF */
3738: case 0x0b: /* VT */
3739: case 0x0c: /* FF */
3740: case 0x0d: /* CR */
3741: case 0x85: /* NEL */
3742: break;
3743: }
3744: break;
3745:
3746: case OP_NOT_DIGIT:
3747: if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3748: break;
3749:
3750: case OP_DIGIT:
3751: if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3752: break;
3753:
3754: case OP_NOT_WHITESPACE:
3755: if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3756: break;
3757:
3758: case OP_WHITESPACE:
3759: if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3760: break;
3761:
3762: case OP_NOT_WORDCHAR:
3763: if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3764: break;
3765:
3766: case OP_WORDCHAR:
3767: if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3768: break;
3769:
3770: default:
3771: RRETURN(PCRE_ERROR_INTERNAL);
3772: }
3773: }
3774: }
3775: /* Control never gets here */
3776: }
3777:
3778: /* If maximizing, it is worth using inline code for speed, doing the type
3779: test once at the start (i.e. keep it out of the loop). Again, keep the
3780: UTF-8 and UCP stuff separate. */
3781:
3782: else
3783: {
3784: pp = eptr; /* Remember where we started */
3785:
3786: #ifdef SUPPORT_UCP
3787: if (prop_type >= 0)
3788: {
3789: switch(prop_type)
3790: {
3791: case PT_ANY:
3792: for (i = min; i < max; i++)
3793: {
3794: int len = 1;
3795: if (eptr >= md->end_subject) break;
3796: GETCHARLEN(c, eptr, len);
3797: if (prop_fail_result) break;
3798: eptr+= len;
3799: }
3800: break;
3801:
3802: case PT_LAMP:
3803: for (i = min; i < max; i++)
3804: {
3805: int len = 1;
3806: if (eptr >= md->end_subject) break;
3807: GETCHARLEN(c, eptr, len);
1.2 misha 3808: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 3809: if ((prop_chartype == ucp_Lu ||
3810: prop_chartype == ucp_Ll ||
3811: prop_chartype == ucp_Lt) == prop_fail_result)
3812: break;
3813: eptr+= len;
3814: }
3815: break;
3816:
3817: case PT_GC:
3818: for (i = min; i < max; i++)
3819: {
3820: int len = 1;
3821: if (eptr >= md->end_subject) break;
3822: GETCHARLEN(c, eptr, len);
1.2 misha 3823: prop_category = UCD_CATEGORY(c);
1.1 misha 3824: if ((prop_category == prop_value) == prop_fail_result)
3825: break;
3826: eptr+= len;
3827: }
3828: break;
3829:
3830: case PT_PC:
3831: for (i = min; i < max; i++)
3832: {
3833: int len = 1;
3834: if (eptr >= md->end_subject) break;
3835: GETCHARLEN(c, eptr, len);
1.2 misha 3836: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 3837: if ((prop_chartype == prop_value) == prop_fail_result)
3838: break;
3839: eptr+= len;
3840: }
3841: break;
3842:
3843: case PT_SC:
3844: for (i = min; i < max; i++)
3845: {
3846: int len = 1;
3847: if (eptr >= md->end_subject) break;
3848: GETCHARLEN(c, eptr, len);
1.2 misha 3849: prop_script = UCD_SCRIPT(c);
1.1 misha 3850: if ((prop_script == prop_value) == prop_fail_result)
3851: break;
3852: eptr+= len;
3853: }
3854: break;
3855: }
3856:
3857: /* eptr is now past the end of the maximum run */
3858:
3859: if (possessive) continue;
3860: for(;;)
3861: {
3862: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3863: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3864: if (eptr-- == pp) break; /* Stop if tried at original pos */
3865: if (utf8) BACKCHAR(eptr);
3866: }
3867: }
3868:
3869: /* Match extended Unicode sequences. We will get here only if the
3870: support is in the binary; otherwise a compile-time error occurs. */
3871:
3872: else if (ctype == OP_EXTUNI)
3873: {
3874: for (i = min; i < max; i++)
3875: {
3876: if (eptr >= md->end_subject) break;
3877: GETCHARINCTEST(c, eptr);
1.2 misha 3878: prop_category = UCD_CATEGORY(c);
1.1 misha 3879: if (prop_category == ucp_M) break;
3880: while (eptr < md->end_subject)
3881: {
3882: int len = 1;
3883: if (!utf8) c = *eptr; else
3884: {
3885: GETCHARLEN(c, eptr, len);
3886: }
1.2 misha 3887: prop_category = UCD_CATEGORY(c);
1.1 misha 3888: if (prop_category != ucp_M) break;
3889: eptr += len;
3890: }
3891: }
3892:
3893: /* eptr is now past the end of the maximum run */
3894:
3895: if (possessive) continue;
3896: for(;;)
3897: {
3898: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3899: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3900: if (eptr-- == pp) break; /* Stop if tried at original pos */
3901: for (;;) /* Move back over one extended */
3902: {
3903: int len = 1;
3904: if (!utf8) c = *eptr; else
3905: {
3906: BACKCHAR(eptr);
3907: GETCHARLEN(c, eptr, len);
3908: }
1.2 misha 3909: prop_category = UCD_CATEGORY(c);
1.1 misha 3910: if (prop_category != ucp_M) break;
3911: eptr--;
3912: }
3913: }
3914: }
3915:
3916: else
3917: #endif /* SUPPORT_UCP */
3918:
3919: #ifdef SUPPORT_UTF8
3920: /* UTF-8 mode */
3921:
3922: if (utf8)
3923: {
3924: switch(ctype)
3925: {
3926: case OP_ANY:
3927: if (max < INT_MAX)
3928: {
3929: for (i = min; i < max; i++)
3930: {
3931: if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3932: eptr++;
3933: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3934: }
3935: }
3936:
3937: /* Handle unlimited UTF-8 repeat */
3938:
3939: else
3940: {
3941: for (i = min; i < max; i++)
3942: {
3943: if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3944: eptr++;
3945: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3946: }
3947: }
3948: break;
3949:
3950: case OP_ALLANY:
3951: if (max < INT_MAX)
3952: {
3953: for (i = min; i < max; i++)
3954: {
3955: if (eptr >= md->end_subject) break;
3956: eptr++;
3957: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3958: }
3959: }
3960: else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3961: break;
3962:
3963: /* The byte case is the same as non-UTF8 */
3964:
3965: case OP_ANYBYTE:
3966: c = max - min;
3967: if (c > (unsigned int)(md->end_subject - eptr))
3968: c = md->end_subject - eptr;
3969: eptr += c;
3970: break;
3971:
3972: case OP_ANYNL:
3973: for (i = min; i < max; i++)
3974: {
3975: int len = 1;
3976: if (eptr >= md->end_subject) break;
3977: GETCHARLEN(c, eptr, len);
3978: if (c == 0x000d)
3979: {
3980: if (++eptr >= md->end_subject) break;
3981: if (*eptr == 0x000a) eptr++;
3982: }
3983: else
3984: {
3985: if (c != 0x000a &&
3986: (md->bsr_anycrlf ||
3987: (c != 0x000b && c != 0x000c &&
3988: c != 0x0085 && c != 0x2028 && c != 0x2029)))
3989: break;
3990: eptr += len;
3991: }
3992: }
3993: break;
3994:
3995: case OP_NOT_HSPACE:
3996: case OP_HSPACE:
3997: for (i = min; i < max; i++)
3998: {
3999: BOOL gotspace;
4000: int len = 1;
4001: if (eptr >= md->end_subject) break;
4002: GETCHARLEN(c, eptr, len);
4003: switch(c)
4004: {
4005: default: gotspace = FALSE; break;
4006: case 0x09: /* HT */
4007: case 0x20: /* SPACE */
4008: case 0xa0: /* NBSP */
4009: case 0x1680: /* OGHAM SPACE MARK */
4010: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4011: case 0x2000: /* EN QUAD */
4012: case 0x2001: /* EM QUAD */
4013: case 0x2002: /* EN SPACE */
4014: case 0x2003: /* EM SPACE */
4015: case 0x2004: /* THREE-PER-EM SPACE */
4016: case 0x2005: /* FOUR-PER-EM SPACE */
4017: case 0x2006: /* SIX-PER-EM SPACE */
4018: case 0x2007: /* FIGURE SPACE */
4019: case 0x2008: /* PUNCTUATION SPACE */
4020: case 0x2009: /* THIN SPACE */
4021: case 0x200A: /* HAIR SPACE */
4022: case 0x202f: /* NARROW NO-BREAK SPACE */
4023: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4024: case 0x3000: /* IDEOGRAPHIC SPACE */
4025: gotspace = TRUE;
4026: break;
4027: }
4028: if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4029: eptr += len;
4030: }
4031: break;
4032:
4033: case OP_NOT_VSPACE:
4034: case OP_VSPACE:
4035: for (i = min; i < max; i++)
4036: {
4037: BOOL gotspace;
4038: int len = 1;
4039: if (eptr >= md->end_subject) break;
4040: GETCHARLEN(c, eptr, len);
4041: switch(c)
4042: {
4043: default: gotspace = FALSE; break;
4044: case 0x0a: /* LF */
4045: case 0x0b: /* VT */
4046: case 0x0c: /* FF */
4047: case 0x0d: /* CR */
4048: case 0x85: /* NEL */
4049: case 0x2028: /* LINE SEPARATOR */
4050: case 0x2029: /* PARAGRAPH SEPARATOR */
4051: gotspace = TRUE;
4052: break;
4053: }
4054: if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4055: eptr += len;
4056: }
4057: break;
4058:
4059: case OP_NOT_DIGIT:
4060: for (i = min; i < max; i++)
4061: {
4062: int len = 1;
4063: if (eptr >= md->end_subject) break;
4064: GETCHARLEN(c, eptr, len);
4065: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4066: eptr+= len;
4067: }
4068: break;
4069:
4070: case OP_DIGIT:
4071: for (i = min; i < max; i++)
4072: {
4073: int len = 1;
4074: if (eptr >= md->end_subject) break;
4075: GETCHARLEN(c, eptr, len);
4076: if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4077: eptr+= len;
4078: }
4079: break;
4080:
4081: case OP_NOT_WHITESPACE:
4082: for (i = min; i < max; i++)
4083: {
4084: int len = 1;
4085: if (eptr >= md->end_subject) break;
4086: GETCHARLEN(c, eptr, len);
4087: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4088: eptr+= len;
4089: }
4090: break;
4091:
4092: case OP_WHITESPACE:
4093: for (i = min; i < max; i++)
4094: {
4095: int len = 1;
4096: if (eptr >= md->end_subject) break;
4097: GETCHARLEN(c, eptr, len);
4098: if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4099: eptr+= len;
4100: }
4101: break;
4102:
4103: case OP_NOT_WORDCHAR:
4104: for (i = min; i < max; i++)
4105: {
4106: int len = 1;
4107: if (eptr >= md->end_subject) break;
4108: GETCHARLEN(c, eptr, len);
4109: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4110: eptr+= len;
4111: }
4112: break;
4113:
4114: case OP_WORDCHAR:
4115: for (i = min; i < max; i++)
4116: {
4117: int len = 1;
4118: if (eptr >= md->end_subject) break;
4119: GETCHARLEN(c, eptr, len);
4120: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4121: eptr+= len;
4122: }
4123: break;
4124:
4125: default:
4126: RRETURN(PCRE_ERROR_INTERNAL);
4127: }
4128:
4129: /* eptr is now past the end of the maximum run */
4130:
4131: if (possessive) continue;
4132: for(;;)
4133: {
4134: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4135: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4136: if (eptr-- == pp) break; /* Stop if tried at original pos */
4137: BACKCHAR(eptr);
4138: }
4139: }
4140: else
4141: #endif /* SUPPORT_UTF8 */
4142:
4143: /* Not UTF-8 mode */
4144: {
4145: switch(ctype)
4146: {
4147: case OP_ANY:
4148: for (i = min; i < max; i++)
4149: {
4150: if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4151: eptr++;
4152: }
4153: break;
4154:
4155: case OP_ALLANY:
4156: case OP_ANYBYTE:
4157: c = max - min;
4158: if (c > (unsigned int)(md->end_subject - eptr))
4159: c = md->end_subject - eptr;
4160: eptr += c;
4161: break;
4162:
4163: case OP_ANYNL:
4164: for (i = min; i < max; i++)
4165: {
4166: if (eptr >= md->end_subject) break;
4167: c = *eptr;
4168: if (c == 0x000d)
4169: {
4170: if (++eptr >= md->end_subject) break;
4171: if (*eptr == 0x000a) eptr++;
4172: }
4173: else
4174: {
4175: if (c != 0x000a &&
4176: (md->bsr_anycrlf ||
4177: (c != 0x000b && c != 0x000c && c != 0x0085)))
4178: break;
4179: eptr++;
4180: }
4181: }
4182: break;
4183:
4184: case OP_NOT_HSPACE:
4185: for (i = min; i < max; i++)
4186: {
4187: if (eptr >= md->end_subject) break;
4188: c = *eptr;
4189: if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4190: eptr++;
4191: }
4192: break;
4193:
4194: case OP_HSPACE:
4195: for (i = min; i < max; i++)
4196: {
4197: if (eptr >= md->end_subject) break;
4198: c = *eptr;
4199: if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4200: eptr++;
4201: }
4202: break;
4203:
4204: case OP_NOT_VSPACE:
4205: for (i = min; i < max; i++)
4206: {
4207: if (eptr >= md->end_subject) break;
4208: c = *eptr;
4209: if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4210: break;
4211: eptr++;
4212: }
4213: break;
4214:
4215: case OP_VSPACE:
4216: for (i = min; i < max; i++)
4217: {
4218: if (eptr >= md->end_subject) break;
4219: c = *eptr;
4220: if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4221: break;
4222: eptr++;
4223: }
4224: break;
4225:
4226: case OP_NOT_DIGIT:
4227: for (i = min; i < max; i++)
4228: {
4229: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4230: break;
4231: eptr++;
4232: }
4233: break;
4234:
4235: case OP_DIGIT:
4236: for (i = min; i < max; i++)
4237: {
4238: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4239: break;
4240: eptr++;
4241: }
4242: break;
4243:
4244: case OP_NOT_WHITESPACE:
4245: for (i = min; i < max; i++)
4246: {
4247: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4248: break;
4249: eptr++;
4250: }
4251: break;
4252:
4253: case OP_WHITESPACE:
4254: for (i = min; i < max; i++)
4255: {
4256: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4257: break;
4258: eptr++;
4259: }
4260: break;
4261:
4262: case OP_NOT_WORDCHAR:
4263: for (i = min; i < max; i++)
4264: {
4265: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4266: break;
4267: eptr++;
4268: }
4269: break;
4270:
4271: case OP_WORDCHAR:
4272: for (i = min; i < max; i++)
4273: {
4274: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4275: break;
4276: eptr++;
4277: }
4278: break;
4279:
4280: default:
4281: RRETURN(PCRE_ERROR_INTERNAL);
4282: }
4283:
4284: /* eptr is now past the end of the maximum run */
4285:
4286: if (possessive) continue;
4287: while (eptr >= pp)
4288: {
4289: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4290: eptr--;
4291: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4292: }
4293: }
4294:
4295: /* Get here if we can't make it match with any permitted repetitions */
4296:
4297: RRETURN(MATCH_NOMATCH);
4298: }
4299: /* Control never gets here */
4300:
4301: /* There's been some horrible disaster. Arrival here can only mean there is
4302: something seriously wrong in the code above or the OP_xxx definitions. */
4303:
4304: default:
4305: DPRINTF(("Unknown opcode %d\n", *ecode));
4306: RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4307: }
4308:
4309: /* Do not stick any code in here without much thought; it is assumed
4310: that "continue" in the code above comes out to here to repeat the main
4311: loop. */
4312:
4313: } /* End of main loop */
4314: /* Control never reaches here */
4315:
4316:
4317: /* When compiling to use the heap rather than the stack for recursive calls to
4318: match(), the RRETURN() macro jumps here. The number that is saved in
4319: frame->Xwhere indicates which label we actually want to return to. */
4320:
4321: #ifdef NO_RECURSE
4322: #define LBL(val) case val: goto L_RM##val;
4323: HEAP_RETURN:
4324: switch (frame->Xwhere)
4325: {
4326: LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4327: LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4328: LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4329: LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4330: LBL(53) LBL(54)
4331: #ifdef SUPPORT_UTF8
4332: LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4333: LBL(32) LBL(34) LBL(42) LBL(46)
4334: #ifdef SUPPORT_UCP
4335: LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4336: #endif /* SUPPORT_UCP */
4337: #endif /* SUPPORT_UTF8 */
4338: default:
4339: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4340: return PCRE_ERROR_INTERNAL;
4341: }
4342: #undef LBL
4343: #endif /* NO_RECURSE */
4344: }
4345:
4346:
4347: /***************************************************************************
4348: ****************************************************************************
4349: RECURSION IN THE match() FUNCTION
4350:
4351: Undefine all the macros that were defined above to handle this. */
4352:
4353: #ifdef NO_RECURSE
4354: #undef eptr
4355: #undef ecode
4356: #undef mstart
4357: #undef offset_top
4358: #undef ims
4359: #undef eptrb
4360: #undef flags
4361:
4362: #undef callpat
4363: #undef charptr
4364: #undef data
4365: #undef next
4366: #undef pp
4367: #undef prev
4368: #undef saved_eptr
4369:
4370: #undef new_recursive
4371:
4372: #undef cur_is_word
4373: #undef condition
4374: #undef prev_is_word
4375:
4376: #undef original_ims
4377:
4378: #undef ctype
4379: #undef length
4380: #undef max
4381: #undef min
4382: #undef number
4383: #undef offset
4384: #undef op
4385: #undef save_capture_last
4386: #undef save_offset1
4387: #undef save_offset2
4388: #undef save_offset3
4389: #undef stacksave
4390:
4391: #undef newptrb
4392:
4393: #endif
4394:
4395: /* These two are defined as macros in both cases */
4396:
4397: #undef fc
4398: #undef fi
4399:
4400: /***************************************************************************
4401: ***************************************************************************/
4402:
4403:
4404:
4405: /*************************************************
4406: * Execute a Regular Expression *
4407: *************************************************/
4408:
4409: /* This function applies a compiled re to a subject string and picks out
4410: portions of the string if it matches. Two elements in the vector are set for
4411: each substring: the offsets to the start and end of the substring.
4412:
4413: Arguments:
4414: argument_re points to the compiled expression
4415: extra_data points to extra data or is NULL
4416: subject points to the subject string
4417: length length of subject string (may contain binary zeros)
4418: start_offset where to start in the subject string
4419: options option bits
4420: offsets points to a vector of ints to be filled in with offsets
4421: offsetcount the number of elements in the vector
4422:
4423: Returns: > 0 => success; value is the number of elements filled in
4424: = 0 => success, but offsets is not big enough
4425: -1 => failed to match
4426: < -1 => some kind of unexpected problem
4427: */
4428:
1.2 misha 4429: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
1.1 misha 4430: pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4431: PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4432: int offsetcount)
4433: {
4434: int rc, resetcount, ocount;
4435: int first_byte = -1;
4436: int req_byte = -1;
4437: int req_byte2 = -1;
4438: int newline;
4439: unsigned long int ims;
4440: BOOL using_temporary_offsets = FALSE;
4441: BOOL anchored;
4442: BOOL startline;
4443: BOOL firstline;
4444: BOOL first_byte_caseless = FALSE;
4445: BOOL req_byte_caseless = FALSE;
4446: BOOL utf8;
4447: match_data match_block;
4448: match_data *md = &match_block;
4449: const uschar *tables;
4450: const uschar *start_bits = NULL;
4451: USPTR start_match = (USPTR)subject + start_offset;
4452: USPTR end_subject;
4453: USPTR req_byte_ptr = start_match - 1;
4454:
4455: pcre_study_data internal_study;
4456: const pcre_study_data *study;
4457:
4458: real_pcre internal_re;
4459: const real_pcre *external_re = (const real_pcre *)argument_re;
4460: const real_pcre *re = external_re;
4461:
4462: /* Plausibility checks */
4463:
4464: if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4465: if (re == NULL || subject == NULL ||
4466: (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4467: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4468:
4469: /* Fish out the optional data from the extra_data structure, first setting
4470: the default values. */
4471:
4472: study = NULL;
4473: md->match_limit = MATCH_LIMIT;
4474: md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4475: md->callout_data = NULL;
4476:
4477: /* The table pointer is always in native byte order. */
4478:
4479: tables = external_re->tables;
4480:
4481: if (extra_data != NULL)
4482: {
4483: register unsigned int flags = extra_data->flags;
4484: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4485: study = (const pcre_study_data *)extra_data->study_data;
4486: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4487: md->match_limit = extra_data->match_limit;
4488: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4489: md->match_limit_recursion = extra_data->match_limit_recursion;
4490: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4491: md->callout_data = extra_data->callout_data;
4492: if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4493: }
4494:
4495: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4496: is a feature that makes it possible to save compiled regex and re-use them
4497: in other programs later. */
4498:
4499: if (tables == NULL) tables = _pcre_default_tables;
4500:
4501: /* Check that the first field in the block is the magic number. If it is not,
4502: test for a regex that was compiled on a host of opposite endianness. If this is
4503: the case, flipped values are put in internal_re and internal_study if there was
4504: study data too. */
4505:
4506: if (re->magic_number != MAGIC_NUMBER)
4507: {
4508: re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4509: if (re == NULL) return PCRE_ERROR_BADMAGIC;
4510: if (study != NULL) study = &internal_study;
4511: }
4512:
4513: /* Set up other data */
4514:
4515: anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4516: startline = (re->flags & PCRE_STARTLINE) != 0;
4517: firstline = (re->options & PCRE_FIRSTLINE) != 0;
4518:
4519: /* The code starts after the real_pcre block and the capture name table. */
4520:
4521: md->start_code = (const uschar *)external_re + re->name_table_offset +
4522: re->name_count * re->name_entry_size;
4523:
4524: md->start_subject = (USPTR)subject;
4525: md->start_offset = start_offset;
4526: md->end_subject = md->start_subject + length;
4527: end_subject = md->end_subject;
4528:
4529: md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4530: utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4531: md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4532:
4533: md->notbol = (options & PCRE_NOTBOL) != 0;
4534: md->noteol = (options & PCRE_NOTEOL) != 0;
4535: md->notempty = (options & PCRE_NOTEMPTY) != 0;
4536: md->partial = (options & PCRE_PARTIAL) != 0;
4537: md->hitend = FALSE;
4538:
4539: md->recursive = NULL; /* No recursion at top level */
4540:
4541: md->lcc = tables + lcc_offset;
4542: md->ctypes = tables + ctypes_offset;
4543:
4544: /* Handle different \R options. */
4545:
4546: switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4547: {
4548: case 0:
4549: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4550: md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4551: else
4552: #ifdef BSR_ANYCRLF
4553: md->bsr_anycrlf = TRUE;
4554: #else
4555: md->bsr_anycrlf = FALSE;
4556: #endif
4557: break;
4558:
4559: case PCRE_BSR_ANYCRLF:
4560: md->bsr_anycrlf = TRUE;
4561: break;
4562:
4563: case PCRE_BSR_UNICODE:
4564: md->bsr_anycrlf = FALSE;
4565: break;
4566:
4567: default: return PCRE_ERROR_BADNEWLINE;
4568: }
4569:
4570: /* Handle different types of newline. The three bits give eight cases. If
4571: nothing is set at run time, whatever was used at compile time applies. */
4572:
4573: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4574: (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4575: {
4576: case 0: newline = NEWLINE; break; /* Compile-time default */
1.3 ! misha 4577: case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
! 4578: case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
1.1 misha 4579: case PCRE_NEWLINE_CR+
1.3 ! misha 4580: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
1.1 misha 4581: case PCRE_NEWLINE_ANY: newline = -1; break;
4582: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4583: default: return PCRE_ERROR_BADNEWLINE;
4584: }
4585:
4586: if (newline == -2)
4587: {
4588: md->nltype = NLTYPE_ANYCRLF;
4589: }
4590: else if (newline < 0)
4591: {
4592: md->nltype = NLTYPE_ANY;
4593: }
4594: else
4595: {
4596: md->nltype = NLTYPE_FIXED;
4597: if (newline > 255)
4598: {
4599: md->nllen = 2;
4600: md->nl[0] = (newline >> 8) & 255;
4601: md->nl[1] = newline & 255;
4602: }
4603: else
4604: {
4605: md->nllen = 1;
4606: md->nl[0] = newline;
4607: }
4608: }
4609:
4610: /* Partial matching is supported only for a restricted set of regexes at the
4611: moment. */
4612:
4613: if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4614: return PCRE_ERROR_BADPARTIAL;
4615:
4616: /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4617: back the character offset. */
4618:
4619: #ifdef SUPPORT_UTF8
4620: if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4621: {
1.3 ! misha 4622: if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
1.1 misha 4623: return PCRE_ERROR_BADUTF8;
4624: if (start_offset > 0 && start_offset < length)
4625: {
1.3 ! misha 4626: int tb = ((USPTR)subject)[start_offset];
1.1 misha 4627: if (tb > 127)
4628: {
4629: tb &= 0xc0;
4630: if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4631: }
4632: }
4633: }
4634: #endif
4635:
4636: /* The ims options can vary during the matching as a result of the presence
4637: of (?ims) items in the pattern. They are kept in a local variable so that
4638: restoring at the exit of a group is easy. */
4639:
4640: ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4641:
4642: /* If the expression has got more back references than the offsets supplied can
4643: hold, we get a temporary chunk of working store to use during the matching.
4644: Otherwise, we can use the vector supplied, rounding down its size to a multiple
4645: of 3. */
4646:
4647: ocount = offsetcount - (offsetcount % 3);
4648:
4649: if (re->top_backref > 0 && re->top_backref >= ocount/3)
4650: {
4651: ocount = re->top_backref * 3 + 3;
4652: md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4653: if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4654: using_temporary_offsets = TRUE;
4655: DPRINTF(("Got memory to hold back references\n"));
4656: }
4657: else md->offset_vector = offsets;
4658:
4659: md->offset_end = ocount;
4660: md->offset_max = (2*ocount)/3;
4661: md->offset_overflow = FALSE;
4662: md->capture_last = -1;
4663:
4664: /* Compute the minimum number of offsets that we need to reset each time. Doing
4665: this makes a huge difference to execution time when there aren't many brackets
4666: in the pattern. */
4667:
4668: resetcount = 2 + re->top_bracket * 2;
4669: if (resetcount > offsetcount) resetcount = ocount;
4670:
4671: /* Reset the working variable associated with each extraction. These should
4672: never be used unless previously set, but they get saved and restored, and so we
4673: initialize them to avoid reading uninitialized locations. */
4674:
4675: if (md->offset_vector != NULL)
4676: {
4677: register int *iptr = md->offset_vector + ocount;
4678: register int *iend = iptr - resetcount/2 + 1;
4679: while (--iptr >= iend) *iptr = -1;
4680: }
4681:
4682: /* Set up the first character to match, if available. The first_byte value is
4683: never set for an anchored regular expression, but the anchoring may be forced
4684: at run time, so we have to test for anchoring. The first char may be unset for
4685: an unanchored pattern, of course. If there's no first char and the pattern was
4686: studied, there may be a bitmap of possible first characters. */
4687:
4688: if (!anchored)
4689: {
4690: if ((re->flags & PCRE_FIRSTSET) != 0)
4691: {
4692: first_byte = re->first_byte & 255;
4693: if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4694: first_byte = md->lcc[first_byte];
4695: }
4696: else
4697: if (!startline && study != NULL &&
4698: (study->options & PCRE_STUDY_MAPPED) != 0)
4699: start_bits = study->start_bits;
4700: }
4701:
4702: /* For anchored or unanchored matches, there may be a "last known required
4703: character" set. */
4704:
4705: if ((re->flags & PCRE_REQCHSET) != 0)
4706: {
4707: req_byte = re->req_byte & 255;
4708: req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4709: req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4710: }
4711:
4712:
4713: /* ==========================================================================*/
4714:
4715: /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4716: the loop runs just once. */
4717:
4718: for(;;)
4719: {
4720: USPTR save_end_subject = end_subject;
4721: USPTR new_start_match;
4722:
4723: /* Reset the maximum number of extractions we might see. */
4724:
4725: if (md->offset_vector != NULL)
4726: {
4727: register int *iptr = md->offset_vector;
4728: register int *iend = iptr + resetcount;
4729: while (iptr < iend) *iptr++ = -1;
4730: }
4731:
1.3 ! misha 4732: /* If firstline is TRUE, the start of the match is constrained to the first
! 4733: line of a multiline string. That is, the match must be before or at the first
! 4734: newline. Implement this by temporarily adjusting end_subject so that we stop
! 4735: scanning at a newline. If the match fails at the newline, later code breaks
! 4736: this loop. */
1.1 misha 4737:
4738: if (firstline)
4739: {
4740: USPTR t = start_match;
1.2 misha 4741: #ifdef SUPPORT_UTF8
4742: if (utf8)
4743: {
4744: while (t < md->end_subject && !IS_NEWLINE(t))
4745: {
4746: t++;
4747: while (t < end_subject && (*t & 0xc0) == 0x80) t++;
4748: }
4749: }
4750: else
4751: #endif
1.1 misha 4752: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4753: end_subject = t;
4754: }
4755:
1.3 ! misha 4756: /* There are some optimizations that avoid running the match if a known
! 4757: starting point is not found, or if a known later character is not present.
! 4758: However, there is an option that disables these, for testing and for ensuring
! 4759: that all callouts do actually occur. */
1.1 misha 4760:
1.3 ! misha 4761: if ((options & PCRE_NO_START_OPTIMIZE) == 0)
1.1 misha 4762: {
1.3 ! misha 4763: /* Advance to a unique first byte if there is one. */
! 4764:
! 4765: if (first_byte >= 0)
! 4766: {
! 4767: if (first_byte_caseless)
! 4768: while (start_match < end_subject && md->lcc[*start_match] != first_byte)
! 4769: start_match++;
! 4770: else
! 4771: while (start_match < end_subject && *start_match != first_byte)
! 4772: start_match++;
! 4773: }
1.1 misha 4774:
1.3 ! misha 4775: /* Or to just after a linebreak for a multiline match */
1.1 misha 4776:
1.3 ! misha 4777: else if (startline)
1.1 misha 4778: {
1.3 ! misha 4779: if (start_match > md->start_subject + start_offset)
! 4780: {
1.2 misha 4781: #ifdef SUPPORT_UTF8
1.3 ! misha 4782: if (utf8)
1.2 misha 4783: {
1.3 ! misha 4784: while (start_match < end_subject && !WAS_NEWLINE(start_match))
! 4785: {
1.2 misha 4786: start_match++;
1.3 ! misha 4787: while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
! 4788: start_match++;
! 4789: }
1.2 misha 4790: }
1.3 ! misha 4791: else
1.2 misha 4792: #endif
1.3 ! misha 4793: while (start_match < end_subject && !WAS_NEWLINE(start_match))
! 4794: start_match++;
1.1 misha 4795:
1.3 ! misha 4796: /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
! 4797: and we are now at a LF, advance the match position by one more character.
! 4798: */
! 4799:
! 4800: if (start_match[-1] == CHAR_CR &&
! 4801: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
! 4802: start_match < end_subject &&
! 4803: *start_match == CHAR_NL)
! 4804: start_match++;
! 4805: }
1.1 misha 4806: }
4807:
1.3 ! misha 4808: /* Or to a non-unique first byte after study */
1.1 misha 4809:
1.3 ! misha 4810: else if (start_bits != NULL)
1.1 misha 4811: {
1.3 ! misha 4812: while (start_match < end_subject)
! 4813: {
! 4814: register unsigned int c = *start_match;
! 4815: if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
! 4816: else break;
! 4817: }
1.1 misha 4818: }
1.3 ! misha 4819: } /* Starting optimizations */
1.1 misha 4820:
4821: /* Restore fudged end_subject */
4822:
4823: end_subject = save_end_subject;
4824:
4825: #ifdef DEBUG /* Sigh. Some compilers never learn. */
4826: printf(">>>> Match against: ");
4827: pchars(start_match, end_subject - start_match, TRUE, md);
4828: printf("\n");
4829: #endif
4830:
1.3 ! misha 4831: /* If req_byte is set, we know that that character must appear in the
! 4832: subject for the match to succeed. If the first character is set, req_byte
! 4833: must be later in the subject; otherwise the test starts at the match point.
! 4834: This optimization can save a huge amount of backtracking in patterns with
! 4835: nested unlimited repeats that aren't going to match. Writing separate code
! 4836: for cased/caseless versions makes it go faster, as does using an
! 4837: autoincrement and backing off on a match.
! 4838:
! 4839: HOWEVER: when the subject string is very, very long, searching to its end
! 4840: can take a long time, and give bad performance on quite ordinary patterns.
! 4841: This showed up when somebody was matching something like /^\d+C/ on a
! 4842: 32-megabyte string... so we don't do this when the string is sufficiently
! 4843: long.
1.1 misha 4844:
1.3 ! misha 4845: ALSO: this processing is disabled when partial matching is requested, or if
! 4846: disabling is explicitly requested. */
1.1 misha 4847:
1.3 ! misha 4848: if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
! 4849: req_byte >= 0 &&
1.1 misha 4850: end_subject - start_match < REQ_BYTE_MAX &&
4851: !md->partial)
4852: {
4853: register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4854:
4855: /* We don't need to repeat the search if we haven't yet reached the
4856: place we found it at last time. */
4857:
4858: if (p > req_byte_ptr)
4859: {
4860: if (req_byte_caseless)
4861: {
4862: while (p < end_subject)
4863: {
4864: register int pp = *p++;
4865: if (pp == req_byte || pp == req_byte2) { p--; break; }
4866: }
4867: }
4868: else
4869: {
4870: while (p < end_subject)
4871: {
4872: if (*p++ == req_byte) { p--; break; }
4873: }
4874: }
4875:
4876: /* If we can't find the required character, break the matching loop,
4877: forcing a match failure. */
4878:
4879: if (p >= end_subject)
4880: {
4881: rc = MATCH_NOMATCH;
4882: break;
4883: }
4884:
4885: /* If we have found the required character, save the point where we
4886: found it, so that we don't search again next time round the loop if
4887: the start hasn't passed this character yet. */
4888:
4889: req_byte_ptr = p;
4890: }
4891: }
4892:
4893: /* OK, we can now run the match. */
4894:
4895: md->start_match_ptr = start_match;
4896: md->match_call_count = 0;
4897: rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4898:
4899: switch(rc)
4900: {
4901: /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4902: exactly like PRUNE. */
4903:
4904: case MATCH_NOMATCH:
4905: case MATCH_PRUNE:
4906: case MATCH_THEN:
4907: new_start_match = start_match + 1;
4908: #ifdef SUPPORT_UTF8
4909: if (utf8)
4910: while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4911: new_start_match++;
4912: #endif
4913: break;
4914:
4915: /* SKIP passes back the next starting point explicitly. */
4916:
4917: case MATCH_SKIP:
4918: new_start_match = md->start_match_ptr;
4919: break;
4920:
4921: /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4922:
4923: case MATCH_COMMIT:
4924: rc = MATCH_NOMATCH;
4925: goto ENDLOOP;
4926:
4927: /* Any other return is some kind of error. */
4928:
4929: default:
4930: goto ENDLOOP;
4931: }
4932:
4933: /* Control reaches here for the various types of "no match at this point"
4934: result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4935:
4936: rc = MATCH_NOMATCH;
4937:
4938: /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4939: newline in the subject (though it may continue over the newline). Therefore,
4940: if we have just failed to match, starting at a newline, do not continue. */
4941:
4942: if (firstline && IS_NEWLINE(start_match)) break;
4943:
4944: /* Advance to new matching position */
4945:
4946: start_match = new_start_match;
4947:
4948: /* Break the loop if the pattern is anchored or if we have passed the end of
4949: the subject. */
4950:
4951: if (anchored || start_match > end_subject) break;
4952:
4953: /* If we have just passed a CR and we are now at a LF, and the pattern does
4954: not contain any explicit matches for \r or \n, and the newline option is CRLF
4955: or ANY or ANYCRLF, advance the match position by one more character. */
4956:
1.3 ! misha 4957: if (start_match[-1] == CHAR_CR &&
1.1 misha 4958: start_match < end_subject &&
1.3 ! misha 4959: *start_match == CHAR_NL &&
1.1 misha 4960: (re->flags & PCRE_HASCRORLF) == 0 &&
4961: (md->nltype == NLTYPE_ANY ||
4962: md->nltype == NLTYPE_ANYCRLF ||
4963: md->nllen == 2))
4964: start_match++;
4965:
4966: } /* End of for(;;) "bumpalong" loop */
4967:
4968: /* ==========================================================================*/
4969:
4970: /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4971: conditions is true:
4972:
4973: (1) The pattern is anchored or the match was failed by (*COMMIT);
4974:
4975: (2) We are past the end of the subject;
4976:
4977: (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4978: this option requests that a match occur at or before the first newline in
4979: the subject.
4980:
4981: When we have a match and the offset vector is big enough to deal with any
4982: backreferences, captured substring offsets will already be set up. In the case
4983: where we had to get some local store to hold offsets for backreference
4984: processing, copy those that we can. In this case there need not be overflow if
4985: certain parts of the pattern were not used, even though there are more
4986: capturing parentheses than vector slots. */
4987:
4988: ENDLOOP:
4989:
4990: if (rc == MATCH_MATCH)
4991: {
4992: if (using_temporary_offsets)
4993: {
4994: if (offsetcount >= 4)
4995: {
4996: memcpy(offsets + 2, md->offset_vector + 2,
4997: (offsetcount - 2) * sizeof(int));
4998: DPRINTF(("Copied offsets from temporary memory\n"));
4999: }
5000: if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5001: DPRINTF(("Freeing temporary memory\n"));
5002: (pcre_free)(md->offset_vector);
5003: }
5004:
5005: /* Set the return code to the number of captured strings, or 0 if there are
5006: too many to fit into the vector. */
5007:
5008: rc = md->offset_overflow? 0 : md->end_offset_top/2;
5009:
5010: /* If there is space, set up the whole thing as substring 0. The value of
5011: md->start_match_ptr might be modified if \K was encountered on the success
5012: matching path. */
5013:
5014: if (offsetcount < 2) rc = 0; else
5015: {
5016: offsets[0] = md->start_match_ptr - md->start_subject;
5017: offsets[1] = md->end_match_ptr - md->start_subject;
5018: }
5019:
5020: DPRINTF((">>>> returning %d\n", rc));
5021: return rc;
5022: }
5023:
5024: /* Control gets here if there has been an error, or if the overall match
5025: attempt has failed at all permitted starting positions. */
5026:
5027: if (using_temporary_offsets)
5028: {
5029: DPRINTF(("Freeing temporary memory\n"));
5030: (pcre_free)(md->offset_vector);
5031: }
5032:
5033: if (rc != MATCH_NOMATCH)
5034: {
5035: DPRINTF((">>>> error: returning %d\n", rc));
5036: return rc;
5037: }
5038: else if (md->partial && md->hitend)
5039: {
5040: DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5041: return PCRE_ERROR_PARTIAL;
5042: }
5043: else
5044: {
5045: DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5046: return PCRE_ERROR_NOMATCH;
5047: }
5048: }
5049:
5050: /* End of pcre_exec.c */
E-mail: