Annotation of win32/pcre/pcre_exec.c, revision 1.2
1.1 misha 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
9: Copyright (c) 1997-2008 University of Cambridge
10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40:
41: /* This module contains pcre_exec(), the externally visible function that does
42: pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43: possible. There are also some static supporting functions. */
44:
45: #ifdef HAVE_CONFIG_H
46: #include "config.h"
47: #endif
48:
49: #define NLBLOCK md /* Block containing newline information */
50: #define PSSTART start_subject /* Field containing processed string start */
51: #define PSEND end_subject /* Field containing processed string end */
52:
53: #include "pcre_internal.h"
54:
55: /* Undefine some potentially clashing cpp symbols */
56:
57: #undef min
58: #undef max
59:
60: /* Flag bits for the match() function */
61:
62: #define match_condassert 0x01 /* Called to check a condition assertion */
63: #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64:
65: /* Non-error returns from the match() function. Error returns are externally
66: defined PCRE_ERROR_xxx codes, which are all negative. */
67:
68: #define MATCH_MATCH 1
69: #define MATCH_NOMATCH 0
70:
71: /* Special internal returns from the match() function. Make them sufficiently
72: negative to avoid the external error codes. */
73:
74: #define MATCH_COMMIT (-999)
75: #define MATCH_PRUNE (-998)
76: #define MATCH_SKIP (-997)
77: #define MATCH_THEN (-996)
78:
79: /* Maximum number of ints of offset to save on the stack for recursive calls.
80: If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81: because the offset vector is always a multiple of 3 long. */
82:
83: #define REC_STACK_SAVE_MAX 30
84:
85: /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86:
87: static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88: static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89:
90:
91:
92: #ifdef DEBUG
93: /*************************************************
94: * Debugging function to print chars *
95: *************************************************/
96:
97: /* Print a sequence of chars in printable format, stopping at the end of the
98: subject if the requested.
99:
100: Arguments:
101: p points to characters
102: length number to print
103: is_subject TRUE if printing from within md->start_subject
104: md pointer to matching data block, if is_subject is TRUE
105:
106: Returns: nothing
107: */
108:
109: static void
110: pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111: {
112: unsigned int c;
113: if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114: while (length-- > 0)
115: if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116: }
117: #endif
118:
119:
120:
121: /*************************************************
122: * Match a back-reference *
123: *************************************************/
124:
125: /* If a back reference hasn't been set, the length that is passed is greater
126: than the number of characters left in the string, so the match fails.
127:
128: Arguments:
129: offset index into the offset vector
130: eptr points into the subject
131: length length to be matched
132: md points to match data block
133: ims the ims flags
134:
135: Returns: TRUE if matched
136: */
137:
138: static BOOL
139: match_ref(int offset, register USPTR eptr, int length, match_data *md,
140: unsigned long int ims)
141: {
142: USPTR p = md->start_subject + md->offset_vector[offset];
143:
144: #ifdef DEBUG
145: if (eptr >= md->end_subject)
146: printf("matching subject <null>");
147: else
148: {
149: printf("matching subject ");
150: pchars(eptr, length, TRUE, md);
151: }
152: printf(" against backref ");
153: pchars(p, length, FALSE, md);
154: printf("\n");
155: #endif
156:
157: /* Always fail if not enough characters left */
158:
159: if (length > md->end_subject - eptr) return FALSE;
160:
1.2 ! misha 161: /* Separate the caseless case for speed. In UTF-8 mode we can only do this
! 162: properly if Unicode properties are supported. Otherwise, we can check only
! 163: ASCII characters. */
1.1 misha 164:
165: if ((ims & PCRE_CASELESS) != 0)
166: {
1.2 ! misha 167: #ifdef SUPPORT_UTF8
! 168: #ifdef SUPPORT_UCP
! 169: if (md->utf8)
! 170: {
! 171: USPTR endptr = eptr + length;
! 172: while (eptr < endptr)
! 173: {
! 174: int c, d;
! 175: GETCHARINC(c, eptr);
! 176: GETCHARINC(d, p);
! 177: if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
! 178: }
! 179: }
! 180: else
! 181: #endif
! 182: #endif
! 183:
! 184: /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
! 185: is no UCP support. */
! 186:
1.1 misha 187: while (length-- > 0)
1.2 ! misha 188: { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
1.1 misha 189: }
1.2 ! misha 190:
! 191: /* In the caseful case, we can just compare the bytes, whether or not we
! 192: are in UTF-8 mode. */
! 193:
1.1 misha 194: else
195: { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196:
197: return TRUE;
198: }
199:
200:
201:
202: /***************************************************************************
203: ****************************************************************************
204: RECURSION IN THE match() FUNCTION
205:
206: The match() function is highly recursive, though not every recursive call
207: increases the recursive depth. Nevertheless, some regular expressions can cause
208: it to recurse to a great depth. I was writing for Unix, so I just let it call
209: itself recursively. This uses the stack for saving everything that has to be
210: saved for a recursive call. On Unix, the stack can be large, and this works
211: fine.
212:
213: It turns out that on some non-Unix-like systems there are problems with
214: programs that use a lot of stack. (This despite the fact that every last chip
215: has oodles of memory these days, and techniques for extending the stack have
216: been known for decades.) So....
217:
218: There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219: calls by keeping local variables that need to be preserved in blocks of memory
220: obtained from malloc() instead instead of on the stack. Macros are used to
221: achieve this so that the actual code doesn't look very different to what it
222: always used to.
223:
224: The original heap-recursive code used longjmp(). However, it seems that this
225: can be very slow on some operating systems. Following a suggestion from Stan
226: Switzer, the use of longjmp() has been abolished, at the cost of having to
227: provide a unique number for each call to RMATCH. There is no way of generating
228: a sequence of numbers at compile time in C. I have given them names, to make
229: them stand out more clearly.
230:
231: Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232: FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233: tests. Furthermore, not using longjmp() means that local dynamic variables
234: don't have indeterminate values; this has meant that the frame size can be
235: reduced because the result can be "passed back" by straight setting of the
236: variable instead of being passed in the frame.
237: ****************************************************************************
238: ***************************************************************************/
239:
240: /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241: below must be updated in sync. */
242:
243: enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
244: RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
245: RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
246: RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
247: RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
248: RM51, RM52, RM53, RM54 };
249:
250: /* These versions of the macros use the stack, as normal. There are debugging
251: versions and production versions. Note that the "rw" argument of RMATCH isn't
252: actuall used in this definition. */
253:
254: #ifndef NO_RECURSE
255: #define REGISTER register
256:
257: #ifdef DEBUG
258: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259: { \
260: printf("match() called in line %d\n", __LINE__); \
261: rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262: printf("to line %d\n", __LINE__); \
263: }
264: #define RRETURN(ra) \
265: { \
266: printf("match() returned %d from line %d ", ra, __LINE__); \
267: return ra; \
268: }
269: #else
270: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271: rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272: #define RRETURN(ra) return ra
273: #endif
274:
275: #else
276:
277:
278: /* These versions of the macros manage a private stack on the heap. Note that
279: the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280: argument of match(), which never changes. */
281:
282: #define REGISTER
283:
284: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285: {\
286: heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287: frame->Xwhere = rw; \
288: newframe->Xeptr = ra;\
289: newframe->Xecode = rb;\
290: newframe->Xmstart = mstart;\
291: newframe->Xoffset_top = rc;\
292: newframe->Xims = re;\
293: newframe->Xeptrb = rf;\
294: newframe->Xflags = rg;\
295: newframe->Xrdepth = frame->Xrdepth + 1;\
296: newframe->Xprevframe = frame;\
297: frame = newframe;\
298: DPRINTF(("restarting from line %d\n", __LINE__));\
299: goto HEAP_RECURSE;\
300: L_##rw:\
301: DPRINTF(("jumped back to line %d\n", __LINE__));\
302: }
303:
304: #define RRETURN(ra)\
305: {\
306: heapframe *newframe = frame;\
307: frame = newframe->Xprevframe;\
308: (pcre_stack_free)(newframe);\
309: if (frame != NULL)\
310: {\
311: rrc = ra;\
312: goto HEAP_RETURN;\
313: }\
314: return ra;\
315: }
316:
317:
318: /* Structure for remembering the local variables in a private frame */
319:
320: typedef struct heapframe {
321: struct heapframe *Xprevframe;
322:
323: /* Function arguments that may change */
324:
325: const uschar *Xeptr;
326: const uschar *Xecode;
327: const uschar *Xmstart;
328: int Xoffset_top;
329: long int Xims;
330: eptrblock *Xeptrb;
331: int Xflags;
332: unsigned int Xrdepth;
333:
334: /* Function local variables */
335:
336: const uschar *Xcallpat;
337: const uschar *Xcharptr;
338: const uschar *Xdata;
339: const uschar *Xnext;
340: const uschar *Xpp;
341: const uschar *Xprev;
342: const uschar *Xsaved_eptr;
343:
344: recursion_info Xnew_recursive;
345:
346: BOOL Xcur_is_word;
347: BOOL Xcondition;
348: BOOL Xprev_is_word;
349:
350: unsigned long int Xoriginal_ims;
351:
352: #ifdef SUPPORT_UCP
353: int Xprop_type;
354: int Xprop_value;
355: int Xprop_fail_result;
356: int Xprop_category;
357: int Xprop_chartype;
358: int Xprop_script;
359: int Xoclength;
360: uschar Xocchars[8];
361: #endif
362:
363: int Xctype;
364: unsigned int Xfc;
365: int Xfi;
366: int Xlength;
367: int Xmax;
368: int Xmin;
369: int Xnumber;
370: int Xoffset;
371: int Xop;
372: int Xsave_capture_last;
373: int Xsave_offset1, Xsave_offset2, Xsave_offset3;
374: int Xstacksave[REC_STACK_SAVE_MAX];
375:
376: eptrblock Xnewptrb;
377:
378: /* Where to jump back to */
379:
380: int Xwhere;
381:
382: } heapframe;
383:
384: #endif
385:
386:
387: /***************************************************************************
388: ***************************************************************************/
389:
390:
391:
392: /*************************************************
393: * Match from current position *
394: *************************************************/
395:
396: /* This function is called recursively in many circumstances. Whenever it
397: returns a negative (error) response, the outer incarnation must also return the
398: same response.
399:
400: Performance note: It might be tempting to extract commonly used fields from the
401: md structure (e.g. utf8, end_subject) into individual variables to improve
402: performance. Tests using gcc on a SPARC disproved this; in the first case, it
403: made performance worse.
404:
405: Arguments:
406: eptr pointer to current character in subject
407: ecode pointer to current position in compiled code
408: mstart pointer to the current match start position (can be modified
409: by encountering \K)
410: offset_top current top pointer
411: md pointer to "static" info for the match
412: ims current /i, /m, and /s options
413: eptrb pointer to chain of blocks containing eptr at start of
414: brackets - for testing for empty matches
415: flags can contain
416: match_condassert - this is an assertion condition
417: match_cbegroup - this is the start of an unlimited repeat
418: group that can match an empty string
419: rdepth the recursion depth
420:
421: Returns: MATCH_MATCH if matched ) these values are >= 0
422: MATCH_NOMATCH if failed to match )
423: a negative PCRE_ERROR_xxx value if aborted by an error condition
424: (e.g. stopped by repeated call or recursion limit)
425: */
426:
427: static int
428: match(REGISTER USPTR eptr, REGISTER const uschar *ecode, const uschar *mstart,
429: int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
430: int flags, unsigned int rdepth)
431: {
432: /* These variables do not need to be preserved over recursion in this function,
433: so they can be ordinary variables in all cases. Mark some of them with
434: "register" because they are used a lot in loops. */
435:
436: register int rrc; /* Returns from recursive calls */
437: register int i; /* Used for loops not involving calls to RMATCH() */
438: register unsigned int c; /* Character values not kept over RMATCH() calls */
439: register BOOL utf8; /* Local copy of UTF-8 flag for speed */
440:
441: BOOL minimize, possessive; /* Quantifier options */
442:
443: /* When recursion is not being used, all "local" variables that have to be
444: preserved over calls to RMATCH() are part of a "frame" which is obtained from
445: heap storage. Set up the top-level frame here; others are obtained from the
446: heap whenever RMATCH() does a "recursion". See the macro definitions above. */
447:
448: #ifdef NO_RECURSE
449: heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
450: frame->Xprevframe = NULL; /* Marks the top level */
451:
452: /* Copy in the original argument variables */
453:
454: frame->Xeptr = eptr;
455: frame->Xecode = ecode;
456: frame->Xmstart = mstart;
457: frame->Xoffset_top = offset_top;
458: frame->Xims = ims;
459: frame->Xeptrb = eptrb;
460: frame->Xflags = flags;
461: frame->Xrdepth = rdepth;
462:
463: /* This is where control jumps back to to effect "recursion" */
464:
465: HEAP_RECURSE:
466:
467: /* Macros make the argument variables come from the current frame */
468:
469: #define eptr frame->Xeptr
470: #define ecode frame->Xecode
471: #define mstart frame->Xmstart
472: #define offset_top frame->Xoffset_top
473: #define ims frame->Xims
474: #define eptrb frame->Xeptrb
475: #define flags frame->Xflags
476: #define rdepth frame->Xrdepth
477:
478: /* Ditto for the local variables */
479:
480: #ifdef SUPPORT_UTF8
481: #define charptr frame->Xcharptr
482: #endif
483: #define callpat frame->Xcallpat
484: #define data frame->Xdata
485: #define next frame->Xnext
486: #define pp frame->Xpp
487: #define prev frame->Xprev
488: #define saved_eptr frame->Xsaved_eptr
489:
490: #define new_recursive frame->Xnew_recursive
491:
492: #define cur_is_word frame->Xcur_is_word
493: #define condition frame->Xcondition
494: #define prev_is_word frame->Xprev_is_word
495:
496: #define original_ims frame->Xoriginal_ims
497:
498: #ifdef SUPPORT_UCP
499: #define prop_type frame->Xprop_type
500: #define prop_value frame->Xprop_value
501: #define prop_fail_result frame->Xprop_fail_result
502: #define prop_category frame->Xprop_category
503: #define prop_chartype frame->Xprop_chartype
504: #define prop_script frame->Xprop_script
505: #define oclength frame->Xoclength
506: #define occhars frame->Xocchars
507: #endif
508:
509: #define ctype frame->Xctype
510: #define fc frame->Xfc
511: #define fi frame->Xfi
512: #define length frame->Xlength
513: #define max frame->Xmax
514: #define min frame->Xmin
515: #define number frame->Xnumber
516: #define offset frame->Xoffset
517: #define op frame->Xop
518: #define save_capture_last frame->Xsave_capture_last
519: #define save_offset1 frame->Xsave_offset1
520: #define save_offset2 frame->Xsave_offset2
521: #define save_offset3 frame->Xsave_offset3
522: #define stacksave frame->Xstacksave
523:
524: #define newptrb frame->Xnewptrb
525:
526: /* When recursion is being used, local variables are allocated on the stack and
527: get preserved during recursion in the normal way. In this environment, fi and
528: i, and fc and c, can be the same variables. */
529:
530: #else /* NO_RECURSE not defined */
531: #define fi i
532: #define fc c
533:
534:
535: #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
536: const uschar *charptr; /* in small blocks of the code. My normal */
537: #endif /* style of coding would have declared */
538: const uschar *callpat; /* them within each of those blocks. */
539: const uschar *data; /* However, in order to accommodate the */
540: const uschar *next; /* version of this code that uses an */
541: USPTR pp; /* external "stack" implemented on the */
542: const uschar *prev; /* heap, it is easier to declare them all */
543: USPTR saved_eptr; /* here, so the declarations can be cut */
544: /* out in a block. The only declarations */
545: recursion_info new_recursive; /* within blocks below are for variables */
546: /* that do not have to be preserved over */
547: BOOL cur_is_word; /* a recursive call to RMATCH(). */
548: BOOL condition;
549: BOOL prev_is_word;
550:
551: unsigned long int original_ims;
552:
553: #ifdef SUPPORT_UCP
554: int prop_type;
555: int prop_value;
556: int prop_fail_result;
557: int prop_category;
558: int prop_chartype;
559: int prop_script;
560: int oclength;
561: uschar occhars[8];
562: #endif
563:
564: int ctype;
565: int length;
566: int max;
567: int min;
568: int number;
569: int offset;
570: int op;
571: int save_capture_last;
572: int save_offset1, save_offset2, save_offset3;
573: int stacksave[REC_STACK_SAVE_MAX];
574:
575: eptrblock newptrb;
576: #endif /* NO_RECURSE */
577:
578: /* These statements are here to stop the compiler complaining about unitialized
579: variables. */
580:
581: #ifdef SUPPORT_UCP
582: prop_value = 0;
583: prop_fail_result = 0;
584: #endif
585:
586:
587: /* This label is used for tail recursion, which is used in a few cases even
588: when NO_RECURSE is not defined, in order to reduce the amount of stack that is
589: used. Thanks to Ian Taylor for noticing this possibility and sending the
590: original patch. */
591:
592: TAIL_RECURSE:
593:
594: /* OK, now we can get on with the real code of the function. Recursive calls
595: are specified by the macro RMATCH and RRETURN is used to return. When
596: NO_RECURSE is *not* defined, these just turn into a recursive call to match()
597: and a "return", respectively (possibly with some debugging if DEBUG is
598: defined). However, RMATCH isn't like a function call because it's quite a
599: complicated macro. It has to be used in one particular way. This shouldn't,
600: however, impact performance when true recursion is being used. */
601:
602: #ifdef SUPPORT_UTF8
603: utf8 = md->utf8; /* Local copy of the flag */
604: #else
605: utf8 = FALSE;
606: #endif
607:
608: /* First check that we haven't called match() too many times, or that we
609: haven't exceeded the recursive call limit. */
610:
611: if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
612: if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
613:
614: original_ims = ims; /* Save for resetting on ')' */
615:
616: /* At the start of a group with an unlimited repeat that may match an empty
617: string, the match_cbegroup flag is set. When this is the case, add the current
618: subject pointer to the chain of such remembered pointers, to be checked when we
619: hit the closing ket, in order to break infinite loops that match no characters.
620: When match() is called in other circumstances, don't add to the chain. The
621: match_cbegroup flag must NOT be used with tail recursion, because the memory
622: block that is used is on the stack, so a new one may be required for each
623: match(). */
624:
625: if ((flags & match_cbegroup) != 0)
626: {
627: newptrb.epb_saved_eptr = eptr;
628: newptrb.epb_prev = eptrb;
629: eptrb = &newptrb;
630: }
631:
632: /* Now start processing the opcodes. */
633:
634: for (;;)
635: {
636: minimize = possessive = FALSE;
637: op = *ecode;
638:
639: /* For partial matching, remember if we ever hit the end of the subject after
640: matching at least one subject character. */
641:
642: if (md->partial &&
643: eptr >= md->end_subject &&
644: eptr > mstart)
645: md->hitend = TRUE;
646:
647: switch(op)
648: {
649: case OP_FAIL:
650: RRETURN(MATCH_NOMATCH);
651:
652: case OP_PRUNE:
653: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
654: ims, eptrb, flags, RM51);
655: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
656: RRETURN(MATCH_PRUNE);
657:
658: case OP_COMMIT:
659: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
660: ims, eptrb, flags, RM52);
661: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
662: RRETURN(MATCH_COMMIT);
663:
664: case OP_SKIP:
665: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
666: ims, eptrb, flags, RM53);
667: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
668: md->start_match_ptr = eptr; /* Pass back current position */
669: RRETURN(MATCH_SKIP);
670:
671: case OP_THEN:
672: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
673: ims, eptrb, flags, RM54);
674: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
675: RRETURN(MATCH_THEN);
676:
677: /* Handle a capturing bracket. If there is space in the offset vector, save
678: the current subject position in the working slot at the top of the vector.
679: We mustn't change the current values of the data slot, because they may be
680: set from a previous iteration of this group, and be referred to by a
681: reference inside the group.
682:
683: If the bracket fails to match, we need to restore this value and also the
684: values of the final offsets, in case they were set by a previous iteration
685: of the same bracket.
686:
687: If there isn't enough space in the offset vector, treat this as if it were
688: a non-capturing bracket. Don't worry about setting the flag for the error
689: case here; that is handled in the code for KET. */
690:
691: case OP_CBRA:
692: case OP_SCBRA:
693: number = GET2(ecode, 1+LINK_SIZE);
694: offset = number << 1;
695:
696: #ifdef DEBUG
697: printf("start bracket %d\n", number);
698: printf("subject=");
699: pchars(eptr, 16, TRUE, md);
700: printf("\n");
701: #endif
702:
703: if (offset < md->offset_max)
704: {
705: save_offset1 = md->offset_vector[offset];
706: save_offset2 = md->offset_vector[offset+1];
707: save_offset3 = md->offset_vector[md->offset_end - number];
708: save_capture_last = md->capture_last;
709:
710: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
711: md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
712:
713: flags = (op == OP_SCBRA)? match_cbegroup : 0;
714: do
715: {
716: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
717: ims, eptrb, flags, RM1);
718: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
719: md->capture_last = save_capture_last;
720: ecode += GET(ecode, 1);
721: }
722: while (*ecode == OP_ALT);
723:
724: DPRINTF(("bracket %d failed\n", number));
725:
726: md->offset_vector[offset] = save_offset1;
727: md->offset_vector[offset+1] = save_offset2;
728: md->offset_vector[md->offset_end - number] = save_offset3;
729:
730: RRETURN(MATCH_NOMATCH);
731: }
732:
733: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
734: as a non-capturing bracket. */
735:
736: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
737: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
738:
739: DPRINTF(("insufficient capture room: treat as non-capturing\n"));
740:
741: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
742: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
743:
744: /* Non-capturing bracket. Loop for all the alternatives. When we get to the
745: final alternative within the brackets, we would return the result of a
746: recursive call to match() whatever happened. We can reduce stack usage by
747: turning this into a tail recursion, except in the case when match_cbegroup
748: is set.*/
749:
750: case OP_BRA:
751: case OP_SBRA:
752: DPRINTF(("start non-capturing bracket\n"));
753: flags = (op >= OP_SBRA)? match_cbegroup : 0;
754: for (;;)
755: {
756: if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
757: {
758: if (flags == 0) /* Not a possibly empty group */
759: {
760: ecode += _pcre_OP_lengths[*ecode];
761: DPRINTF(("bracket 0 tail recursion\n"));
762: goto TAIL_RECURSE;
763: }
764:
765: /* Possibly empty group; can't use tail recursion. */
766:
767: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
768: eptrb, flags, RM48);
769: RRETURN(rrc);
770: }
771:
772: /* For non-final alternatives, continue the loop for a NOMATCH result;
773: otherwise return. */
774:
775: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
776: eptrb, flags, RM2);
777: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
778: ecode += GET(ecode, 1);
779: }
780: /* Control never reaches here. */
781:
782: /* Conditional group: compilation checked that there are no more than
783: two branches. If the condition is false, skipping the first branch takes us
784: past the end if there is only one branch, but that's OK because that is
785: exactly what going to the ket would do. As there is only one branch to be
786: obeyed, we can use tail recursion to avoid using another stack frame. */
787:
788: case OP_COND:
789: case OP_SCOND:
790: if (ecode[LINK_SIZE+1] == OP_RREF) /* Recursion test */
791: {
792: offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
793: condition = md->recursive != NULL &&
794: (offset == RREF_ANY || offset == md->recursive->group_num);
795: ecode += condition? 3 : GET(ecode, 1);
796: }
797:
798: else if (ecode[LINK_SIZE+1] == OP_CREF) /* Group used test */
799: {
800: offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
801: condition = offset < offset_top && md->offset_vector[offset] >= 0;
802: ecode += condition? 3 : GET(ecode, 1);
803: }
804:
805: else if (ecode[LINK_SIZE+1] == OP_DEF) /* DEFINE - always false */
806: {
807: condition = FALSE;
808: ecode += GET(ecode, 1);
809: }
810:
811: /* The condition is an assertion. Call match() to evaluate it - setting
812: the final argument match_condassert causes it to stop at the end of an
813: assertion. */
814:
815: else
816: {
817: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
818: match_condassert, RM3);
819: if (rrc == MATCH_MATCH)
820: {
821: condition = TRUE;
822: ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
823: while (*ecode == OP_ALT) ecode += GET(ecode, 1);
824: }
825: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
826: {
827: RRETURN(rrc); /* Need braces because of following else */
828: }
829: else
830: {
831: condition = FALSE;
832: ecode += GET(ecode, 1);
833: }
834: }
835:
836: /* We are now at the branch that is to be obeyed. As there is only one,
837: we can use tail recursion to avoid using another stack frame, except when
838: match_cbegroup is required for an unlimited repeat of a possibly empty
839: group. If the second alternative doesn't exist, we can just plough on. */
840:
841: if (condition || *ecode == OP_ALT)
842: {
843: ecode += 1 + LINK_SIZE;
844: if (op == OP_SCOND) /* Possibly empty group */
845: {
846: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
847: RRETURN(rrc);
848: }
849: else /* Group must match something */
850: {
851: flags = 0;
852: goto TAIL_RECURSE;
853: }
854: }
855: else /* Condition false & no 2nd alternative */
856: {
857: ecode += 1 + LINK_SIZE;
858: }
859: break;
860:
861:
862: /* End of the pattern, either real or forced. If we are in a top-level
863: recursion, we should restore the offsets appropriately and continue from
864: after the call. */
865:
866: case OP_ACCEPT:
867: case OP_END:
868: if (md->recursive != NULL && md->recursive->group_num == 0)
869: {
870: recursion_info *rec = md->recursive;
871: DPRINTF(("End of pattern in a (?0) recursion\n"));
872: md->recursive = rec->prevrec;
873: memmove(md->offset_vector, rec->offset_save,
874: rec->saved_max * sizeof(int));
875: mstart = rec->save_start;
876: ims = original_ims;
877: ecode = rec->after_call;
878: break;
879: }
880:
881: /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
882: string - backtracking will then try other alternatives, if any. */
883:
884: if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
885: md->end_match_ptr = eptr; /* Record where we ended */
886: md->end_offset_top = offset_top; /* and how many extracts were taken */
887: md->start_match_ptr = mstart; /* and the start (\K can modify) */
888: RRETURN(MATCH_MATCH);
889:
890: /* Change option settings */
891:
892: case OP_OPT:
893: ims = ecode[1];
894: ecode += 2;
895: DPRINTF(("ims set to %02lx\n", ims));
896: break;
897:
898: /* Assertion brackets. Check the alternative branches in turn - the
899: matching won't pass the KET for an assertion. If any one branch matches,
900: the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
901: start of each branch to move the current point backwards, so the code at
902: this level is identical to the lookahead case. */
903:
904: case OP_ASSERT:
905: case OP_ASSERTBACK:
906: do
907: {
908: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
909: RM4);
910: if (rrc == MATCH_MATCH) break;
911: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
912: ecode += GET(ecode, 1);
913: }
914: while (*ecode == OP_ALT);
915: if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
916:
917: /* If checking an assertion for a condition, return MATCH_MATCH. */
918:
919: if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
920:
921: /* Continue from after the assertion, updating the offsets high water
922: mark, since extracts may have been taken during the assertion. */
923:
924: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
925: ecode += 1 + LINK_SIZE;
926: offset_top = md->end_offset_top;
927: continue;
928:
929: /* Negative assertion: all branches must fail to match */
930:
931: case OP_ASSERT_NOT:
932: case OP_ASSERTBACK_NOT:
933: do
934: {
935: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
936: RM5);
937: if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
938: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
939: ecode += GET(ecode,1);
940: }
941: while (*ecode == OP_ALT);
942:
943: if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
944:
945: ecode += 1 + LINK_SIZE;
946: continue;
947:
948: /* Move the subject pointer back. This occurs only at the start of
949: each branch of a lookbehind assertion. If we are too close to the start to
950: move back, this match function fails. When working with UTF-8 we move
951: back a number of characters, not bytes. */
952:
953: case OP_REVERSE:
954: #ifdef SUPPORT_UTF8
955: if (utf8)
956: {
957: i = GET(ecode, 1);
958: while (i-- > 0)
959: {
960: eptr--;
961: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
962: BACKCHAR(eptr);
963: }
964: }
965: else
966: #endif
967:
968: /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
969:
970: {
971: eptr -= GET(ecode, 1);
972: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
973: }
974:
975: /* Skip to next op code */
976:
977: ecode += 1 + LINK_SIZE;
978: break;
979:
980: /* The callout item calls an external function, if one is provided, passing
981: details of the match so far. This is mainly for debugging, though the
982: function is able to force a failure. */
983:
984: case OP_CALLOUT:
985: if (pcre_callout != NULL)
986: {
987: pcre_callout_block cb;
988: cb.version = 1; /* Version 1 of the callout block */
989: cb.callout_number = ecode[1];
990: cb.offset_vector = md->offset_vector;
991: cb.subject = (PCRE_SPTR)md->start_subject;
992: cb.subject_length = md->end_subject - md->start_subject;
993: cb.start_match = mstart - md->start_subject;
994: cb.current_position = eptr - md->start_subject;
995: cb.pattern_position = GET(ecode, 2);
996: cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
997: cb.capture_top = offset_top/2;
998: cb.capture_last = md->capture_last;
999: cb.callout_data = md->callout_data;
1000: if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1001: if (rrc < 0) RRETURN(rrc);
1002: }
1003: ecode += 2 + 2*LINK_SIZE;
1004: break;
1005:
1006: /* Recursion either matches the current regex, or some subexpression. The
1007: offset data is the offset to the starting bracket from the start of the
1008: whole pattern. (This is so that it works from duplicated subpatterns.)
1009:
1010: If there are any capturing brackets started but not finished, we have to
1011: save their starting points and reinstate them after the recursion. However,
1012: we don't know how many such there are (offset_top records the completed
1013: total) so we just have to save all the potential data. There may be up to
1014: 65535 such values, which is too large to put on the stack, but using malloc
1015: for small numbers seems expensive. As a compromise, the stack is used when
1016: there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1017: is used. A problem is what to do if the malloc fails ... there is no way of
1018: returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1019: values on the stack, and accept that the rest may be wrong.
1020:
1021: There are also other values that have to be saved. We use a chained
1022: sequence of blocks that actually live on the stack. Thanks to Robin Houston
1023: for the original version of this logic. */
1024:
1025: case OP_RECURSE:
1026: {
1027: callpat = md->start_code + GET(ecode, 1);
1028: new_recursive.group_num = (callpat == md->start_code)? 0 :
1029: GET2(callpat, 1 + LINK_SIZE);
1030:
1031: /* Add to "recursing stack" */
1032:
1033: new_recursive.prevrec = md->recursive;
1034: md->recursive = &new_recursive;
1035:
1036: /* Find where to continue from afterwards */
1037:
1038: ecode += 1 + LINK_SIZE;
1039: new_recursive.after_call = ecode;
1040:
1041: /* Now save the offset data. */
1042:
1043: new_recursive.saved_max = md->offset_end;
1044: if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1045: new_recursive.offset_save = stacksave;
1046: else
1047: {
1048: new_recursive.offset_save =
1049: (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1050: if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1051: }
1052:
1053: memcpy(new_recursive.offset_save, md->offset_vector,
1054: new_recursive.saved_max * sizeof(int));
1055: new_recursive.save_start = mstart;
1056: mstart = eptr;
1057:
1058: /* OK, now we can do the recursion. For each top-level alternative we
1059: restore the offset and recursion data. */
1060:
1061: DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1062: flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1063: do
1064: {
1065: RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1066: md, ims, eptrb, flags, RM6);
1067: if (rrc == MATCH_MATCH)
1068: {
1069: DPRINTF(("Recursion matched\n"));
1070: md->recursive = new_recursive.prevrec;
1071: if (new_recursive.offset_save != stacksave)
1072: (pcre_free)(new_recursive.offset_save);
1073: RRETURN(MATCH_MATCH);
1074: }
1075: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1076: {
1077: DPRINTF(("Recursion gave error %d\n", rrc));
1078: RRETURN(rrc);
1079: }
1080:
1081: md->recursive = &new_recursive;
1082: memcpy(md->offset_vector, new_recursive.offset_save,
1083: new_recursive.saved_max * sizeof(int));
1084: callpat += GET(callpat, 1);
1085: }
1086: while (*callpat == OP_ALT);
1087:
1088: DPRINTF(("Recursion didn't match\n"));
1089: md->recursive = new_recursive.prevrec;
1090: if (new_recursive.offset_save != stacksave)
1091: (pcre_free)(new_recursive.offset_save);
1092: RRETURN(MATCH_NOMATCH);
1093: }
1094: /* Control never reaches here */
1095:
1096: /* "Once" brackets are like assertion brackets except that after a match,
1097: the point in the subject string is not moved back. Thus there can never be
1098: a move back into the brackets. Friedl calls these "atomic" subpatterns.
1099: Check the alternative branches in turn - the matching won't pass the KET
1100: for this kind of subpattern. If any one branch matches, we carry on as at
1101: the end of a normal bracket, leaving the subject pointer. */
1102:
1103: case OP_ONCE:
1104: prev = ecode;
1105: saved_eptr = eptr;
1106:
1107: do
1108: {
1109: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1110: if (rrc == MATCH_MATCH) break;
1111: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1112: ecode += GET(ecode,1);
1113: }
1114: while (*ecode == OP_ALT);
1115:
1116: /* If hit the end of the group (which could be repeated), fail */
1117:
1118: if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1119:
1120: /* Continue as from after the assertion, updating the offsets high water
1121: mark, since extracts may have been taken. */
1122:
1123: do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1124:
1125: offset_top = md->end_offset_top;
1126: eptr = md->end_match_ptr;
1127:
1128: /* For a non-repeating ket, just continue at this level. This also
1129: happens for a repeating ket if no characters were matched in the group.
1130: This is the forcible breaking of infinite loops as implemented in Perl
1131: 5.005. If there is an options reset, it will get obeyed in the normal
1132: course of events. */
1133:
1134: if (*ecode == OP_KET || eptr == saved_eptr)
1135: {
1136: ecode += 1+LINK_SIZE;
1137: break;
1138: }
1139:
1140: /* The repeating kets try the rest of the pattern or restart from the
1141: preceding bracket, in the appropriate order. The second "call" of match()
1142: uses tail recursion, to avoid using another stack frame. We need to reset
1143: any options that changed within the bracket before re-running it, so
1144: check the next opcode. */
1145:
1146: if (ecode[1+LINK_SIZE] == OP_OPT)
1147: {
1148: ims = (ims & ~PCRE_IMS) | ecode[4];
1149: DPRINTF(("ims set to %02lx at group repeat\n", ims));
1150: }
1151:
1152: if (*ecode == OP_KETRMIN)
1153: {
1154: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1155: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1156: ecode = prev;
1157: flags = 0;
1158: goto TAIL_RECURSE;
1159: }
1160: else /* OP_KETRMAX */
1161: {
1162: RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1163: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1164: ecode += 1 + LINK_SIZE;
1165: flags = 0;
1166: goto TAIL_RECURSE;
1167: }
1168: /* Control never gets here */
1169:
1170: /* An alternation is the end of a branch; scan along to find the end of the
1171: bracketed group and go to there. */
1172:
1173: case OP_ALT:
1174: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1175: break;
1176:
1177: /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1178: indicating that it may occur zero times. It may repeat infinitely, or not
1179: at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1180: with fixed upper repeat limits are compiled as a number of copies, with the
1181: optional ones preceded by BRAZERO or BRAMINZERO. */
1182:
1183: case OP_BRAZERO:
1184: {
1185: next = ecode+1;
1186: RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1187: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1188: do next += GET(next,1); while (*next == OP_ALT);
1189: ecode = next + 1 + LINK_SIZE;
1190: }
1191: break;
1192:
1193: case OP_BRAMINZERO:
1194: {
1195: next = ecode+1;
1196: do next += GET(next, 1); while (*next == OP_ALT);
1197: RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1198: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1199: ecode++;
1200: }
1201: break;
1202:
1203: case OP_SKIPZERO:
1204: {
1205: next = ecode+1;
1206: do next += GET(next,1); while (*next == OP_ALT);
1207: ecode = next + 1 + LINK_SIZE;
1208: }
1209: break;
1210:
1211: /* End of a group, repeated or non-repeating. */
1212:
1213: case OP_KET:
1214: case OP_KETRMIN:
1215: case OP_KETRMAX:
1216: prev = ecode - GET(ecode, 1);
1217:
1218: /* If this was a group that remembered the subject start, in order to break
1219: infinite repeats of empty string matches, retrieve the subject start from
1220: the chain. Otherwise, set it NULL. */
1221:
1222: if (*prev >= OP_SBRA)
1223: {
1224: saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1225: eptrb = eptrb->epb_prev; /* Backup to previous group */
1226: }
1227: else saved_eptr = NULL;
1228:
1229: /* If we are at the end of an assertion group, stop matching and return
1230: MATCH_MATCH, but record the current high water mark for use by positive
1231: assertions. Do this also for the "once" (atomic) groups. */
1232:
1233: if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1234: *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1235: *prev == OP_ONCE)
1236: {
1237: md->end_match_ptr = eptr; /* For ONCE */
1238: md->end_offset_top = offset_top;
1239: RRETURN(MATCH_MATCH);
1240: }
1241:
1242: /* For capturing groups we have to check the group number back at the start
1243: and if necessary complete handling an extraction by setting the offsets and
1244: bumping the high water mark. Note that whole-pattern recursion is coded as
1245: a recurse into group 0, so it won't be picked up here. Instead, we catch it
1246: when the OP_END is reached. Other recursion is handled here. */
1247:
1248: if (*prev == OP_CBRA || *prev == OP_SCBRA)
1249: {
1250: number = GET2(prev, 1+LINK_SIZE);
1251: offset = number << 1;
1252:
1253: #ifdef DEBUG
1254: printf("end bracket %d", number);
1255: printf("\n");
1256: #endif
1257:
1258: md->capture_last = number;
1259: if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1260: {
1261: md->offset_vector[offset] =
1262: md->offset_vector[md->offset_end - number];
1263: md->offset_vector[offset+1] = eptr - md->start_subject;
1264: if (offset_top <= offset) offset_top = offset + 2;
1265: }
1266:
1267: /* Handle a recursively called group. Restore the offsets
1268: appropriately and continue from after the call. */
1269:
1270: if (md->recursive != NULL && md->recursive->group_num == number)
1271: {
1272: recursion_info *rec = md->recursive;
1273: DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1274: md->recursive = rec->prevrec;
1275: mstart = rec->save_start;
1276: memcpy(md->offset_vector, rec->offset_save,
1277: rec->saved_max * sizeof(int));
1278: ecode = rec->after_call;
1279: ims = original_ims;
1280: break;
1281: }
1282: }
1283:
1284: /* For both capturing and non-capturing groups, reset the value of the ims
1285: flags, in case they got changed during the group. */
1286:
1287: ims = original_ims;
1288: DPRINTF(("ims reset to %02lx\n", ims));
1289:
1290: /* For a non-repeating ket, just continue at this level. This also
1291: happens for a repeating ket if no characters were matched in the group.
1292: This is the forcible breaking of infinite loops as implemented in Perl
1293: 5.005. If there is an options reset, it will get obeyed in the normal
1294: course of events. */
1295:
1296: if (*ecode == OP_KET || eptr == saved_eptr)
1297: {
1298: ecode += 1 + LINK_SIZE;
1299: break;
1300: }
1301:
1302: /* The repeating kets try the rest of the pattern or restart from the
1303: preceding bracket, in the appropriate order. In the second case, we can use
1304: tail recursion to avoid using another stack frame, unless we have an
1305: unlimited repeat of a group that can match an empty string. */
1306:
1307: flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1308:
1309: if (*ecode == OP_KETRMIN)
1310: {
1311: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1312: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1313: if (flags != 0) /* Could match an empty string */
1314: {
1315: RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1316: RRETURN(rrc);
1317: }
1318: ecode = prev;
1319: goto TAIL_RECURSE;
1320: }
1321: else /* OP_KETRMAX */
1322: {
1323: RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1324: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1325: ecode += 1 + LINK_SIZE;
1326: flags = 0;
1327: goto TAIL_RECURSE;
1328: }
1329: /* Control never gets here */
1330:
1331: /* Start of subject unless notbol, or after internal newline if multiline */
1332:
1333: case OP_CIRC:
1334: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1335: if ((ims & PCRE_MULTILINE) != 0)
1336: {
1337: if (eptr != md->start_subject &&
1338: (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1339: RRETURN(MATCH_NOMATCH);
1340: ecode++;
1341: break;
1342: }
1343: /* ... else fall through */
1344:
1345: /* Start of subject assertion */
1346:
1347: case OP_SOD:
1348: if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1349: ecode++;
1350: break;
1351:
1352: /* Start of match assertion */
1353:
1354: case OP_SOM:
1355: if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1356: ecode++;
1357: break;
1358:
1359: /* Reset the start of match point */
1360:
1361: case OP_SET_SOM:
1362: mstart = eptr;
1363: ecode++;
1364: break;
1365:
1366: /* Assert before internal newline if multiline, or before a terminating
1367: newline unless endonly is set, else end of subject unless noteol is set. */
1368:
1369: case OP_DOLL:
1370: if ((ims & PCRE_MULTILINE) != 0)
1371: {
1372: if (eptr < md->end_subject)
1373: { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1374: else
1375: { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1376: ecode++;
1377: break;
1378: }
1379: else
1380: {
1381: if (md->noteol) RRETURN(MATCH_NOMATCH);
1382: if (!md->endonly)
1383: {
1384: if (eptr != md->end_subject &&
1385: (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1386: RRETURN(MATCH_NOMATCH);
1387: ecode++;
1388: break;
1389: }
1390: }
1391: /* ... else fall through for endonly */
1392:
1393: /* End of subject assertion (\z) */
1394:
1395: case OP_EOD:
1396: if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1397: ecode++;
1398: break;
1399:
1400: /* End of subject or ending \n assertion (\Z) */
1401:
1402: case OP_EODN:
1403: if (eptr != md->end_subject &&
1404: (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1405: RRETURN(MATCH_NOMATCH);
1406: ecode++;
1407: break;
1408:
1409: /* Word boundary assertions */
1410:
1411: case OP_NOT_WORD_BOUNDARY:
1412: case OP_WORD_BOUNDARY:
1413: {
1414:
1415: /* Find out if the previous and current characters are "word" characters.
1416: It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1417: be "non-word" characters. */
1418:
1419: #ifdef SUPPORT_UTF8
1420: if (utf8)
1421: {
1422: if (eptr == md->start_subject) prev_is_word = FALSE; else
1423: {
1424: const uschar *lastptr = eptr - 1;
1425: while((*lastptr & 0xc0) == 0x80) lastptr--;
1426: GETCHAR(c, lastptr);
1427: prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1428: }
1429: if (eptr >= md->end_subject) cur_is_word = FALSE; else
1430: {
1431: GETCHAR(c, eptr);
1432: cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1433: }
1434: }
1435: else
1436: #endif
1437:
1438: /* More streamlined when not in UTF-8 mode */
1439:
1440: {
1441: prev_is_word = (eptr != md->start_subject) &&
1442: ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1443: cur_is_word = (eptr < md->end_subject) &&
1444: ((md->ctypes[*eptr] & ctype_word) != 0);
1445: }
1446:
1447: /* Now see if the situation is what we want */
1448:
1449: if ((*ecode++ == OP_WORD_BOUNDARY)?
1450: cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1451: RRETURN(MATCH_NOMATCH);
1452: }
1453: break;
1454:
1455: /* Match a single character type; inline for speed */
1456:
1457: case OP_ANY:
1458: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1459: /* Fall through */
1460:
1461: case OP_ALLANY:
1462: if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1463: if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1464: ecode++;
1465: break;
1466:
1467: /* Match a single byte, even in UTF-8 mode. This opcode really does match
1468: any byte, even newline, independent of the setting of PCRE_DOTALL. */
1469:
1470: case OP_ANYBYTE:
1471: if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1472: ecode++;
1473: break;
1474:
1475: case OP_NOT_DIGIT:
1476: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1477: GETCHARINCTEST(c, eptr);
1478: if (
1479: #ifdef SUPPORT_UTF8
1480: c < 256 &&
1481: #endif
1482: (md->ctypes[c] & ctype_digit) != 0
1483: )
1484: RRETURN(MATCH_NOMATCH);
1485: ecode++;
1486: break;
1487:
1488: case OP_DIGIT:
1489: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1490: GETCHARINCTEST(c, eptr);
1491: if (
1492: #ifdef SUPPORT_UTF8
1493: c >= 256 ||
1494: #endif
1495: (md->ctypes[c] & ctype_digit) == 0
1496: )
1497: RRETURN(MATCH_NOMATCH);
1498: ecode++;
1499: break;
1500:
1501: case OP_NOT_WHITESPACE:
1502: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1503: GETCHARINCTEST(c, eptr);
1504: if (
1505: #ifdef SUPPORT_UTF8
1506: c < 256 &&
1507: #endif
1508: (md->ctypes[c] & ctype_space) != 0
1509: )
1510: RRETURN(MATCH_NOMATCH);
1511: ecode++;
1512: break;
1513:
1514: case OP_WHITESPACE:
1515: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1516: GETCHARINCTEST(c, eptr);
1517: if (
1518: #ifdef SUPPORT_UTF8
1519: c >= 256 ||
1520: #endif
1521: (md->ctypes[c] & ctype_space) == 0
1522: )
1523: RRETURN(MATCH_NOMATCH);
1524: ecode++;
1525: break;
1526:
1527: case OP_NOT_WORDCHAR:
1528: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1529: GETCHARINCTEST(c, eptr);
1530: if (
1531: #ifdef SUPPORT_UTF8
1532: c < 256 &&
1533: #endif
1534: (md->ctypes[c] & ctype_word) != 0
1535: )
1536: RRETURN(MATCH_NOMATCH);
1537: ecode++;
1538: break;
1539:
1540: case OP_WORDCHAR:
1541: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1542: GETCHARINCTEST(c, eptr);
1543: if (
1544: #ifdef SUPPORT_UTF8
1545: c >= 256 ||
1546: #endif
1547: (md->ctypes[c] & ctype_word) == 0
1548: )
1549: RRETURN(MATCH_NOMATCH);
1550: ecode++;
1551: break;
1552:
1553: case OP_ANYNL:
1554: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1555: GETCHARINCTEST(c, eptr);
1556: switch(c)
1557: {
1558: default: RRETURN(MATCH_NOMATCH);
1559: case 0x000d:
1560: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1561: break;
1562:
1563: case 0x000a:
1564: break;
1565:
1566: case 0x000b:
1567: case 0x000c:
1568: case 0x0085:
1569: case 0x2028:
1570: case 0x2029:
1571: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1572: break;
1573: }
1574: ecode++;
1575: break;
1576:
1577: case OP_NOT_HSPACE:
1578: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1579: GETCHARINCTEST(c, eptr);
1580: switch(c)
1581: {
1582: default: break;
1583: case 0x09: /* HT */
1584: case 0x20: /* SPACE */
1585: case 0xa0: /* NBSP */
1586: case 0x1680: /* OGHAM SPACE MARK */
1587: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1588: case 0x2000: /* EN QUAD */
1589: case 0x2001: /* EM QUAD */
1590: case 0x2002: /* EN SPACE */
1591: case 0x2003: /* EM SPACE */
1592: case 0x2004: /* THREE-PER-EM SPACE */
1593: case 0x2005: /* FOUR-PER-EM SPACE */
1594: case 0x2006: /* SIX-PER-EM SPACE */
1595: case 0x2007: /* FIGURE SPACE */
1596: case 0x2008: /* PUNCTUATION SPACE */
1597: case 0x2009: /* THIN SPACE */
1598: case 0x200A: /* HAIR SPACE */
1599: case 0x202f: /* NARROW NO-BREAK SPACE */
1600: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1601: case 0x3000: /* IDEOGRAPHIC SPACE */
1602: RRETURN(MATCH_NOMATCH);
1603: }
1604: ecode++;
1605: break;
1606:
1607: case OP_HSPACE:
1608: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1609: GETCHARINCTEST(c, eptr);
1610: switch(c)
1611: {
1612: default: RRETURN(MATCH_NOMATCH);
1613: case 0x09: /* HT */
1614: case 0x20: /* SPACE */
1615: case 0xa0: /* NBSP */
1616: case 0x1680: /* OGHAM SPACE MARK */
1617: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1618: case 0x2000: /* EN QUAD */
1619: case 0x2001: /* EM QUAD */
1620: case 0x2002: /* EN SPACE */
1621: case 0x2003: /* EM SPACE */
1622: case 0x2004: /* THREE-PER-EM SPACE */
1623: case 0x2005: /* FOUR-PER-EM SPACE */
1624: case 0x2006: /* SIX-PER-EM SPACE */
1625: case 0x2007: /* FIGURE SPACE */
1626: case 0x2008: /* PUNCTUATION SPACE */
1627: case 0x2009: /* THIN SPACE */
1628: case 0x200A: /* HAIR SPACE */
1629: case 0x202f: /* NARROW NO-BREAK SPACE */
1630: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1631: case 0x3000: /* IDEOGRAPHIC SPACE */
1632: break;
1633: }
1634: ecode++;
1635: break;
1636:
1637: case OP_NOT_VSPACE:
1638: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1639: GETCHARINCTEST(c, eptr);
1640: switch(c)
1641: {
1642: default: break;
1643: case 0x0a: /* LF */
1644: case 0x0b: /* VT */
1645: case 0x0c: /* FF */
1646: case 0x0d: /* CR */
1647: case 0x85: /* NEL */
1648: case 0x2028: /* LINE SEPARATOR */
1649: case 0x2029: /* PARAGRAPH SEPARATOR */
1650: RRETURN(MATCH_NOMATCH);
1651: }
1652: ecode++;
1653: break;
1654:
1655: case OP_VSPACE:
1656: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1657: GETCHARINCTEST(c, eptr);
1658: switch(c)
1659: {
1660: default: RRETURN(MATCH_NOMATCH);
1661: case 0x0a: /* LF */
1662: case 0x0b: /* VT */
1663: case 0x0c: /* FF */
1664: case 0x0d: /* CR */
1665: case 0x85: /* NEL */
1666: case 0x2028: /* LINE SEPARATOR */
1667: case 0x2029: /* PARAGRAPH SEPARATOR */
1668: break;
1669: }
1670: ecode++;
1671: break;
1672:
1673: #ifdef SUPPORT_UCP
1674: /* Check the next character by Unicode property. We will get here only
1675: if the support is in the binary; otherwise a compile-time error occurs. */
1676:
1677: case OP_PROP:
1678: case OP_NOTPROP:
1679: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1680: GETCHARINCTEST(c, eptr);
1681: {
1.2 ! misha 1682: const ucd_record * prop = GET_UCD(c);
1.1 misha 1683:
1684: switch(ecode[1])
1685: {
1686: case PT_ANY:
1687: if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1688: break;
1689:
1690: case PT_LAMP:
1.2 ! misha 1691: if ((prop->chartype == ucp_Lu ||
! 1692: prop->chartype == ucp_Ll ||
! 1693: prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1.1 misha 1694: RRETURN(MATCH_NOMATCH);
1695: break;
1696:
1697: case PT_GC:
1.2 ! misha 1698: if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1.1 misha 1699: RRETURN(MATCH_NOMATCH);
1700: break;
1701:
1702: case PT_PC:
1.2 ! misha 1703: if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1.1 misha 1704: RRETURN(MATCH_NOMATCH);
1705: break;
1706:
1707: case PT_SC:
1.2 ! misha 1708: if ((ecode[2] != prop->script) == (op == OP_PROP))
1.1 misha 1709: RRETURN(MATCH_NOMATCH);
1710: break;
1711:
1712: default:
1713: RRETURN(PCRE_ERROR_INTERNAL);
1714: }
1715:
1716: ecode += 3;
1717: }
1718: break;
1719:
1720: /* Match an extended Unicode sequence. We will get here only if the support
1721: is in the binary; otherwise a compile-time error occurs. */
1722:
1723: case OP_EXTUNI:
1724: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1725: GETCHARINCTEST(c, eptr);
1726: {
1.2 ! misha 1727: int category = UCD_CATEGORY(c);
1.1 misha 1728: if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1729: while (eptr < md->end_subject)
1730: {
1731: int len = 1;
1732: if (!utf8) c = *eptr; else
1733: {
1734: GETCHARLEN(c, eptr, len);
1735: }
1.2 ! misha 1736: category = UCD_CATEGORY(c);
1.1 misha 1737: if (category != ucp_M) break;
1738: eptr += len;
1739: }
1740: }
1741: ecode++;
1742: break;
1743: #endif
1744:
1745:
1746: /* Match a back reference, possibly repeatedly. Look past the end of the
1747: item to see if there is repeat information following. The code is similar
1748: to that for character classes, but repeated for efficiency. Then obey
1749: similar code to character type repeats - written out again for speed.
1750: However, if the referenced string is the empty string, always treat
1751: it as matched, any number of times (otherwise there could be infinite
1752: loops). */
1753:
1754: case OP_REF:
1755: {
1756: offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1757: ecode += 3;
1758:
1759: /* If the reference is unset, there are two possibilities:
1760:
1761: (a) In the default, Perl-compatible state, set the length to be longer
1762: than the amount of subject left; this ensures that every attempt at a
1763: match fails. We can't just fail here, because of the possibility of
1764: quantifiers with zero minima.
1765:
1766: (b) If the JavaScript compatibility flag is set, set the length to zero
1767: so that the back reference matches an empty string.
1768:
1769: Otherwise, set the length to the length of what was matched by the
1770: referenced subpattern. */
1771:
1772: if (offset >= offset_top || md->offset_vector[offset] < 0)
1773: length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1774: else
1775: length = md->offset_vector[offset+1] - md->offset_vector[offset];
1776:
1777: /* Set up for repetition, or handle the non-repeated case */
1778:
1779: switch (*ecode)
1780: {
1781: case OP_CRSTAR:
1782: case OP_CRMINSTAR:
1783: case OP_CRPLUS:
1784: case OP_CRMINPLUS:
1785: case OP_CRQUERY:
1786: case OP_CRMINQUERY:
1787: c = *ecode++ - OP_CRSTAR;
1788: minimize = (c & 1) != 0;
1789: min = rep_min[c]; /* Pick up values from tables; */
1790: max = rep_max[c]; /* zero for max => infinity */
1791: if (max == 0) max = INT_MAX;
1792: break;
1793:
1794: case OP_CRRANGE:
1795: case OP_CRMINRANGE:
1796: minimize = (*ecode == OP_CRMINRANGE);
1797: min = GET2(ecode, 1);
1798: max = GET2(ecode, 3);
1799: if (max == 0) max = INT_MAX;
1800: ecode += 5;
1801: break;
1802:
1803: default: /* No repeat follows */
1804: if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1805: eptr += length;
1806: continue; /* With the main loop */
1807: }
1808:
1809: /* If the length of the reference is zero, just continue with the
1810: main loop. */
1811:
1812: if (length == 0) continue;
1813:
1814: /* First, ensure the minimum number of matches are present. We get back
1815: the length of the reference string explicitly rather than passing the
1816: address of eptr, so that eptr can be a register variable. */
1817:
1818: for (i = 1; i <= min; i++)
1819: {
1820: if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1821: eptr += length;
1822: }
1823:
1824: /* If min = max, continue at the same level without recursion.
1825: They are not both allowed to be zero. */
1826:
1827: if (min == max) continue;
1828:
1829: /* If minimizing, keep trying and advancing the pointer */
1830:
1831: if (minimize)
1832: {
1833: for (fi = min;; fi++)
1834: {
1835: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1836: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1837: if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1838: RRETURN(MATCH_NOMATCH);
1839: eptr += length;
1840: }
1841: /* Control never gets here */
1842: }
1843:
1844: /* If maximizing, find the longest string and work backwards */
1845:
1846: else
1847: {
1848: pp = eptr;
1849: for (i = min; i < max; i++)
1850: {
1851: if (!match_ref(offset, eptr, length, md, ims)) break;
1852: eptr += length;
1853: }
1854: while (eptr >= pp)
1855: {
1856: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1857: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1858: eptr -= length;
1859: }
1860: RRETURN(MATCH_NOMATCH);
1861: }
1862: }
1863: /* Control never gets here */
1864:
1865:
1866:
1867: /* Match a bit-mapped character class, possibly repeatedly. This op code is
1868: used when all the characters in the class have values in the range 0-255,
1869: and either the matching is caseful, or the characters are in the range
1870: 0-127 when UTF-8 processing is enabled. The only difference between
1871: OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1872: encountered.
1873:
1874: First, look past the end of the item to see if there is repeat information
1875: following. Then obey similar code to character type repeats - written out
1876: again for speed. */
1877:
1878: case OP_NCLASS:
1879: case OP_CLASS:
1880: {
1881: data = ecode + 1; /* Save for matching */
1882: ecode += 33; /* Advance past the item */
1883:
1884: switch (*ecode)
1885: {
1886: case OP_CRSTAR:
1887: case OP_CRMINSTAR:
1888: case OP_CRPLUS:
1889: case OP_CRMINPLUS:
1890: case OP_CRQUERY:
1891: case OP_CRMINQUERY:
1892: c = *ecode++ - OP_CRSTAR;
1893: minimize = (c & 1) != 0;
1894: min = rep_min[c]; /* Pick up values from tables; */
1895: max = rep_max[c]; /* zero for max => infinity */
1896: if (max == 0) max = INT_MAX;
1897: break;
1898:
1899: case OP_CRRANGE:
1900: case OP_CRMINRANGE:
1901: minimize = (*ecode == OP_CRMINRANGE);
1902: min = GET2(ecode, 1);
1903: max = GET2(ecode, 3);
1904: if (max == 0) max = INT_MAX;
1905: ecode += 5;
1906: break;
1907:
1908: default: /* No repeat follows */
1909: min = max = 1;
1910: break;
1911: }
1912:
1913: /* First, ensure the minimum number of matches are present. */
1914:
1915: #ifdef SUPPORT_UTF8
1916: /* UTF-8 mode */
1917: if (utf8)
1918: {
1919: for (i = 1; i <= min; i++)
1920: {
1921: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1922: GETCHARINC(c, eptr);
1923: if (c > 255)
1924: {
1925: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1926: }
1927: else
1928: {
1929: if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1930: }
1931: }
1932: }
1933: else
1934: #endif
1935: /* Not UTF-8 mode */
1936: {
1937: for (i = 1; i <= min; i++)
1938: {
1939: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1940: c = *eptr++;
1941: if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1942: }
1943: }
1944:
1945: /* If max == min we can continue with the main loop without the
1946: need to recurse. */
1947:
1948: if (min == max) continue;
1949:
1950: /* If minimizing, keep testing the rest of the expression and advancing
1951: the pointer while it matches the class. */
1952:
1953: if (minimize)
1954: {
1955: #ifdef SUPPORT_UTF8
1956: /* UTF-8 mode */
1957: if (utf8)
1958: {
1959: for (fi = min;; fi++)
1960: {
1961: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
1962: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1963: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1964: GETCHARINC(c, eptr);
1965: if (c > 255)
1966: {
1967: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1968: }
1969: else
1970: {
1971: if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1972: }
1973: }
1974: }
1975: else
1976: #endif
1977: /* Not UTF-8 mode */
1978: {
1979: for (fi = min;; fi++)
1980: {
1981: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
1982: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1983: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1984: c = *eptr++;
1985: if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1986: }
1987: }
1988: /* Control never gets here */
1989: }
1990:
1991: /* If maximizing, find the longest possible run, then work backwards. */
1992:
1993: else
1994: {
1995: pp = eptr;
1996:
1997: #ifdef SUPPORT_UTF8
1998: /* UTF-8 mode */
1999: if (utf8)
2000: {
2001: for (i = min; i < max; i++)
2002: {
2003: int len = 1;
2004: if (eptr >= md->end_subject) break;
2005: GETCHARLEN(c, eptr, len);
2006: if (c > 255)
2007: {
2008: if (op == OP_CLASS) break;
2009: }
2010: else
2011: {
2012: if ((data[c/8] & (1 << (c&7))) == 0) break;
2013: }
2014: eptr += len;
2015: }
2016: for (;;)
2017: {
2018: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2019: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2020: if (eptr-- == pp) break; /* Stop if tried at original pos */
2021: BACKCHAR(eptr);
2022: }
2023: }
2024: else
2025: #endif
2026: /* Not UTF-8 mode */
2027: {
2028: for (i = min; i < max; i++)
2029: {
2030: if (eptr >= md->end_subject) break;
2031: c = *eptr;
2032: if ((data[c/8] & (1 << (c&7))) == 0) break;
2033: eptr++;
2034: }
2035: while (eptr >= pp)
2036: {
2037: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2038: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2039: eptr--;
2040: }
2041: }
2042:
2043: RRETURN(MATCH_NOMATCH);
2044: }
2045: }
2046: /* Control never gets here */
2047:
2048:
2049: /* Match an extended character class. This opcode is encountered only
2050: in UTF-8 mode, because that's the only time it is compiled. */
2051:
2052: #ifdef SUPPORT_UTF8
2053: case OP_XCLASS:
2054: {
2055: data = ecode + 1 + LINK_SIZE; /* Save for matching */
2056: ecode += GET(ecode, 1); /* Advance past the item */
2057:
2058: switch (*ecode)
2059: {
2060: case OP_CRSTAR:
2061: case OP_CRMINSTAR:
2062: case OP_CRPLUS:
2063: case OP_CRMINPLUS:
2064: case OP_CRQUERY:
2065: case OP_CRMINQUERY:
2066: c = *ecode++ - OP_CRSTAR;
2067: minimize = (c & 1) != 0;
2068: min = rep_min[c]; /* Pick up values from tables; */
2069: max = rep_max[c]; /* zero for max => infinity */
2070: if (max == 0) max = INT_MAX;
2071: break;
2072:
2073: case OP_CRRANGE:
2074: case OP_CRMINRANGE:
2075: minimize = (*ecode == OP_CRMINRANGE);
2076: min = GET2(ecode, 1);
2077: max = GET2(ecode, 3);
2078: if (max == 0) max = INT_MAX;
2079: ecode += 5;
2080: break;
2081:
2082: default: /* No repeat follows */
2083: min = max = 1;
2084: break;
2085: }
2086:
2087: /* First, ensure the minimum number of matches are present. */
2088:
2089: for (i = 1; i <= min; i++)
2090: {
2091: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2092: GETCHARINC(c, eptr);
2093: if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2094: }
2095:
2096: /* If max == min we can continue with the main loop without the
2097: need to recurse. */
2098:
2099: if (min == max) continue;
2100:
2101: /* If minimizing, keep testing the rest of the expression and advancing
2102: the pointer while it matches the class. */
2103:
2104: if (minimize)
2105: {
2106: for (fi = min;; fi++)
2107: {
2108: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2109: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2110: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2111: GETCHARINC(c, eptr);
2112: if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2113: }
2114: /* Control never gets here */
2115: }
2116:
2117: /* If maximizing, find the longest possible run, then work backwards. */
2118:
2119: else
2120: {
2121: pp = eptr;
2122: for (i = min; i < max; i++)
2123: {
2124: int len = 1;
2125: if (eptr >= md->end_subject) break;
2126: GETCHARLEN(c, eptr, len);
2127: if (!_pcre_xclass(c, data)) break;
2128: eptr += len;
2129: }
2130: for(;;)
2131: {
2132: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2133: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2134: if (eptr-- == pp) break; /* Stop if tried at original pos */
2135: if (utf8) BACKCHAR(eptr);
2136: }
2137: RRETURN(MATCH_NOMATCH);
2138: }
2139:
2140: /* Control never gets here */
2141: }
2142: #endif /* End of XCLASS */
2143:
2144: /* Match a single character, casefully */
2145:
2146: case OP_CHAR:
2147: #ifdef SUPPORT_UTF8
2148: if (utf8)
2149: {
2150: length = 1;
2151: ecode++;
2152: GETCHARLEN(fc, ecode, length);
2153: if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2154: while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2155: }
2156: else
2157: #endif
2158:
2159: /* Non-UTF-8 mode */
2160: {
2161: if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2162: if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2163: ecode += 2;
2164: }
2165: break;
2166:
2167: /* Match a single character, caselessly */
2168:
2169: case OP_CHARNC:
2170: #ifdef SUPPORT_UTF8
2171: if (utf8)
2172: {
2173: length = 1;
2174: ecode++;
2175: GETCHARLEN(fc, ecode, length);
2176:
2177: if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2178:
2179: /* If the pattern character's value is < 128, we have only one byte, and
2180: can use the fast lookup table. */
2181:
2182: if (fc < 128)
2183: {
2184: if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2185: }
2186:
2187: /* Otherwise we must pick up the subject character */
2188:
2189: else
2190: {
2191: unsigned int dc;
2192: GETCHARINC(dc, eptr);
2193: ecode += length;
2194:
2195: /* If we have Unicode property support, we can use it to test the other
2196: case of the character, if there is one. */
2197:
2198: if (fc != dc)
2199: {
2200: #ifdef SUPPORT_UCP
1.2 ! misha 2201: if (dc != UCD_OTHERCASE(fc))
1.1 misha 2202: #endif
2203: RRETURN(MATCH_NOMATCH);
2204: }
2205: }
2206: }
2207: else
2208: #endif /* SUPPORT_UTF8 */
2209:
2210: /* Non-UTF-8 mode */
2211: {
2212: if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2213: if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2214: ecode += 2;
2215: }
2216: break;
2217:
2218: /* Match a single character repeatedly. */
2219:
2220: case OP_EXACT:
2221: min = max = GET2(ecode, 1);
2222: ecode += 3;
2223: goto REPEATCHAR;
2224:
2225: case OP_POSUPTO:
2226: possessive = TRUE;
2227: /* Fall through */
2228:
2229: case OP_UPTO:
2230: case OP_MINUPTO:
2231: min = 0;
2232: max = GET2(ecode, 1);
2233: minimize = *ecode == OP_MINUPTO;
2234: ecode += 3;
2235: goto REPEATCHAR;
2236:
2237: case OP_POSSTAR:
2238: possessive = TRUE;
2239: min = 0;
2240: max = INT_MAX;
2241: ecode++;
2242: goto REPEATCHAR;
2243:
2244: case OP_POSPLUS:
2245: possessive = TRUE;
2246: min = 1;
2247: max = INT_MAX;
2248: ecode++;
2249: goto REPEATCHAR;
2250:
2251: case OP_POSQUERY:
2252: possessive = TRUE;
2253: min = 0;
2254: max = 1;
2255: ecode++;
2256: goto REPEATCHAR;
2257:
2258: case OP_STAR:
2259: case OP_MINSTAR:
2260: case OP_PLUS:
2261: case OP_MINPLUS:
2262: case OP_QUERY:
2263: case OP_MINQUERY:
2264: c = *ecode++ - OP_STAR;
2265: minimize = (c & 1) != 0;
2266: min = rep_min[c]; /* Pick up values from tables; */
2267: max = rep_max[c]; /* zero for max => infinity */
2268: if (max == 0) max = INT_MAX;
2269:
2270: /* Common code for all repeated single-character matches. We can give
2271: up quickly if there are fewer than the minimum number of characters left in
2272: the subject. */
2273:
2274: REPEATCHAR:
2275: #ifdef SUPPORT_UTF8
2276: if (utf8)
2277: {
2278: length = 1;
2279: charptr = ecode;
2280: GETCHARLEN(fc, ecode, length);
2281: if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2282: ecode += length;
2283:
2284: /* Handle multibyte character matching specially here. There is
2285: support for caseless matching if UCP support is present. */
2286:
2287: if (length > 1)
2288: {
2289: #ifdef SUPPORT_UCP
2290: unsigned int othercase;
2291: if ((ims & PCRE_CASELESS) != 0 &&
1.2 ! misha 2292: (othercase = UCD_OTHERCASE(fc)) != fc)
1.1 misha 2293: oclength = _pcre_ord2utf8(othercase, occhars);
2294: else oclength = 0;
2295: #endif /* SUPPORT_UCP */
2296:
2297: for (i = 1; i <= min; i++)
2298: {
2299: if (memcmp(eptr, charptr, length) == 0) eptr += length;
2300: #ifdef SUPPORT_UCP
2301: /* Need braces because of following else */
2302: else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2303: else
2304: {
2305: if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2306: eptr += oclength;
2307: }
2308: #else /* without SUPPORT_UCP */
2309: else { RRETURN(MATCH_NOMATCH); }
2310: #endif /* SUPPORT_UCP */
2311: }
2312:
2313: if (min == max) continue;
2314:
2315: if (minimize)
2316: {
2317: for (fi = min;; fi++)
2318: {
2319: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2320: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2321: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2322: if (memcmp(eptr, charptr, length) == 0) eptr += length;
2323: #ifdef SUPPORT_UCP
2324: /* Need braces because of following else */
2325: else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2326: else
2327: {
2328: if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2329: eptr += oclength;
2330: }
2331: #else /* without SUPPORT_UCP */
2332: else { RRETURN (MATCH_NOMATCH); }
2333: #endif /* SUPPORT_UCP */
2334: }
2335: /* Control never gets here */
2336: }
2337:
2338: else /* Maximize */
2339: {
2340: pp = eptr;
2341: for (i = min; i < max; i++)
2342: {
2343: if (eptr > md->end_subject - length) break;
2344: if (memcmp(eptr, charptr, length) == 0) eptr += length;
2345: #ifdef SUPPORT_UCP
2346: else if (oclength == 0) break;
2347: else
2348: {
2349: if (memcmp(eptr, occhars, oclength) != 0) break;
2350: eptr += oclength;
2351: }
2352: #else /* without SUPPORT_UCP */
2353: else break;
2354: #endif /* SUPPORT_UCP */
2355: }
2356:
2357: if (possessive) continue;
2358: for(;;)
2359: {
2360: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2361: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2362: if (eptr == pp) RRETURN(MATCH_NOMATCH);
2363: #ifdef SUPPORT_UCP
2364: eptr--;
2365: BACKCHAR(eptr);
2366: #else /* without SUPPORT_UCP */
2367: eptr -= length;
2368: #endif /* SUPPORT_UCP */
2369: }
2370: }
2371: /* Control never gets here */
2372: }
2373:
2374: /* If the length of a UTF-8 character is 1, we fall through here, and
2375: obey the code as for non-UTF-8 characters below, though in this case the
2376: value of fc will always be < 128. */
2377: }
2378: else
2379: #endif /* SUPPORT_UTF8 */
2380:
2381: /* When not in UTF-8 mode, load a single-byte character. */
2382: {
2383: if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2384: fc = *ecode++;
2385: }
2386:
2387: /* The value of fc at this point is always less than 256, though we may or
2388: may not be in UTF-8 mode. The code is duplicated for the caseless and
2389: caseful cases, for speed, since matching characters is likely to be quite
2390: common. First, ensure the minimum number of matches are present. If min =
2391: max, continue at the same level without recursing. Otherwise, if
2392: minimizing, keep trying the rest of the expression and advancing one
2393: matching character if failing, up to the maximum. Alternatively, if
2394: maximizing, find the maximum number of characters and work backwards. */
2395:
2396: DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2397: max, eptr));
2398:
2399: if ((ims & PCRE_CASELESS) != 0)
2400: {
2401: fc = md->lcc[fc];
2402: for (i = 1; i <= min; i++)
2403: if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2404: if (min == max) continue;
2405: if (minimize)
2406: {
2407: for (fi = min;; fi++)
2408: {
2409: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2410: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2411: if (fi >= max || eptr >= md->end_subject ||
2412: fc != md->lcc[*eptr++])
2413: RRETURN(MATCH_NOMATCH);
2414: }
2415: /* Control never gets here */
2416: }
2417: else /* Maximize */
2418: {
2419: pp = eptr;
2420: for (i = min; i < max; i++)
2421: {
2422: if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2423: eptr++;
2424: }
2425: if (possessive) continue;
2426: while (eptr >= pp)
2427: {
2428: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2429: eptr--;
2430: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2431: }
2432: RRETURN(MATCH_NOMATCH);
2433: }
2434: /* Control never gets here */
2435: }
2436:
2437: /* Caseful comparisons (includes all multi-byte characters) */
2438:
2439: else
2440: {
2441: for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2442: if (min == max) continue;
2443: if (minimize)
2444: {
2445: for (fi = min;; fi++)
2446: {
2447: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2448: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2449: if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2450: RRETURN(MATCH_NOMATCH);
2451: }
2452: /* Control never gets here */
2453: }
2454: else /* Maximize */
2455: {
2456: pp = eptr;
2457: for (i = min; i < max; i++)
2458: {
2459: if (eptr >= md->end_subject || fc != *eptr) break;
2460: eptr++;
2461: }
2462: if (possessive) continue;
2463: while (eptr >= pp)
2464: {
2465: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2466: eptr--;
2467: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2468: }
2469: RRETURN(MATCH_NOMATCH);
2470: }
2471: }
2472: /* Control never gets here */
2473:
2474: /* Match a negated single one-byte character. The character we are
2475: checking can be multibyte. */
2476:
2477: case OP_NOT:
2478: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2479: ecode++;
2480: GETCHARINCTEST(c, eptr);
2481: if ((ims & PCRE_CASELESS) != 0)
2482: {
2483: #ifdef SUPPORT_UTF8
2484: if (c < 256)
2485: #endif
2486: c = md->lcc[c];
2487: if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2488: }
2489: else
2490: {
2491: if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2492: }
2493: break;
2494:
2495: /* Match a negated single one-byte character repeatedly. This is almost a
2496: repeat of the code for a repeated single character, but I haven't found a
2497: nice way of commoning these up that doesn't require a test of the
2498: positive/negative option for each character match. Maybe that wouldn't add
2499: very much to the time taken, but character matching *is* what this is all
2500: about... */
2501:
2502: case OP_NOTEXACT:
2503: min = max = GET2(ecode, 1);
2504: ecode += 3;
2505: goto REPEATNOTCHAR;
2506:
2507: case OP_NOTUPTO:
2508: case OP_NOTMINUPTO:
2509: min = 0;
2510: max = GET2(ecode, 1);
2511: minimize = *ecode == OP_NOTMINUPTO;
2512: ecode += 3;
2513: goto REPEATNOTCHAR;
2514:
2515: case OP_NOTPOSSTAR:
2516: possessive = TRUE;
2517: min = 0;
2518: max = INT_MAX;
2519: ecode++;
2520: goto REPEATNOTCHAR;
2521:
2522: case OP_NOTPOSPLUS:
2523: possessive = TRUE;
2524: min = 1;
2525: max = INT_MAX;
2526: ecode++;
2527: goto REPEATNOTCHAR;
2528:
2529: case OP_NOTPOSQUERY:
2530: possessive = TRUE;
2531: min = 0;
2532: max = 1;
2533: ecode++;
2534: goto REPEATNOTCHAR;
2535:
2536: case OP_NOTPOSUPTO:
2537: possessive = TRUE;
2538: min = 0;
2539: max = GET2(ecode, 1);
2540: ecode += 3;
2541: goto REPEATNOTCHAR;
2542:
2543: case OP_NOTSTAR:
2544: case OP_NOTMINSTAR:
2545: case OP_NOTPLUS:
2546: case OP_NOTMINPLUS:
2547: case OP_NOTQUERY:
2548: case OP_NOTMINQUERY:
2549: c = *ecode++ - OP_NOTSTAR;
2550: minimize = (c & 1) != 0;
2551: min = rep_min[c]; /* Pick up values from tables; */
2552: max = rep_max[c]; /* zero for max => infinity */
2553: if (max == 0) max = INT_MAX;
2554:
2555: /* Common code for all repeated single-byte matches. We can give up quickly
2556: if there are fewer than the minimum number of bytes left in the
2557: subject. */
2558:
2559: REPEATNOTCHAR:
2560: if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2561: fc = *ecode++;
2562:
2563: /* The code is duplicated for the caseless and caseful cases, for speed,
2564: since matching characters is likely to be quite common. First, ensure the
2565: minimum number of matches are present. If min = max, continue at the same
2566: level without recursing. Otherwise, if minimizing, keep trying the rest of
2567: the expression and advancing one matching character if failing, up to the
2568: maximum. Alternatively, if maximizing, find the maximum number of
2569: characters and work backwards. */
2570:
2571: DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2572: max, eptr));
2573:
2574: if ((ims & PCRE_CASELESS) != 0)
2575: {
2576: fc = md->lcc[fc];
2577:
2578: #ifdef SUPPORT_UTF8
2579: /* UTF-8 mode */
2580: if (utf8)
2581: {
2582: register unsigned int d;
2583: for (i = 1; i <= min; i++)
2584: {
2585: GETCHARINC(d, eptr);
2586: if (d < 256) d = md->lcc[d];
2587: if (fc == d) RRETURN(MATCH_NOMATCH);
2588: }
2589: }
2590: else
2591: #endif
2592:
2593: /* Not UTF-8 mode */
2594: {
2595: for (i = 1; i <= min; i++)
2596: if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2597: }
2598:
2599: if (min == max) continue;
2600:
2601: if (minimize)
2602: {
2603: #ifdef SUPPORT_UTF8
2604: /* UTF-8 mode */
2605: if (utf8)
2606: {
2607: register unsigned int d;
2608: for (fi = min;; fi++)
2609: {
2610: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2611: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.2 ! misha 2612: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1.1 misha 2613: GETCHARINC(d, eptr);
2614: if (d < 256) d = md->lcc[d];
1.2 ! misha 2615: if (fc == d) RRETURN(MATCH_NOMATCH);
! 2616:
1.1 misha 2617: }
2618: }
2619: else
2620: #endif
2621: /* Not UTF-8 mode */
2622: {
2623: for (fi = min;; fi++)
2624: {
2625: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2626: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2627: if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2628: RRETURN(MATCH_NOMATCH);
2629: }
2630: }
2631: /* Control never gets here */
2632: }
2633:
2634: /* Maximize case */
2635:
2636: else
2637: {
2638: pp = eptr;
2639:
2640: #ifdef SUPPORT_UTF8
2641: /* UTF-8 mode */
2642: if (utf8)
2643: {
2644: register unsigned int d;
2645: for (i = min; i < max; i++)
2646: {
2647: int len = 1;
2648: if (eptr >= md->end_subject) break;
2649: GETCHARLEN(d, eptr, len);
2650: if (d < 256) d = md->lcc[d];
2651: if (fc == d) break;
2652: eptr += len;
2653: }
2654: if (possessive) continue;
2655: for(;;)
2656: {
2657: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2658: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2659: if (eptr-- == pp) break; /* Stop if tried at original pos */
2660: BACKCHAR(eptr);
2661: }
2662: }
2663: else
2664: #endif
2665: /* Not UTF-8 mode */
2666: {
2667: for (i = min; i < max; i++)
2668: {
2669: if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2670: eptr++;
2671: }
2672: if (possessive) continue;
2673: while (eptr >= pp)
2674: {
2675: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2676: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2677: eptr--;
2678: }
2679: }
2680:
2681: RRETURN(MATCH_NOMATCH);
2682: }
2683: /* Control never gets here */
2684: }
2685:
2686: /* Caseful comparisons */
2687:
2688: else
2689: {
2690: #ifdef SUPPORT_UTF8
2691: /* UTF-8 mode */
2692: if (utf8)
2693: {
2694: register unsigned int d;
2695: for (i = 1; i <= min; i++)
2696: {
2697: GETCHARINC(d, eptr);
2698: if (fc == d) RRETURN(MATCH_NOMATCH);
2699: }
2700: }
2701: else
2702: #endif
2703: /* Not UTF-8 mode */
2704: {
2705: for (i = 1; i <= min; i++)
2706: if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2707: }
2708:
2709: if (min == max) continue;
2710:
2711: if (minimize)
2712: {
2713: #ifdef SUPPORT_UTF8
2714: /* UTF-8 mode */
2715: if (utf8)
2716: {
2717: register unsigned int d;
2718: for (fi = min;; fi++)
2719: {
2720: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2721: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.2 ! misha 2722: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1.1 misha 2723: GETCHARINC(d, eptr);
1.2 ! misha 2724: if (fc == d) RRETURN(MATCH_NOMATCH);
1.1 misha 2725: }
2726: }
2727: else
2728: #endif
2729: /* Not UTF-8 mode */
2730: {
2731: for (fi = min;; fi++)
2732: {
2733: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2734: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2735: if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2736: RRETURN(MATCH_NOMATCH);
2737: }
2738: }
2739: /* Control never gets here */
2740: }
2741:
2742: /* Maximize case */
2743:
2744: else
2745: {
2746: pp = eptr;
2747:
2748: #ifdef SUPPORT_UTF8
2749: /* UTF-8 mode */
2750: if (utf8)
2751: {
2752: register unsigned int d;
2753: for (i = min; i < max; i++)
2754: {
2755: int len = 1;
2756: if (eptr >= md->end_subject) break;
2757: GETCHARLEN(d, eptr, len);
2758: if (fc == d) break;
2759: eptr += len;
2760: }
2761: if (possessive) continue;
2762: for(;;)
2763: {
2764: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2765: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2766: if (eptr-- == pp) break; /* Stop if tried at original pos */
2767: BACKCHAR(eptr);
2768: }
2769: }
2770: else
2771: #endif
2772: /* Not UTF-8 mode */
2773: {
2774: for (i = min; i < max; i++)
2775: {
2776: if (eptr >= md->end_subject || fc == *eptr) break;
2777: eptr++;
2778: }
2779: if (possessive) continue;
2780: while (eptr >= pp)
2781: {
2782: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2783: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2784: eptr--;
2785: }
2786: }
2787:
2788: RRETURN(MATCH_NOMATCH);
2789: }
2790: }
2791: /* Control never gets here */
2792:
2793: /* Match a single character type repeatedly; several different opcodes
2794: share code. This is very similar to the code for single characters, but we
2795: repeat it in the interests of efficiency. */
2796:
2797: case OP_TYPEEXACT:
2798: min = max = GET2(ecode, 1);
2799: minimize = TRUE;
2800: ecode += 3;
2801: goto REPEATTYPE;
2802:
2803: case OP_TYPEUPTO:
2804: case OP_TYPEMINUPTO:
2805: min = 0;
2806: max = GET2(ecode, 1);
2807: minimize = *ecode == OP_TYPEMINUPTO;
2808: ecode += 3;
2809: goto REPEATTYPE;
2810:
2811: case OP_TYPEPOSSTAR:
2812: possessive = TRUE;
2813: min = 0;
2814: max = INT_MAX;
2815: ecode++;
2816: goto REPEATTYPE;
2817:
2818: case OP_TYPEPOSPLUS:
2819: possessive = TRUE;
2820: min = 1;
2821: max = INT_MAX;
2822: ecode++;
2823: goto REPEATTYPE;
2824:
2825: case OP_TYPEPOSQUERY:
2826: possessive = TRUE;
2827: min = 0;
2828: max = 1;
2829: ecode++;
2830: goto REPEATTYPE;
2831:
2832: case OP_TYPEPOSUPTO:
2833: possessive = TRUE;
2834: min = 0;
2835: max = GET2(ecode, 1);
2836: ecode += 3;
2837: goto REPEATTYPE;
2838:
2839: case OP_TYPESTAR:
2840: case OP_TYPEMINSTAR:
2841: case OP_TYPEPLUS:
2842: case OP_TYPEMINPLUS:
2843: case OP_TYPEQUERY:
2844: case OP_TYPEMINQUERY:
2845: c = *ecode++ - OP_TYPESTAR;
2846: minimize = (c & 1) != 0;
2847: min = rep_min[c]; /* Pick up values from tables; */
2848: max = rep_max[c]; /* zero for max => infinity */
2849: if (max == 0) max = INT_MAX;
2850:
2851: /* Common code for all repeated single character type matches. Note that
2852: in UTF-8 mode, '.' matches a character of any length, but for the other
2853: character types, the valid characters are all one-byte long. */
2854:
2855: REPEATTYPE:
2856: ctype = *ecode++; /* Code for the character type */
2857:
2858: #ifdef SUPPORT_UCP
2859: if (ctype == OP_PROP || ctype == OP_NOTPROP)
2860: {
2861: prop_fail_result = ctype == OP_NOTPROP;
2862: prop_type = *ecode++;
2863: prop_value = *ecode++;
2864: }
2865: else prop_type = -1;
2866: #endif
2867:
2868: /* First, ensure the minimum number of matches are present. Use inline
2869: code for maximizing the speed, and do the type test once at the start
2870: (i.e. keep it out of the loop). Also we can test that there are at least
2871: the minimum number of bytes before we start. This isn't as effective in
2872: UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2873: is tidier. Also separate the UCP code, which can be the same for both UTF-8
2874: and single-bytes. */
2875:
2876: if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2877: if (min > 0)
2878: {
2879: #ifdef SUPPORT_UCP
2880: if (prop_type >= 0)
2881: {
2882: switch(prop_type)
2883: {
2884: case PT_ANY:
2885: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2886: for (i = 1; i <= min; i++)
2887: {
2888: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2889: GETCHARINCTEST(c, eptr);
2890: }
2891: break;
2892:
2893: case PT_LAMP:
2894: for (i = 1; i <= min; i++)
2895: {
2896: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2897: GETCHARINCTEST(c, eptr);
1.2 ! misha 2898: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 2899: if ((prop_chartype == ucp_Lu ||
2900: prop_chartype == ucp_Ll ||
2901: prop_chartype == ucp_Lt) == prop_fail_result)
2902: RRETURN(MATCH_NOMATCH);
2903: }
2904: break;
2905:
2906: case PT_GC:
2907: for (i = 1; i <= min; i++)
2908: {
2909: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2910: GETCHARINCTEST(c, eptr);
1.2 ! misha 2911: prop_category = UCD_CATEGORY(c);
1.1 misha 2912: if ((prop_category == prop_value) == prop_fail_result)
2913: RRETURN(MATCH_NOMATCH);
2914: }
2915: break;
2916:
2917: case PT_PC:
2918: for (i = 1; i <= min; i++)
2919: {
2920: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2921: GETCHARINCTEST(c, eptr);
1.2 ! misha 2922: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 2923: if ((prop_chartype == prop_value) == prop_fail_result)
2924: RRETURN(MATCH_NOMATCH);
2925: }
2926: break;
2927:
2928: case PT_SC:
2929: for (i = 1; i <= min; i++)
2930: {
2931: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2932: GETCHARINCTEST(c, eptr);
1.2 ! misha 2933: prop_script = UCD_SCRIPT(c);
1.1 misha 2934: if ((prop_script == prop_value) == prop_fail_result)
2935: RRETURN(MATCH_NOMATCH);
2936: }
2937: break;
2938:
2939: default:
2940: RRETURN(PCRE_ERROR_INTERNAL);
2941: }
2942: }
2943:
2944: /* Match extended Unicode sequences. We will get here only if the
2945: support is in the binary; otherwise a compile-time error occurs. */
2946:
2947: else if (ctype == OP_EXTUNI)
2948: {
2949: for (i = 1; i <= min; i++)
2950: {
2951: GETCHARINCTEST(c, eptr);
1.2 ! misha 2952: prop_category = UCD_CATEGORY(c);
1.1 misha 2953: if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2954: while (eptr < md->end_subject)
2955: {
2956: int len = 1;
2957: if (!utf8) c = *eptr; else
2958: {
2959: GETCHARLEN(c, eptr, len);
2960: }
1.2 ! misha 2961: prop_category = UCD_CATEGORY(c);
1.1 misha 2962: if (prop_category != ucp_M) break;
2963: eptr += len;
2964: }
2965: }
2966: }
2967:
2968: else
2969: #endif /* SUPPORT_UCP */
2970:
2971: /* Handle all other cases when the coding is UTF-8 */
2972:
2973: #ifdef SUPPORT_UTF8
2974: if (utf8) switch(ctype)
2975: {
2976: case OP_ANY:
2977: for (i = 1; i <= min; i++)
2978: {
2979: if (eptr >= md->end_subject || IS_NEWLINE(eptr))
2980: RRETURN(MATCH_NOMATCH);
2981: eptr++;
2982: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2983: }
2984: break;
2985:
2986: case OP_ALLANY:
2987: for (i = 1; i <= min; i++)
2988: {
2989: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2990: eptr++;
2991: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
2992: }
2993: break;
2994:
2995: case OP_ANYBYTE:
2996: eptr += min;
2997: break;
2998:
2999: case OP_ANYNL:
3000: for (i = 1; i <= min; i++)
3001: {
3002: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3003: GETCHARINC(c, eptr);
3004: switch(c)
3005: {
3006: default: RRETURN(MATCH_NOMATCH);
3007: case 0x000d:
3008: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3009: break;
3010:
3011: case 0x000a:
3012: break;
3013:
3014: case 0x000b:
3015: case 0x000c:
3016: case 0x0085:
3017: case 0x2028:
3018: case 0x2029:
3019: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3020: break;
3021: }
3022: }
3023: break;
3024:
3025: case OP_NOT_HSPACE:
3026: for (i = 1; i <= min; i++)
3027: {
3028: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3029: GETCHARINC(c, eptr);
3030: switch(c)
3031: {
3032: default: break;
3033: case 0x09: /* HT */
3034: case 0x20: /* SPACE */
3035: case 0xa0: /* NBSP */
3036: case 0x1680: /* OGHAM SPACE MARK */
3037: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3038: case 0x2000: /* EN QUAD */
3039: case 0x2001: /* EM QUAD */
3040: case 0x2002: /* EN SPACE */
3041: case 0x2003: /* EM SPACE */
3042: case 0x2004: /* THREE-PER-EM SPACE */
3043: case 0x2005: /* FOUR-PER-EM SPACE */
3044: case 0x2006: /* SIX-PER-EM SPACE */
3045: case 0x2007: /* FIGURE SPACE */
3046: case 0x2008: /* PUNCTUATION SPACE */
3047: case 0x2009: /* THIN SPACE */
3048: case 0x200A: /* HAIR SPACE */
3049: case 0x202f: /* NARROW NO-BREAK SPACE */
3050: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3051: case 0x3000: /* IDEOGRAPHIC SPACE */
3052: RRETURN(MATCH_NOMATCH);
3053: }
3054: }
3055: break;
3056:
3057: case OP_HSPACE:
3058: for (i = 1; i <= min; i++)
3059: {
3060: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3061: GETCHARINC(c, eptr);
3062: switch(c)
3063: {
3064: default: RRETURN(MATCH_NOMATCH);
3065: case 0x09: /* HT */
3066: case 0x20: /* SPACE */
3067: case 0xa0: /* NBSP */
3068: case 0x1680: /* OGHAM SPACE MARK */
3069: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3070: case 0x2000: /* EN QUAD */
3071: case 0x2001: /* EM QUAD */
3072: case 0x2002: /* EN SPACE */
3073: case 0x2003: /* EM SPACE */
3074: case 0x2004: /* THREE-PER-EM SPACE */
3075: case 0x2005: /* FOUR-PER-EM SPACE */
3076: case 0x2006: /* SIX-PER-EM SPACE */
3077: case 0x2007: /* FIGURE SPACE */
3078: case 0x2008: /* PUNCTUATION SPACE */
3079: case 0x2009: /* THIN SPACE */
3080: case 0x200A: /* HAIR SPACE */
3081: case 0x202f: /* NARROW NO-BREAK SPACE */
3082: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3083: case 0x3000: /* IDEOGRAPHIC SPACE */
3084: break;
3085: }
3086: }
3087: break;
3088:
3089: case OP_NOT_VSPACE:
3090: for (i = 1; i <= min; i++)
3091: {
3092: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3093: GETCHARINC(c, eptr);
3094: switch(c)
3095: {
3096: default: break;
3097: case 0x0a: /* LF */
3098: case 0x0b: /* VT */
3099: case 0x0c: /* FF */
3100: case 0x0d: /* CR */
3101: case 0x85: /* NEL */
3102: case 0x2028: /* LINE SEPARATOR */
3103: case 0x2029: /* PARAGRAPH SEPARATOR */
3104: RRETURN(MATCH_NOMATCH);
3105: }
3106: }
3107: break;
3108:
3109: case OP_VSPACE:
3110: for (i = 1; i <= min; i++)
3111: {
3112: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3113: GETCHARINC(c, eptr);
3114: switch(c)
3115: {
3116: default: RRETURN(MATCH_NOMATCH);
3117: case 0x0a: /* LF */
3118: case 0x0b: /* VT */
3119: case 0x0c: /* FF */
3120: case 0x0d: /* CR */
3121: case 0x85: /* NEL */
3122: case 0x2028: /* LINE SEPARATOR */
3123: case 0x2029: /* PARAGRAPH SEPARATOR */
3124: break;
3125: }
3126: }
3127: break;
3128:
3129: case OP_NOT_DIGIT:
3130: for (i = 1; i <= min; i++)
3131: {
3132: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3133: GETCHARINC(c, eptr);
3134: if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3135: RRETURN(MATCH_NOMATCH);
3136: }
3137: break;
3138:
3139: case OP_DIGIT:
3140: for (i = 1; i <= min; i++)
3141: {
3142: if (eptr >= md->end_subject ||
3143: *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3144: RRETURN(MATCH_NOMATCH);
3145: /* No need to skip more bytes - we know it's a 1-byte character */
3146: }
3147: break;
3148:
3149: case OP_NOT_WHITESPACE:
3150: for (i = 1; i <= min; i++)
3151: {
3152: if (eptr >= md->end_subject ||
3153: (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3154: RRETURN(MATCH_NOMATCH);
3155: while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3156: }
3157: break;
3158:
3159: case OP_WHITESPACE:
3160: for (i = 1; i <= min; i++)
3161: {
3162: if (eptr >= md->end_subject ||
3163: *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3164: RRETURN(MATCH_NOMATCH);
3165: /* No need to skip more bytes - we know it's a 1-byte character */
3166: }
3167: break;
3168:
3169: case OP_NOT_WORDCHAR:
3170: for (i = 1; i <= min; i++)
3171: {
3172: if (eptr >= md->end_subject ||
3173: (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3174: RRETURN(MATCH_NOMATCH);
3175: while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3176: }
3177: break;
3178:
3179: case OP_WORDCHAR:
3180: for (i = 1; i <= min; i++)
3181: {
3182: if (eptr >= md->end_subject ||
3183: *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3184: RRETURN(MATCH_NOMATCH);
3185: /* No need to skip more bytes - we know it's a 1-byte character */
3186: }
3187: break;
3188:
3189: default:
3190: RRETURN(PCRE_ERROR_INTERNAL);
3191: } /* End switch(ctype) */
3192:
3193: else
3194: #endif /* SUPPORT_UTF8 */
3195:
3196: /* Code for the non-UTF-8 case for minimum matching of operators other
3197: than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3198: number of bytes present, as this was tested above. */
3199:
3200: switch(ctype)
3201: {
3202: case OP_ANY:
3203: for (i = 1; i <= min; i++)
3204: {
3205: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3206: eptr++;
3207: }
3208: break;
3209:
3210: case OP_ALLANY:
3211: eptr += min;
3212: break;
3213:
3214: case OP_ANYBYTE:
3215: eptr += min;
3216: break;
3217:
3218: /* Because of the CRLF case, we can't assume the minimum number of
3219: bytes are present in this case. */
3220:
3221: case OP_ANYNL:
3222: for (i = 1; i <= min; i++)
3223: {
3224: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3225: switch(*eptr++)
3226: {
3227: default: RRETURN(MATCH_NOMATCH);
3228: case 0x000d:
3229: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3230: break;
3231: case 0x000a:
3232: break;
3233:
3234: case 0x000b:
3235: case 0x000c:
3236: case 0x0085:
3237: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3238: break;
3239: }
3240: }
3241: break;
3242:
3243: case OP_NOT_HSPACE:
3244: for (i = 1; i <= min; i++)
3245: {
3246: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3247: switch(*eptr++)
3248: {
3249: default: break;
3250: case 0x09: /* HT */
3251: case 0x20: /* SPACE */
3252: case 0xa0: /* NBSP */
3253: RRETURN(MATCH_NOMATCH);
3254: }
3255: }
3256: break;
3257:
3258: case OP_HSPACE:
3259: for (i = 1; i <= min; i++)
3260: {
3261: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3262: switch(*eptr++)
3263: {
3264: default: RRETURN(MATCH_NOMATCH);
3265: case 0x09: /* HT */
3266: case 0x20: /* SPACE */
3267: case 0xa0: /* NBSP */
3268: break;
3269: }
3270: }
3271: break;
3272:
3273: case OP_NOT_VSPACE:
3274: for (i = 1; i <= min; i++)
3275: {
3276: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3277: switch(*eptr++)
3278: {
3279: default: break;
3280: case 0x0a: /* LF */
3281: case 0x0b: /* VT */
3282: case 0x0c: /* FF */
3283: case 0x0d: /* CR */
3284: case 0x85: /* NEL */
3285: RRETURN(MATCH_NOMATCH);
3286: }
3287: }
3288: break;
3289:
3290: case OP_VSPACE:
3291: for (i = 1; i <= min; i++)
3292: {
3293: if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3294: switch(*eptr++)
3295: {
3296: default: RRETURN(MATCH_NOMATCH);
3297: case 0x0a: /* LF */
3298: case 0x0b: /* VT */
3299: case 0x0c: /* FF */
3300: case 0x0d: /* CR */
3301: case 0x85: /* NEL */
3302: break;
3303: }
3304: }
3305: break;
3306:
3307: case OP_NOT_DIGIT:
3308: for (i = 1; i <= min; i++)
3309: if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3310: break;
3311:
3312: case OP_DIGIT:
3313: for (i = 1; i <= min; i++)
3314: if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3315: break;
3316:
3317: case OP_NOT_WHITESPACE:
3318: for (i = 1; i <= min; i++)
3319: if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3320: break;
3321:
3322: case OP_WHITESPACE:
3323: for (i = 1; i <= min; i++)
3324: if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3325: break;
3326:
3327: case OP_NOT_WORDCHAR:
3328: for (i = 1; i <= min; i++)
3329: if ((md->ctypes[*eptr++] & ctype_word) != 0)
3330: RRETURN(MATCH_NOMATCH);
3331: break;
3332:
3333: case OP_WORDCHAR:
3334: for (i = 1; i <= min; i++)
3335: if ((md->ctypes[*eptr++] & ctype_word) == 0)
3336: RRETURN(MATCH_NOMATCH);
3337: break;
3338:
3339: default:
3340: RRETURN(PCRE_ERROR_INTERNAL);
3341: }
3342: }
3343:
3344: /* If min = max, continue at the same level without recursing */
3345:
3346: if (min == max) continue;
3347:
3348: /* If minimizing, we have to test the rest of the pattern before each
3349: subsequent match. Again, separate the UTF-8 case for speed, and also
3350: separate the UCP cases. */
3351:
3352: if (minimize)
3353: {
3354: #ifdef SUPPORT_UCP
3355: if (prop_type >= 0)
3356: {
3357: switch(prop_type)
3358: {
3359: case PT_ANY:
3360: for (fi = min;; fi++)
3361: {
3362: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3363: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3364: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3365: GETCHARINC(c, eptr);
3366: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3367: }
3368: /* Control never gets here */
3369:
3370: case PT_LAMP:
3371: for (fi = min;; fi++)
3372: {
3373: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3374: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3375: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3376: GETCHARINC(c, eptr);
1.2 ! misha 3377: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 3378: if ((prop_chartype == ucp_Lu ||
3379: prop_chartype == ucp_Ll ||
3380: prop_chartype == ucp_Lt) == prop_fail_result)
3381: RRETURN(MATCH_NOMATCH);
3382: }
3383: /* Control never gets here */
3384:
3385: case PT_GC:
3386: for (fi = min;; fi++)
3387: {
3388: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3389: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3390: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3391: GETCHARINC(c, eptr);
1.2 ! misha 3392: prop_category = UCD_CATEGORY(c);
1.1 misha 3393: if ((prop_category == prop_value) == prop_fail_result)
3394: RRETURN(MATCH_NOMATCH);
3395: }
3396: /* Control never gets here */
3397:
3398: case PT_PC:
3399: for (fi = min;; fi++)
3400: {
3401: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3402: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3403: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3404: GETCHARINC(c, eptr);
1.2 ! misha 3405: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 3406: if ((prop_chartype == prop_value) == prop_fail_result)
3407: RRETURN(MATCH_NOMATCH);
3408: }
3409: /* Control never gets here */
3410:
3411: case PT_SC:
3412: for (fi = min;; fi++)
3413: {
3414: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3415: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3416: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3417: GETCHARINC(c, eptr);
1.2 ! misha 3418: prop_script = UCD_SCRIPT(c);
1.1 misha 3419: if ((prop_script == prop_value) == prop_fail_result)
3420: RRETURN(MATCH_NOMATCH);
3421: }
3422: /* Control never gets here */
3423:
3424: default:
3425: RRETURN(PCRE_ERROR_INTERNAL);
3426: }
3427: }
3428:
3429: /* Match extended Unicode sequences. We will get here only if the
3430: support is in the binary; otherwise a compile-time error occurs. */
3431:
3432: else if (ctype == OP_EXTUNI)
3433: {
3434: for (fi = min;; fi++)
3435: {
3436: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3437: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3438: if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3439: GETCHARINCTEST(c, eptr);
1.2 ! misha 3440: prop_category = UCD_CATEGORY(c);
1.1 misha 3441: if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3442: while (eptr < md->end_subject)
3443: {
3444: int len = 1;
3445: if (!utf8) c = *eptr; else
3446: {
3447: GETCHARLEN(c, eptr, len);
3448: }
1.2 ! misha 3449: prop_category = UCD_CATEGORY(c);
1.1 misha 3450: if (prop_category != ucp_M) break;
3451: eptr += len;
3452: }
3453: }
3454: }
3455:
3456: else
3457: #endif /* SUPPORT_UCP */
3458:
3459: #ifdef SUPPORT_UTF8
3460: /* UTF-8 mode */
3461: if (utf8)
3462: {
3463: for (fi = min;; fi++)
3464: {
3465: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3466: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3467: if (fi >= max || eptr >= md->end_subject ||
3468: (ctype == OP_ANY && IS_NEWLINE(eptr)))
3469: RRETURN(MATCH_NOMATCH);
3470:
3471: GETCHARINC(c, eptr);
3472: switch(ctype)
3473: {
3474: case OP_ANY: /* This is the non-NL case */
3475: case OP_ALLANY:
3476: case OP_ANYBYTE:
3477: break;
3478:
3479: case OP_ANYNL:
3480: switch(c)
3481: {
3482: default: RRETURN(MATCH_NOMATCH);
3483: case 0x000d:
3484: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3485: break;
3486: case 0x000a:
3487: break;
3488:
3489: case 0x000b:
3490: case 0x000c:
3491: case 0x0085:
3492: case 0x2028:
3493: case 0x2029:
3494: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3495: break;
3496: }
3497: break;
3498:
3499: case OP_NOT_HSPACE:
3500: switch(c)
3501: {
3502: default: break;
3503: case 0x09: /* HT */
3504: case 0x20: /* SPACE */
3505: case 0xa0: /* NBSP */
3506: case 0x1680: /* OGHAM SPACE MARK */
3507: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3508: case 0x2000: /* EN QUAD */
3509: case 0x2001: /* EM QUAD */
3510: case 0x2002: /* EN SPACE */
3511: case 0x2003: /* EM SPACE */
3512: case 0x2004: /* THREE-PER-EM SPACE */
3513: case 0x2005: /* FOUR-PER-EM SPACE */
3514: case 0x2006: /* SIX-PER-EM SPACE */
3515: case 0x2007: /* FIGURE SPACE */
3516: case 0x2008: /* PUNCTUATION SPACE */
3517: case 0x2009: /* THIN SPACE */
3518: case 0x200A: /* HAIR SPACE */
3519: case 0x202f: /* NARROW NO-BREAK SPACE */
3520: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3521: case 0x3000: /* IDEOGRAPHIC SPACE */
3522: RRETURN(MATCH_NOMATCH);
3523: }
3524: break;
3525:
3526: case OP_HSPACE:
3527: switch(c)
3528: {
3529: default: RRETURN(MATCH_NOMATCH);
3530: case 0x09: /* HT */
3531: case 0x20: /* SPACE */
3532: case 0xa0: /* NBSP */
3533: case 0x1680: /* OGHAM SPACE MARK */
3534: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3535: case 0x2000: /* EN QUAD */
3536: case 0x2001: /* EM QUAD */
3537: case 0x2002: /* EN SPACE */
3538: case 0x2003: /* EM SPACE */
3539: case 0x2004: /* THREE-PER-EM SPACE */
3540: case 0x2005: /* FOUR-PER-EM SPACE */
3541: case 0x2006: /* SIX-PER-EM SPACE */
3542: case 0x2007: /* FIGURE SPACE */
3543: case 0x2008: /* PUNCTUATION SPACE */
3544: case 0x2009: /* THIN SPACE */
3545: case 0x200A: /* HAIR SPACE */
3546: case 0x202f: /* NARROW NO-BREAK SPACE */
3547: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3548: case 0x3000: /* IDEOGRAPHIC SPACE */
3549: break;
3550: }
3551: break;
3552:
3553: case OP_NOT_VSPACE:
3554: switch(c)
3555: {
3556: default: break;
3557: case 0x0a: /* LF */
3558: case 0x0b: /* VT */
3559: case 0x0c: /* FF */
3560: case 0x0d: /* CR */
3561: case 0x85: /* NEL */
3562: case 0x2028: /* LINE SEPARATOR */
3563: case 0x2029: /* PARAGRAPH SEPARATOR */
3564: RRETURN(MATCH_NOMATCH);
3565: }
3566: break;
3567:
3568: case OP_VSPACE:
3569: switch(c)
3570: {
3571: default: RRETURN(MATCH_NOMATCH);
3572: case 0x0a: /* LF */
3573: case 0x0b: /* VT */
3574: case 0x0c: /* FF */
3575: case 0x0d: /* CR */
3576: case 0x85: /* NEL */
3577: case 0x2028: /* LINE SEPARATOR */
3578: case 0x2029: /* PARAGRAPH SEPARATOR */
3579: break;
3580: }
3581: break;
3582:
3583: case OP_NOT_DIGIT:
3584: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3585: RRETURN(MATCH_NOMATCH);
3586: break;
3587:
3588: case OP_DIGIT:
3589: if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3590: RRETURN(MATCH_NOMATCH);
3591: break;
3592:
3593: case OP_NOT_WHITESPACE:
3594: if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3595: RRETURN(MATCH_NOMATCH);
3596: break;
3597:
3598: case OP_WHITESPACE:
3599: if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3600: RRETURN(MATCH_NOMATCH);
3601: break;
3602:
3603: case OP_NOT_WORDCHAR:
3604: if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3605: RRETURN(MATCH_NOMATCH);
3606: break;
3607:
3608: case OP_WORDCHAR:
3609: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3610: RRETURN(MATCH_NOMATCH);
3611: break;
3612:
3613: default:
3614: RRETURN(PCRE_ERROR_INTERNAL);
3615: }
3616: }
3617: }
3618: else
3619: #endif
3620: /* Not UTF-8 mode */
3621: {
3622: for (fi = min;; fi++)
3623: {
3624: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3625: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3626: if (fi >= max || eptr >= md->end_subject ||
3627: (ctype == OP_ANY && IS_NEWLINE(eptr)))
3628: RRETURN(MATCH_NOMATCH);
3629:
3630: c = *eptr++;
3631: switch(ctype)
3632: {
3633: case OP_ANY: /* This is the non-NL case */
3634: case OP_ALLANY:
3635: case OP_ANYBYTE:
3636: break;
3637:
3638: case OP_ANYNL:
3639: switch(c)
3640: {
3641: default: RRETURN(MATCH_NOMATCH);
3642: case 0x000d:
3643: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3644: break;
3645:
3646: case 0x000a:
3647: break;
3648:
3649: case 0x000b:
3650: case 0x000c:
3651: case 0x0085:
3652: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3653: break;
3654: }
3655: break;
3656:
3657: case OP_NOT_HSPACE:
3658: switch(c)
3659: {
3660: default: break;
3661: case 0x09: /* HT */
3662: case 0x20: /* SPACE */
3663: case 0xa0: /* NBSP */
3664: RRETURN(MATCH_NOMATCH);
3665: }
3666: break;
3667:
3668: case OP_HSPACE:
3669: switch(c)
3670: {
3671: default: RRETURN(MATCH_NOMATCH);
3672: case 0x09: /* HT */
3673: case 0x20: /* SPACE */
3674: case 0xa0: /* NBSP */
3675: break;
3676: }
3677: break;
3678:
3679: case OP_NOT_VSPACE:
3680: switch(c)
3681: {
3682: default: break;
3683: case 0x0a: /* LF */
3684: case 0x0b: /* VT */
3685: case 0x0c: /* FF */
3686: case 0x0d: /* CR */
3687: case 0x85: /* NEL */
3688: RRETURN(MATCH_NOMATCH);
3689: }
3690: break;
3691:
3692: case OP_VSPACE:
3693: switch(c)
3694: {
3695: default: RRETURN(MATCH_NOMATCH);
3696: case 0x0a: /* LF */
3697: case 0x0b: /* VT */
3698: case 0x0c: /* FF */
3699: case 0x0d: /* CR */
3700: case 0x85: /* NEL */
3701: break;
3702: }
3703: break;
3704:
3705: case OP_NOT_DIGIT:
3706: if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3707: break;
3708:
3709: case OP_DIGIT:
3710: if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3711: break;
3712:
3713: case OP_NOT_WHITESPACE:
3714: if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3715: break;
3716:
3717: case OP_WHITESPACE:
3718: if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3719: break;
3720:
3721: case OP_NOT_WORDCHAR:
3722: if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3723: break;
3724:
3725: case OP_WORDCHAR:
3726: if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3727: break;
3728:
3729: default:
3730: RRETURN(PCRE_ERROR_INTERNAL);
3731: }
3732: }
3733: }
3734: /* Control never gets here */
3735: }
3736:
3737: /* If maximizing, it is worth using inline code for speed, doing the type
3738: test once at the start (i.e. keep it out of the loop). Again, keep the
3739: UTF-8 and UCP stuff separate. */
3740:
3741: else
3742: {
3743: pp = eptr; /* Remember where we started */
3744:
3745: #ifdef SUPPORT_UCP
3746: if (prop_type >= 0)
3747: {
3748: switch(prop_type)
3749: {
3750: case PT_ANY:
3751: for (i = min; i < max; i++)
3752: {
3753: int len = 1;
3754: if (eptr >= md->end_subject) break;
3755: GETCHARLEN(c, eptr, len);
3756: if (prop_fail_result) break;
3757: eptr+= len;
3758: }
3759: break;
3760:
3761: case PT_LAMP:
3762: for (i = min; i < max; i++)
3763: {
3764: int len = 1;
3765: if (eptr >= md->end_subject) break;
3766: GETCHARLEN(c, eptr, len);
1.2 ! misha 3767: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 3768: if ((prop_chartype == ucp_Lu ||
3769: prop_chartype == ucp_Ll ||
3770: prop_chartype == ucp_Lt) == prop_fail_result)
3771: break;
3772: eptr+= len;
3773: }
3774: break;
3775:
3776: case PT_GC:
3777: for (i = min; i < max; i++)
3778: {
3779: int len = 1;
3780: if (eptr >= md->end_subject) break;
3781: GETCHARLEN(c, eptr, len);
1.2 ! misha 3782: prop_category = UCD_CATEGORY(c);
1.1 misha 3783: if ((prop_category == prop_value) == prop_fail_result)
3784: break;
3785: eptr+= len;
3786: }
3787: break;
3788:
3789: case PT_PC:
3790: for (i = min; i < max; i++)
3791: {
3792: int len = 1;
3793: if (eptr >= md->end_subject) break;
3794: GETCHARLEN(c, eptr, len);
1.2 ! misha 3795: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 3796: if ((prop_chartype == prop_value) == prop_fail_result)
3797: break;
3798: eptr+= len;
3799: }
3800: break;
3801:
3802: case PT_SC:
3803: for (i = min; i < max; i++)
3804: {
3805: int len = 1;
3806: if (eptr >= md->end_subject) break;
3807: GETCHARLEN(c, eptr, len);
1.2 ! misha 3808: prop_script = UCD_SCRIPT(c);
1.1 misha 3809: if ((prop_script == prop_value) == prop_fail_result)
3810: break;
3811: eptr+= len;
3812: }
3813: break;
3814: }
3815:
3816: /* eptr is now past the end of the maximum run */
3817:
3818: if (possessive) continue;
3819: for(;;)
3820: {
3821: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3822: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3823: if (eptr-- == pp) break; /* Stop if tried at original pos */
3824: if (utf8) BACKCHAR(eptr);
3825: }
3826: }
3827:
3828: /* Match extended Unicode sequences. We will get here only if the
3829: support is in the binary; otherwise a compile-time error occurs. */
3830:
3831: else if (ctype == OP_EXTUNI)
3832: {
3833: for (i = min; i < max; i++)
3834: {
3835: if (eptr >= md->end_subject) break;
3836: GETCHARINCTEST(c, eptr);
1.2 ! misha 3837: prop_category = UCD_CATEGORY(c);
1.1 misha 3838: if (prop_category == ucp_M) break;
3839: while (eptr < md->end_subject)
3840: {
3841: int len = 1;
3842: if (!utf8) c = *eptr; else
3843: {
3844: GETCHARLEN(c, eptr, len);
3845: }
1.2 ! misha 3846: prop_category = UCD_CATEGORY(c);
1.1 misha 3847: if (prop_category != ucp_M) break;
3848: eptr += len;
3849: }
3850: }
3851:
3852: /* eptr is now past the end of the maximum run */
3853:
3854: if (possessive) continue;
3855: for(;;)
3856: {
3857: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3858: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3859: if (eptr-- == pp) break; /* Stop if tried at original pos */
3860: for (;;) /* Move back over one extended */
3861: {
3862: int len = 1;
3863: if (!utf8) c = *eptr; else
3864: {
3865: BACKCHAR(eptr);
3866: GETCHARLEN(c, eptr, len);
3867: }
1.2 ! misha 3868: prop_category = UCD_CATEGORY(c);
1.1 misha 3869: if (prop_category != ucp_M) break;
3870: eptr--;
3871: }
3872: }
3873: }
3874:
3875: else
3876: #endif /* SUPPORT_UCP */
3877:
3878: #ifdef SUPPORT_UTF8
3879: /* UTF-8 mode */
3880:
3881: if (utf8)
3882: {
3883: switch(ctype)
3884: {
3885: case OP_ANY:
3886: if (max < INT_MAX)
3887: {
3888: for (i = min; i < max; i++)
3889: {
3890: if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3891: eptr++;
3892: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3893: }
3894: }
3895:
3896: /* Handle unlimited UTF-8 repeat */
3897:
3898: else
3899: {
3900: for (i = min; i < max; i++)
3901: {
3902: if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3903: eptr++;
3904: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3905: }
3906: }
3907: break;
3908:
3909: case OP_ALLANY:
3910: if (max < INT_MAX)
3911: {
3912: for (i = min; i < max; i++)
3913: {
3914: if (eptr >= md->end_subject) break;
3915: eptr++;
3916: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3917: }
3918: }
3919: else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3920: break;
3921:
3922: /* The byte case is the same as non-UTF8 */
3923:
3924: case OP_ANYBYTE:
3925: c = max - min;
3926: if (c > (unsigned int)(md->end_subject - eptr))
3927: c = md->end_subject - eptr;
3928: eptr += c;
3929: break;
3930:
3931: case OP_ANYNL:
3932: for (i = min; i < max; i++)
3933: {
3934: int len = 1;
3935: if (eptr >= md->end_subject) break;
3936: GETCHARLEN(c, eptr, len);
3937: if (c == 0x000d)
3938: {
3939: if (++eptr >= md->end_subject) break;
3940: if (*eptr == 0x000a) eptr++;
3941: }
3942: else
3943: {
3944: if (c != 0x000a &&
3945: (md->bsr_anycrlf ||
3946: (c != 0x000b && c != 0x000c &&
3947: c != 0x0085 && c != 0x2028 && c != 0x2029)))
3948: break;
3949: eptr += len;
3950: }
3951: }
3952: break;
3953:
3954: case OP_NOT_HSPACE:
3955: case OP_HSPACE:
3956: for (i = min; i < max; i++)
3957: {
3958: BOOL gotspace;
3959: int len = 1;
3960: if (eptr >= md->end_subject) break;
3961: GETCHARLEN(c, eptr, len);
3962: switch(c)
3963: {
3964: default: gotspace = FALSE; break;
3965: case 0x09: /* HT */
3966: case 0x20: /* SPACE */
3967: case 0xa0: /* NBSP */
3968: case 0x1680: /* OGHAM SPACE MARK */
3969: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3970: case 0x2000: /* EN QUAD */
3971: case 0x2001: /* EM QUAD */
3972: case 0x2002: /* EN SPACE */
3973: case 0x2003: /* EM SPACE */
3974: case 0x2004: /* THREE-PER-EM SPACE */
3975: case 0x2005: /* FOUR-PER-EM SPACE */
3976: case 0x2006: /* SIX-PER-EM SPACE */
3977: case 0x2007: /* FIGURE SPACE */
3978: case 0x2008: /* PUNCTUATION SPACE */
3979: case 0x2009: /* THIN SPACE */
3980: case 0x200A: /* HAIR SPACE */
3981: case 0x202f: /* NARROW NO-BREAK SPACE */
3982: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3983: case 0x3000: /* IDEOGRAPHIC SPACE */
3984: gotspace = TRUE;
3985: break;
3986: }
3987: if (gotspace == (ctype == OP_NOT_HSPACE)) break;
3988: eptr += len;
3989: }
3990: break;
3991:
3992: case OP_NOT_VSPACE:
3993: case OP_VSPACE:
3994: for (i = min; i < max; i++)
3995: {
3996: BOOL gotspace;
3997: int len = 1;
3998: if (eptr >= md->end_subject) break;
3999: GETCHARLEN(c, eptr, len);
4000: switch(c)
4001: {
4002: default: gotspace = FALSE; break;
4003: case 0x0a: /* LF */
4004: case 0x0b: /* VT */
4005: case 0x0c: /* FF */
4006: case 0x0d: /* CR */
4007: case 0x85: /* NEL */
4008: case 0x2028: /* LINE SEPARATOR */
4009: case 0x2029: /* PARAGRAPH SEPARATOR */
4010: gotspace = TRUE;
4011: break;
4012: }
4013: if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4014: eptr += len;
4015: }
4016: break;
4017:
4018: case OP_NOT_DIGIT:
4019: for (i = min; i < max; i++)
4020: {
4021: int len = 1;
4022: if (eptr >= md->end_subject) break;
4023: GETCHARLEN(c, eptr, len);
4024: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4025: eptr+= len;
4026: }
4027: break;
4028:
4029: case OP_DIGIT:
4030: for (i = min; i < max; i++)
4031: {
4032: int len = 1;
4033: if (eptr >= md->end_subject) break;
4034: GETCHARLEN(c, eptr, len);
4035: if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4036: eptr+= len;
4037: }
4038: break;
4039:
4040: case OP_NOT_WHITESPACE:
4041: for (i = min; i < max; i++)
4042: {
4043: int len = 1;
4044: if (eptr >= md->end_subject) break;
4045: GETCHARLEN(c, eptr, len);
4046: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4047: eptr+= len;
4048: }
4049: break;
4050:
4051: case OP_WHITESPACE:
4052: for (i = min; i < max; i++)
4053: {
4054: int len = 1;
4055: if (eptr >= md->end_subject) break;
4056: GETCHARLEN(c, eptr, len);
4057: if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4058: eptr+= len;
4059: }
4060: break;
4061:
4062: case OP_NOT_WORDCHAR:
4063: for (i = min; i < max; i++)
4064: {
4065: int len = 1;
4066: if (eptr >= md->end_subject) break;
4067: GETCHARLEN(c, eptr, len);
4068: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4069: eptr+= len;
4070: }
4071: break;
4072:
4073: case OP_WORDCHAR:
4074: for (i = min; i < max; i++)
4075: {
4076: int len = 1;
4077: if (eptr >= md->end_subject) break;
4078: GETCHARLEN(c, eptr, len);
4079: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4080: eptr+= len;
4081: }
4082: break;
4083:
4084: default:
4085: RRETURN(PCRE_ERROR_INTERNAL);
4086: }
4087:
4088: /* eptr is now past the end of the maximum run */
4089:
4090: if (possessive) continue;
4091: for(;;)
4092: {
4093: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4094: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4095: if (eptr-- == pp) break; /* Stop if tried at original pos */
4096: BACKCHAR(eptr);
4097: }
4098: }
4099: else
4100: #endif /* SUPPORT_UTF8 */
4101:
4102: /* Not UTF-8 mode */
4103: {
4104: switch(ctype)
4105: {
4106: case OP_ANY:
4107: for (i = min; i < max; i++)
4108: {
4109: if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4110: eptr++;
4111: }
4112: break;
4113:
4114: case OP_ALLANY:
4115: case OP_ANYBYTE:
4116: c = max - min;
4117: if (c > (unsigned int)(md->end_subject - eptr))
4118: c = md->end_subject - eptr;
4119: eptr += c;
4120: break;
4121:
4122: case OP_ANYNL:
4123: for (i = min; i < max; i++)
4124: {
4125: if (eptr >= md->end_subject) break;
4126: c = *eptr;
4127: if (c == 0x000d)
4128: {
4129: if (++eptr >= md->end_subject) break;
4130: if (*eptr == 0x000a) eptr++;
4131: }
4132: else
4133: {
4134: if (c != 0x000a &&
4135: (md->bsr_anycrlf ||
4136: (c != 0x000b && c != 0x000c && c != 0x0085)))
4137: break;
4138: eptr++;
4139: }
4140: }
4141: break;
4142:
4143: case OP_NOT_HSPACE:
4144: for (i = min; i < max; i++)
4145: {
4146: if (eptr >= md->end_subject) break;
4147: c = *eptr;
4148: if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4149: eptr++;
4150: }
4151: break;
4152:
4153: case OP_HSPACE:
4154: for (i = min; i < max; i++)
4155: {
4156: if (eptr >= md->end_subject) break;
4157: c = *eptr;
4158: if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4159: eptr++;
4160: }
4161: break;
4162:
4163: case OP_NOT_VSPACE:
4164: for (i = min; i < max; i++)
4165: {
4166: if (eptr >= md->end_subject) break;
4167: c = *eptr;
4168: if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4169: break;
4170: eptr++;
4171: }
4172: break;
4173:
4174: case OP_VSPACE:
4175: for (i = min; i < max; i++)
4176: {
4177: if (eptr >= md->end_subject) break;
4178: c = *eptr;
4179: if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4180: break;
4181: eptr++;
4182: }
4183: break;
4184:
4185: case OP_NOT_DIGIT:
4186: for (i = min; i < max; i++)
4187: {
4188: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4189: break;
4190: eptr++;
4191: }
4192: break;
4193:
4194: case OP_DIGIT:
4195: for (i = min; i < max; i++)
4196: {
4197: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4198: break;
4199: eptr++;
4200: }
4201: break;
4202:
4203: case OP_NOT_WHITESPACE:
4204: for (i = min; i < max; i++)
4205: {
4206: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4207: break;
4208: eptr++;
4209: }
4210: break;
4211:
4212: case OP_WHITESPACE:
4213: for (i = min; i < max; i++)
4214: {
4215: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4216: break;
4217: eptr++;
4218: }
4219: break;
4220:
4221: case OP_NOT_WORDCHAR:
4222: for (i = min; i < max; i++)
4223: {
4224: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4225: break;
4226: eptr++;
4227: }
4228: break;
4229:
4230: case OP_WORDCHAR:
4231: for (i = min; i < max; i++)
4232: {
4233: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4234: break;
4235: eptr++;
4236: }
4237: break;
4238:
4239: default:
4240: RRETURN(PCRE_ERROR_INTERNAL);
4241: }
4242:
4243: /* eptr is now past the end of the maximum run */
4244:
4245: if (possessive) continue;
4246: while (eptr >= pp)
4247: {
4248: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4249: eptr--;
4250: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4251: }
4252: }
4253:
4254: /* Get here if we can't make it match with any permitted repetitions */
4255:
4256: RRETURN(MATCH_NOMATCH);
4257: }
4258: /* Control never gets here */
4259:
4260: /* There's been some horrible disaster. Arrival here can only mean there is
4261: something seriously wrong in the code above or the OP_xxx definitions. */
4262:
4263: default:
4264: DPRINTF(("Unknown opcode %d\n", *ecode));
4265: RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
4266: }
4267:
4268: /* Do not stick any code in here without much thought; it is assumed
4269: that "continue" in the code above comes out to here to repeat the main
4270: loop. */
4271:
4272: } /* End of main loop */
4273: /* Control never reaches here */
4274:
4275:
4276: /* When compiling to use the heap rather than the stack for recursive calls to
4277: match(), the RRETURN() macro jumps here. The number that is saved in
4278: frame->Xwhere indicates which label we actually want to return to. */
4279:
4280: #ifdef NO_RECURSE
4281: #define LBL(val) case val: goto L_RM##val;
4282: HEAP_RETURN:
4283: switch (frame->Xwhere)
4284: {
4285: LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4286: LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4287: LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4288: LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4289: LBL(53) LBL(54)
4290: #ifdef SUPPORT_UTF8
4291: LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4292: LBL(32) LBL(34) LBL(42) LBL(46)
4293: #ifdef SUPPORT_UCP
4294: LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4295: #endif /* SUPPORT_UCP */
4296: #endif /* SUPPORT_UTF8 */
4297: default:
4298: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4299: return PCRE_ERROR_INTERNAL;
4300: }
4301: #undef LBL
4302: #endif /* NO_RECURSE */
4303: }
4304:
4305:
4306: /***************************************************************************
4307: ****************************************************************************
4308: RECURSION IN THE match() FUNCTION
4309:
4310: Undefine all the macros that were defined above to handle this. */
4311:
4312: #ifdef NO_RECURSE
4313: #undef eptr
4314: #undef ecode
4315: #undef mstart
4316: #undef offset_top
4317: #undef ims
4318: #undef eptrb
4319: #undef flags
4320:
4321: #undef callpat
4322: #undef charptr
4323: #undef data
4324: #undef next
4325: #undef pp
4326: #undef prev
4327: #undef saved_eptr
4328:
4329: #undef new_recursive
4330:
4331: #undef cur_is_word
4332: #undef condition
4333: #undef prev_is_word
4334:
4335: #undef original_ims
4336:
4337: #undef ctype
4338: #undef length
4339: #undef max
4340: #undef min
4341: #undef number
4342: #undef offset
4343: #undef op
4344: #undef save_capture_last
4345: #undef save_offset1
4346: #undef save_offset2
4347: #undef save_offset3
4348: #undef stacksave
4349:
4350: #undef newptrb
4351:
4352: #endif
4353:
4354: /* These two are defined as macros in both cases */
4355:
4356: #undef fc
4357: #undef fi
4358:
4359: /***************************************************************************
4360: ***************************************************************************/
4361:
4362:
4363:
4364: /*************************************************
4365: * Execute a Regular Expression *
4366: *************************************************/
4367:
4368: /* This function applies a compiled re to a subject string and picks out
4369: portions of the string if it matches. Two elements in the vector are set for
4370: each substring: the offsets to the start and end of the substring.
4371:
4372: Arguments:
4373: argument_re points to the compiled expression
4374: extra_data points to extra data or is NULL
4375: subject points to the subject string
4376: length length of subject string (may contain binary zeros)
4377: start_offset where to start in the subject string
4378: options option bits
4379: offsets points to a vector of ints to be filled in with offsets
4380: offsetcount the number of elements in the vector
4381:
4382: Returns: > 0 => success; value is the number of elements filled in
4383: = 0 => success, but offsets is not big enough
4384: -1 => failed to match
4385: < -1 => some kind of unexpected problem
4386: */
4387:
1.2 ! misha 4388: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
1.1 misha 4389: pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4390: PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4391: int offsetcount)
4392: {
4393: int rc, resetcount, ocount;
4394: int first_byte = -1;
4395: int req_byte = -1;
4396: int req_byte2 = -1;
4397: int newline;
4398: unsigned long int ims;
4399: BOOL using_temporary_offsets = FALSE;
4400: BOOL anchored;
4401: BOOL startline;
4402: BOOL firstline;
4403: BOOL first_byte_caseless = FALSE;
4404: BOOL req_byte_caseless = FALSE;
4405: BOOL utf8;
4406: match_data match_block;
4407: match_data *md = &match_block;
4408: const uschar *tables;
4409: const uschar *start_bits = NULL;
4410: USPTR start_match = (USPTR)subject + start_offset;
4411: USPTR end_subject;
4412: USPTR req_byte_ptr = start_match - 1;
4413:
4414: pcre_study_data internal_study;
4415: const pcre_study_data *study;
4416:
4417: real_pcre internal_re;
4418: const real_pcre *external_re = (const real_pcre *)argument_re;
4419: const real_pcre *re = external_re;
4420:
4421: /* Plausibility checks */
4422:
4423: if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4424: if (re == NULL || subject == NULL ||
4425: (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4426: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4427:
4428: /* Fish out the optional data from the extra_data structure, first setting
4429: the default values. */
4430:
4431: study = NULL;
4432: md->match_limit = MATCH_LIMIT;
4433: md->match_limit_recursion = MATCH_LIMIT_RECURSION;
4434: md->callout_data = NULL;
4435:
4436: /* The table pointer is always in native byte order. */
4437:
4438: tables = external_re->tables;
4439:
4440: if (extra_data != NULL)
4441: {
4442: register unsigned int flags = extra_data->flags;
4443: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4444: study = (const pcre_study_data *)extra_data->study_data;
4445: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4446: md->match_limit = extra_data->match_limit;
4447: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4448: md->match_limit_recursion = extra_data->match_limit_recursion;
4449: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4450: md->callout_data = extra_data->callout_data;
4451: if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4452: }
4453:
4454: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4455: is a feature that makes it possible to save compiled regex and re-use them
4456: in other programs later. */
4457:
4458: if (tables == NULL) tables = _pcre_default_tables;
4459:
4460: /* Check that the first field in the block is the magic number. If it is not,
4461: test for a regex that was compiled on a host of opposite endianness. If this is
4462: the case, flipped values are put in internal_re and internal_study if there was
4463: study data too. */
4464:
4465: if (re->magic_number != MAGIC_NUMBER)
4466: {
4467: re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4468: if (re == NULL) return PCRE_ERROR_BADMAGIC;
4469: if (study != NULL) study = &internal_study;
4470: }
4471:
4472: /* Set up other data */
4473:
4474: anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4475: startline = (re->flags & PCRE_STARTLINE) != 0;
4476: firstline = (re->options & PCRE_FIRSTLINE) != 0;
4477:
4478: /* The code starts after the real_pcre block and the capture name table. */
4479:
4480: md->start_code = (const uschar *)external_re + re->name_table_offset +
4481: re->name_count * re->name_entry_size;
4482:
4483: md->start_subject = (USPTR)subject;
4484: md->start_offset = start_offset;
4485: md->end_subject = md->start_subject + length;
4486: end_subject = md->end_subject;
4487:
4488: md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4489: utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4490: md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4491:
4492: md->notbol = (options & PCRE_NOTBOL) != 0;
4493: md->noteol = (options & PCRE_NOTEOL) != 0;
4494: md->notempty = (options & PCRE_NOTEMPTY) != 0;
4495: md->partial = (options & PCRE_PARTIAL) != 0;
4496: md->hitend = FALSE;
4497:
4498: md->recursive = NULL; /* No recursion at top level */
4499:
4500: md->lcc = tables + lcc_offset;
4501: md->ctypes = tables + ctypes_offset;
4502:
4503: /* Handle different \R options. */
4504:
4505: switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4506: {
4507: case 0:
4508: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4509: md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4510: else
4511: #ifdef BSR_ANYCRLF
4512: md->bsr_anycrlf = TRUE;
4513: #else
4514: md->bsr_anycrlf = FALSE;
4515: #endif
4516: break;
4517:
4518: case PCRE_BSR_ANYCRLF:
4519: md->bsr_anycrlf = TRUE;
4520: break;
4521:
4522: case PCRE_BSR_UNICODE:
4523: md->bsr_anycrlf = FALSE;
4524: break;
4525:
4526: default: return PCRE_ERROR_BADNEWLINE;
4527: }
4528:
4529: /* Handle different types of newline. The three bits give eight cases. If
4530: nothing is set at run time, whatever was used at compile time applies. */
4531:
4532: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4533: (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4534: {
4535: case 0: newline = NEWLINE; break; /* Compile-time default */
4536: case PCRE_NEWLINE_CR: newline = '\r'; break;
4537: case PCRE_NEWLINE_LF: newline = '\n'; break;
4538: case PCRE_NEWLINE_CR+
4539: PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
4540: case PCRE_NEWLINE_ANY: newline = -1; break;
4541: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4542: default: return PCRE_ERROR_BADNEWLINE;
4543: }
4544:
4545: if (newline == -2)
4546: {
4547: md->nltype = NLTYPE_ANYCRLF;
4548: }
4549: else if (newline < 0)
4550: {
4551: md->nltype = NLTYPE_ANY;
4552: }
4553: else
4554: {
4555: md->nltype = NLTYPE_FIXED;
4556: if (newline > 255)
4557: {
4558: md->nllen = 2;
4559: md->nl[0] = (newline >> 8) & 255;
4560: md->nl[1] = newline & 255;
4561: }
4562: else
4563: {
4564: md->nllen = 1;
4565: md->nl[0] = newline;
4566: }
4567: }
4568:
4569: /* Partial matching is supported only for a restricted set of regexes at the
4570: moment. */
4571:
4572: if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4573: return PCRE_ERROR_BADPARTIAL;
4574:
4575: /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4576: back the character offset. */
4577:
4578: #ifdef SUPPORT_UTF8
4579: if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4580: {
4581: if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
4582: return PCRE_ERROR_BADUTF8;
4583: if (start_offset > 0 && start_offset < length)
4584: {
4585: int tb = ((uschar *)subject)[start_offset];
4586: if (tb > 127)
4587: {
4588: tb &= 0xc0;
4589: if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4590: }
4591: }
4592: }
4593: #endif
4594:
4595: /* The ims options can vary during the matching as a result of the presence
4596: of (?ims) items in the pattern. They are kept in a local variable so that
4597: restoring at the exit of a group is easy. */
4598:
4599: ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4600:
4601: /* If the expression has got more back references than the offsets supplied can
4602: hold, we get a temporary chunk of working store to use during the matching.
4603: Otherwise, we can use the vector supplied, rounding down its size to a multiple
4604: of 3. */
4605:
4606: ocount = offsetcount - (offsetcount % 3);
4607:
4608: if (re->top_backref > 0 && re->top_backref >= ocount/3)
4609: {
4610: ocount = re->top_backref * 3 + 3;
4611: md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4612: if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4613: using_temporary_offsets = TRUE;
4614: DPRINTF(("Got memory to hold back references\n"));
4615: }
4616: else md->offset_vector = offsets;
4617:
4618: md->offset_end = ocount;
4619: md->offset_max = (2*ocount)/3;
4620: md->offset_overflow = FALSE;
4621: md->capture_last = -1;
4622:
4623: /* Compute the minimum number of offsets that we need to reset each time. Doing
4624: this makes a huge difference to execution time when there aren't many brackets
4625: in the pattern. */
4626:
4627: resetcount = 2 + re->top_bracket * 2;
4628: if (resetcount > offsetcount) resetcount = ocount;
4629:
4630: /* Reset the working variable associated with each extraction. These should
4631: never be used unless previously set, but they get saved and restored, and so we
4632: initialize them to avoid reading uninitialized locations. */
4633:
4634: if (md->offset_vector != NULL)
4635: {
4636: register int *iptr = md->offset_vector + ocount;
4637: register int *iend = iptr - resetcount/2 + 1;
4638: while (--iptr >= iend) *iptr = -1;
4639: }
4640:
4641: /* Set up the first character to match, if available. The first_byte value is
4642: never set for an anchored regular expression, but the anchoring may be forced
4643: at run time, so we have to test for anchoring. The first char may be unset for
4644: an unanchored pattern, of course. If there's no first char and the pattern was
4645: studied, there may be a bitmap of possible first characters. */
4646:
4647: if (!anchored)
4648: {
4649: if ((re->flags & PCRE_FIRSTSET) != 0)
4650: {
4651: first_byte = re->first_byte & 255;
4652: if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4653: first_byte = md->lcc[first_byte];
4654: }
4655: else
4656: if (!startline && study != NULL &&
4657: (study->options & PCRE_STUDY_MAPPED) != 0)
4658: start_bits = study->start_bits;
4659: }
4660:
4661: /* For anchored or unanchored matches, there may be a "last known required
4662: character" set. */
4663:
4664: if ((re->flags & PCRE_REQCHSET) != 0)
4665: {
4666: req_byte = re->req_byte & 255;
4667: req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4668: req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4669: }
4670:
4671:
4672: /* ==========================================================================*/
4673:
4674: /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4675: the loop runs just once. */
4676:
4677: for(;;)
4678: {
4679: USPTR save_end_subject = end_subject;
4680: USPTR new_start_match;
4681:
4682: /* Reset the maximum number of extractions we might see. */
4683:
4684: if (md->offset_vector != NULL)
4685: {
4686: register int *iptr = md->offset_vector;
4687: register int *iend = iptr + resetcount;
4688: while (iptr < iend) *iptr++ = -1;
4689: }
4690:
4691: /* Advance to a unique first char if possible. If firstline is TRUE, the
4692: start of the match is constrained to the first line of a multiline string.
4693: That is, the match must be before or at the first newline. Implement this by
4694: temporarily adjusting end_subject so that we stop scanning at a newline. If
4695: the match fails at the newline, later code breaks this loop. */
4696:
4697: if (firstline)
4698: {
4699: USPTR t = start_match;
1.2 ! misha 4700: #ifdef SUPPORT_UTF8
! 4701: if (utf8)
! 4702: {
! 4703: while (t < md->end_subject && !IS_NEWLINE(t))
! 4704: {
! 4705: t++;
! 4706: while (t < end_subject && (*t & 0xc0) == 0x80) t++;
! 4707: }
! 4708: }
! 4709: else
! 4710: #endif
1.1 misha 4711: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4712: end_subject = t;
4713: }
4714:
1.2 ! misha 4715: /* Now advance to a unique first byte if there is one. */
1.1 misha 4716:
4717: if (first_byte >= 0)
4718: {
4719: if (first_byte_caseless)
1.2 ! misha 4720: while (start_match < end_subject && md->lcc[*start_match] != first_byte)
! 4721: start_match++;
1.1 misha 4722: else
4723: while (start_match < end_subject && *start_match != first_byte)
1.2 ! misha 4724: start_match++;
1.1 misha 4725: }
4726:
1.2 ! misha 4727: /* Or to just after a linebreak for a multiline match */
1.1 misha 4728:
4729: else if (startline)
4730: {
4731: if (start_match > md->start_subject + start_offset)
4732: {
1.2 ! misha 4733: #ifdef SUPPORT_UTF8
! 4734: if (utf8)
! 4735: {
! 4736: while (start_match < end_subject && !WAS_NEWLINE(start_match))
! 4737: {
! 4738: start_match++;
! 4739: while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
! 4740: start_match++;
! 4741: }
! 4742: }
! 4743: else
! 4744: #endif
! 4745: while (start_match < end_subject && !WAS_NEWLINE(start_match))
! 4746: start_match++;
1.1 misha 4747:
4748: /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4749: and we are now at a LF, advance the match position by one more character.
4750: */
4751:
4752: if (start_match[-1] == '\r' &&
4753: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4754: start_match < end_subject &&
4755: *start_match == '\n')
4756: start_match++;
4757: }
4758: }
4759:
1.2 ! misha 4760: /* Or to a non-unique first byte after study */
1.1 misha 4761:
4762: else if (start_bits != NULL)
4763: {
4764: while (start_match < end_subject)
4765: {
4766: register unsigned int c = *start_match;
1.2 ! misha 4767: if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
! 4768: else break;
1.1 misha 4769: }
4770: }
4771:
4772: /* Restore fudged end_subject */
4773:
4774: end_subject = save_end_subject;
4775:
4776: #ifdef DEBUG /* Sigh. Some compilers never learn. */
4777: printf(">>>> Match against: ");
4778: pchars(start_match, end_subject - start_match, TRUE, md);
4779: printf("\n");
4780: #endif
4781:
4782: /* If req_byte is set, we know that that character must appear in the subject
4783: for the match to succeed. If the first character is set, req_byte must be
4784: later in the subject; otherwise the test starts at the match point. This
4785: optimization can save a huge amount of backtracking in patterns with nested
4786: unlimited repeats that aren't going to match. Writing separate code for
4787: cased/caseless versions makes it go faster, as does using an autoincrement
4788: and backing off on a match.
4789:
4790: HOWEVER: when the subject string is very, very long, searching to its end can
4791: take a long time, and give bad performance on quite ordinary patterns. This
4792: showed up when somebody was matching something like /^\d+C/ on a 32-megabyte
4793: string... so we don't do this when the string is sufficiently long.
4794:
4795: ALSO: this processing is disabled when partial matching is requested.
4796: */
4797:
4798: if (req_byte >= 0 &&
4799: end_subject - start_match < REQ_BYTE_MAX &&
4800: !md->partial)
4801: {
4802: register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4803:
4804: /* We don't need to repeat the search if we haven't yet reached the
4805: place we found it at last time. */
4806:
4807: if (p > req_byte_ptr)
4808: {
4809: if (req_byte_caseless)
4810: {
4811: while (p < end_subject)
4812: {
4813: register int pp = *p++;
4814: if (pp == req_byte || pp == req_byte2) { p--; break; }
4815: }
4816: }
4817: else
4818: {
4819: while (p < end_subject)
4820: {
4821: if (*p++ == req_byte) { p--; break; }
4822: }
4823: }
4824:
4825: /* If we can't find the required character, break the matching loop,
4826: forcing a match failure. */
4827:
4828: if (p >= end_subject)
4829: {
4830: rc = MATCH_NOMATCH;
4831: break;
4832: }
4833:
4834: /* If we have found the required character, save the point where we
4835: found it, so that we don't search again next time round the loop if
4836: the start hasn't passed this character yet. */
4837:
4838: req_byte_ptr = p;
4839: }
4840: }
4841:
4842: /* OK, we can now run the match. */
4843:
4844: md->start_match_ptr = start_match;
4845: md->match_call_count = 0;
4846: rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4847:
4848: switch(rc)
4849: {
4850: /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4851: exactly like PRUNE. */
4852:
4853: case MATCH_NOMATCH:
4854: case MATCH_PRUNE:
4855: case MATCH_THEN:
4856: new_start_match = start_match + 1;
4857: #ifdef SUPPORT_UTF8
4858: if (utf8)
4859: while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4860: new_start_match++;
4861: #endif
4862: break;
4863:
4864: /* SKIP passes back the next starting point explicitly. */
4865:
4866: case MATCH_SKIP:
4867: new_start_match = md->start_match_ptr;
4868: break;
4869:
4870: /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4871:
4872: case MATCH_COMMIT:
4873: rc = MATCH_NOMATCH;
4874: goto ENDLOOP;
4875:
4876: /* Any other return is some kind of error. */
4877:
4878: default:
4879: goto ENDLOOP;
4880: }
4881:
4882: /* Control reaches here for the various types of "no match at this point"
4883: result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4884:
4885: rc = MATCH_NOMATCH;
4886:
4887: /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4888: newline in the subject (though it may continue over the newline). Therefore,
4889: if we have just failed to match, starting at a newline, do not continue. */
4890:
4891: if (firstline && IS_NEWLINE(start_match)) break;
4892:
4893: /* Advance to new matching position */
4894:
4895: start_match = new_start_match;
4896:
4897: /* Break the loop if the pattern is anchored or if we have passed the end of
4898: the subject. */
4899:
4900: if (anchored || start_match > end_subject) break;
4901:
4902: /* If we have just passed a CR and we are now at a LF, and the pattern does
4903: not contain any explicit matches for \r or \n, and the newline option is CRLF
4904: or ANY or ANYCRLF, advance the match position by one more character. */
4905:
4906: if (start_match[-1] == '\r' &&
4907: start_match < end_subject &&
4908: *start_match == '\n' &&
4909: (re->flags & PCRE_HASCRORLF) == 0 &&
4910: (md->nltype == NLTYPE_ANY ||
4911: md->nltype == NLTYPE_ANYCRLF ||
4912: md->nllen == 2))
4913: start_match++;
4914:
4915: } /* End of for(;;) "bumpalong" loop */
4916:
4917: /* ==========================================================================*/
4918:
4919: /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4920: conditions is true:
4921:
4922: (1) The pattern is anchored or the match was failed by (*COMMIT);
4923:
4924: (2) We are past the end of the subject;
4925:
4926: (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4927: this option requests that a match occur at or before the first newline in
4928: the subject.
4929:
4930: When we have a match and the offset vector is big enough to deal with any
4931: backreferences, captured substring offsets will already be set up. In the case
4932: where we had to get some local store to hold offsets for backreference
4933: processing, copy those that we can. In this case there need not be overflow if
4934: certain parts of the pattern were not used, even though there are more
4935: capturing parentheses than vector slots. */
4936:
4937: ENDLOOP:
4938:
4939: if (rc == MATCH_MATCH)
4940: {
4941: if (using_temporary_offsets)
4942: {
4943: if (offsetcount >= 4)
4944: {
4945: memcpy(offsets + 2, md->offset_vector + 2,
4946: (offsetcount - 2) * sizeof(int));
4947: DPRINTF(("Copied offsets from temporary memory\n"));
4948: }
4949: if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
4950: DPRINTF(("Freeing temporary memory\n"));
4951: (pcre_free)(md->offset_vector);
4952: }
4953:
4954: /* Set the return code to the number of captured strings, or 0 if there are
4955: too many to fit into the vector. */
4956:
4957: rc = md->offset_overflow? 0 : md->end_offset_top/2;
4958:
4959: /* If there is space, set up the whole thing as substring 0. The value of
4960: md->start_match_ptr might be modified if \K was encountered on the success
4961: matching path. */
4962:
4963: if (offsetcount < 2) rc = 0; else
4964: {
4965: offsets[0] = md->start_match_ptr - md->start_subject;
4966: offsets[1] = md->end_match_ptr - md->start_subject;
4967: }
4968:
4969: DPRINTF((">>>> returning %d\n", rc));
4970: return rc;
4971: }
4972:
4973: /* Control gets here if there has been an error, or if the overall match
4974: attempt has failed at all permitted starting positions. */
4975:
4976: if (using_temporary_offsets)
4977: {
4978: DPRINTF(("Freeing temporary memory\n"));
4979: (pcre_free)(md->offset_vector);
4980: }
4981:
4982: if (rc != MATCH_NOMATCH)
4983: {
4984: DPRINTF((">>>> error: returning %d\n", rc));
4985: return rc;
4986: }
4987: else if (md->partial && md->hitend)
4988: {
4989: DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
4990: return PCRE_ERROR_PARTIAL;
4991: }
4992: else
4993: {
4994: DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
4995: return PCRE_ERROR_NOMATCH;
4996: }
4997: }
4998:
4999: /* End of pcre_exec.c */
E-mail: