Annotation of win32/pcre/pcre_exec.c, revision 1.4
1.1 misha 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
1.4 ! misha 9: Copyright (c) 1997-2010 University of Cambridge
1.1 misha 10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40:
41: /* This module contains pcre_exec(), the externally visible function that does
42: pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43: possible. There are also some static supporting functions. */
44:
45: #ifdef HAVE_CONFIG_H
46: #include "config.h"
47: #endif
48:
49: #define NLBLOCK md /* Block containing newline information */
50: #define PSSTART start_subject /* Field containing processed string start */
51: #define PSEND end_subject /* Field containing processed string end */
52:
53: #include "pcre_internal.h"
54:
55: /* Undefine some potentially clashing cpp symbols */
56:
57: #undef min
58: #undef max
59:
60: /* Flag bits for the match() function */
61:
62: #define match_condassert 0x01 /* Called to check a condition assertion */
63: #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64:
65: /* Non-error returns from the match() function. Error returns are externally
66: defined PCRE_ERROR_xxx codes, which are all negative. */
67:
68: #define MATCH_MATCH 1
69: #define MATCH_NOMATCH 0
70:
71: /* Special internal returns from the match() function. Make them sufficiently
72: negative to avoid the external error codes. */
73:
1.4 ! misha 74: #define MATCH_ACCEPT (-999)
! 75: #define MATCH_COMMIT (-998)
! 76: #define MATCH_PRUNE (-997)
! 77: #define MATCH_SKIP (-996)
! 78: #define MATCH_SKIP_ARG (-995)
! 79: #define MATCH_THEN (-994)
! 80:
! 81: /* This is a convenience macro for code that occurs many times. */
! 82:
! 83: #define MRRETURN(ra) \
! 84: { \
! 85: md->mark = markptr; \
! 86: RRETURN(ra); \
! 87: }
1.1 misha 88:
89: /* Maximum number of ints of offset to save on the stack for recursive calls.
90: If the offset vector is bigger, malloc is used. This should be a multiple of 3,
91: because the offset vector is always a multiple of 3 long. */
92:
93: #define REC_STACK_SAVE_MAX 30
94:
95: /* Min and max values for the common repeats; for the maxima, 0 => infinity */
96:
97: static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
98: static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
99:
100:
101:
1.4 ! misha 102: #ifdef PCRE_DEBUG
1.1 misha 103: /*************************************************
104: * Debugging function to print chars *
105: *************************************************/
106:
107: /* Print a sequence of chars in printable format, stopping at the end of the
108: subject if the requested.
109:
110: Arguments:
111: p points to characters
112: length number to print
113: is_subject TRUE if printing from within md->start_subject
114: md pointer to matching data block, if is_subject is TRUE
115:
116: Returns: nothing
117: */
118:
119: static void
120: pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
121: {
122: unsigned int c;
123: if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
124: while (length-- > 0)
125: if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
126: }
127: #endif
128:
129:
130:
131: /*************************************************
132: * Match a back-reference *
133: *************************************************/
134:
135: /* If a back reference hasn't been set, the length that is passed is greater
136: than the number of characters left in the string, so the match fails.
137:
138: Arguments:
139: offset index into the offset vector
140: eptr points into the subject
141: length length to be matched
142: md points to match data block
143: ims the ims flags
144:
145: Returns: TRUE if matched
146: */
147:
148: static BOOL
149: match_ref(int offset, register USPTR eptr, int length, match_data *md,
150: unsigned long int ims)
151: {
152: USPTR p = md->start_subject + md->offset_vector[offset];
153:
1.4 ! misha 154: #ifdef PCRE_DEBUG
1.1 misha 155: if (eptr >= md->end_subject)
156: printf("matching subject <null>");
157: else
158: {
159: printf("matching subject ");
160: pchars(eptr, length, TRUE, md);
161: }
162: printf(" against backref ");
163: pchars(p, length, FALSE, md);
164: printf("\n");
165: #endif
166:
167: /* Always fail if not enough characters left */
168:
169: if (length > md->end_subject - eptr) return FALSE;
170:
1.2 misha 171: /* Separate the caseless case for speed. In UTF-8 mode we can only do this
172: properly if Unicode properties are supported. Otherwise, we can check only
173: ASCII characters. */
1.1 misha 174:
175: if ((ims & PCRE_CASELESS) != 0)
176: {
1.2 misha 177: #ifdef SUPPORT_UTF8
178: #ifdef SUPPORT_UCP
179: if (md->utf8)
180: {
181: USPTR endptr = eptr + length;
182: while (eptr < endptr)
183: {
184: int c, d;
185: GETCHARINC(c, eptr);
186: GETCHARINC(d, p);
187: if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
188: }
189: }
190: else
191: #endif
192: #endif
193:
194: /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
195: is no UCP support. */
196:
1.1 misha 197: while (length-- > 0)
1.2 misha 198: { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
1.1 misha 199: }
1.2 misha 200:
201: /* In the caseful case, we can just compare the bytes, whether or not we
202: are in UTF-8 mode. */
203:
1.1 misha 204: else
205: { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
206:
207: return TRUE;
208: }
209:
210:
211:
212: /***************************************************************************
213: ****************************************************************************
214: RECURSION IN THE match() FUNCTION
215:
216: The match() function is highly recursive, though not every recursive call
217: increases the recursive depth. Nevertheless, some regular expressions can cause
218: it to recurse to a great depth. I was writing for Unix, so I just let it call
219: itself recursively. This uses the stack for saving everything that has to be
220: saved for a recursive call. On Unix, the stack can be large, and this works
221: fine.
222:
223: It turns out that on some non-Unix-like systems there are problems with
224: programs that use a lot of stack. (This despite the fact that every last chip
225: has oodles of memory these days, and techniques for extending the stack have
226: been known for decades.) So....
227:
228: There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
229: calls by keeping local variables that need to be preserved in blocks of memory
230: obtained from malloc() instead instead of on the stack. Macros are used to
231: achieve this so that the actual code doesn't look very different to what it
232: always used to.
233:
234: The original heap-recursive code used longjmp(). However, it seems that this
235: can be very slow on some operating systems. Following a suggestion from Stan
236: Switzer, the use of longjmp() has been abolished, at the cost of having to
237: provide a unique number for each call to RMATCH. There is no way of generating
238: a sequence of numbers at compile time in C. I have given them names, to make
239: them stand out more clearly.
240:
241: Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
242: FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
243: tests. Furthermore, not using longjmp() means that local dynamic variables
244: don't have indeterminate values; this has meant that the frame size can be
245: reduced because the result can be "passed back" by straight setting of the
246: variable instead of being passed in the frame.
247: ****************************************************************************
248: ***************************************************************************/
249:
250: /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
251: below must be updated in sync. */
252:
253: enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
254: RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
255: RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
256: RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
257: RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
1.4 ! misha 258: RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
! 259: RM61, RM62 };
1.1 misha 260:
261: /* These versions of the macros use the stack, as normal. There are debugging
262: versions and production versions. Note that the "rw" argument of RMATCH isn't
1.4 ! misha 263: actually used in this definition. */
1.1 misha 264:
265: #ifndef NO_RECURSE
266: #define REGISTER register
267:
1.4 ! misha 268: #ifdef PCRE_DEBUG
1.1 misha 269: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
270: { \
271: printf("match() called in line %d\n", __LINE__); \
1.4 ! misha 272: rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1); \
1.1 misha 273: printf("to line %d\n", __LINE__); \
274: }
275: #define RRETURN(ra) \
276: { \
277: printf("match() returned %d from line %d ", ra, __LINE__); \
278: return ra; \
279: }
280: #else
281: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
1.4 ! misha 282: rrc = match(ra,rb,mstart,markptr,rc,rd,re,rf,rg,rdepth+1)
1.1 misha 283: #define RRETURN(ra) return ra
284: #endif
285:
286: #else
287:
288:
289: /* These versions of the macros manage a private stack on the heap. Note that
290: the "rd" argument of RMATCH isn't actually used in this definition. It's the md
291: argument of match(), which never changes. */
292:
293: #define REGISTER
294:
295: #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
296: {\
297: heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
1.4 ! misha 298: if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
1.1 misha 299: frame->Xwhere = rw; \
300: newframe->Xeptr = ra;\
301: newframe->Xecode = rb;\
302: newframe->Xmstart = mstart;\
1.4 ! misha 303: newframe->Xmarkptr = markptr;\
1.1 misha 304: newframe->Xoffset_top = rc;\
305: newframe->Xims = re;\
306: newframe->Xeptrb = rf;\
307: newframe->Xflags = rg;\
308: newframe->Xrdepth = frame->Xrdepth + 1;\
309: newframe->Xprevframe = frame;\
310: frame = newframe;\
311: DPRINTF(("restarting from line %d\n", __LINE__));\
312: goto HEAP_RECURSE;\
313: L_##rw:\
314: DPRINTF(("jumped back to line %d\n", __LINE__));\
315: }
316:
317: #define RRETURN(ra)\
318: {\
1.4 ! misha 319: heapframe *oldframe = frame;\
! 320: frame = oldframe->Xprevframe;\
! 321: (pcre_stack_free)(oldframe);\
1.1 misha 322: if (frame != NULL)\
323: {\
324: rrc = ra;\
325: goto HEAP_RETURN;\
326: }\
327: return ra;\
328: }
329:
330:
331: /* Structure for remembering the local variables in a private frame */
332:
333: typedef struct heapframe {
334: struct heapframe *Xprevframe;
335:
336: /* Function arguments that may change */
337:
1.3 misha 338: USPTR Xeptr;
1.1 misha 339: const uschar *Xecode;
1.3 misha 340: USPTR Xmstart;
1.4 ! misha 341: USPTR Xmarkptr;
1.1 misha 342: int Xoffset_top;
343: long int Xims;
344: eptrblock *Xeptrb;
345: int Xflags;
346: unsigned int Xrdepth;
347:
348: /* Function local variables */
349:
1.3 misha 350: USPTR Xcallpat;
351: #ifdef SUPPORT_UTF8
352: USPTR Xcharptr;
353: #endif
354: USPTR Xdata;
355: USPTR Xnext;
356: USPTR Xpp;
357: USPTR Xprev;
358: USPTR Xsaved_eptr;
1.1 misha 359:
360: recursion_info Xnew_recursive;
361:
362: BOOL Xcur_is_word;
363: BOOL Xcondition;
364: BOOL Xprev_is_word;
365:
366: unsigned long int Xoriginal_ims;
367:
368: #ifdef SUPPORT_UCP
369: int Xprop_type;
370: int Xprop_value;
371: int Xprop_fail_result;
372: int Xprop_category;
373: int Xprop_chartype;
374: int Xprop_script;
375: int Xoclength;
376: uschar Xocchars[8];
377: #endif
378:
1.3 misha 379: int Xcodelink;
1.1 misha 380: int Xctype;
381: unsigned int Xfc;
382: int Xfi;
383: int Xlength;
384: int Xmax;
385: int Xmin;
386: int Xnumber;
387: int Xoffset;
388: int Xop;
389: int Xsave_capture_last;
390: int Xsave_offset1, Xsave_offset2, Xsave_offset3;
391: int Xstacksave[REC_STACK_SAVE_MAX];
392:
393: eptrblock Xnewptrb;
394:
395: /* Where to jump back to */
396:
397: int Xwhere;
398:
399: } heapframe;
400:
401: #endif
402:
403:
404: /***************************************************************************
405: ***************************************************************************/
406:
407:
408:
409: /*************************************************
410: * Match from current position *
411: *************************************************/
412:
413: /* This function is called recursively in many circumstances. Whenever it
414: returns a negative (error) response, the outer incarnation must also return the
1.4 ! misha 415: same response. */
! 416:
! 417: /* These macros pack up tests that are used for partial matching, and which
! 418: appears several times in the code. We set the "hit end" flag if the pointer is
! 419: at the end of the subject and also past the start of the subject (i.e.
! 420: something has been matched). For hard partial matching, we then return
! 421: immediately. The second one is used when we already know we are past the end of
! 422: the subject. */
! 423:
! 424: #define CHECK_PARTIAL()\
! 425: if (md->partial != 0 && eptr >= md->end_subject && eptr > mstart)\
! 426: {\
! 427: md->hitend = TRUE;\
! 428: if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
! 429: }
1.1 misha 430:
1.4 ! misha 431: #define SCHECK_PARTIAL()\
! 432: if (md->partial != 0 && eptr > mstart)\
! 433: {\
! 434: md->hitend = TRUE;\
! 435: if (md->partial > 1) MRRETURN(PCRE_ERROR_PARTIAL);\
! 436: }
! 437:
! 438:
! 439: /* Performance note: It might be tempting to extract commonly used fields from
! 440: the md structure (e.g. utf8, end_subject) into individual variables to improve
1.1 misha 441: performance. Tests using gcc on a SPARC disproved this; in the first case, it
442: made performance worse.
443:
444: Arguments:
445: eptr pointer to current character in subject
446: ecode pointer to current position in compiled code
447: mstart pointer to the current match start position (can be modified
448: by encountering \K)
1.4 ! misha 449: markptr pointer to the most recent MARK name, or NULL
1.1 misha 450: offset_top current top pointer
451: md pointer to "static" info for the match
452: ims current /i, /m, and /s options
453: eptrb pointer to chain of blocks containing eptr at start of
454: brackets - for testing for empty matches
455: flags can contain
456: match_condassert - this is an assertion condition
457: match_cbegroup - this is the start of an unlimited repeat
458: group that can match an empty string
459: rdepth the recursion depth
460:
461: Returns: MATCH_MATCH if matched ) these values are >= 0
462: MATCH_NOMATCH if failed to match )
1.4 ! misha 463: a negative MATCH_xxx value for PRUNE, SKIP, etc
1.1 misha 464: a negative PCRE_ERROR_xxx value if aborted by an error condition
465: (e.g. stopped by repeated call or recursion limit)
466: */
467:
468: static int
1.3 misha 469: match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
1.4 ! misha 470: const uschar *markptr, int offset_top, match_data *md, unsigned long int ims,
! 471: eptrblock *eptrb, int flags, unsigned int rdepth)
1.1 misha 472: {
473: /* These variables do not need to be preserved over recursion in this function,
474: so they can be ordinary variables in all cases. Mark some of them with
475: "register" because they are used a lot in loops. */
476:
477: register int rrc; /* Returns from recursive calls */
478: register int i; /* Used for loops not involving calls to RMATCH() */
479: register unsigned int c; /* Character values not kept over RMATCH() calls */
480: register BOOL utf8; /* Local copy of UTF-8 flag for speed */
481:
482: BOOL minimize, possessive; /* Quantifier options */
1.3 misha 483: int condcode;
1.1 misha 484:
485: /* When recursion is not being used, all "local" variables that have to be
486: preserved over calls to RMATCH() are part of a "frame" which is obtained from
487: heap storage. Set up the top-level frame here; others are obtained from the
488: heap whenever RMATCH() does a "recursion". See the macro definitions above. */
489:
490: #ifdef NO_RECURSE
491: heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
1.4 ! misha 492: if (frame == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1.1 misha 493: frame->Xprevframe = NULL; /* Marks the top level */
494:
495: /* Copy in the original argument variables */
496:
497: frame->Xeptr = eptr;
498: frame->Xecode = ecode;
499: frame->Xmstart = mstart;
1.4 ! misha 500: frame->Xmarkptr = markptr;
1.1 misha 501: frame->Xoffset_top = offset_top;
502: frame->Xims = ims;
503: frame->Xeptrb = eptrb;
504: frame->Xflags = flags;
505: frame->Xrdepth = rdepth;
506:
507: /* This is where control jumps back to to effect "recursion" */
508:
509: HEAP_RECURSE:
510:
511: /* Macros make the argument variables come from the current frame */
512:
513: #define eptr frame->Xeptr
514: #define ecode frame->Xecode
515: #define mstart frame->Xmstart
1.4 ! misha 516: #define markptr frame->Xmarkptr
1.1 misha 517: #define offset_top frame->Xoffset_top
518: #define ims frame->Xims
519: #define eptrb frame->Xeptrb
520: #define flags frame->Xflags
521: #define rdepth frame->Xrdepth
522:
523: /* Ditto for the local variables */
524:
525: #ifdef SUPPORT_UTF8
526: #define charptr frame->Xcharptr
527: #endif
528: #define callpat frame->Xcallpat
1.3 misha 529: #define codelink frame->Xcodelink
1.1 misha 530: #define data frame->Xdata
531: #define next frame->Xnext
532: #define pp frame->Xpp
533: #define prev frame->Xprev
534: #define saved_eptr frame->Xsaved_eptr
535:
536: #define new_recursive frame->Xnew_recursive
537:
538: #define cur_is_word frame->Xcur_is_word
539: #define condition frame->Xcondition
540: #define prev_is_word frame->Xprev_is_word
541:
542: #define original_ims frame->Xoriginal_ims
543:
544: #ifdef SUPPORT_UCP
545: #define prop_type frame->Xprop_type
546: #define prop_value frame->Xprop_value
547: #define prop_fail_result frame->Xprop_fail_result
548: #define prop_category frame->Xprop_category
549: #define prop_chartype frame->Xprop_chartype
550: #define prop_script frame->Xprop_script
551: #define oclength frame->Xoclength
552: #define occhars frame->Xocchars
553: #endif
554:
555: #define ctype frame->Xctype
556: #define fc frame->Xfc
557: #define fi frame->Xfi
558: #define length frame->Xlength
559: #define max frame->Xmax
560: #define min frame->Xmin
561: #define number frame->Xnumber
562: #define offset frame->Xoffset
563: #define op frame->Xop
564: #define save_capture_last frame->Xsave_capture_last
565: #define save_offset1 frame->Xsave_offset1
566: #define save_offset2 frame->Xsave_offset2
567: #define save_offset3 frame->Xsave_offset3
568: #define stacksave frame->Xstacksave
569:
570: #define newptrb frame->Xnewptrb
571:
572: /* When recursion is being used, local variables are allocated on the stack and
573: get preserved during recursion in the normal way. In this environment, fi and
574: i, and fc and c, can be the same variables. */
575:
576: #else /* NO_RECURSE not defined */
577: #define fi i
578: #define fc c
579:
580:
581: #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
582: const uschar *charptr; /* in small blocks of the code. My normal */
583: #endif /* style of coding would have declared */
584: const uschar *callpat; /* them within each of those blocks. */
585: const uschar *data; /* However, in order to accommodate the */
586: const uschar *next; /* version of this code that uses an */
587: USPTR pp; /* external "stack" implemented on the */
588: const uschar *prev; /* heap, it is easier to declare them all */
589: USPTR saved_eptr; /* here, so the declarations can be cut */
590: /* out in a block. The only declarations */
591: recursion_info new_recursive; /* within blocks below are for variables */
592: /* that do not have to be preserved over */
593: BOOL cur_is_word; /* a recursive call to RMATCH(). */
594: BOOL condition;
595: BOOL prev_is_word;
596:
597: unsigned long int original_ims;
598:
599: #ifdef SUPPORT_UCP
600: int prop_type;
601: int prop_value;
602: int prop_fail_result;
603: int prop_category;
604: int prop_chartype;
605: int prop_script;
606: int oclength;
607: uschar occhars[8];
608: #endif
609:
1.3 misha 610: int codelink;
1.1 misha 611: int ctype;
612: int length;
613: int max;
614: int min;
615: int number;
616: int offset;
617: int op;
618: int save_capture_last;
619: int save_offset1, save_offset2, save_offset3;
620: int stacksave[REC_STACK_SAVE_MAX];
621:
622: eptrblock newptrb;
623: #endif /* NO_RECURSE */
624:
625: /* These statements are here to stop the compiler complaining about unitialized
626: variables. */
627:
628: #ifdef SUPPORT_UCP
629: prop_value = 0;
630: prop_fail_result = 0;
631: #endif
632:
633:
634: /* This label is used for tail recursion, which is used in a few cases even
635: when NO_RECURSE is not defined, in order to reduce the amount of stack that is
636: used. Thanks to Ian Taylor for noticing this possibility and sending the
637: original patch. */
638:
639: TAIL_RECURSE:
640:
641: /* OK, now we can get on with the real code of the function. Recursive calls
642: are specified by the macro RMATCH and RRETURN is used to return. When
643: NO_RECURSE is *not* defined, these just turn into a recursive call to match()
1.4 ! misha 644: and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
1.1 misha 645: defined). However, RMATCH isn't like a function call because it's quite a
646: complicated macro. It has to be used in one particular way. This shouldn't,
647: however, impact performance when true recursion is being used. */
648:
649: #ifdef SUPPORT_UTF8
650: utf8 = md->utf8; /* Local copy of the flag */
651: #else
652: utf8 = FALSE;
653: #endif
654:
655: /* First check that we haven't called match() too many times, or that we
656: haven't exceeded the recursive call limit. */
657:
658: if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
659: if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
660:
661: original_ims = ims; /* Save for resetting on ')' */
662:
663: /* At the start of a group with an unlimited repeat that may match an empty
664: string, the match_cbegroup flag is set. When this is the case, add the current
665: subject pointer to the chain of such remembered pointers, to be checked when we
666: hit the closing ket, in order to break infinite loops that match no characters.
667: When match() is called in other circumstances, don't add to the chain. The
668: match_cbegroup flag must NOT be used with tail recursion, because the memory
669: block that is used is on the stack, so a new one may be required for each
670: match(). */
671:
672: if ((flags & match_cbegroup) != 0)
673: {
674: newptrb.epb_saved_eptr = eptr;
675: newptrb.epb_prev = eptrb;
676: eptrb = &newptrb;
677: }
678:
679: /* Now start processing the opcodes. */
680:
681: for (;;)
682: {
683: minimize = possessive = FALSE;
684: op = *ecode;
685:
1.4 ! misha 686: switch(op)
! 687: {
! 688: case OP_MARK:
! 689: markptr = ecode + 2;
! 690: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
! 691: ims, eptrb, flags, RM55);
! 692:
! 693: /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
! 694: argument, and we must check whether that argument matches this MARK's
! 695: argument. It is passed back in md->start_match_ptr (an overloading of that
! 696: variable). If it does match, we reset that variable to the current subject
! 697: position and return MATCH_SKIP. Otherwise, pass back the return code
! 698: unaltered. */
! 699:
! 700: if (rrc == MATCH_SKIP_ARG &&
! 701: strcmp((char *)markptr, (char *)(md->start_match_ptr)) == 0)
! 702: {
! 703: md->start_match_ptr = eptr;
! 704: RRETURN(MATCH_SKIP);
! 705: }
1.1 misha 706:
1.4 ! misha 707: if (md->mark == NULL) md->mark = markptr;
! 708: RRETURN(rrc);
1.1 misha 709:
710: case OP_FAIL:
1.4 ! misha 711: MRRETURN(MATCH_NOMATCH);
! 712:
! 713: case OP_COMMIT:
! 714: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
! 715: ims, eptrb, flags, RM52);
! 716: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 717: MRRETURN(MATCH_COMMIT);
1.1 misha 718:
719: case OP_PRUNE:
720: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
721: ims, eptrb, flags, RM51);
722: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 723: MRRETURN(MATCH_PRUNE);
1.1 misha 724:
1.4 ! misha 725: case OP_PRUNE_ARG:
! 726: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
! 727: ims, eptrb, flags, RM56);
1.1 misha 728: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 729: md->mark = ecode + 2;
! 730: RRETURN(MATCH_PRUNE);
1.1 misha 731:
732: case OP_SKIP:
733: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
734: ims, eptrb, flags, RM53);
735: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
736: md->start_match_ptr = eptr; /* Pass back current position */
1.4 ! misha 737: MRRETURN(MATCH_SKIP);
! 738:
! 739: case OP_SKIP_ARG:
! 740: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
! 741: ims, eptrb, flags, RM57);
! 742: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 743:
! 744: /* Pass back the current skip name by overloading md->start_match_ptr and
! 745: returning the special MATCH_SKIP_ARG return code. This will either be
! 746: caught by a matching MARK, or get to the top, where it is treated the same
! 747: as PRUNE. */
! 748:
! 749: md->start_match_ptr = ecode + 2;
! 750: RRETURN(MATCH_SKIP_ARG);
1.1 misha 751:
752: case OP_THEN:
753: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
754: ims, eptrb, flags, RM54);
755: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 756: MRRETURN(MATCH_THEN);
! 757:
! 758: case OP_THEN_ARG:
! 759: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode] + ecode[1], offset_top, md,
! 760: ims, eptrb, flags, RM58);
! 761: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 762: md->mark = ecode + 2;
1.1 misha 763: RRETURN(MATCH_THEN);
764:
765: /* Handle a capturing bracket. If there is space in the offset vector, save
766: the current subject position in the working slot at the top of the vector.
767: We mustn't change the current values of the data slot, because they may be
768: set from a previous iteration of this group, and be referred to by a
769: reference inside the group.
770:
771: If the bracket fails to match, we need to restore this value and also the
772: values of the final offsets, in case they were set by a previous iteration
773: of the same bracket.
774:
775: If there isn't enough space in the offset vector, treat this as if it were
776: a non-capturing bracket. Don't worry about setting the flag for the error
777: case here; that is handled in the code for KET. */
778:
779: case OP_CBRA:
780: case OP_SCBRA:
781: number = GET2(ecode, 1+LINK_SIZE);
782: offset = number << 1;
783:
1.4 ! misha 784: #ifdef PCRE_DEBUG
1.1 misha 785: printf("start bracket %d\n", number);
786: printf("subject=");
787: pchars(eptr, 16, TRUE, md);
788: printf("\n");
789: #endif
790:
791: if (offset < md->offset_max)
792: {
793: save_offset1 = md->offset_vector[offset];
794: save_offset2 = md->offset_vector[offset+1];
795: save_offset3 = md->offset_vector[md->offset_end - number];
796: save_capture_last = md->capture_last;
797:
798: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1.4 ! misha 799: md->offset_vector[md->offset_end - number] =
! 800: (int)(eptr - md->start_subject);
1.1 misha 801:
802: flags = (op == OP_SCBRA)? match_cbegroup : 0;
803: do
804: {
805: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
806: ims, eptrb, flags, RM1);
807: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
808: md->capture_last = save_capture_last;
809: ecode += GET(ecode, 1);
810: }
811: while (*ecode == OP_ALT);
812:
813: DPRINTF(("bracket %d failed\n", number));
814:
815: md->offset_vector[offset] = save_offset1;
816: md->offset_vector[offset+1] = save_offset2;
817: md->offset_vector[md->offset_end - number] = save_offset3;
818:
1.4 ! misha 819: if (rrc != MATCH_THEN) md->mark = markptr;
1.1 misha 820: RRETURN(MATCH_NOMATCH);
821: }
822:
823: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
824: as a non-capturing bracket. */
825:
826: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
827: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
828:
829: DPRINTF(("insufficient capture room: treat as non-capturing\n"));
830:
831: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
832: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
833:
834: /* Non-capturing bracket. Loop for all the alternatives. When we get to the
835: final alternative within the brackets, we would return the result of a
836: recursive call to match() whatever happened. We can reduce stack usage by
837: turning this into a tail recursion, except in the case when match_cbegroup
838: is set.*/
839:
840: case OP_BRA:
841: case OP_SBRA:
842: DPRINTF(("start non-capturing bracket\n"));
843: flags = (op >= OP_SBRA)? match_cbegroup : 0;
844: for (;;)
845: {
846: if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
847: {
848: if (flags == 0) /* Not a possibly empty group */
849: {
850: ecode += _pcre_OP_lengths[*ecode];
851: DPRINTF(("bracket 0 tail recursion\n"));
852: goto TAIL_RECURSE;
853: }
854:
855: /* Possibly empty group; can't use tail recursion. */
856:
857: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
858: eptrb, flags, RM48);
1.4 ! misha 859: if (rrc == MATCH_NOMATCH) md->mark = markptr;
1.1 misha 860: RRETURN(rrc);
861: }
862:
863: /* For non-final alternatives, continue the loop for a NOMATCH result;
864: otherwise return. */
865:
866: RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
867: eptrb, flags, RM2);
868: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
869: ecode += GET(ecode, 1);
870: }
871: /* Control never reaches here. */
872:
873: /* Conditional group: compilation checked that there are no more than
874: two branches. If the condition is false, skipping the first branch takes us
875: past the end if there is only one branch, but that's OK because that is
876: exactly what going to the ket would do. As there is only one branch to be
877: obeyed, we can use tail recursion to avoid using another stack frame. */
878:
879: case OP_COND:
880: case OP_SCOND:
1.3 misha 881: codelink= GET(ecode, 1);
882:
883: /* Because of the way auto-callout works during compile, a callout item is
884: inserted between OP_COND and an assertion condition. */
885:
886: if (ecode[LINK_SIZE+1] == OP_CALLOUT)
887: {
888: if (pcre_callout != NULL)
889: {
890: pcre_callout_block cb;
891: cb.version = 1; /* Version 1 of the callout block */
892: cb.callout_number = ecode[LINK_SIZE+2];
893: cb.offset_vector = md->offset_vector;
894: cb.subject = (PCRE_SPTR)md->start_subject;
1.4 ! misha 895: cb.subject_length = (int)(md->end_subject - md->start_subject);
! 896: cb.start_match = (int)(mstart - md->start_subject);
! 897: cb.current_position = (int)(eptr - md->start_subject);
1.3 misha 898: cb.pattern_position = GET(ecode, LINK_SIZE + 3);
899: cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
900: cb.capture_top = offset_top/2;
901: cb.capture_last = md->capture_last;
902: cb.callout_data = md->callout_data;
1.4 ! misha 903: if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1.3 misha 904: if (rrc < 0) RRETURN(rrc);
905: }
906: ecode += _pcre_OP_lengths[OP_CALLOUT];
907: }
908:
909: condcode = ecode[LINK_SIZE+1];
910:
911: /* Now see what the actual condition is */
912:
1.4 ! misha 913: if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1.1 misha 914: {
1.4 ! misha 915: if (md->recursive == NULL) /* Not recursing => FALSE */
! 916: {
! 917: condition = FALSE;
! 918: ecode += GET(ecode, 1);
! 919: }
! 920: else
! 921: {
! 922: int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
! 923: condition = (recno == RREF_ANY || recno == md->recursive->group_num);
! 924:
! 925: /* If the test is for recursion into a specific subpattern, and it is
! 926: false, but the test was set up by name, scan the table to see if the
! 927: name refers to any other numbers, and test them. The condition is true
! 928: if any one is set. */
! 929:
! 930: if (!condition && condcode == OP_NRREF && recno != RREF_ANY)
! 931: {
! 932: uschar *slotA = md->name_table;
! 933: for (i = 0; i < md->name_count; i++)
! 934: {
! 935: if (GET2(slotA, 0) == recno) break;
! 936: slotA += md->name_entry_size;
! 937: }
! 938:
! 939: /* Found a name for the number - there can be only one; duplicate
! 940: names for different numbers are allowed, but not vice versa. First
! 941: scan down for duplicates. */
! 942:
! 943: if (i < md->name_count)
! 944: {
! 945: uschar *slotB = slotA;
! 946: while (slotB > md->name_table)
! 947: {
! 948: slotB -= md->name_entry_size;
! 949: if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
! 950: {
! 951: condition = GET2(slotB, 0) == md->recursive->group_num;
! 952: if (condition) break;
! 953: }
! 954: else break;
! 955: }
! 956:
! 957: /* Scan up for duplicates */
! 958:
! 959: if (!condition)
! 960: {
! 961: slotB = slotA;
! 962: for (i++; i < md->name_count; i++)
! 963: {
! 964: slotB += md->name_entry_size;
! 965: if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
! 966: {
! 967: condition = GET2(slotB, 0) == md->recursive->group_num;
! 968: if (condition) break;
! 969: }
! 970: else break;
! 971: }
! 972: }
! 973: }
! 974: }
! 975:
! 976: /* Chose branch according to the condition */
! 977:
! 978: ecode += condition? 3 : GET(ecode, 1);
! 979: }
1.1 misha 980: }
981:
1.4 ! misha 982: else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1.1 misha 983: {
984: offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
985: condition = offset < offset_top && md->offset_vector[offset] >= 0;
1.4 ! misha 986:
! 987: /* If the numbered capture is unset, but the reference was by name,
! 988: scan the table to see if the name refers to any other numbers, and test
! 989: them. The condition is true if any one is set. This is tediously similar
! 990: to the code above, but not close enough to try to amalgamate. */
! 991:
! 992: if (!condition && condcode == OP_NCREF)
! 993: {
! 994: int refno = offset >> 1;
! 995: uschar *slotA = md->name_table;
! 996:
! 997: for (i = 0; i < md->name_count; i++)
! 998: {
! 999: if (GET2(slotA, 0) == refno) break;
! 1000: slotA += md->name_entry_size;
! 1001: }
! 1002:
! 1003: /* Found a name for the number - there can be only one; duplicate names
! 1004: for different numbers are allowed, but not vice versa. First scan down
! 1005: for duplicates. */
! 1006:
! 1007: if (i < md->name_count)
! 1008: {
! 1009: uschar *slotB = slotA;
! 1010: while (slotB > md->name_table)
! 1011: {
! 1012: slotB -= md->name_entry_size;
! 1013: if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
! 1014: {
! 1015: offset = GET2(slotB, 0) << 1;
! 1016: condition = offset < offset_top &&
! 1017: md->offset_vector[offset] >= 0;
! 1018: if (condition) break;
! 1019: }
! 1020: else break;
! 1021: }
! 1022:
! 1023: /* Scan up for duplicates */
! 1024:
! 1025: if (!condition)
! 1026: {
! 1027: slotB = slotA;
! 1028: for (i++; i < md->name_count; i++)
! 1029: {
! 1030: slotB += md->name_entry_size;
! 1031: if (strcmp((char *)slotA + 2, (char *)slotB + 2) == 0)
! 1032: {
! 1033: offset = GET2(slotB, 0) << 1;
! 1034: condition = offset < offset_top &&
! 1035: md->offset_vector[offset] >= 0;
! 1036: if (condition) break;
! 1037: }
! 1038: else break;
! 1039: }
! 1040: }
! 1041: }
! 1042: }
! 1043:
! 1044: /* Chose branch according to the condition */
! 1045:
1.1 misha 1046: ecode += condition? 3 : GET(ecode, 1);
1047: }
1048:
1.3 misha 1049: else if (condcode == OP_DEF) /* DEFINE - always false */
1.1 misha 1050: {
1051: condition = FALSE;
1052: ecode += GET(ecode, 1);
1053: }
1054:
1055: /* The condition is an assertion. Call match() to evaluate it - setting
1056: the final argument match_condassert causes it to stop at the end of an
1057: assertion. */
1058:
1059: else
1060: {
1061: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
1062: match_condassert, RM3);
1063: if (rrc == MATCH_MATCH)
1064: {
1065: condition = TRUE;
1066: ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1067: while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1068: }
1069: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1070: {
1071: RRETURN(rrc); /* Need braces because of following else */
1072: }
1073: else
1074: {
1075: condition = FALSE;
1.3 misha 1076: ecode += codelink;
1.1 misha 1077: }
1078: }
1079:
1080: /* We are now at the branch that is to be obeyed. As there is only one,
1081: we can use tail recursion to avoid using another stack frame, except when
1082: match_cbegroup is required for an unlimited repeat of a possibly empty
1083: group. If the second alternative doesn't exist, we can just plough on. */
1084:
1085: if (condition || *ecode == OP_ALT)
1086: {
1087: ecode += 1 + LINK_SIZE;
1088: if (op == OP_SCOND) /* Possibly empty group */
1089: {
1090: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
1091: RRETURN(rrc);
1092: }
1093: else /* Group must match something */
1094: {
1095: flags = 0;
1096: goto TAIL_RECURSE;
1097: }
1098: }
1.3 misha 1099: else /* Condition false & no alternative */
1.1 misha 1100: {
1101: ecode += 1 + LINK_SIZE;
1102: }
1103: break;
1104:
1105:
1.4 ! misha 1106: /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
! 1107: to close any currently open capturing brackets. */
! 1108:
! 1109: case OP_CLOSE:
! 1110: number = GET2(ecode, 1);
! 1111: offset = number << 1;
! 1112:
! 1113: #ifdef PCRE_DEBUG
! 1114: printf("end bracket %d at *ACCEPT", number);
! 1115: printf("\n");
! 1116: #endif
! 1117:
! 1118: md->capture_last = number;
! 1119: if (offset >= md->offset_max) md->offset_overflow = TRUE; else
! 1120: {
! 1121: md->offset_vector[offset] =
! 1122: md->offset_vector[md->offset_end - number];
! 1123: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
! 1124: if (offset_top <= offset) offset_top = offset + 2;
! 1125: }
! 1126: ecode += 3;
! 1127: break;
! 1128:
! 1129:
1.1 misha 1130: /* End of the pattern, either real or forced. If we are in a top-level
1131: recursion, we should restore the offsets appropriately and continue from
1132: after the call. */
1133:
1134: case OP_ACCEPT:
1135: case OP_END:
1136: if (md->recursive != NULL && md->recursive->group_num == 0)
1137: {
1138: recursion_info *rec = md->recursive;
1139: DPRINTF(("End of pattern in a (?0) recursion\n"));
1140: md->recursive = rec->prevrec;
1141: memmove(md->offset_vector, rec->offset_save,
1142: rec->saved_max * sizeof(int));
1.4 ! misha 1143: offset_top = rec->save_offset_top;
1.1 misha 1144: ims = original_ims;
1145: ecode = rec->after_call;
1146: break;
1147: }
1148:
1.4 ! misha 1149: /* Otherwise, if we have matched an empty string, fail if PCRE_NOTEMPTY is
! 1150: set, or if PCRE_NOTEMPTY_ATSTART is set and we have matched at the start of
! 1151: the subject. In both cases, backtracking will then try other alternatives,
! 1152: if any. */
! 1153:
! 1154: if (eptr == mstart &&
! 1155: (md->notempty ||
! 1156: (md->notempty_atstart &&
! 1157: mstart == md->start_subject + md->start_offset)))
! 1158: MRRETURN(MATCH_NOMATCH);
! 1159:
! 1160: /* Otherwise, we have a match. */
1.1 misha 1161:
1162: md->end_match_ptr = eptr; /* Record where we ended */
1163: md->end_offset_top = offset_top; /* and how many extracts were taken */
1164: md->start_match_ptr = mstart; /* and the start (\K can modify) */
1.4 ! misha 1165:
! 1166: /* For some reason, the macros don't work properly if an expression is
! 1167: given as the argument to MRRETURN when the heap is in use. */
! 1168:
! 1169: rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
! 1170: MRRETURN(rrc);
1.1 misha 1171:
1172: /* Change option settings */
1173:
1174: case OP_OPT:
1175: ims = ecode[1];
1176: ecode += 2;
1177: DPRINTF(("ims set to %02lx\n", ims));
1178: break;
1179:
1180: /* Assertion brackets. Check the alternative branches in turn - the
1181: matching won't pass the KET for an assertion. If any one branch matches,
1182: the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1183: start of each branch to move the current point backwards, so the code at
1184: this level is identical to the lookahead case. */
1185:
1186: case OP_ASSERT:
1187: case OP_ASSERTBACK:
1188: do
1189: {
1190: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1191: RM4);
1.4 ! misha 1192: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
! 1193: {
! 1194: mstart = md->start_match_ptr; /* In case \K reset it */
! 1195: break;
! 1196: }
1.1 misha 1197: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1198: ecode += GET(ecode, 1);
1199: }
1200: while (*ecode == OP_ALT);
1.4 ! misha 1201: if (*ecode == OP_KET) MRRETURN(MATCH_NOMATCH);
1.1 misha 1202:
1203: /* If checking an assertion for a condition, return MATCH_MATCH. */
1204:
1205: if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1206:
1207: /* Continue from after the assertion, updating the offsets high water
1208: mark, since extracts may have been taken during the assertion. */
1209:
1210: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1211: ecode += 1 + LINK_SIZE;
1212: offset_top = md->end_offset_top;
1213: continue;
1214:
1.4 ! misha 1215: /* Negative assertion: all branches must fail to match. Encountering SKIP,
! 1216: PRUNE, or COMMIT means we must assume failure without checking subsequent
! 1217: branches. */
1.1 misha 1218:
1219: case OP_ASSERT_NOT:
1220: case OP_ASSERTBACK_NOT:
1221: do
1222: {
1223: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
1224: RM5);
1.4 ! misha 1225: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) MRRETURN(MATCH_NOMATCH);
! 1226: if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
! 1227: {
! 1228: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
! 1229: break;
! 1230: }
1.1 misha 1231: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1232: ecode += GET(ecode,1);
1233: }
1234: while (*ecode == OP_ALT);
1235:
1236: if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
1237:
1238: ecode += 1 + LINK_SIZE;
1239: continue;
1240:
1241: /* Move the subject pointer back. This occurs only at the start of
1242: each branch of a lookbehind assertion. If we are too close to the start to
1243: move back, this match function fails. When working with UTF-8 we move
1244: back a number of characters, not bytes. */
1245:
1246: case OP_REVERSE:
1247: #ifdef SUPPORT_UTF8
1248: if (utf8)
1249: {
1250: i = GET(ecode, 1);
1251: while (i-- > 0)
1252: {
1253: eptr--;
1.4 ! misha 1254: if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1.1 misha 1255: BACKCHAR(eptr);
1256: }
1257: }
1258: else
1259: #endif
1260:
1261: /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1262:
1263: {
1264: eptr -= GET(ecode, 1);
1.4 ! misha 1265: if (eptr < md->start_subject) MRRETURN(MATCH_NOMATCH);
1.1 misha 1266: }
1267:
1.4 ! misha 1268: /* Save the earliest consulted character, then skip to next op code */
1.1 misha 1269:
1.4 ! misha 1270: if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1.1 misha 1271: ecode += 1 + LINK_SIZE;
1272: break;
1273:
1274: /* The callout item calls an external function, if one is provided, passing
1275: details of the match so far. This is mainly for debugging, though the
1276: function is able to force a failure. */
1277:
1278: case OP_CALLOUT:
1279: if (pcre_callout != NULL)
1280: {
1281: pcre_callout_block cb;
1282: cb.version = 1; /* Version 1 of the callout block */
1283: cb.callout_number = ecode[1];
1284: cb.offset_vector = md->offset_vector;
1285: cb.subject = (PCRE_SPTR)md->start_subject;
1.4 ! misha 1286: cb.subject_length = (int)(md->end_subject - md->start_subject);
! 1287: cb.start_match = (int)(mstart - md->start_subject);
! 1288: cb.current_position = (int)(eptr - md->start_subject);
1.1 misha 1289: cb.pattern_position = GET(ecode, 2);
1290: cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1291: cb.capture_top = offset_top/2;
1292: cb.capture_last = md->capture_last;
1293: cb.callout_data = md->callout_data;
1.4 ! misha 1294: if ((rrc = (*pcre_callout)(&cb)) > 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 1295: if (rrc < 0) RRETURN(rrc);
1296: }
1297: ecode += 2 + 2*LINK_SIZE;
1298: break;
1299:
1300: /* Recursion either matches the current regex, or some subexpression. The
1301: offset data is the offset to the starting bracket from the start of the
1302: whole pattern. (This is so that it works from duplicated subpatterns.)
1303:
1304: If there are any capturing brackets started but not finished, we have to
1305: save their starting points and reinstate them after the recursion. However,
1306: we don't know how many such there are (offset_top records the completed
1307: total) so we just have to save all the potential data. There may be up to
1308: 65535 such values, which is too large to put on the stack, but using malloc
1309: for small numbers seems expensive. As a compromise, the stack is used when
1310: there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1311: is used. A problem is what to do if the malloc fails ... there is no way of
1312: returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1313: values on the stack, and accept that the rest may be wrong.
1314:
1315: There are also other values that have to be saved. We use a chained
1316: sequence of blocks that actually live on the stack. Thanks to Robin Houston
1317: for the original version of this logic. */
1318:
1319: case OP_RECURSE:
1320: {
1321: callpat = md->start_code + GET(ecode, 1);
1322: new_recursive.group_num = (callpat == md->start_code)? 0 :
1323: GET2(callpat, 1 + LINK_SIZE);
1324:
1325: /* Add to "recursing stack" */
1326:
1327: new_recursive.prevrec = md->recursive;
1328: md->recursive = &new_recursive;
1329:
1330: /* Find where to continue from afterwards */
1331:
1332: ecode += 1 + LINK_SIZE;
1333: new_recursive.after_call = ecode;
1334:
1335: /* Now save the offset data. */
1336:
1337: new_recursive.saved_max = md->offset_end;
1338: if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1339: new_recursive.offset_save = stacksave;
1340: else
1341: {
1342: new_recursive.offset_save =
1343: (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1344: if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1345: }
1346:
1347: memcpy(new_recursive.offset_save, md->offset_vector,
1348: new_recursive.saved_max * sizeof(int));
1.4 ! misha 1349: new_recursive.save_offset_top = offset_top;
1.1 misha 1350:
1351: /* OK, now we can do the recursion. For each top-level alternative we
1352: restore the offset and recursion data. */
1353:
1354: DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1355: flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1356: do
1357: {
1358: RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1359: md, ims, eptrb, flags, RM6);
1.4 ! misha 1360: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1.1 misha 1361: {
1362: DPRINTF(("Recursion matched\n"));
1363: md->recursive = new_recursive.prevrec;
1364: if (new_recursive.offset_save != stacksave)
1365: (pcre_free)(new_recursive.offset_save);
1.4 ! misha 1366: MRRETURN(MATCH_MATCH);
1.1 misha 1367: }
1368: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1369: {
1370: DPRINTF(("Recursion gave error %d\n", rrc));
1.3 misha 1371: if (new_recursive.offset_save != stacksave)
1372: (pcre_free)(new_recursive.offset_save);
1.1 misha 1373: RRETURN(rrc);
1374: }
1375:
1376: md->recursive = &new_recursive;
1377: memcpy(md->offset_vector, new_recursive.offset_save,
1378: new_recursive.saved_max * sizeof(int));
1379: callpat += GET(callpat, 1);
1380: }
1381: while (*callpat == OP_ALT);
1382:
1383: DPRINTF(("Recursion didn't match\n"));
1384: md->recursive = new_recursive.prevrec;
1385: if (new_recursive.offset_save != stacksave)
1386: (pcre_free)(new_recursive.offset_save);
1.4 ! misha 1387: MRRETURN(MATCH_NOMATCH);
1.1 misha 1388: }
1389: /* Control never reaches here */
1390:
1391: /* "Once" brackets are like assertion brackets except that after a match,
1392: the point in the subject string is not moved back. Thus there can never be
1393: a move back into the brackets. Friedl calls these "atomic" subpatterns.
1394: Check the alternative branches in turn - the matching won't pass the KET
1395: for this kind of subpattern. If any one branch matches, we carry on as at
1.4 ! misha 1396: the end of a normal bracket, leaving the subject pointer, but resetting
! 1397: the start-of-match value in case it was changed by \K. */
1.1 misha 1398:
1399: case OP_ONCE:
1400: prev = ecode;
1401: saved_eptr = eptr;
1402:
1403: do
1404: {
1405: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1.4 ! misha 1406: if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
! 1407: {
! 1408: mstart = md->start_match_ptr;
! 1409: break;
! 1410: }
1.1 misha 1411: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1412: ecode += GET(ecode,1);
1413: }
1414: while (*ecode == OP_ALT);
1415:
1416: /* If hit the end of the group (which could be repeated), fail */
1417:
1418: if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1419:
1420: /* Continue as from after the assertion, updating the offsets high water
1421: mark, since extracts may have been taken. */
1422:
1423: do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1424:
1425: offset_top = md->end_offset_top;
1426: eptr = md->end_match_ptr;
1427:
1428: /* For a non-repeating ket, just continue at this level. This also
1429: happens for a repeating ket if no characters were matched in the group.
1430: This is the forcible breaking of infinite loops as implemented in Perl
1431: 5.005. If there is an options reset, it will get obeyed in the normal
1432: course of events. */
1433:
1434: if (*ecode == OP_KET || eptr == saved_eptr)
1435: {
1436: ecode += 1+LINK_SIZE;
1437: break;
1438: }
1439:
1440: /* The repeating kets try the rest of the pattern or restart from the
1441: preceding bracket, in the appropriate order. The second "call" of match()
1442: uses tail recursion, to avoid using another stack frame. We need to reset
1443: any options that changed within the bracket before re-running it, so
1444: check the next opcode. */
1445:
1446: if (ecode[1+LINK_SIZE] == OP_OPT)
1447: {
1448: ims = (ims & ~PCRE_IMS) | ecode[4];
1449: DPRINTF(("ims set to %02lx at group repeat\n", ims));
1450: }
1451:
1452: if (*ecode == OP_KETRMIN)
1453: {
1454: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1455: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1456: ecode = prev;
1457: flags = 0;
1458: goto TAIL_RECURSE;
1459: }
1460: else /* OP_KETRMAX */
1461: {
1462: RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1463: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1464: ecode += 1 + LINK_SIZE;
1465: flags = 0;
1466: goto TAIL_RECURSE;
1467: }
1468: /* Control never gets here */
1469:
1470: /* An alternation is the end of a branch; scan along to find the end of the
1471: bracketed group and go to there. */
1472:
1473: case OP_ALT:
1474: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1475: break;
1476:
1477: /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1478: indicating that it may occur zero times. It may repeat infinitely, or not
1479: at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1480: with fixed upper repeat limits are compiled as a number of copies, with the
1481: optional ones preceded by BRAZERO or BRAMINZERO. */
1482:
1483: case OP_BRAZERO:
1484: {
1485: next = ecode+1;
1486: RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1487: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1488: do next += GET(next,1); while (*next == OP_ALT);
1489: ecode = next + 1 + LINK_SIZE;
1490: }
1491: break;
1492:
1493: case OP_BRAMINZERO:
1494: {
1495: next = ecode+1;
1496: do next += GET(next, 1); while (*next == OP_ALT);
1497: RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1498: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1499: ecode++;
1500: }
1501: break;
1502:
1503: case OP_SKIPZERO:
1504: {
1505: next = ecode+1;
1506: do next += GET(next,1); while (*next == OP_ALT);
1507: ecode = next + 1 + LINK_SIZE;
1508: }
1509: break;
1510:
1511: /* End of a group, repeated or non-repeating. */
1512:
1513: case OP_KET:
1514: case OP_KETRMIN:
1515: case OP_KETRMAX:
1516: prev = ecode - GET(ecode, 1);
1517:
1518: /* If this was a group that remembered the subject start, in order to break
1519: infinite repeats of empty string matches, retrieve the subject start from
1520: the chain. Otherwise, set it NULL. */
1521:
1522: if (*prev >= OP_SBRA)
1523: {
1524: saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1525: eptrb = eptrb->epb_prev; /* Backup to previous group */
1526: }
1527: else saved_eptr = NULL;
1528:
1.4 ! misha 1529: /* If we are at the end of an assertion group or an atomic group, stop
! 1530: matching and return MATCH_MATCH, but record the current high water mark for
! 1531: use by positive assertions. We also need to record the match start in case
! 1532: it was changed by \K. */
1.1 misha 1533:
1534: if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1535: *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1536: *prev == OP_ONCE)
1537: {
1538: md->end_match_ptr = eptr; /* For ONCE */
1539: md->end_offset_top = offset_top;
1.4 ! misha 1540: md->start_match_ptr = mstart;
! 1541: MRRETURN(MATCH_MATCH);
1.1 misha 1542: }
1543:
1544: /* For capturing groups we have to check the group number back at the start
1545: and if necessary complete handling an extraction by setting the offsets and
1546: bumping the high water mark. Note that whole-pattern recursion is coded as
1547: a recurse into group 0, so it won't be picked up here. Instead, we catch it
1548: when the OP_END is reached. Other recursion is handled here. */
1549:
1550: if (*prev == OP_CBRA || *prev == OP_SCBRA)
1551: {
1552: number = GET2(prev, 1+LINK_SIZE);
1553: offset = number << 1;
1554:
1.4 ! misha 1555: #ifdef PCRE_DEBUG
1.1 misha 1556: printf("end bracket %d", number);
1557: printf("\n");
1558: #endif
1559:
1560: md->capture_last = number;
1561: if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1562: {
1563: md->offset_vector[offset] =
1564: md->offset_vector[md->offset_end - number];
1.4 ! misha 1565: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1.1 misha 1566: if (offset_top <= offset) offset_top = offset + 2;
1567: }
1568:
1569: /* Handle a recursively called group. Restore the offsets
1570: appropriately and continue from after the call. */
1571:
1572: if (md->recursive != NULL && md->recursive->group_num == number)
1573: {
1574: recursion_info *rec = md->recursive;
1575: DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1576: md->recursive = rec->prevrec;
1577: memcpy(md->offset_vector, rec->offset_save,
1578: rec->saved_max * sizeof(int));
1.4 ! misha 1579: offset_top = rec->save_offset_top;
1.1 misha 1580: ecode = rec->after_call;
1581: ims = original_ims;
1582: break;
1583: }
1584: }
1585:
1586: /* For both capturing and non-capturing groups, reset the value of the ims
1587: flags, in case they got changed during the group. */
1588:
1589: ims = original_ims;
1590: DPRINTF(("ims reset to %02lx\n", ims));
1591:
1592: /* For a non-repeating ket, just continue at this level. This also
1593: happens for a repeating ket if no characters were matched in the group.
1594: This is the forcible breaking of infinite loops as implemented in Perl
1595: 5.005. If there is an options reset, it will get obeyed in the normal
1596: course of events. */
1597:
1598: if (*ecode == OP_KET || eptr == saved_eptr)
1599: {
1600: ecode += 1 + LINK_SIZE;
1601: break;
1602: }
1603:
1604: /* The repeating kets try the rest of the pattern or restart from the
1605: preceding bracket, in the appropriate order. In the second case, we can use
1606: tail recursion to avoid using another stack frame, unless we have an
1607: unlimited repeat of a group that can match an empty string. */
1608:
1609: flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1610:
1611: if (*ecode == OP_KETRMIN)
1612: {
1613: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1614: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1615: if (flags != 0) /* Could match an empty string */
1616: {
1617: RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1618: RRETURN(rrc);
1619: }
1620: ecode = prev;
1621: goto TAIL_RECURSE;
1622: }
1623: else /* OP_KETRMAX */
1624: {
1625: RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1626: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1627: ecode += 1 + LINK_SIZE;
1628: flags = 0;
1629: goto TAIL_RECURSE;
1630: }
1631: /* Control never gets here */
1632:
1633: /* Start of subject unless notbol, or after internal newline if multiline */
1634:
1635: case OP_CIRC:
1.4 ! misha 1636: if (md->notbol && eptr == md->start_subject) MRRETURN(MATCH_NOMATCH);
1.1 misha 1637: if ((ims & PCRE_MULTILINE) != 0)
1638: {
1639: if (eptr != md->start_subject &&
1640: (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1.4 ! misha 1641: MRRETURN(MATCH_NOMATCH);
1.1 misha 1642: ecode++;
1643: break;
1644: }
1645: /* ... else fall through */
1646:
1647: /* Start of subject assertion */
1648:
1649: case OP_SOD:
1.4 ! misha 1650: if (eptr != md->start_subject) MRRETURN(MATCH_NOMATCH);
1.1 misha 1651: ecode++;
1652: break;
1653:
1654: /* Start of match assertion */
1655:
1656: case OP_SOM:
1.4 ! misha 1657: if (eptr != md->start_subject + md->start_offset) MRRETURN(MATCH_NOMATCH);
1.1 misha 1658: ecode++;
1659: break;
1660:
1661: /* Reset the start of match point */
1662:
1663: case OP_SET_SOM:
1664: mstart = eptr;
1665: ecode++;
1666: break;
1667:
1668: /* Assert before internal newline if multiline, or before a terminating
1669: newline unless endonly is set, else end of subject unless noteol is set. */
1670:
1671: case OP_DOLL:
1672: if ((ims & PCRE_MULTILINE) != 0)
1673: {
1674: if (eptr < md->end_subject)
1.4 ! misha 1675: { if (!IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH); }
1.1 misha 1676: else
1.4 ! misha 1677: { if (md->noteol) MRRETURN(MATCH_NOMATCH); }
1.1 misha 1678: ecode++;
1679: break;
1680: }
1681: else
1682: {
1.4 ! misha 1683: if (md->noteol) MRRETURN(MATCH_NOMATCH);
1.1 misha 1684: if (!md->endonly)
1685: {
1686: if (eptr != md->end_subject &&
1687: (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1.4 ! misha 1688: MRRETURN(MATCH_NOMATCH);
1.1 misha 1689: ecode++;
1690: break;
1691: }
1692: }
1693: /* ... else fall through for endonly */
1694:
1695: /* End of subject assertion (\z) */
1696:
1697: case OP_EOD:
1.4 ! misha 1698: if (eptr < md->end_subject) MRRETURN(MATCH_NOMATCH);
1.1 misha 1699: ecode++;
1700: break;
1701:
1702: /* End of subject or ending \n assertion (\Z) */
1703:
1704: case OP_EODN:
1705: if (eptr != md->end_subject &&
1706: (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1.4 ! misha 1707: MRRETURN(MATCH_NOMATCH);
1.1 misha 1708: ecode++;
1709: break;
1710:
1711: /* Word boundary assertions */
1712:
1713: case OP_NOT_WORD_BOUNDARY:
1714: case OP_WORD_BOUNDARY:
1715: {
1716:
1717: /* Find out if the previous and current characters are "word" characters.
1718: It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1.4 ! misha 1719: be "non-word" characters. Remember the earliest consulted character for
! 1720: partial matching. */
1.1 misha 1721:
1722: #ifdef SUPPORT_UTF8
1723: if (utf8)
1724: {
1.4 ! misha 1725: /* Get status of previous character */
! 1726:
1.1 misha 1727: if (eptr == md->start_subject) prev_is_word = FALSE; else
1728: {
1.3 misha 1729: USPTR lastptr = eptr - 1;
1.1 misha 1730: while((*lastptr & 0xc0) == 0x80) lastptr--;
1.4 ! misha 1731: if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1.1 misha 1732: GETCHAR(c, lastptr);
1.4 ! misha 1733: #ifdef SUPPORT_UCP
! 1734: if (md->use_ucp)
! 1735: {
! 1736: if (c == '_') prev_is_word = TRUE; else
! 1737: {
! 1738: int cat = UCD_CATEGORY(c);
! 1739: prev_is_word = (cat == ucp_L || cat == ucp_N);
! 1740: }
! 1741: }
! 1742: else
! 1743: #endif
1.1 misha 1744: prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1745: }
1.4 ! misha 1746:
! 1747: /* Get status of next character */
! 1748:
! 1749: if (eptr >= md->end_subject)
! 1750: {
! 1751: SCHECK_PARTIAL();
! 1752: cur_is_word = FALSE;
! 1753: }
! 1754: else
1.1 misha 1755: {
1756: GETCHAR(c, eptr);
1.4 ! misha 1757: #ifdef SUPPORT_UCP
! 1758: if (md->use_ucp)
! 1759: {
! 1760: if (c == '_') cur_is_word = TRUE; else
! 1761: {
! 1762: int cat = UCD_CATEGORY(c);
! 1763: cur_is_word = (cat == ucp_L || cat == ucp_N);
! 1764: }
! 1765: }
! 1766: else
! 1767: #endif
1.1 misha 1768: cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1769: }
1770: }
1771: else
1772: #endif
1773:
1.4 ! misha 1774: /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
! 1775: consistency with the behaviour of \w we do use it in this case. */
1.1 misha 1776:
1777: {
1.4 ! misha 1778: /* Get status of previous character */
! 1779:
! 1780: if (eptr == md->start_subject) prev_is_word = FALSE; else
! 1781: {
! 1782: if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
! 1783: #ifdef SUPPORT_UCP
! 1784: if (md->use_ucp)
! 1785: {
! 1786: c = eptr[-1];
! 1787: if (c == '_') prev_is_word = TRUE; else
! 1788: {
! 1789: int cat = UCD_CATEGORY(c);
! 1790: prev_is_word = (cat == ucp_L || cat == ucp_N);
! 1791: }
! 1792: }
! 1793: else
! 1794: #endif
! 1795: prev_is_word = ((md->ctypes[eptr[-1]] & ctype_word) != 0);
! 1796: }
! 1797:
! 1798: /* Get status of next character */
! 1799:
! 1800: if (eptr >= md->end_subject)
! 1801: {
! 1802: SCHECK_PARTIAL();
! 1803: cur_is_word = FALSE;
! 1804: }
! 1805: else
! 1806: #ifdef SUPPORT_UCP
! 1807: if (md->use_ucp)
! 1808: {
! 1809: c = *eptr;
! 1810: if (c == '_') cur_is_word = TRUE; else
! 1811: {
! 1812: int cat = UCD_CATEGORY(c);
! 1813: cur_is_word = (cat == ucp_L || cat == ucp_N);
! 1814: }
! 1815: }
! 1816: else
! 1817: #endif
! 1818: cur_is_word = ((md->ctypes[*eptr] & ctype_word) != 0);
1.1 misha 1819: }
1820:
1821: /* Now see if the situation is what we want */
1822:
1823: if ((*ecode++ == OP_WORD_BOUNDARY)?
1824: cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1.4 ! misha 1825: MRRETURN(MATCH_NOMATCH);
1.1 misha 1826: }
1827: break;
1828:
1829: /* Match a single character type; inline for speed */
1830:
1831: case OP_ANY:
1.4 ! misha 1832: if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1.1 misha 1833: /* Fall through */
1834:
1835: case OP_ALLANY:
1.4 ! misha 1836: if (eptr++ >= md->end_subject)
! 1837: {
! 1838: SCHECK_PARTIAL();
! 1839: MRRETURN(MATCH_NOMATCH);
! 1840: }
1.1 misha 1841: if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1842: ecode++;
1843: break;
1844:
1845: /* Match a single byte, even in UTF-8 mode. This opcode really does match
1846: any byte, even newline, independent of the setting of PCRE_DOTALL. */
1847:
1848: case OP_ANYBYTE:
1.4 ! misha 1849: if (eptr++ >= md->end_subject)
! 1850: {
! 1851: SCHECK_PARTIAL();
! 1852: MRRETURN(MATCH_NOMATCH);
! 1853: }
1.1 misha 1854: ecode++;
1855: break;
1856:
1857: case OP_NOT_DIGIT:
1.4 ! misha 1858: if (eptr >= md->end_subject)
! 1859: {
! 1860: SCHECK_PARTIAL();
! 1861: MRRETURN(MATCH_NOMATCH);
! 1862: }
1.1 misha 1863: GETCHARINCTEST(c, eptr);
1864: if (
1865: #ifdef SUPPORT_UTF8
1866: c < 256 &&
1867: #endif
1868: (md->ctypes[c] & ctype_digit) != 0
1869: )
1.4 ! misha 1870: MRRETURN(MATCH_NOMATCH);
1.1 misha 1871: ecode++;
1872: break;
1873:
1874: case OP_DIGIT:
1.4 ! misha 1875: if (eptr >= md->end_subject)
! 1876: {
! 1877: SCHECK_PARTIAL();
! 1878: MRRETURN(MATCH_NOMATCH);
! 1879: }
1.1 misha 1880: GETCHARINCTEST(c, eptr);
1881: if (
1882: #ifdef SUPPORT_UTF8
1883: c >= 256 ||
1884: #endif
1885: (md->ctypes[c] & ctype_digit) == 0
1886: )
1.4 ! misha 1887: MRRETURN(MATCH_NOMATCH);
1.1 misha 1888: ecode++;
1889: break;
1890:
1891: case OP_NOT_WHITESPACE:
1.4 ! misha 1892: if (eptr >= md->end_subject)
! 1893: {
! 1894: SCHECK_PARTIAL();
! 1895: MRRETURN(MATCH_NOMATCH);
! 1896: }
1.1 misha 1897: GETCHARINCTEST(c, eptr);
1898: if (
1899: #ifdef SUPPORT_UTF8
1900: c < 256 &&
1901: #endif
1902: (md->ctypes[c] & ctype_space) != 0
1903: )
1.4 ! misha 1904: MRRETURN(MATCH_NOMATCH);
1.1 misha 1905: ecode++;
1906: break;
1907:
1908: case OP_WHITESPACE:
1.4 ! misha 1909: if (eptr >= md->end_subject)
! 1910: {
! 1911: SCHECK_PARTIAL();
! 1912: MRRETURN(MATCH_NOMATCH);
! 1913: }
1.1 misha 1914: GETCHARINCTEST(c, eptr);
1915: if (
1916: #ifdef SUPPORT_UTF8
1917: c >= 256 ||
1918: #endif
1919: (md->ctypes[c] & ctype_space) == 0
1920: )
1.4 ! misha 1921: MRRETURN(MATCH_NOMATCH);
1.1 misha 1922: ecode++;
1923: break;
1924:
1925: case OP_NOT_WORDCHAR:
1.4 ! misha 1926: if (eptr >= md->end_subject)
! 1927: {
! 1928: SCHECK_PARTIAL();
! 1929: MRRETURN(MATCH_NOMATCH);
! 1930: }
1.1 misha 1931: GETCHARINCTEST(c, eptr);
1932: if (
1933: #ifdef SUPPORT_UTF8
1934: c < 256 &&
1935: #endif
1936: (md->ctypes[c] & ctype_word) != 0
1937: )
1.4 ! misha 1938: MRRETURN(MATCH_NOMATCH);
1.1 misha 1939: ecode++;
1940: break;
1941:
1942: case OP_WORDCHAR:
1.4 ! misha 1943: if (eptr >= md->end_subject)
! 1944: {
! 1945: SCHECK_PARTIAL();
! 1946: MRRETURN(MATCH_NOMATCH);
! 1947: }
1.1 misha 1948: GETCHARINCTEST(c, eptr);
1949: if (
1950: #ifdef SUPPORT_UTF8
1951: c >= 256 ||
1952: #endif
1953: (md->ctypes[c] & ctype_word) == 0
1954: )
1.4 ! misha 1955: MRRETURN(MATCH_NOMATCH);
1.1 misha 1956: ecode++;
1957: break;
1958:
1959: case OP_ANYNL:
1.4 ! misha 1960: if (eptr >= md->end_subject)
! 1961: {
! 1962: SCHECK_PARTIAL();
! 1963: MRRETURN(MATCH_NOMATCH);
! 1964: }
1.1 misha 1965: GETCHARINCTEST(c, eptr);
1966: switch(c)
1967: {
1.4 ! misha 1968: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 1969: case 0x000d:
1970: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1971: break;
1972:
1973: case 0x000a:
1974: break;
1975:
1976: case 0x000b:
1977: case 0x000c:
1978: case 0x0085:
1979: case 0x2028:
1980: case 0x2029:
1.4 ! misha 1981: if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1.1 misha 1982: break;
1983: }
1984: ecode++;
1985: break;
1986:
1987: case OP_NOT_HSPACE:
1.4 ! misha 1988: if (eptr >= md->end_subject)
! 1989: {
! 1990: SCHECK_PARTIAL();
! 1991: MRRETURN(MATCH_NOMATCH);
! 1992: }
1.1 misha 1993: GETCHARINCTEST(c, eptr);
1994: switch(c)
1995: {
1996: default: break;
1997: case 0x09: /* HT */
1998: case 0x20: /* SPACE */
1999: case 0xa0: /* NBSP */
2000: case 0x1680: /* OGHAM SPACE MARK */
2001: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2002: case 0x2000: /* EN QUAD */
2003: case 0x2001: /* EM QUAD */
2004: case 0x2002: /* EN SPACE */
2005: case 0x2003: /* EM SPACE */
2006: case 0x2004: /* THREE-PER-EM SPACE */
2007: case 0x2005: /* FOUR-PER-EM SPACE */
2008: case 0x2006: /* SIX-PER-EM SPACE */
2009: case 0x2007: /* FIGURE SPACE */
2010: case 0x2008: /* PUNCTUATION SPACE */
2011: case 0x2009: /* THIN SPACE */
2012: case 0x200A: /* HAIR SPACE */
2013: case 0x202f: /* NARROW NO-BREAK SPACE */
2014: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2015: case 0x3000: /* IDEOGRAPHIC SPACE */
1.4 ! misha 2016: MRRETURN(MATCH_NOMATCH);
1.1 misha 2017: }
2018: ecode++;
2019: break;
2020:
2021: case OP_HSPACE:
1.4 ! misha 2022: if (eptr >= md->end_subject)
! 2023: {
! 2024: SCHECK_PARTIAL();
! 2025: MRRETURN(MATCH_NOMATCH);
! 2026: }
1.1 misha 2027: GETCHARINCTEST(c, eptr);
2028: switch(c)
2029: {
1.4 ! misha 2030: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 2031: case 0x09: /* HT */
2032: case 0x20: /* SPACE */
2033: case 0xa0: /* NBSP */
2034: case 0x1680: /* OGHAM SPACE MARK */
2035: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2036: case 0x2000: /* EN QUAD */
2037: case 0x2001: /* EM QUAD */
2038: case 0x2002: /* EN SPACE */
2039: case 0x2003: /* EM SPACE */
2040: case 0x2004: /* THREE-PER-EM SPACE */
2041: case 0x2005: /* FOUR-PER-EM SPACE */
2042: case 0x2006: /* SIX-PER-EM SPACE */
2043: case 0x2007: /* FIGURE SPACE */
2044: case 0x2008: /* PUNCTUATION SPACE */
2045: case 0x2009: /* THIN SPACE */
2046: case 0x200A: /* HAIR SPACE */
2047: case 0x202f: /* NARROW NO-BREAK SPACE */
2048: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2049: case 0x3000: /* IDEOGRAPHIC SPACE */
2050: break;
2051: }
2052: ecode++;
2053: break;
2054:
2055: case OP_NOT_VSPACE:
1.4 ! misha 2056: if (eptr >= md->end_subject)
! 2057: {
! 2058: SCHECK_PARTIAL();
! 2059: MRRETURN(MATCH_NOMATCH);
! 2060: }
1.1 misha 2061: GETCHARINCTEST(c, eptr);
2062: switch(c)
2063: {
2064: default: break;
2065: case 0x0a: /* LF */
2066: case 0x0b: /* VT */
2067: case 0x0c: /* FF */
2068: case 0x0d: /* CR */
2069: case 0x85: /* NEL */
2070: case 0x2028: /* LINE SEPARATOR */
2071: case 0x2029: /* PARAGRAPH SEPARATOR */
1.4 ! misha 2072: MRRETURN(MATCH_NOMATCH);
1.1 misha 2073: }
2074: ecode++;
2075: break;
2076:
2077: case OP_VSPACE:
1.4 ! misha 2078: if (eptr >= md->end_subject)
! 2079: {
! 2080: SCHECK_PARTIAL();
! 2081: MRRETURN(MATCH_NOMATCH);
! 2082: }
1.1 misha 2083: GETCHARINCTEST(c, eptr);
2084: switch(c)
2085: {
1.4 ! misha 2086: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 2087: case 0x0a: /* LF */
2088: case 0x0b: /* VT */
2089: case 0x0c: /* FF */
2090: case 0x0d: /* CR */
2091: case 0x85: /* NEL */
2092: case 0x2028: /* LINE SEPARATOR */
2093: case 0x2029: /* PARAGRAPH SEPARATOR */
2094: break;
2095: }
2096: ecode++;
2097: break;
2098:
2099: #ifdef SUPPORT_UCP
2100: /* Check the next character by Unicode property. We will get here only
2101: if the support is in the binary; otherwise a compile-time error occurs. */
2102:
2103: case OP_PROP:
2104: case OP_NOTPROP:
1.4 ! misha 2105: if (eptr >= md->end_subject)
! 2106: {
! 2107: SCHECK_PARTIAL();
! 2108: MRRETURN(MATCH_NOMATCH);
! 2109: }
1.1 misha 2110: GETCHARINCTEST(c, eptr);
2111: {
1.3 misha 2112: const ucd_record *prop = GET_UCD(c);
1.1 misha 2113:
2114: switch(ecode[1])
2115: {
2116: case PT_ANY:
1.4 ! misha 2117: if (op == OP_NOTPROP) MRRETURN(MATCH_NOMATCH);
1.1 misha 2118: break;
2119:
2120: case PT_LAMP:
1.2 misha 2121: if ((prop->chartype == ucp_Lu ||
2122: prop->chartype == ucp_Ll ||
2123: prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1.4 ! misha 2124: MRRETURN(MATCH_NOMATCH);
! 2125: break;
1.1 misha 2126:
2127: case PT_GC:
1.2 misha 2128: if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1.4 ! misha 2129: MRRETURN(MATCH_NOMATCH);
1.1 misha 2130: break;
2131:
2132: case PT_PC:
1.2 misha 2133: if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1.4 ! misha 2134: MRRETURN(MATCH_NOMATCH);
1.1 misha 2135: break;
2136:
2137: case PT_SC:
1.2 misha 2138: if ((ecode[2] != prop->script) == (op == OP_PROP))
1.4 ! misha 2139: MRRETURN(MATCH_NOMATCH);
! 2140: break;
! 2141:
! 2142: /* These are specials */
! 2143:
! 2144: case PT_ALNUM:
! 2145: if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
! 2146: _pcre_ucp_gentype[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
! 2147: MRRETURN(MATCH_NOMATCH);
! 2148: break;
! 2149:
! 2150: case PT_SPACE: /* Perl space */
! 2151: if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
! 2152: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
! 2153: == (op == OP_NOTPROP))
! 2154: MRRETURN(MATCH_NOMATCH);
! 2155: break;
! 2156:
! 2157: case PT_PXSPACE: /* POSIX space */
! 2158: if ((_pcre_ucp_gentype[prop->chartype] == ucp_Z ||
! 2159: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
! 2160: c == CHAR_FF || c == CHAR_CR)
! 2161: == (op == OP_NOTPROP))
! 2162: MRRETURN(MATCH_NOMATCH);
! 2163: break;
! 2164:
! 2165: case PT_WORD:
! 2166: if ((_pcre_ucp_gentype[prop->chartype] == ucp_L ||
! 2167: _pcre_ucp_gentype[prop->chartype] == ucp_N ||
! 2168: c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
! 2169: MRRETURN(MATCH_NOMATCH);
1.1 misha 2170: break;
2171:
1.4 ! misha 2172: /* This should never occur */
! 2173:
1.1 misha 2174: default:
2175: RRETURN(PCRE_ERROR_INTERNAL);
2176: }
2177:
2178: ecode += 3;
2179: }
2180: break;
2181:
2182: /* Match an extended Unicode sequence. We will get here only if the support
2183: is in the binary; otherwise a compile-time error occurs. */
2184:
2185: case OP_EXTUNI:
1.4 ! misha 2186: if (eptr >= md->end_subject)
! 2187: {
! 2188: SCHECK_PARTIAL();
! 2189: MRRETURN(MATCH_NOMATCH);
! 2190: }
1.1 misha 2191: GETCHARINCTEST(c, eptr);
2192: {
1.2 misha 2193: int category = UCD_CATEGORY(c);
1.4 ! misha 2194: if (category == ucp_M) MRRETURN(MATCH_NOMATCH);
1.1 misha 2195: while (eptr < md->end_subject)
2196: {
2197: int len = 1;
2198: if (!utf8) c = *eptr; else
2199: {
2200: GETCHARLEN(c, eptr, len);
2201: }
1.2 misha 2202: category = UCD_CATEGORY(c);
1.1 misha 2203: if (category != ucp_M) break;
2204: eptr += len;
2205: }
2206: }
2207: ecode++;
2208: break;
2209: #endif
2210:
2211:
2212: /* Match a back reference, possibly repeatedly. Look past the end of the
2213: item to see if there is repeat information following. The code is similar
2214: to that for character classes, but repeated for efficiency. Then obey
2215: similar code to character type repeats - written out again for speed.
2216: However, if the referenced string is the empty string, always treat
2217: it as matched, any number of times (otherwise there could be infinite
2218: loops). */
2219:
2220: case OP_REF:
2221: {
2222: offset = GET2(ecode, 1) << 1; /* Doubled ref number */
2223: ecode += 3;
2224:
2225: /* If the reference is unset, there are two possibilities:
2226:
2227: (a) In the default, Perl-compatible state, set the length to be longer
2228: than the amount of subject left; this ensures that every attempt at a
2229: match fails. We can't just fail here, because of the possibility of
2230: quantifiers with zero minima.
2231:
2232: (b) If the JavaScript compatibility flag is set, set the length to zero
2233: so that the back reference matches an empty string.
2234:
2235: Otherwise, set the length to the length of what was matched by the
2236: referenced subpattern. */
2237:
2238: if (offset >= offset_top || md->offset_vector[offset] < 0)
1.4 ! misha 2239: length = (md->jscript_compat)? 0 : (int)(md->end_subject - eptr + 1);
1.1 misha 2240: else
2241: length = md->offset_vector[offset+1] - md->offset_vector[offset];
2242:
2243: /* Set up for repetition, or handle the non-repeated case */
2244:
2245: switch (*ecode)
2246: {
2247: case OP_CRSTAR:
2248: case OP_CRMINSTAR:
2249: case OP_CRPLUS:
2250: case OP_CRMINPLUS:
2251: case OP_CRQUERY:
2252: case OP_CRMINQUERY:
2253: c = *ecode++ - OP_CRSTAR;
2254: minimize = (c & 1) != 0;
2255: min = rep_min[c]; /* Pick up values from tables; */
2256: max = rep_max[c]; /* zero for max => infinity */
2257: if (max == 0) max = INT_MAX;
2258: break;
2259:
2260: case OP_CRRANGE:
2261: case OP_CRMINRANGE:
2262: minimize = (*ecode == OP_CRMINRANGE);
2263: min = GET2(ecode, 1);
2264: max = GET2(ecode, 3);
2265: if (max == 0) max = INT_MAX;
2266: ecode += 5;
2267: break;
2268:
2269: default: /* No repeat follows */
1.4 ! misha 2270: if (!match_ref(offset, eptr, length, md, ims))
! 2271: {
! 2272: CHECK_PARTIAL();
! 2273: MRRETURN(MATCH_NOMATCH);
! 2274: }
1.1 misha 2275: eptr += length;
2276: continue; /* With the main loop */
2277: }
2278:
2279: /* If the length of the reference is zero, just continue with the
2280: main loop. */
2281:
2282: if (length == 0) continue;
2283:
2284: /* First, ensure the minimum number of matches are present. We get back
2285: the length of the reference string explicitly rather than passing the
2286: address of eptr, so that eptr can be a register variable. */
2287:
2288: for (i = 1; i <= min; i++)
2289: {
1.4 ! misha 2290: if (!match_ref(offset, eptr, length, md, ims))
! 2291: {
! 2292: CHECK_PARTIAL();
! 2293: MRRETURN(MATCH_NOMATCH);
! 2294: }
1.1 misha 2295: eptr += length;
2296: }
2297:
2298: /* If min = max, continue at the same level without recursion.
2299: They are not both allowed to be zero. */
2300:
2301: if (min == max) continue;
2302:
2303: /* If minimizing, keep trying and advancing the pointer */
2304:
2305: if (minimize)
2306: {
2307: for (fi = min;; fi++)
2308: {
2309: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
2310: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 2311: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 2312: if (!match_ref(offset, eptr, length, md, ims))
! 2313: {
! 2314: CHECK_PARTIAL();
! 2315: MRRETURN(MATCH_NOMATCH);
! 2316: }
1.1 misha 2317: eptr += length;
2318: }
2319: /* Control never gets here */
2320: }
2321:
2322: /* If maximizing, find the longest string and work backwards */
2323:
2324: else
2325: {
2326: pp = eptr;
2327: for (i = min; i < max; i++)
2328: {
1.4 ! misha 2329: if (!match_ref(offset, eptr, length, md, ims))
! 2330: {
! 2331: CHECK_PARTIAL();
! 2332: break;
! 2333: }
1.1 misha 2334: eptr += length;
2335: }
2336: while (eptr >= pp)
2337: {
2338: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
2339: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2340: eptr -= length;
2341: }
1.4 ! misha 2342: MRRETURN(MATCH_NOMATCH);
1.1 misha 2343: }
2344: }
2345: /* Control never gets here */
2346:
2347: /* Match a bit-mapped character class, possibly repeatedly. This op code is
2348: used when all the characters in the class have values in the range 0-255,
2349: and either the matching is caseful, or the characters are in the range
2350: 0-127 when UTF-8 processing is enabled. The only difference between
2351: OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2352: encountered.
2353:
2354: First, look past the end of the item to see if there is repeat information
2355: following. Then obey similar code to character type repeats - written out
2356: again for speed. */
2357:
2358: case OP_NCLASS:
2359: case OP_CLASS:
2360: {
2361: data = ecode + 1; /* Save for matching */
2362: ecode += 33; /* Advance past the item */
2363:
2364: switch (*ecode)
2365: {
2366: case OP_CRSTAR:
2367: case OP_CRMINSTAR:
2368: case OP_CRPLUS:
2369: case OP_CRMINPLUS:
2370: case OP_CRQUERY:
2371: case OP_CRMINQUERY:
2372: c = *ecode++ - OP_CRSTAR;
2373: minimize = (c & 1) != 0;
2374: min = rep_min[c]; /* Pick up values from tables; */
2375: max = rep_max[c]; /* zero for max => infinity */
2376: if (max == 0) max = INT_MAX;
2377: break;
2378:
2379: case OP_CRRANGE:
2380: case OP_CRMINRANGE:
2381: minimize = (*ecode == OP_CRMINRANGE);
2382: min = GET2(ecode, 1);
2383: max = GET2(ecode, 3);
2384: if (max == 0) max = INT_MAX;
2385: ecode += 5;
2386: break;
2387:
2388: default: /* No repeat follows */
2389: min = max = 1;
2390: break;
2391: }
2392:
2393: /* First, ensure the minimum number of matches are present. */
2394:
2395: #ifdef SUPPORT_UTF8
2396: /* UTF-8 mode */
2397: if (utf8)
2398: {
2399: for (i = 1; i <= min; i++)
2400: {
1.4 ! misha 2401: if (eptr >= md->end_subject)
! 2402: {
! 2403: SCHECK_PARTIAL();
! 2404: MRRETURN(MATCH_NOMATCH);
! 2405: }
1.1 misha 2406: GETCHARINC(c, eptr);
2407: if (c > 255)
2408: {
1.4 ! misha 2409: if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
1.1 misha 2410: }
2411: else
2412: {
1.4 ! misha 2413: if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 2414: }
2415: }
2416: }
2417: else
2418: #endif
2419: /* Not UTF-8 mode */
2420: {
2421: for (i = 1; i <= min; i++)
2422: {
1.4 ! misha 2423: if (eptr >= md->end_subject)
! 2424: {
! 2425: SCHECK_PARTIAL();
! 2426: MRRETURN(MATCH_NOMATCH);
! 2427: }
1.1 misha 2428: c = *eptr++;
1.4 ! misha 2429: if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 2430: }
2431: }
2432:
2433: /* If max == min we can continue with the main loop without the
2434: need to recurse. */
2435:
2436: if (min == max) continue;
2437:
2438: /* If minimizing, keep testing the rest of the expression and advancing
2439: the pointer while it matches the class. */
2440:
2441: if (minimize)
2442: {
2443: #ifdef SUPPORT_UTF8
2444: /* UTF-8 mode */
2445: if (utf8)
2446: {
2447: for (fi = min;; fi++)
2448: {
2449: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2450: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 2451: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 2452: if (eptr >= md->end_subject)
! 2453: {
! 2454: SCHECK_PARTIAL();
! 2455: MRRETURN(MATCH_NOMATCH);
! 2456: }
1.1 misha 2457: GETCHARINC(c, eptr);
2458: if (c > 255)
2459: {
1.4 ! misha 2460: if (op == OP_CLASS) MRRETURN(MATCH_NOMATCH);
1.1 misha 2461: }
2462: else
2463: {
1.4 ! misha 2464: if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 2465: }
2466: }
2467: }
2468: else
2469: #endif
2470: /* Not UTF-8 mode */
2471: {
2472: for (fi = min;; fi++)
2473: {
2474: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2475: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 2476: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 2477: if (eptr >= md->end_subject)
! 2478: {
! 2479: SCHECK_PARTIAL();
! 2480: MRRETURN(MATCH_NOMATCH);
! 2481: }
1.1 misha 2482: c = *eptr++;
1.4 ! misha 2483: if ((data[c/8] & (1 << (c&7))) == 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 2484: }
2485: }
2486: /* Control never gets here */
2487: }
2488:
2489: /* If maximizing, find the longest possible run, then work backwards. */
2490:
2491: else
2492: {
2493: pp = eptr;
2494:
2495: #ifdef SUPPORT_UTF8
2496: /* UTF-8 mode */
2497: if (utf8)
2498: {
2499: for (i = min; i < max; i++)
2500: {
2501: int len = 1;
1.4 ! misha 2502: if (eptr >= md->end_subject)
! 2503: {
! 2504: SCHECK_PARTIAL();
! 2505: break;
! 2506: }
1.1 misha 2507: GETCHARLEN(c, eptr, len);
2508: if (c > 255)
2509: {
2510: if (op == OP_CLASS) break;
2511: }
2512: else
2513: {
2514: if ((data[c/8] & (1 << (c&7))) == 0) break;
2515: }
2516: eptr += len;
2517: }
2518: for (;;)
2519: {
2520: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2521: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2522: if (eptr-- == pp) break; /* Stop if tried at original pos */
2523: BACKCHAR(eptr);
2524: }
2525: }
2526: else
2527: #endif
2528: /* Not UTF-8 mode */
2529: {
2530: for (i = min; i < max; i++)
2531: {
1.4 ! misha 2532: if (eptr >= md->end_subject)
! 2533: {
! 2534: SCHECK_PARTIAL();
! 2535: break;
! 2536: }
1.1 misha 2537: c = *eptr;
2538: if ((data[c/8] & (1 << (c&7))) == 0) break;
2539: eptr++;
2540: }
2541: while (eptr >= pp)
2542: {
2543: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2544: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2545: eptr--;
2546: }
2547: }
2548:
1.4 ! misha 2549: MRRETURN(MATCH_NOMATCH);
1.1 misha 2550: }
2551: }
2552: /* Control never gets here */
2553:
2554:
2555: /* Match an extended character class. This opcode is encountered only
1.3 misha 2556: when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2557: mode, because Unicode properties are supported in non-UTF-8 mode. */
1.1 misha 2558:
2559: #ifdef SUPPORT_UTF8
2560: case OP_XCLASS:
2561: {
2562: data = ecode + 1 + LINK_SIZE; /* Save for matching */
2563: ecode += GET(ecode, 1); /* Advance past the item */
2564:
2565: switch (*ecode)
2566: {
2567: case OP_CRSTAR:
2568: case OP_CRMINSTAR:
2569: case OP_CRPLUS:
2570: case OP_CRMINPLUS:
2571: case OP_CRQUERY:
2572: case OP_CRMINQUERY:
2573: c = *ecode++ - OP_CRSTAR;
2574: minimize = (c & 1) != 0;
2575: min = rep_min[c]; /* Pick up values from tables; */
2576: max = rep_max[c]; /* zero for max => infinity */
2577: if (max == 0) max = INT_MAX;
2578: break;
2579:
2580: case OP_CRRANGE:
2581: case OP_CRMINRANGE:
2582: minimize = (*ecode == OP_CRMINRANGE);
2583: min = GET2(ecode, 1);
2584: max = GET2(ecode, 3);
2585: if (max == 0) max = INT_MAX;
2586: ecode += 5;
2587: break;
2588:
2589: default: /* No repeat follows */
2590: min = max = 1;
2591: break;
2592: }
2593:
2594: /* First, ensure the minimum number of matches are present. */
2595:
2596: for (i = 1; i <= min; i++)
2597: {
1.4 ! misha 2598: if (eptr >= md->end_subject)
! 2599: {
! 2600: SCHECK_PARTIAL();
! 2601: MRRETURN(MATCH_NOMATCH);
! 2602: }
1.3 misha 2603: GETCHARINCTEST(c, eptr);
1.4 ! misha 2604: if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
1.1 misha 2605: }
2606:
2607: /* If max == min we can continue with the main loop without the
2608: need to recurse. */
2609:
2610: if (min == max) continue;
2611:
2612: /* If minimizing, keep testing the rest of the expression and advancing
2613: the pointer while it matches the class. */
2614:
2615: if (minimize)
2616: {
2617: for (fi = min;; fi++)
2618: {
2619: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2620: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 2621: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 2622: if (eptr >= md->end_subject)
! 2623: {
! 2624: SCHECK_PARTIAL();
! 2625: MRRETURN(MATCH_NOMATCH);
! 2626: }
1.3 misha 2627: GETCHARINCTEST(c, eptr);
1.4 ! misha 2628: if (!_pcre_xclass(c, data)) MRRETURN(MATCH_NOMATCH);
1.1 misha 2629: }
2630: /* Control never gets here */
2631: }
2632:
2633: /* If maximizing, find the longest possible run, then work backwards. */
2634:
2635: else
2636: {
2637: pp = eptr;
2638: for (i = min; i < max; i++)
2639: {
2640: int len = 1;
1.4 ! misha 2641: if (eptr >= md->end_subject)
! 2642: {
! 2643: SCHECK_PARTIAL();
! 2644: break;
! 2645: }
1.3 misha 2646: GETCHARLENTEST(c, eptr, len);
1.1 misha 2647: if (!_pcre_xclass(c, data)) break;
2648: eptr += len;
2649: }
2650: for(;;)
2651: {
2652: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2653: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2654: if (eptr-- == pp) break; /* Stop if tried at original pos */
2655: if (utf8) BACKCHAR(eptr);
2656: }
1.4 ! misha 2657: MRRETURN(MATCH_NOMATCH);
1.1 misha 2658: }
2659:
2660: /* Control never gets here */
2661: }
2662: #endif /* End of XCLASS */
2663:
2664: /* Match a single character, casefully */
2665:
2666: case OP_CHAR:
2667: #ifdef SUPPORT_UTF8
2668: if (utf8)
2669: {
2670: length = 1;
2671: ecode++;
2672: GETCHARLEN(fc, ecode, length);
1.4 ! misha 2673: if (length > md->end_subject - eptr)
! 2674: {
! 2675: CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
! 2676: MRRETURN(MATCH_NOMATCH);
! 2677: }
! 2678: while (length-- > 0) if (*ecode++ != *eptr++) MRRETURN(MATCH_NOMATCH);
1.1 misha 2679: }
2680: else
2681: #endif
2682:
2683: /* Non-UTF-8 mode */
2684: {
1.4 ! misha 2685: if (md->end_subject - eptr < 1)
! 2686: {
! 2687: SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
! 2688: MRRETURN(MATCH_NOMATCH);
! 2689: }
! 2690: if (ecode[1] != *eptr++) MRRETURN(MATCH_NOMATCH);
1.1 misha 2691: ecode += 2;
2692: }
2693: break;
2694:
2695: /* Match a single character, caselessly */
2696:
2697: case OP_CHARNC:
2698: #ifdef SUPPORT_UTF8
2699: if (utf8)
2700: {
2701: length = 1;
2702: ecode++;
2703: GETCHARLEN(fc, ecode, length);
2704:
1.4 ! misha 2705: if (length > md->end_subject - eptr)
! 2706: {
! 2707: CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
! 2708: MRRETURN(MATCH_NOMATCH);
! 2709: }
1.1 misha 2710:
2711: /* If the pattern character's value is < 128, we have only one byte, and
2712: can use the fast lookup table. */
2713:
2714: if (fc < 128)
2715: {
1.4 ! misha 2716: if (md->lcc[*ecode++] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
1.1 misha 2717: }
2718:
2719: /* Otherwise we must pick up the subject character */
2720:
2721: else
2722: {
2723: unsigned int dc;
2724: GETCHARINC(dc, eptr);
2725: ecode += length;
2726:
2727: /* If we have Unicode property support, we can use it to test the other
2728: case of the character, if there is one. */
2729:
2730: if (fc != dc)
2731: {
2732: #ifdef SUPPORT_UCP
1.2 misha 2733: if (dc != UCD_OTHERCASE(fc))
1.1 misha 2734: #endif
1.4 ! misha 2735: MRRETURN(MATCH_NOMATCH);
1.1 misha 2736: }
2737: }
2738: }
2739: else
2740: #endif /* SUPPORT_UTF8 */
2741:
2742: /* Non-UTF-8 mode */
2743: {
1.4 ! misha 2744: if (md->end_subject - eptr < 1)
! 2745: {
! 2746: SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
! 2747: MRRETURN(MATCH_NOMATCH);
! 2748: }
! 2749: if (md->lcc[ecode[1]] != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
1.1 misha 2750: ecode += 2;
2751: }
2752: break;
2753:
2754: /* Match a single character repeatedly. */
2755:
2756: case OP_EXACT:
2757: min = max = GET2(ecode, 1);
2758: ecode += 3;
2759: goto REPEATCHAR;
2760:
2761: case OP_POSUPTO:
2762: possessive = TRUE;
2763: /* Fall through */
2764:
2765: case OP_UPTO:
2766: case OP_MINUPTO:
2767: min = 0;
2768: max = GET2(ecode, 1);
2769: minimize = *ecode == OP_MINUPTO;
2770: ecode += 3;
2771: goto REPEATCHAR;
2772:
2773: case OP_POSSTAR:
2774: possessive = TRUE;
2775: min = 0;
2776: max = INT_MAX;
2777: ecode++;
2778: goto REPEATCHAR;
2779:
2780: case OP_POSPLUS:
2781: possessive = TRUE;
2782: min = 1;
2783: max = INT_MAX;
2784: ecode++;
2785: goto REPEATCHAR;
2786:
2787: case OP_POSQUERY:
2788: possessive = TRUE;
2789: min = 0;
2790: max = 1;
2791: ecode++;
2792: goto REPEATCHAR;
2793:
2794: case OP_STAR:
2795: case OP_MINSTAR:
2796: case OP_PLUS:
2797: case OP_MINPLUS:
2798: case OP_QUERY:
2799: case OP_MINQUERY:
2800: c = *ecode++ - OP_STAR;
2801: minimize = (c & 1) != 0;
1.4 ! misha 2802:
1.1 misha 2803: min = rep_min[c]; /* Pick up values from tables; */
2804: max = rep_max[c]; /* zero for max => infinity */
2805: if (max == 0) max = INT_MAX;
2806:
1.4 ! misha 2807: /* Common code for all repeated single-character matches. */
1.1 misha 2808:
2809: REPEATCHAR:
2810: #ifdef SUPPORT_UTF8
2811: if (utf8)
2812: {
2813: length = 1;
2814: charptr = ecode;
2815: GETCHARLEN(fc, ecode, length);
2816: ecode += length;
2817:
2818: /* Handle multibyte character matching specially here. There is
2819: support for caseless matching if UCP support is present. */
2820:
2821: if (length > 1)
2822: {
2823: #ifdef SUPPORT_UCP
2824: unsigned int othercase;
2825: if ((ims & PCRE_CASELESS) != 0 &&
1.2 misha 2826: (othercase = UCD_OTHERCASE(fc)) != fc)
1.1 misha 2827: oclength = _pcre_ord2utf8(othercase, occhars);
2828: else oclength = 0;
2829: #endif /* SUPPORT_UCP */
2830:
2831: for (i = 1; i <= min; i++)
2832: {
1.4 ! misha 2833: if (eptr <= md->end_subject - length &&
! 2834: memcmp(eptr, charptr, length) == 0) eptr += length;
1.1 misha 2835: #ifdef SUPPORT_UCP
1.4 ! misha 2836: else if (oclength > 0 &&
! 2837: eptr <= md->end_subject - oclength &&
! 2838: memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
! 2839: #endif /* SUPPORT_UCP */
1.1 misha 2840: else
2841: {
1.4 ! misha 2842: CHECK_PARTIAL();
! 2843: MRRETURN(MATCH_NOMATCH);
1.1 misha 2844: }
2845: }
2846:
2847: if (min == max) continue;
2848:
2849: if (minimize)
2850: {
2851: for (fi = min;; fi++)
2852: {
2853: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2854: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 2855: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 2856: if (eptr <= md->end_subject - length &&
! 2857: memcmp(eptr, charptr, length) == 0) eptr += length;
1.1 misha 2858: #ifdef SUPPORT_UCP
1.4 ! misha 2859: else if (oclength > 0 &&
! 2860: eptr <= md->end_subject - oclength &&
! 2861: memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
! 2862: #endif /* SUPPORT_UCP */
1.1 misha 2863: else
2864: {
1.4 ! misha 2865: CHECK_PARTIAL();
! 2866: MRRETURN(MATCH_NOMATCH);
1.1 misha 2867: }
2868: }
2869: /* Control never gets here */
2870: }
2871:
2872: else /* Maximize */
2873: {
2874: pp = eptr;
2875: for (i = min; i < max; i++)
2876: {
1.4 ! misha 2877: if (eptr <= md->end_subject - length &&
! 2878: memcmp(eptr, charptr, length) == 0) eptr += length;
1.1 misha 2879: #ifdef SUPPORT_UCP
1.4 ! misha 2880: else if (oclength > 0 &&
! 2881: eptr <= md->end_subject - oclength &&
! 2882: memcmp(eptr, occhars, oclength) == 0) eptr += oclength;
! 2883: #endif /* SUPPORT_UCP */
1.1 misha 2884: else
2885: {
1.4 ! misha 2886: CHECK_PARTIAL();
! 2887: break;
1.1 misha 2888: }
2889: }
2890:
2891: if (possessive) continue;
1.4 ! misha 2892:
1.1 misha 2893: for(;;)
1.4 ! misha 2894: {
! 2895: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
! 2896: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2897: if (eptr == pp) { MRRETURN(MATCH_NOMATCH); }
1.1 misha 2898: #ifdef SUPPORT_UCP
1.4 ! misha 2899: eptr--;
! 2900: BACKCHAR(eptr);
1.1 misha 2901: #else /* without SUPPORT_UCP */
1.4 ! misha 2902: eptr -= length;
1.1 misha 2903: #endif /* SUPPORT_UCP */
1.4 ! misha 2904: }
1.1 misha 2905: }
2906: /* Control never gets here */
2907: }
2908:
2909: /* If the length of a UTF-8 character is 1, we fall through here, and
2910: obey the code as for non-UTF-8 characters below, though in this case the
2911: value of fc will always be < 128. */
2912: }
2913: else
2914: #endif /* SUPPORT_UTF8 */
2915:
2916: /* When not in UTF-8 mode, load a single-byte character. */
1.4 ! misha 2917:
! 2918: fc = *ecode++;
1.1 misha 2919:
2920: /* The value of fc at this point is always less than 256, though we may or
2921: may not be in UTF-8 mode. The code is duplicated for the caseless and
2922: caseful cases, for speed, since matching characters is likely to be quite
2923: common. First, ensure the minimum number of matches are present. If min =
2924: max, continue at the same level without recursing. Otherwise, if
2925: minimizing, keep trying the rest of the expression and advancing one
2926: matching character if failing, up to the maximum. Alternatively, if
2927: maximizing, find the maximum number of characters and work backwards. */
2928:
2929: DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2930: max, eptr));
2931:
2932: if ((ims & PCRE_CASELESS) != 0)
2933: {
2934: fc = md->lcc[fc];
2935: for (i = 1; i <= min; i++)
1.4 ! misha 2936: {
! 2937: if (eptr >= md->end_subject)
! 2938: {
! 2939: SCHECK_PARTIAL();
! 2940: MRRETURN(MATCH_NOMATCH);
! 2941: }
! 2942: if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
! 2943: }
1.1 misha 2944: if (min == max) continue;
2945: if (minimize)
2946: {
2947: for (fi = min;; fi++)
2948: {
2949: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2950: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 2951: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 2952: if (eptr >= md->end_subject)
! 2953: {
! 2954: SCHECK_PARTIAL();
! 2955: MRRETURN(MATCH_NOMATCH);
! 2956: }
! 2957: if (fc != md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
1.1 misha 2958: }
2959: /* Control never gets here */
2960: }
2961: else /* Maximize */
2962: {
2963: pp = eptr;
2964: for (i = min; i < max; i++)
2965: {
1.4 ! misha 2966: if (eptr >= md->end_subject)
! 2967: {
! 2968: SCHECK_PARTIAL();
! 2969: break;
! 2970: }
! 2971: if (fc != md->lcc[*eptr]) break;
1.1 misha 2972: eptr++;
2973: }
1.4 ! misha 2974:
1.1 misha 2975: if (possessive) continue;
1.4 ! misha 2976:
1.1 misha 2977: while (eptr >= pp)
2978: {
2979: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2980: eptr--;
2981: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2982: }
1.4 ! misha 2983: MRRETURN(MATCH_NOMATCH);
1.1 misha 2984: }
2985: /* Control never gets here */
2986: }
2987:
2988: /* Caseful comparisons (includes all multi-byte characters) */
2989:
2990: else
2991: {
1.4 ! misha 2992: for (i = 1; i <= min; i++)
! 2993: {
! 2994: if (eptr >= md->end_subject)
! 2995: {
! 2996: SCHECK_PARTIAL();
! 2997: MRRETURN(MATCH_NOMATCH);
! 2998: }
! 2999: if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
! 3000: }
! 3001:
1.1 misha 3002: if (min == max) continue;
1.4 ! misha 3003:
1.1 misha 3004: if (minimize)
3005: {
3006: for (fi = min;; fi++)
3007: {
3008: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
3009: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 3010: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 3011: if (eptr >= md->end_subject)
! 3012: {
! 3013: SCHECK_PARTIAL();
! 3014: MRRETURN(MATCH_NOMATCH);
! 3015: }
! 3016: if (fc != *eptr++) MRRETURN(MATCH_NOMATCH);
1.1 misha 3017: }
3018: /* Control never gets here */
3019: }
3020: else /* Maximize */
3021: {
3022: pp = eptr;
3023: for (i = min; i < max; i++)
3024: {
1.4 ! misha 3025: if (eptr >= md->end_subject)
! 3026: {
! 3027: SCHECK_PARTIAL();
! 3028: break;
! 3029: }
! 3030: if (fc != *eptr) break;
1.1 misha 3031: eptr++;
3032: }
3033: if (possessive) continue;
1.4 ! misha 3034:
1.1 misha 3035: while (eptr >= pp)
3036: {
3037: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
3038: eptr--;
3039: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3040: }
1.4 ! misha 3041: MRRETURN(MATCH_NOMATCH);
1.1 misha 3042: }
3043: }
3044: /* Control never gets here */
3045:
3046: /* Match a negated single one-byte character. The character we are
3047: checking can be multibyte. */
3048:
3049: case OP_NOT:
1.4 ! misha 3050: if (eptr >= md->end_subject)
! 3051: {
! 3052: SCHECK_PARTIAL();
! 3053: MRRETURN(MATCH_NOMATCH);
! 3054: }
1.1 misha 3055: ecode++;
3056: GETCHARINCTEST(c, eptr);
3057: if ((ims & PCRE_CASELESS) != 0)
3058: {
3059: #ifdef SUPPORT_UTF8
3060: if (c < 256)
3061: #endif
3062: c = md->lcc[c];
1.4 ! misha 3063: if (md->lcc[*ecode++] == c) MRRETURN(MATCH_NOMATCH);
1.1 misha 3064: }
3065: else
3066: {
1.4 ! misha 3067: if (*ecode++ == c) MRRETURN(MATCH_NOMATCH);
1.1 misha 3068: }
3069: break;
3070:
3071: /* Match a negated single one-byte character repeatedly. This is almost a
3072: repeat of the code for a repeated single character, but I haven't found a
3073: nice way of commoning these up that doesn't require a test of the
3074: positive/negative option for each character match. Maybe that wouldn't add
3075: very much to the time taken, but character matching *is* what this is all
3076: about... */
3077:
3078: case OP_NOTEXACT:
3079: min = max = GET2(ecode, 1);
3080: ecode += 3;
3081: goto REPEATNOTCHAR;
3082:
3083: case OP_NOTUPTO:
3084: case OP_NOTMINUPTO:
3085: min = 0;
3086: max = GET2(ecode, 1);
3087: minimize = *ecode == OP_NOTMINUPTO;
3088: ecode += 3;
3089: goto REPEATNOTCHAR;
3090:
3091: case OP_NOTPOSSTAR:
3092: possessive = TRUE;
3093: min = 0;
3094: max = INT_MAX;
3095: ecode++;
3096: goto REPEATNOTCHAR;
3097:
3098: case OP_NOTPOSPLUS:
3099: possessive = TRUE;
3100: min = 1;
3101: max = INT_MAX;
3102: ecode++;
3103: goto REPEATNOTCHAR;
3104:
3105: case OP_NOTPOSQUERY:
3106: possessive = TRUE;
3107: min = 0;
3108: max = 1;
3109: ecode++;
3110: goto REPEATNOTCHAR;
3111:
3112: case OP_NOTPOSUPTO:
3113: possessive = TRUE;
3114: min = 0;
3115: max = GET2(ecode, 1);
3116: ecode += 3;
3117: goto REPEATNOTCHAR;
3118:
3119: case OP_NOTSTAR:
3120: case OP_NOTMINSTAR:
3121: case OP_NOTPLUS:
3122: case OP_NOTMINPLUS:
3123: case OP_NOTQUERY:
3124: case OP_NOTMINQUERY:
3125: c = *ecode++ - OP_NOTSTAR;
3126: minimize = (c & 1) != 0;
3127: min = rep_min[c]; /* Pick up values from tables; */
3128: max = rep_max[c]; /* zero for max => infinity */
3129: if (max == 0) max = INT_MAX;
3130:
1.4 ! misha 3131: /* Common code for all repeated single-byte matches. */
1.1 misha 3132:
3133: REPEATNOTCHAR:
3134: fc = *ecode++;
3135:
3136: /* The code is duplicated for the caseless and caseful cases, for speed,
3137: since matching characters is likely to be quite common. First, ensure the
3138: minimum number of matches are present. If min = max, continue at the same
3139: level without recursing. Otherwise, if minimizing, keep trying the rest of
3140: the expression and advancing one matching character if failing, up to the
3141: maximum. Alternatively, if maximizing, find the maximum number of
3142: characters and work backwards. */
3143:
3144: DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3145: max, eptr));
3146:
3147: if ((ims & PCRE_CASELESS) != 0)
3148: {
3149: fc = md->lcc[fc];
3150:
3151: #ifdef SUPPORT_UTF8
3152: /* UTF-8 mode */
3153: if (utf8)
3154: {
3155: register unsigned int d;
3156: for (i = 1; i <= min; i++)
3157: {
1.4 ! misha 3158: if (eptr >= md->end_subject)
! 3159: {
! 3160: SCHECK_PARTIAL();
! 3161: MRRETURN(MATCH_NOMATCH);
! 3162: }
1.1 misha 3163: GETCHARINC(d, eptr);
3164: if (d < 256) d = md->lcc[d];
1.4 ! misha 3165: if (fc == d) MRRETURN(MATCH_NOMATCH);
1.1 misha 3166: }
3167: }
3168: else
3169: #endif
3170:
3171: /* Not UTF-8 mode */
3172: {
3173: for (i = 1; i <= min; i++)
1.4 ! misha 3174: {
! 3175: if (eptr >= md->end_subject)
! 3176: {
! 3177: SCHECK_PARTIAL();
! 3178: MRRETURN(MATCH_NOMATCH);
! 3179: }
! 3180: if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
! 3181: }
1.1 misha 3182: }
3183:
3184: if (min == max) continue;
3185:
3186: if (minimize)
3187: {
3188: #ifdef SUPPORT_UTF8
3189: /* UTF-8 mode */
3190: if (utf8)
3191: {
3192: register unsigned int d;
3193: for (fi = min;; fi++)
3194: {
3195: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
3196: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 3197: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 3198: if (eptr >= md->end_subject)
! 3199: {
! 3200: SCHECK_PARTIAL();
! 3201: MRRETURN(MATCH_NOMATCH);
! 3202: }
1.1 misha 3203: GETCHARINC(d, eptr);
3204: if (d < 256) d = md->lcc[d];
1.4 ! misha 3205: if (fc == d) MRRETURN(MATCH_NOMATCH);
1.1 misha 3206: }
3207: }
3208: else
3209: #endif
3210: /* Not UTF-8 mode */
3211: {
3212: for (fi = min;; fi++)
3213: {
3214: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
3215: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 3216: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 3217: if (eptr >= md->end_subject)
! 3218: {
! 3219: SCHECK_PARTIAL();
! 3220: MRRETURN(MATCH_NOMATCH);
! 3221: }
! 3222: if (fc == md->lcc[*eptr++]) MRRETURN(MATCH_NOMATCH);
1.1 misha 3223: }
3224: }
3225: /* Control never gets here */
3226: }
3227:
3228: /* Maximize case */
3229:
3230: else
3231: {
3232: pp = eptr;
3233:
3234: #ifdef SUPPORT_UTF8
3235: /* UTF-8 mode */
3236: if (utf8)
3237: {
3238: register unsigned int d;
3239: for (i = min; i < max; i++)
3240: {
3241: int len = 1;
1.4 ! misha 3242: if (eptr >= md->end_subject)
! 3243: {
! 3244: SCHECK_PARTIAL();
! 3245: break;
! 3246: }
1.1 misha 3247: GETCHARLEN(d, eptr, len);
3248: if (d < 256) d = md->lcc[d];
3249: if (fc == d) break;
3250: eptr += len;
3251: }
3252: if (possessive) continue;
3253: for(;;)
3254: {
3255: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
3256: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3257: if (eptr-- == pp) break; /* Stop if tried at original pos */
3258: BACKCHAR(eptr);
3259: }
3260: }
3261: else
3262: #endif
3263: /* Not UTF-8 mode */
3264: {
3265: for (i = min; i < max; i++)
3266: {
1.4 ! misha 3267: if (eptr >= md->end_subject)
! 3268: {
! 3269: SCHECK_PARTIAL();
! 3270: break;
! 3271: }
! 3272: if (fc == md->lcc[*eptr]) break;
1.1 misha 3273: eptr++;
3274: }
3275: if (possessive) continue;
3276: while (eptr >= pp)
3277: {
3278: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
3279: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3280: eptr--;
3281: }
3282: }
3283:
1.4 ! misha 3284: MRRETURN(MATCH_NOMATCH);
1.1 misha 3285: }
3286: /* Control never gets here */
3287: }
3288:
3289: /* Caseful comparisons */
3290:
3291: else
3292: {
3293: #ifdef SUPPORT_UTF8
3294: /* UTF-8 mode */
3295: if (utf8)
3296: {
3297: register unsigned int d;
3298: for (i = 1; i <= min; i++)
3299: {
1.4 ! misha 3300: if (eptr >= md->end_subject)
! 3301: {
! 3302: SCHECK_PARTIAL();
! 3303: MRRETURN(MATCH_NOMATCH);
! 3304: }
1.1 misha 3305: GETCHARINC(d, eptr);
1.4 ! misha 3306: if (fc == d) MRRETURN(MATCH_NOMATCH);
1.1 misha 3307: }
3308: }
3309: else
3310: #endif
3311: /* Not UTF-8 mode */
3312: {
3313: for (i = 1; i <= min; i++)
1.4 ! misha 3314: {
! 3315: if (eptr >= md->end_subject)
! 3316: {
! 3317: SCHECK_PARTIAL();
! 3318: MRRETURN(MATCH_NOMATCH);
! 3319: }
! 3320: if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
! 3321: }
1.1 misha 3322: }
3323:
3324: if (min == max) continue;
3325:
3326: if (minimize)
3327: {
3328: #ifdef SUPPORT_UTF8
3329: /* UTF-8 mode */
3330: if (utf8)
3331: {
3332: register unsigned int d;
3333: for (fi = min;; fi++)
3334: {
3335: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
3336: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 3337: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 3338: if (eptr >= md->end_subject)
! 3339: {
! 3340: SCHECK_PARTIAL();
! 3341: MRRETURN(MATCH_NOMATCH);
! 3342: }
1.1 misha 3343: GETCHARINC(d, eptr);
1.4 ! misha 3344: if (fc == d) MRRETURN(MATCH_NOMATCH);
1.1 misha 3345: }
3346: }
3347: else
3348: #endif
3349: /* Not UTF-8 mode */
3350: {
3351: for (fi = min;; fi++)
3352: {
3353: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
3354: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 3355: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 3356: if (eptr >= md->end_subject)
! 3357: {
! 3358: SCHECK_PARTIAL();
! 3359: MRRETURN(MATCH_NOMATCH);
! 3360: }
! 3361: if (fc == *eptr++) MRRETURN(MATCH_NOMATCH);
1.1 misha 3362: }
3363: }
3364: /* Control never gets here */
3365: }
3366:
3367: /* Maximize case */
3368:
3369: else
3370: {
3371: pp = eptr;
3372:
3373: #ifdef SUPPORT_UTF8
3374: /* UTF-8 mode */
3375: if (utf8)
3376: {
3377: register unsigned int d;
3378: for (i = min; i < max; i++)
3379: {
3380: int len = 1;
1.4 ! misha 3381: if (eptr >= md->end_subject)
! 3382: {
! 3383: SCHECK_PARTIAL();
! 3384: break;
! 3385: }
1.1 misha 3386: GETCHARLEN(d, eptr, len);
3387: if (fc == d) break;
3388: eptr += len;
3389: }
3390: if (possessive) continue;
3391: for(;;)
3392: {
3393: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
3394: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3395: if (eptr-- == pp) break; /* Stop if tried at original pos */
3396: BACKCHAR(eptr);
3397: }
3398: }
3399: else
3400: #endif
3401: /* Not UTF-8 mode */
3402: {
3403: for (i = min; i < max; i++)
3404: {
1.4 ! misha 3405: if (eptr >= md->end_subject)
! 3406: {
! 3407: SCHECK_PARTIAL();
! 3408: break;
! 3409: }
! 3410: if (fc == *eptr) break;
1.1 misha 3411: eptr++;
3412: }
3413: if (possessive) continue;
3414: while (eptr >= pp)
3415: {
3416: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
3417: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3418: eptr--;
3419: }
3420: }
3421:
1.4 ! misha 3422: MRRETURN(MATCH_NOMATCH);
1.1 misha 3423: }
3424: }
3425: /* Control never gets here */
3426:
3427: /* Match a single character type repeatedly; several different opcodes
3428: share code. This is very similar to the code for single characters, but we
3429: repeat it in the interests of efficiency. */
3430:
3431: case OP_TYPEEXACT:
3432: min = max = GET2(ecode, 1);
3433: minimize = TRUE;
3434: ecode += 3;
3435: goto REPEATTYPE;
3436:
3437: case OP_TYPEUPTO:
3438: case OP_TYPEMINUPTO:
3439: min = 0;
3440: max = GET2(ecode, 1);
3441: minimize = *ecode == OP_TYPEMINUPTO;
3442: ecode += 3;
3443: goto REPEATTYPE;
3444:
3445: case OP_TYPEPOSSTAR:
3446: possessive = TRUE;
3447: min = 0;
3448: max = INT_MAX;
3449: ecode++;
3450: goto REPEATTYPE;
3451:
3452: case OP_TYPEPOSPLUS:
3453: possessive = TRUE;
3454: min = 1;
3455: max = INT_MAX;
3456: ecode++;
3457: goto REPEATTYPE;
3458:
3459: case OP_TYPEPOSQUERY:
3460: possessive = TRUE;
3461: min = 0;
3462: max = 1;
3463: ecode++;
3464: goto REPEATTYPE;
3465:
3466: case OP_TYPEPOSUPTO:
3467: possessive = TRUE;
3468: min = 0;
3469: max = GET2(ecode, 1);
3470: ecode += 3;
3471: goto REPEATTYPE;
3472:
3473: case OP_TYPESTAR:
3474: case OP_TYPEMINSTAR:
3475: case OP_TYPEPLUS:
3476: case OP_TYPEMINPLUS:
3477: case OP_TYPEQUERY:
3478: case OP_TYPEMINQUERY:
3479: c = *ecode++ - OP_TYPESTAR;
3480: minimize = (c & 1) != 0;
3481: min = rep_min[c]; /* Pick up values from tables; */
3482: max = rep_max[c]; /* zero for max => infinity */
3483: if (max == 0) max = INT_MAX;
3484:
3485: /* Common code for all repeated single character type matches. Note that
3486: in UTF-8 mode, '.' matches a character of any length, but for the other
3487: character types, the valid characters are all one-byte long. */
3488:
3489: REPEATTYPE:
3490: ctype = *ecode++; /* Code for the character type */
3491:
3492: #ifdef SUPPORT_UCP
3493: if (ctype == OP_PROP || ctype == OP_NOTPROP)
3494: {
3495: prop_fail_result = ctype == OP_NOTPROP;
3496: prop_type = *ecode++;
3497: prop_value = *ecode++;
3498: }
3499: else prop_type = -1;
3500: #endif
3501:
3502: /* First, ensure the minimum number of matches are present. Use inline
3503: code for maximizing the speed, and do the type test once at the start
1.4 ! misha 3504: (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
1.1 misha 3505: is tidier. Also separate the UCP code, which can be the same for both UTF-8
3506: and single-bytes. */
3507:
3508: if (min > 0)
3509: {
3510: #ifdef SUPPORT_UCP
3511: if (prop_type >= 0)
3512: {
3513: switch(prop_type)
3514: {
3515: case PT_ANY:
1.4 ! misha 3516: if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
1.1 misha 3517: for (i = 1; i <= min; i++)
3518: {
1.4 ! misha 3519: if (eptr >= md->end_subject)
! 3520: {
! 3521: SCHECK_PARTIAL();
! 3522: MRRETURN(MATCH_NOMATCH);
! 3523: }
1.1 misha 3524: GETCHARINCTEST(c, eptr);
3525: }
3526: break;
3527:
3528: case PT_LAMP:
3529: for (i = 1; i <= min; i++)
3530: {
1.4 ! misha 3531: if (eptr >= md->end_subject)
! 3532: {
! 3533: SCHECK_PARTIAL();
! 3534: MRRETURN(MATCH_NOMATCH);
! 3535: }
1.1 misha 3536: GETCHARINCTEST(c, eptr);
1.2 misha 3537: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 3538: if ((prop_chartype == ucp_Lu ||
3539: prop_chartype == ucp_Ll ||
3540: prop_chartype == ucp_Lt) == prop_fail_result)
1.4 ! misha 3541: MRRETURN(MATCH_NOMATCH);
1.1 misha 3542: }
3543: break;
3544:
3545: case PT_GC:
3546: for (i = 1; i <= min; i++)
3547: {
1.4 ! misha 3548: if (eptr >= md->end_subject)
! 3549: {
! 3550: SCHECK_PARTIAL();
! 3551: MRRETURN(MATCH_NOMATCH);
! 3552: }
1.1 misha 3553: GETCHARINCTEST(c, eptr);
1.2 misha 3554: prop_category = UCD_CATEGORY(c);
1.1 misha 3555: if ((prop_category == prop_value) == prop_fail_result)
1.4 ! misha 3556: MRRETURN(MATCH_NOMATCH);
1.1 misha 3557: }
3558: break;
3559:
3560: case PT_PC:
3561: for (i = 1; i <= min; i++)
3562: {
1.4 ! misha 3563: if (eptr >= md->end_subject)
! 3564: {
! 3565: SCHECK_PARTIAL();
! 3566: MRRETURN(MATCH_NOMATCH);
! 3567: }
1.1 misha 3568: GETCHARINCTEST(c, eptr);
1.2 misha 3569: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 3570: if ((prop_chartype == prop_value) == prop_fail_result)
1.4 ! misha 3571: MRRETURN(MATCH_NOMATCH);
1.1 misha 3572: }
3573: break;
3574:
3575: case PT_SC:
3576: for (i = 1; i <= min; i++)
3577: {
1.4 ! misha 3578: if (eptr >= md->end_subject)
! 3579: {
! 3580: SCHECK_PARTIAL();
! 3581: MRRETURN(MATCH_NOMATCH);
! 3582: }
1.1 misha 3583: GETCHARINCTEST(c, eptr);
1.2 misha 3584: prop_script = UCD_SCRIPT(c);
1.1 misha 3585: if ((prop_script == prop_value) == prop_fail_result)
1.4 ! misha 3586: MRRETURN(MATCH_NOMATCH);
! 3587: }
! 3588: break;
! 3589:
! 3590: case PT_ALNUM:
! 3591: for (i = 1; i <= min; i++)
! 3592: {
! 3593: if (eptr >= md->end_subject)
! 3594: {
! 3595: SCHECK_PARTIAL();
! 3596: MRRETURN(MATCH_NOMATCH);
! 3597: }
! 3598: GETCHARINCTEST(c, eptr);
! 3599: prop_category = UCD_CATEGORY(c);
! 3600: if ((prop_category == ucp_L || prop_category == ucp_N)
! 3601: == prop_fail_result)
! 3602: MRRETURN(MATCH_NOMATCH);
! 3603: }
! 3604: break;
! 3605:
! 3606: case PT_SPACE: /* Perl space */
! 3607: for (i = 1; i <= min; i++)
! 3608: {
! 3609: if (eptr >= md->end_subject)
! 3610: {
! 3611: SCHECK_PARTIAL();
! 3612: MRRETURN(MATCH_NOMATCH);
! 3613: }
! 3614: GETCHARINCTEST(c, eptr);
! 3615: prop_category = UCD_CATEGORY(c);
! 3616: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
! 3617: c == CHAR_FF || c == CHAR_CR)
! 3618: == prop_fail_result)
! 3619: MRRETURN(MATCH_NOMATCH);
1.1 misha 3620: }
3621: break;
3622:
1.4 ! misha 3623: case PT_PXSPACE: /* POSIX space */
! 3624: for (i = 1; i <= min; i++)
! 3625: {
! 3626: if (eptr >= md->end_subject)
! 3627: {
! 3628: SCHECK_PARTIAL();
! 3629: MRRETURN(MATCH_NOMATCH);
! 3630: }
! 3631: GETCHARINCTEST(c, eptr);
! 3632: prop_category = UCD_CATEGORY(c);
! 3633: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
! 3634: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
! 3635: == prop_fail_result)
! 3636: MRRETURN(MATCH_NOMATCH);
! 3637: }
! 3638: break;
! 3639:
! 3640: case PT_WORD:
! 3641: for (i = 1; i <= min; i++)
! 3642: {
! 3643: if (eptr >= md->end_subject)
! 3644: {
! 3645: SCHECK_PARTIAL();
! 3646: MRRETURN(MATCH_NOMATCH);
! 3647: }
! 3648: GETCHARINCTEST(c, eptr);
! 3649: prop_category = UCD_CATEGORY(c);
! 3650: if ((prop_category == ucp_L || prop_category == ucp_N ||
! 3651: c == CHAR_UNDERSCORE)
! 3652: == prop_fail_result)
! 3653: MRRETURN(MATCH_NOMATCH);
! 3654: }
! 3655: break;
! 3656:
! 3657: /* This should not occur */
! 3658:
1.1 misha 3659: default:
3660: RRETURN(PCRE_ERROR_INTERNAL);
3661: }
3662: }
3663:
3664: /* Match extended Unicode sequences. We will get here only if the
3665: support is in the binary; otherwise a compile-time error occurs. */
3666:
3667: else if (ctype == OP_EXTUNI)
3668: {
3669: for (i = 1; i <= min; i++)
3670: {
1.4 ! misha 3671: if (eptr >= md->end_subject)
! 3672: {
! 3673: SCHECK_PARTIAL();
! 3674: MRRETURN(MATCH_NOMATCH);
! 3675: }
1.1 misha 3676: GETCHARINCTEST(c, eptr);
1.2 misha 3677: prop_category = UCD_CATEGORY(c);
1.4 ! misha 3678: if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
1.1 misha 3679: while (eptr < md->end_subject)
3680: {
3681: int len = 1;
1.4 ! misha 3682: if (!utf8) c = *eptr;
! 3683: else { GETCHARLEN(c, eptr, len); }
1.2 misha 3684: prop_category = UCD_CATEGORY(c);
1.1 misha 3685: if (prop_category != ucp_M) break;
3686: eptr += len;
3687: }
3688: }
3689: }
3690:
3691: else
3692: #endif /* SUPPORT_UCP */
3693:
3694: /* Handle all other cases when the coding is UTF-8 */
3695:
3696: #ifdef SUPPORT_UTF8
3697: if (utf8) switch(ctype)
3698: {
3699: case OP_ANY:
3700: for (i = 1; i <= min; i++)
3701: {
1.4 ! misha 3702: if (eptr >= md->end_subject)
! 3703: {
! 3704: SCHECK_PARTIAL();
! 3705: MRRETURN(MATCH_NOMATCH);
! 3706: }
! 3707: if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1.1 misha 3708: eptr++;
3709: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3710: }
3711: break;
3712:
3713: case OP_ALLANY:
3714: for (i = 1; i <= min; i++)
3715: {
1.4 ! misha 3716: if (eptr >= md->end_subject)
! 3717: {
! 3718: SCHECK_PARTIAL();
! 3719: MRRETURN(MATCH_NOMATCH);
! 3720: }
1.1 misha 3721: eptr++;
3722: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3723: }
3724: break;
3725:
3726: case OP_ANYBYTE:
1.4 ! misha 3727: if (eptr > md->end_subject - min) MRRETURN(MATCH_NOMATCH);
1.1 misha 3728: eptr += min;
3729: break;
3730:
3731: case OP_ANYNL:
3732: for (i = 1; i <= min; i++)
3733: {
1.4 ! misha 3734: if (eptr >= md->end_subject)
! 3735: {
! 3736: SCHECK_PARTIAL();
! 3737: MRRETURN(MATCH_NOMATCH);
! 3738: }
1.1 misha 3739: GETCHARINC(c, eptr);
3740: switch(c)
3741: {
1.4 ! misha 3742: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 3743: case 0x000d:
3744: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3745: break;
3746:
3747: case 0x000a:
3748: break;
3749:
3750: case 0x000b:
3751: case 0x000c:
3752: case 0x0085:
3753: case 0x2028:
3754: case 0x2029:
1.4 ! misha 3755: if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1.1 misha 3756: break;
3757: }
3758: }
3759: break;
3760:
3761: case OP_NOT_HSPACE:
3762: for (i = 1; i <= min; i++)
3763: {
1.4 ! misha 3764: if (eptr >= md->end_subject)
! 3765: {
! 3766: SCHECK_PARTIAL();
! 3767: MRRETURN(MATCH_NOMATCH);
! 3768: }
1.1 misha 3769: GETCHARINC(c, eptr);
3770: switch(c)
3771: {
3772: default: break;
3773: case 0x09: /* HT */
3774: case 0x20: /* SPACE */
3775: case 0xa0: /* NBSP */
3776: case 0x1680: /* OGHAM SPACE MARK */
3777: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3778: case 0x2000: /* EN QUAD */
3779: case 0x2001: /* EM QUAD */
3780: case 0x2002: /* EN SPACE */
3781: case 0x2003: /* EM SPACE */
3782: case 0x2004: /* THREE-PER-EM SPACE */
3783: case 0x2005: /* FOUR-PER-EM SPACE */
3784: case 0x2006: /* SIX-PER-EM SPACE */
3785: case 0x2007: /* FIGURE SPACE */
3786: case 0x2008: /* PUNCTUATION SPACE */
3787: case 0x2009: /* THIN SPACE */
3788: case 0x200A: /* HAIR SPACE */
3789: case 0x202f: /* NARROW NO-BREAK SPACE */
3790: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3791: case 0x3000: /* IDEOGRAPHIC SPACE */
1.4 ! misha 3792: MRRETURN(MATCH_NOMATCH);
1.1 misha 3793: }
3794: }
3795: break;
3796:
3797: case OP_HSPACE:
3798: for (i = 1; i <= min; i++)
3799: {
1.4 ! misha 3800: if (eptr >= md->end_subject)
! 3801: {
! 3802: SCHECK_PARTIAL();
! 3803: MRRETURN(MATCH_NOMATCH);
! 3804: }
1.1 misha 3805: GETCHARINC(c, eptr);
3806: switch(c)
3807: {
1.4 ! misha 3808: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 3809: case 0x09: /* HT */
3810: case 0x20: /* SPACE */
3811: case 0xa0: /* NBSP */
3812: case 0x1680: /* OGHAM SPACE MARK */
3813: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3814: case 0x2000: /* EN QUAD */
3815: case 0x2001: /* EM QUAD */
3816: case 0x2002: /* EN SPACE */
3817: case 0x2003: /* EM SPACE */
3818: case 0x2004: /* THREE-PER-EM SPACE */
3819: case 0x2005: /* FOUR-PER-EM SPACE */
3820: case 0x2006: /* SIX-PER-EM SPACE */
3821: case 0x2007: /* FIGURE SPACE */
3822: case 0x2008: /* PUNCTUATION SPACE */
3823: case 0x2009: /* THIN SPACE */
3824: case 0x200A: /* HAIR SPACE */
3825: case 0x202f: /* NARROW NO-BREAK SPACE */
3826: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3827: case 0x3000: /* IDEOGRAPHIC SPACE */
3828: break;
3829: }
3830: }
3831: break;
3832:
3833: case OP_NOT_VSPACE:
3834: for (i = 1; i <= min; i++)
3835: {
1.4 ! misha 3836: if (eptr >= md->end_subject)
! 3837: {
! 3838: SCHECK_PARTIAL();
! 3839: MRRETURN(MATCH_NOMATCH);
! 3840: }
1.1 misha 3841: GETCHARINC(c, eptr);
3842: switch(c)
3843: {
3844: default: break;
3845: case 0x0a: /* LF */
3846: case 0x0b: /* VT */
3847: case 0x0c: /* FF */
3848: case 0x0d: /* CR */
3849: case 0x85: /* NEL */
3850: case 0x2028: /* LINE SEPARATOR */
3851: case 0x2029: /* PARAGRAPH SEPARATOR */
1.4 ! misha 3852: MRRETURN(MATCH_NOMATCH);
1.1 misha 3853: }
3854: }
3855: break;
3856:
3857: case OP_VSPACE:
3858: for (i = 1; i <= min; i++)
3859: {
1.4 ! misha 3860: if (eptr >= md->end_subject)
! 3861: {
! 3862: SCHECK_PARTIAL();
! 3863: MRRETURN(MATCH_NOMATCH);
! 3864: }
1.1 misha 3865: GETCHARINC(c, eptr);
3866: switch(c)
3867: {
1.4 ! misha 3868: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 3869: case 0x0a: /* LF */
3870: case 0x0b: /* VT */
3871: case 0x0c: /* FF */
3872: case 0x0d: /* CR */
3873: case 0x85: /* NEL */
3874: case 0x2028: /* LINE SEPARATOR */
3875: case 0x2029: /* PARAGRAPH SEPARATOR */
3876: break;
3877: }
3878: }
3879: break;
3880:
3881: case OP_NOT_DIGIT:
3882: for (i = 1; i <= min; i++)
3883: {
1.4 ! misha 3884: if (eptr >= md->end_subject)
! 3885: {
! 3886: SCHECK_PARTIAL();
! 3887: MRRETURN(MATCH_NOMATCH);
! 3888: }
1.1 misha 3889: GETCHARINC(c, eptr);
3890: if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
1.4 ! misha 3891: MRRETURN(MATCH_NOMATCH);
1.1 misha 3892: }
3893: break;
3894:
3895: case OP_DIGIT:
3896: for (i = 1; i <= min; i++)
3897: {
1.4 ! misha 3898: if (eptr >= md->end_subject)
! 3899: {
! 3900: SCHECK_PARTIAL();
! 3901: MRRETURN(MATCH_NOMATCH);
! 3902: }
! 3903: if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
! 3904: MRRETURN(MATCH_NOMATCH);
1.1 misha 3905: /* No need to skip more bytes - we know it's a 1-byte character */
3906: }
3907: break;
3908:
3909: case OP_NOT_WHITESPACE:
3910: for (i = 1; i <= min; i++)
3911: {
1.4 ! misha 3912: if (eptr >= md->end_subject)
! 3913: {
! 3914: SCHECK_PARTIAL();
! 3915: MRRETURN(MATCH_NOMATCH);
! 3916: }
! 3917: if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
! 3918: MRRETURN(MATCH_NOMATCH);
1.1 misha 3919: while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3920: }
3921: break;
3922:
3923: case OP_WHITESPACE:
3924: for (i = 1; i <= min; i++)
3925: {
1.4 ! misha 3926: if (eptr >= md->end_subject)
! 3927: {
! 3928: SCHECK_PARTIAL();
! 3929: MRRETURN(MATCH_NOMATCH);
! 3930: }
! 3931: if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
! 3932: MRRETURN(MATCH_NOMATCH);
1.1 misha 3933: /* No need to skip more bytes - we know it's a 1-byte character */
3934: }
3935: break;
3936:
3937: case OP_NOT_WORDCHAR:
3938: for (i = 1; i <= min; i++)
3939: {
1.4 ! misha 3940: if (eptr >= md->end_subject)
! 3941: {
! 3942: SCHECK_PARTIAL();
! 3943: MRRETURN(MATCH_NOMATCH);
! 3944: }
! 3945: if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
! 3946: MRRETURN(MATCH_NOMATCH);
1.1 misha 3947: while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3948: }
3949: break;
3950:
3951: case OP_WORDCHAR:
3952: for (i = 1; i <= min; i++)
3953: {
1.4 ! misha 3954: if (eptr >= md->end_subject)
! 3955: {
! 3956: SCHECK_PARTIAL();
! 3957: MRRETURN(MATCH_NOMATCH);
! 3958: }
! 3959: if (*eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
! 3960: MRRETURN(MATCH_NOMATCH);
1.1 misha 3961: /* No need to skip more bytes - we know it's a 1-byte character */
3962: }
3963: break;
3964:
3965: default:
3966: RRETURN(PCRE_ERROR_INTERNAL);
3967: } /* End switch(ctype) */
3968:
3969: else
3970: #endif /* SUPPORT_UTF8 */
3971:
3972: /* Code for the non-UTF-8 case for minimum matching of operators other
1.4 ! misha 3973: than OP_PROP and OP_NOTPROP. */
1.1 misha 3974:
3975: switch(ctype)
3976: {
3977: case OP_ANY:
3978: for (i = 1; i <= min; i++)
3979: {
1.4 ! misha 3980: if (eptr >= md->end_subject)
! 3981: {
! 3982: SCHECK_PARTIAL();
! 3983: MRRETURN(MATCH_NOMATCH);
! 3984: }
! 3985: if (IS_NEWLINE(eptr)) MRRETURN(MATCH_NOMATCH);
1.1 misha 3986: eptr++;
3987: }
3988: break;
3989:
3990: case OP_ALLANY:
1.4 ! misha 3991: if (eptr > md->end_subject - min)
! 3992: {
! 3993: SCHECK_PARTIAL();
! 3994: MRRETURN(MATCH_NOMATCH);
! 3995: }
1.1 misha 3996: eptr += min;
3997: break;
3998:
3999: case OP_ANYBYTE:
1.4 ! misha 4000: if (eptr > md->end_subject - min)
! 4001: {
! 4002: SCHECK_PARTIAL();
! 4003: MRRETURN(MATCH_NOMATCH);
! 4004: }
1.1 misha 4005: eptr += min;
4006: break;
4007:
4008: case OP_ANYNL:
4009: for (i = 1; i <= min; i++)
4010: {
1.4 ! misha 4011: if (eptr >= md->end_subject)
! 4012: {
! 4013: SCHECK_PARTIAL();
! 4014: MRRETURN(MATCH_NOMATCH);
! 4015: }
1.1 misha 4016: switch(*eptr++)
4017: {
1.4 ! misha 4018: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4019: case 0x000d:
4020: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4021: break;
4022: case 0x000a:
4023: break;
4024:
4025: case 0x000b:
4026: case 0x000c:
4027: case 0x0085:
1.4 ! misha 4028: if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1.1 misha 4029: break;
4030: }
4031: }
4032: break;
4033:
4034: case OP_NOT_HSPACE:
4035: for (i = 1; i <= min; i++)
4036: {
1.4 ! misha 4037: if (eptr >= md->end_subject)
! 4038: {
! 4039: SCHECK_PARTIAL();
! 4040: MRRETURN(MATCH_NOMATCH);
! 4041: }
1.1 misha 4042: switch(*eptr++)
4043: {
4044: default: break;
4045: case 0x09: /* HT */
4046: case 0x20: /* SPACE */
4047: case 0xa0: /* NBSP */
1.4 ! misha 4048: MRRETURN(MATCH_NOMATCH);
1.1 misha 4049: }
4050: }
4051: break;
4052:
4053: case OP_HSPACE:
4054: for (i = 1; i <= min; i++)
4055: {
1.4 ! misha 4056: if (eptr >= md->end_subject)
! 4057: {
! 4058: SCHECK_PARTIAL();
! 4059: MRRETURN(MATCH_NOMATCH);
! 4060: }
1.1 misha 4061: switch(*eptr++)
4062: {
1.4 ! misha 4063: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4064: case 0x09: /* HT */
4065: case 0x20: /* SPACE */
4066: case 0xa0: /* NBSP */
4067: break;
4068: }
4069: }
4070: break;
4071:
4072: case OP_NOT_VSPACE:
4073: for (i = 1; i <= min; i++)
4074: {
1.4 ! misha 4075: if (eptr >= md->end_subject)
! 4076: {
! 4077: SCHECK_PARTIAL();
! 4078: MRRETURN(MATCH_NOMATCH);
! 4079: }
1.1 misha 4080: switch(*eptr++)
4081: {
4082: default: break;
4083: case 0x0a: /* LF */
4084: case 0x0b: /* VT */
4085: case 0x0c: /* FF */
4086: case 0x0d: /* CR */
4087: case 0x85: /* NEL */
1.4 ! misha 4088: MRRETURN(MATCH_NOMATCH);
1.1 misha 4089: }
4090: }
4091: break;
4092:
4093: case OP_VSPACE:
4094: for (i = 1; i <= min; i++)
4095: {
1.4 ! misha 4096: if (eptr >= md->end_subject)
! 4097: {
! 4098: SCHECK_PARTIAL();
! 4099: MRRETURN(MATCH_NOMATCH);
! 4100: }
1.1 misha 4101: switch(*eptr++)
4102: {
1.4 ! misha 4103: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4104: case 0x0a: /* LF */
4105: case 0x0b: /* VT */
4106: case 0x0c: /* FF */
4107: case 0x0d: /* CR */
4108: case 0x85: /* NEL */
4109: break;
4110: }
4111: }
4112: break;
4113:
4114: case OP_NOT_DIGIT:
4115: for (i = 1; i <= min; i++)
1.4 ! misha 4116: {
! 4117: if (eptr >= md->end_subject)
! 4118: {
! 4119: SCHECK_PARTIAL();
! 4120: MRRETURN(MATCH_NOMATCH);
! 4121: }
! 4122: if ((md->ctypes[*eptr++] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
! 4123: }
1.1 misha 4124: break;
4125:
4126: case OP_DIGIT:
4127: for (i = 1; i <= min; i++)
1.4 ! misha 4128: {
! 4129: if (eptr >= md->end_subject)
! 4130: {
! 4131: SCHECK_PARTIAL();
! 4132: MRRETURN(MATCH_NOMATCH);
! 4133: }
! 4134: if ((md->ctypes[*eptr++] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
! 4135: }
1.1 misha 4136: break;
4137:
4138: case OP_NOT_WHITESPACE:
4139: for (i = 1; i <= min; i++)
1.4 ! misha 4140: {
! 4141: if (eptr >= md->end_subject)
! 4142: {
! 4143: SCHECK_PARTIAL();
! 4144: MRRETURN(MATCH_NOMATCH);
! 4145: }
! 4146: if ((md->ctypes[*eptr++] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
! 4147: }
1.1 misha 4148: break;
4149:
4150: case OP_WHITESPACE:
4151: for (i = 1; i <= min; i++)
1.4 ! misha 4152: {
! 4153: if (eptr >= md->end_subject)
! 4154: {
! 4155: SCHECK_PARTIAL();
! 4156: MRRETURN(MATCH_NOMATCH);
! 4157: }
! 4158: if ((md->ctypes[*eptr++] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
! 4159: }
1.1 misha 4160: break;
4161:
4162: case OP_NOT_WORDCHAR:
4163: for (i = 1; i <= min; i++)
1.4 ! misha 4164: {
! 4165: if (eptr >= md->end_subject)
! 4166: {
! 4167: SCHECK_PARTIAL();
! 4168: MRRETURN(MATCH_NOMATCH);
! 4169: }
1.1 misha 4170: if ((md->ctypes[*eptr++] & ctype_word) != 0)
1.4 ! misha 4171: MRRETURN(MATCH_NOMATCH);
! 4172: }
1.1 misha 4173: break;
4174:
4175: case OP_WORDCHAR:
4176: for (i = 1; i <= min; i++)
1.4 ! misha 4177: {
! 4178: if (eptr >= md->end_subject)
! 4179: {
! 4180: SCHECK_PARTIAL();
! 4181: MRRETURN(MATCH_NOMATCH);
! 4182: }
1.1 misha 4183: if ((md->ctypes[*eptr++] & ctype_word) == 0)
1.4 ! misha 4184: MRRETURN(MATCH_NOMATCH);
! 4185: }
1.1 misha 4186: break;
4187:
4188: default:
4189: RRETURN(PCRE_ERROR_INTERNAL);
4190: }
4191: }
4192:
4193: /* If min = max, continue at the same level without recursing */
4194:
4195: if (min == max) continue;
4196:
4197: /* If minimizing, we have to test the rest of the pattern before each
4198: subsequent match. Again, separate the UTF-8 case for speed, and also
4199: separate the UCP cases. */
4200:
4201: if (minimize)
4202: {
4203: #ifdef SUPPORT_UCP
4204: if (prop_type >= 0)
4205: {
4206: switch(prop_type)
4207: {
4208: case PT_ANY:
4209: for (fi = min;; fi++)
4210: {
4211: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
4212: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 4213: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 4214: if (eptr >= md->end_subject)
! 4215: {
! 4216: SCHECK_PARTIAL();
! 4217: MRRETURN(MATCH_NOMATCH);
! 4218: }
! 4219: GETCHARINCTEST(c, eptr);
! 4220: if (prop_fail_result) MRRETURN(MATCH_NOMATCH);
1.1 misha 4221: }
4222: /* Control never gets here */
4223:
4224: case PT_LAMP:
4225: for (fi = min;; fi++)
4226: {
4227: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
4228: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 4229: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 4230: if (eptr >= md->end_subject)
! 4231: {
! 4232: SCHECK_PARTIAL();
! 4233: MRRETURN(MATCH_NOMATCH);
! 4234: }
! 4235: GETCHARINCTEST(c, eptr);
1.2 misha 4236: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 4237: if ((prop_chartype == ucp_Lu ||
4238: prop_chartype == ucp_Ll ||
4239: prop_chartype == ucp_Lt) == prop_fail_result)
1.4 ! misha 4240: MRRETURN(MATCH_NOMATCH);
1.1 misha 4241: }
4242: /* Control never gets here */
4243:
4244: case PT_GC:
4245: for (fi = min;; fi++)
4246: {
4247: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
4248: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 4249: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 4250: if (eptr >= md->end_subject)
! 4251: {
! 4252: SCHECK_PARTIAL();
! 4253: MRRETURN(MATCH_NOMATCH);
! 4254: }
! 4255: GETCHARINCTEST(c, eptr);
1.2 misha 4256: prop_category = UCD_CATEGORY(c);
1.1 misha 4257: if ((prop_category == prop_value) == prop_fail_result)
1.4 ! misha 4258: MRRETURN(MATCH_NOMATCH);
1.1 misha 4259: }
4260: /* Control never gets here */
4261:
4262: case PT_PC:
4263: for (fi = min;; fi++)
4264: {
4265: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
4266: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 4267: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 4268: if (eptr >= md->end_subject)
! 4269: {
! 4270: SCHECK_PARTIAL();
! 4271: MRRETURN(MATCH_NOMATCH);
! 4272: }
! 4273: GETCHARINCTEST(c, eptr);
1.2 misha 4274: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 4275: if ((prop_chartype == prop_value) == prop_fail_result)
1.4 ! misha 4276: MRRETURN(MATCH_NOMATCH);
1.1 misha 4277: }
4278: /* Control never gets here */
4279:
4280: case PT_SC:
4281: for (fi = min;; fi++)
4282: {
4283: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
4284: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 4285: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 4286: if (eptr >= md->end_subject)
! 4287: {
! 4288: SCHECK_PARTIAL();
! 4289: MRRETURN(MATCH_NOMATCH);
! 4290: }
! 4291: GETCHARINCTEST(c, eptr);
1.2 misha 4292: prop_script = UCD_SCRIPT(c);
1.1 misha 4293: if ((prop_script == prop_value) == prop_fail_result)
1.4 ! misha 4294: MRRETURN(MATCH_NOMATCH);
! 4295: }
! 4296: /* Control never gets here */
! 4297:
! 4298: case PT_ALNUM:
! 4299: for (fi = min;; fi++)
! 4300: {
! 4301: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM59);
! 4302: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 4303: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 4304: if (eptr >= md->end_subject)
! 4305: {
! 4306: SCHECK_PARTIAL();
! 4307: MRRETURN(MATCH_NOMATCH);
! 4308: }
! 4309: GETCHARINCTEST(c, eptr);
! 4310: prop_category = UCD_CATEGORY(c);
! 4311: if ((prop_category == ucp_L || prop_category == ucp_N)
! 4312: == prop_fail_result)
! 4313: MRRETURN(MATCH_NOMATCH);
! 4314: }
! 4315: /* Control never gets here */
! 4316:
! 4317: case PT_SPACE: /* Perl space */
! 4318: for (fi = min;; fi++)
! 4319: {
! 4320: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM60);
! 4321: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 4322: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 4323: if (eptr >= md->end_subject)
! 4324: {
! 4325: SCHECK_PARTIAL();
! 4326: MRRETURN(MATCH_NOMATCH);
! 4327: }
! 4328: GETCHARINCTEST(c, eptr);
! 4329: prop_category = UCD_CATEGORY(c);
! 4330: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
! 4331: c == CHAR_FF || c == CHAR_CR)
! 4332: == prop_fail_result)
! 4333: MRRETURN(MATCH_NOMATCH);
! 4334: }
! 4335: /* Control never gets here */
! 4336:
! 4337: case PT_PXSPACE: /* POSIX space */
! 4338: for (fi = min;; fi++)
! 4339: {
! 4340: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM61);
! 4341: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 4342: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 4343: if (eptr >= md->end_subject)
! 4344: {
! 4345: SCHECK_PARTIAL();
! 4346: MRRETURN(MATCH_NOMATCH);
! 4347: }
! 4348: GETCHARINCTEST(c, eptr);
! 4349: prop_category = UCD_CATEGORY(c);
! 4350: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
! 4351: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
! 4352: == prop_fail_result)
! 4353: MRRETURN(MATCH_NOMATCH);
1.1 misha 4354: }
4355: /* Control never gets here */
4356:
1.4 ! misha 4357: case PT_WORD:
! 4358: for (fi = min;; fi++)
! 4359: {
! 4360: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM62);
! 4361: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 4362: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 4363: if (eptr >= md->end_subject)
! 4364: {
! 4365: SCHECK_PARTIAL();
! 4366: MRRETURN(MATCH_NOMATCH);
! 4367: }
! 4368: GETCHARINCTEST(c, eptr);
! 4369: prop_category = UCD_CATEGORY(c);
! 4370: if ((prop_category == ucp_L ||
! 4371: prop_category == ucp_N ||
! 4372: c == CHAR_UNDERSCORE)
! 4373: == prop_fail_result)
! 4374: MRRETURN(MATCH_NOMATCH);
! 4375: }
! 4376: /* Control never gets here */
! 4377:
! 4378: /* This should never occur */
! 4379:
1.1 misha 4380: default:
4381: RRETURN(PCRE_ERROR_INTERNAL);
4382: }
4383: }
4384:
4385: /* Match extended Unicode sequences. We will get here only if the
4386: support is in the binary; otherwise a compile-time error occurs. */
4387:
4388: else if (ctype == OP_EXTUNI)
4389: {
4390: for (fi = min;; fi++)
4391: {
4392: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
4393: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 4394: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 4395: if (eptr >= md->end_subject)
! 4396: {
! 4397: SCHECK_PARTIAL();
! 4398: MRRETURN(MATCH_NOMATCH);
! 4399: }
1.1 misha 4400: GETCHARINCTEST(c, eptr);
1.2 misha 4401: prop_category = UCD_CATEGORY(c);
1.4 ! misha 4402: if (prop_category == ucp_M) MRRETURN(MATCH_NOMATCH);
1.1 misha 4403: while (eptr < md->end_subject)
4404: {
4405: int len = 1;
1.4 ! misha 4406: if (!utf8) c = *eptr;
! 4407: else { GETCHARLEN(c, eptr, len); }
1.2 misha 4408: prop_category = UCD_CATEGORY(c);
1.1 misha 4409: if (prop_category != ucp_M) break;
4410: eptr += len;
4411: }
4412: }
4413: }
4414:
4415: else
4416: #endif /* SUPPORT_UCP */
4417:
4418: #ifdef SUPPORT_UTF8
4419: /* UTF-8 mode */
4420: if (utf8)
4421: {
4422: for (fi = min;; fi++)
4423: {
4424: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
4425: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 4426: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 4427: if (eptr >= md->end_subject)
! 4428: {
! 4429: SCHECK_PARTIAL();
! 4430: MRRETURN(MATCH_NOMATCH);
! 4431: }
! 4432: if (ctype == OP_ANY && IS_NEWLINE(eptr))
! 4433: MRRETURN(MATCH_NOMATCH);
1.1 misha 4434: GETCHARINC(c, eptr);
4435: switch(ctype)
4436: {
4437: case OP_ANY: /* This is the non-NL case */
4438: case OP_ALLANY:
4439: case OP_ANYBYTE:
4440: break;
4441:
4442: case OP_ANYNL:
4443: switch(c)
4444: {
1.4 ! misha 4445: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4446: case 0x000d:
4447: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4448: break;
4449: case 0x000a:
4450: break;
4451:
4452: case 0x000b:
4453: case 0x000c:
4454: case 0x0085:
4455: case 0x2028:
4456: case 0x2029:
1.4 ! misha 4457: if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1.1 misha 4458: break;
4459: }
4460: break;
4461:
4462: case OP_NOT_HSPACE:
4463: switch(c)
4464: {
4465: default: break;
4466: case 0x09: /* HT */
4467: case 0x20: /* SPACE */
4468: case 0xa0: /* NBSP */
4469: case 0x1680: /* OGHAM SPACE MARK */
4470: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4471: case 0x2000: /* EN QUAD */
4472: case 0x2001: /* EM QUAD */
4473: case 0x2002: /* EN SPACE */
4474: case 0x2003: /* EM SPACE */
4475: case 0x2004: /* THREE-PER-EM SPACE */
4476: case 0x2005: /* FOUR-PER-EM SPACE */
4477: case 0x2006: /* SIX-PER-EM SPACE */
4478: case 0x2007: /* FIGURE SPACE */
4479: case 0x2008: /* PUNCTUATION SPACE */
4480: case 0x2009: /* THIN SPACE */
4481: case 0x200A: /* HAIR SPACE */
4482: case 0x202f: /* NARROW NO-BREAK SPACE */
4483: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4484: case 0x3000: /* IDEOGRAPHIC SPACE */
1.4 ! misha 4485: MRRETURN(MATCH_NOMATCH);
1.1 misha 4486: }
4487: break;
4488:
4489: case OP_HSPACE:
4490: switch(c)
4491: {
1.4 ! misha 4492: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4493: case 0x09: /* HT */
4494: case 0x20: /* SPACE */
4495: case 0xa0: /* NBSP */
4496: case 0x1680: /* OGHAM SPACE MARK */
4497: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4498: case 0x2000: /* EN QUAD */
4499: case 0x2001: /* EM QUAD */
4500: case 0x2002: /* EN SPACE */
4501: case 0x2003: /* EM SPACE */
4502: case 0x2004: /* THREE-PER-EM SPACE */
4503: case 0x2005: /* FOUR-PER-EM SPACE */
4504: case 0x2006: /* SIX-PER-EM SPACE */
4505: case 0x2007: /* FIGURE SPACE */
4506: case 0x2008: /* PUNCTUATION SPACE */
4507: case 0x2009: /* THIN SPACE */
4508: case 0x200A: /* HAIR SPACE */
4509: case 0x202f: /* NARROW NO-BREAK SPACE */
4510: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4511: case 0x3000: /* IDEOGRAPHIC SPACE */
4512: break;
4513: }
4514: break;
4515:
4516: case OP_NOT_VSPACE:
4517: switch(c)
4518: {
4519: default: break;
4520: case 0x0a: /* LF */
4521: case 0x0b: /* VT */
4522: case 0x0c: /* FF */
4523: case 0x0d: /* CR */
4524: case 0x85: /* NEL */
4525: case 0x2028: /* LINE SEPARATOR */
4526: case 0x2029: /* PARAGRAPH SEPARATOR */
1.4 ! misha 4527: MRRETURN(MATCH_NOMATCH);
1.1 misha 4528: }
4529: break;
4530:
4531: case OP_VSPACE:
4532: switch(c)
4533: {
1.4 ! misha 4534: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4535: case 0x0a: /* LF */
4536: case 0x0b: /* VT */
4537: case 0x0c: /* FF */
4538: case 0x0d: /* CR */
4539: case 0x85: /* NEL */
4540: case 0x2028: /* LINE SEPARATOR */
4541: case 0x2029: /* PARAGRAPH SEPARATOR */
4542: break;
4543: }
4544: break;
4545:
4546: case OP_NOT_DIGIT:
4547: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
1.4 ! misha 4548: MRRETURN(MATCH_NOMATCH);
1.1 misha 4549: break;
4550:
4551: case OP_DIGIT:
4552: if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
1.4 ! misha 4553: MRRETURN(MATCH_NOMATCH);
1.1 misha 4554: break;
4555:
4556: case OP_NOT_WHITESPACE:
4557: if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
1.4 ! misha 4558: MRRETURN(MATCH_NOMATCH);
1.1 misha 4559: break;
4560:
4561: case OP_WHITESPACE:
4562: if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
1.4 ! misha 4563: MRRETURN(MATCH_NOMATCH);
1.1 misha 4564: break;
4565:
4566: case OP_NOT_WORDCHAR:
4567: if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
1.4 ! misha 4568: MRRETURN(MATCH_NOMATCH);
1.1 misha 4569: break;
4570:
4571: case OP_WORDCHAR:
4572: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
1.4 ! misha 4573: MRRETURN(MATCH_NOMATCH);
1.1 misha 4574: break;
4575:
4576: default:
4577: RRETURN(PCRE_ERROR_INTERNAL);
4578: }
4579: }
4580: }
4581: else
4582: #endif
4583: /* Not UTF-8 mode */
4584: {
4585: for (fi = min;; fi++)
4586: {
4587: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
4588: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.4 ! misha 4589: if (fi >= max) MRRETURN(MATCH_NOMATCH);
! 4590: if (eptr >= md->end_subject)
! 4591: {
! 4592: SCHECK_PARTIAL();
! 4593: MRRETURN(MATCH_NOMATCH);
! 4594: }
! 4595: if (ctype == OP_ANY && IS_NEWLINE(eptr))
! 4596: MRRETURN(MATCH_NOMATCH);
1.1 misha 4597: c = *eptr++;
4598: switch(ctype)
4599: {
4600: case OP_ANY: /* This is the non-NL case */
4601: case OP_ALLANY:
4602: case OP_ANYBYTE:
4603: break;
4604:
4605: case OP_ANYNL:
4606: switch(c)
4607: {
1.4 ! misha 4608: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4609: case 0x000d:
4610: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4611: break;
4612:
4613: case 0x000a:
4614: break;
4615:
4616: case 0x000b:
4617: case 0x000c:
4618: case 0x0085:
1.4 ! misha 4619: if (md->bsr_anycrlf) MRRETURN(MATCH_NOMATCH);
1.1 misha 4620: break;
4621: }
4622: break;
4623:
4624: case OP_NOT_HSPACE:
4625: switch(c)
4626: {
4627: default: break;
4628: case 0x09: /* HT */
4629: case 0x20: /* SPACE */
4630: case 0xa0: /* NBSP */
1.4 ! misha 4631: MRRETURN(MATCH_NOMATCH);
1.1 misha 4632: }
4633: break;
4634:
4635: case OP_HSPACE:
4636: switch(c)
4637: {
1.4 ! misha 4638: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4639: case 0x09: /* HT */
4640: case 0x20: /* SPACE */
4641: case 0xa0: /* NBSP */
4642: break;
4643: }
4644: break;
4645:
4646: case OP_NOT_VSPACE:
4647: switch(c)
4648: {
4649: default: break;
4650: case 0x0a: /* LF */
4651: case 0x0b: /* VT */
4652: case 0x0c: /* FF */
4653: case 0x0d: /* CR */
4654: case 0x85: /* NEL */
1.4 ! misha 4655: MRRETURN(MATCH_NOMATCH);
1.1 misha 4656: }
4657: break;
4658:
4659: case OP_VSPACE:
4660: switch(c)
4661: {
1.4 ! misha 4662: default: MRRETURN(MATCH_NOMATCH);
1.1 misha 4663: case 0x0a: /* LF */
4664: case 0x0b: /* VT */
4665: case 0x0c: /* FF */
4666: case 0x0d: /* CR */
4667: case 0x85: /* NEL */
4668: break;
4669: }
4670: break;
4671:
4672: case OP_NOT_DIGIT:
1.4 ! misha 4673: if ((md->ctypes[c] & ctype_digit) != 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 4674: break;
4675:
4676: case OP_DIGIT:
1.4 ! misha 4677: if ((md->ctypes[c] & ctype_digit) == 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 4678: break;
4679:
4680: case OP_NOT_WHITESPACE:
1.4 ! misha 4681: if ((md->ctypes[c] & ctype_space) != 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 4682: break;
4683:
4684: case OP_WHITESPACE:
1.4 ! misha 4685: if ((md->ctypes[c] & ctype_space) == 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 4686: break;
4687:
4688: case OP_NOT_WORDCHAR:
1.4 ! misha 4689: if ((md->ctypes[c] & ctype_word) != 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 4690: break;
4691:
4692: case OP_WORDCHAR:
1.4 ! misha 4693: if ((md->ctypes[c] & ctype_word) == 0) MRRETURN(MATCH_NOMATCH);
1.1 misha 4694: break;
4695:
4696: default:
4697: RRETURN(PCRE_ERROR_INTERNAL);
4698: }
4699: }
4700: }
4701: /* Control never gets here */
4702: }
4703:
4704: /* If maximizing, it is worth using inline code for speed, doing the type
4705: test once at the start (i.e. keep it out of the loop). Again, keep the
4706: UTF-8 and UCP stuff separate. */
4707:
4708: else
4709: {
4710: pp = eptr; /* Remember where we started */
4711:
4712: #ifdef SUPPORT_UCP
4713: if (prop_type >= 0)
4714: {
4715: switch(prop_type)
4716: {
4717: case PT_ANY:
4718: for (i = min; i < max; i++)
4719: {
4720: int len = 1;
1.4 ! misha 4721: if (eptr >= md->end_subject)
! 4722: {
! 4723: SCHECK_PARTIAL();
! 4724: break;
! 4725: }
! 4726: GETCHARLENTEST(c, eptr, len);
1.1 misha 4727: if (prop_fail_result) break;
4728: eptr+= len;
4729: }
4730: break;
4731:
4732: case PT_LAMP:
4733: for (i = min; i < max; i++)
4734: {
4735: int len = 1;
1.4 ! misha 4736: if (eptr >= md->end_subject)
! 4737: {
! 4738: SCHECK_PARTIAL();
! 4739: break;
! 4740: }
! 4741: GETCHARLENTEST(c, eptr, len);
1.2 misha 4742: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 4743: if ((prop_chartype == ucp_Lu ||
4744: prop_chartype == ucp_Ll ||
4745: prop_chartype == ucp_Lt) == prop_fail_result)
4746: break;
4747: eptr+= len;
4748: }
4749: break;
4750:
4751: case PT_GC:
4752: for (i = min; i < max; i++)
4753: {
4754: int len = 1;
1.4 ! misha 4755: if (eptr >= md->end_subject)
! 4756: {
! 4757: SCHECK_PARTIAL();
! 4758: break;
! 4759: }
! 4760: GETCHARLENTEST(c, eptr, len);
1.2 misha 4761: prop_category = UCD_CATEGORY(c);
1.1 misha 4762: if ((prop_category == prop_value) == prop_fail_result)
4763: break;
4764: eptr+= len;
4765: }
4766: break;
4767:
4768: case PT_PC:
4769: for (i = min; i < max; i++)
4770: {
4771: int len = 1;
1.4 ! misha 4772: if (eptr >= md->end_subject)
! 4773: {
! 4774: SCHECK_PARTIAL();
! 4775: break;
! 4776: }
! 4777: GETCHARLENTEST(c, eptr, len);
1.2 misha 4778: prop_chartype = UCD_CHARTYPE(c);
1.1 misha 4779: if ((prop_chartype == prop_value) == prop_fail_result)
4780: break;
4781: eptr+= len;
4782: }
4783: break;
4784:
4785: case PT_SC:
4786: for (i = min; i < max; i++)
4787: {
4788: int len = 1;
1.4 ! misha 4789: if (eptr >= md->end_subject)
! 4790: {
! 4791: SCHECK_PARTIAL();
! 4792: break;
! 4793: }
! 4794: GETCHARLENTEST(c, eptr, len);
1.2 misha 4795: prop_script = UCD_SCRIPT(c);
1.1 misha 4796: if ((prop_script == prop_value) == prop_fail_result)
4797: break;
4798: eptr+= len;
4799: }
4800: break;
1.4 ! misha 4801:
! 4802: case PT_ALNUM:
! 4803: for (i = min; i < max; i++)
! 4804: {
! 4805: int len = 1;
! 4806: if (eptr >= md->end_subject)
! 4807: {
! 4808: SCHECK_PARTIAL();
! 4809: break;
! 4810: }
! 4811: GETCHARLENTEST(c, eptr, len);
! 4812: prop_category = UCD_CATEGORY(c);
! 4813: if ((prop_category == ucp_L || prop_category == ucp_N)
! 4814: == prop_fail_result)
! 4815: break;
! 4816: eptr+= len;
! 4817: }
! 4818: break;
! 4819:
! 4820: case PT_SPACE: /* Perl space */
! 4821: for (i = min; i < max; i++)
! 4822: {
! 4823: int len = 1;
! 4824: if (eptr >= md->end_subject)
! 4825: {
! 4826: SCHECK_PARTIAL();
! 4827: break;
! 4828: }
! 4829: GETCHARLENTEST(c, eptr, len);
! 4830: prop_category = UCD_CATEGORY(c);
! 4831: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
! 4832: c == CHAR_FF || c == CHAR_CR)
! 4833: == prop_fail_result)
! 4834: break;
! 4835: eptr+= len;
! 4836: }
! 4837: break;
! 4838:
! 4839: case PT_PXSPACE: /* POSIX space */
! 4840: for (i = min; i < max; i++)
! 4841: {
! 4842: int len = 1;
! 4843: if (eptr >= md->end_subject)
! 4844: {
! 4845: SCHECK_PARTIAL();
! 4846: break;
! 4847: }
! 4848: GETCHARLENTEST(c, eptr, len);
! 4849: prop_category = UCD_CATEGORY(c);
! 4850: if ((prop_category == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
! 4851: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
! 4852: == prop_fail_result)
! 4853: break;
! 4854: eptr+= len;
! 4855: }
! 4856: break;
! 4857:
! 4858: case PT_WORD:
! 4859: for (i = min; i < max; i++)
! 4860: {
! 4861: int len = 1;
! 4862: if (eptr >= md->end_subject)
! 4863: {
! 4864: SCHECK_PARTIAL();
! 4865: break;
! 4866: }
! 4867: GETCHARLENTEST(c, eptr, len);
! 4868: prop_category = UCD_CATEGORY(c);
! 4869: if ((prop_category == ucp_L || prop_category == ucp_N ||
! 4870: c == CHAR_UNDERSCORE) == prop_fail_result)
! 4871: break;
! 4872: eptr+= len;
! 4873: }
! 4874: break;
! 4875:
! 4876: default:
! 4877: RRETURN(PCRE_ERROR_INTERNAL);
1.1 misha 4878: }
4879:
4880: /* eptr is now past the end of the maximum run */
4881:
4882: if (possessive) continue;
4883: for(;;)
4884: {
4885: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
4886: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4887: if (eptr-- == pp) break; /* Stop if tried at original pos */
4888: if (utf8) BACKCHAR(eptr);
4889: }
4890: }
4891:
4892: /* Match extended Unicode sequences. We will get here only if the
4893: support is in the binary; otherwise a compile-time error occurs. */
4894:
4895: else if (ctype == OP_EXTUNI)
4896: {
4897: for (i = min; i < max; i++)
4898: {
1.4 ! misha 4899: if (eptr >= md->end_subject)
! 4900: {
! 4901: SCHECK_PARTIAL();
! 4902: break;
! 4903: }
1.1 misha 4904: GETCHARINCTEST(c, eptr);
1.2 misha 4905: prop_category = UCD_CATEGORY(c);
1.1 misha 4906: if (prop_category == ucp_M) break;
4907: while (eptr < md->end_subject)
4908: {
4909: int len = 1;
4910: if (!utf8) c = *eptr; else
4911: {
4912: GETCHARLEN(c, eptr, len);
4913: }
1.2 misha 4914: prop_category = UCD_CATEGORY(c);
1.1 misha 4915: if (prop_category != ucp_M) break;
4916: eptr += len;
4917: }
4918: }
4919:
4920: /* eptr is now past the end of the maximum run */
4921:
4922: if (possessive) continue;
1.4 ! misha 4923:
1.1 misha 4924: for(;;)
4925: {
4926: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
4927: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4928: if (eptr-- == pp) break; /* Stop if tried at original pos */
4929: for (;;) /* Move back over one extended */
4930: {
4931: int len = 1;
4932: if (!utf8) c = *eptr; else
4933: {
4934: BACKCHAR(eptr);
4935: GETCHARLEN(c, eptr, len);
4936: }
1.2 misha 4937: prop_category = UCD_CATEGORY(c);
1.1 misha 4938: if (prop_category != ucp_M) break;
4939: eptr--;
4940: }
4941: }
4942: }
4943:
4944: else
4945: #endif /* SUPPORT_UCP */
4946:
4947: #ifdef SUPPORT_UTF8
4948: /* UTF-8 mode */
4949:
4950: if (utf8)
4951: {
4952: switch(ctype)
4953: {
4954: case OP_ANY:
4955: if (max < INT_MAX)
4956: {
4957: for (i = min; i < max; i++)
4958: {
1.4 ! misha 4959: if (eptr >= md->end_subject)
! 4960: {
! 4961: SCHECK_PARTIAL();
! 4962: break;
! 4963: }
! 4964: if (IS_NEWLINE(eptr)) break;
1.1 misha 4965: eptr++;
4966: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4967: }
4968: }
4969:
4970: /* Handle unlimited UTF-8 repeat */
4971:
4972: else
4973: {
4974: for (i = min; i < max; i++)
4975: {
1.4 ! misha 4976: if (eptr >= md->end_subject)
! 4977: {
! 4978: SCHECK_PARTIAL();
! 4979: break;
! 4980: }
! 4981: if (IS_NEWLINE(eptr)) break;
1.1 misha 4982: eptr++;
4983: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4984: }
4985: }
4986: break;
4987:
4988: case OP_ALLANY:
4989: if (max < INT_MAX)
4990: {
4991: for (i = min; i < max; i++)
4992: {
1.4 ! misha 4993: if (eptr >= md->end_subject)
! 4994: {
! 4995: SCHECK_PARTIAL();
! 4996: break;
! 4997: }
1.1 misha 4998: eptr++;
4999: while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
5000: }
5001: }
5002: else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
5003: break;
5004:
5005: /* The byte case is the same as non-UTF8 */
5006:
5007: case OP_ANYBYTE:
5008: c = max - min;
5009: if (c > (unsigned int)(md->end_subject - eptr))
1.4 ! misha 5010: {
! 5011: eptr = md->end_subject;
! 5012: SCHECK_PARTIAL();
! 5013: }
! 5014: else eptr += c;
1.1 misha 5015: break;
5016:
5017: case OP_ANYNL:
5018: for (i = min; i < max; i++)
5019: {
5020: int len = 1;
1.4 ! misha 5021: if (eptr >= md->end_subject)
! 5022: {
! 5023: SCHECK_PARTIAL();
! 5024: break;
! 5025: }
1.1 misha 5026: GETCHARLEN(c, eptr, len);
5027: if (c == 0x000d)
5028: {
5029: if (++eptr >= md->end_subject) break;
5030: if (*eptr == 0x000a) eptr++;
5031: }
5032: else
5033: {
5034: if (c != 0x000a &&
5035: (md->bsr_anycrlf ||
5036: (c != 0x000b && c != 0x000c &&
5037: c != 0x0085 && c != 0x2028 && c != 0x2029)))
5038: break;
5039: eptr += len;
5040: }
5041: }
5042: break;
5043:
5044: case OP_NOT_HSPACE:
5045: case OP_HSPACE:
5046: for (i = min; i < max; i++)
5047: {
5048: BOOL gotspace;
5049: int len = 1;
1.4 ! misha 5050: if (eptr >= md->end_subject)
! 5051: {
! 5052: SCHECK_PARTIAL();
! 5053: break;
! 5054: }
1.1 misha 5055: GETCHARLEN(c, eptr, len);
5056: switch(c)
5057: {
5058: default: gotspace = FALSE; break;
5059: case 0x09: /* HT */
5060: case 0x20: /* SPACE */
5061: case 0xa0: /* NBSP */
5062: case 0x1680: /* OGHAM SPACE MARK */
5063: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5064: case 0x2000: /* EN QUAD */
5065: case 0x2001: /* EM QUAD */
5066: case 0x2002: /* EN SPACE */
5067: case 0x2003: /* EM SPACE */
5068: case 0x2004: /* THREE-PER-EM SPACE */
5069: case 0x2005: /* FOUR-PER-EM SPACE */
5070: case 0x2006: /* SIX-PER-EM SPACE */
5071: case 0x2007: /* FIGURE SPACE */
5072: case 0x2008: /* PUNCTUATION SPACE */
5073: case 0x2009: /* THIN SPACE */
5074: case 0x200A: /* HAIR SPACE */
5075: case 0x202f: /* NARROW NO-BREAK SPACE */
5076: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5077: case 0x3000: /* IDEOGRAPHIC SPACE */
5078: gotspace = TRUE;
5079: break;
5080: }
5081: if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5082: eptr += len;
5083: }
5084: break;
5085:
5086: case OP_NOT_VSPACE:
5087: case OP_VSPACE:
5088: for (i = min; i < max; i++)
5089: {
5090: BOOL gotspace;
5091: int len = 1;
1.4 ! misha 5092: if (eptr >= md->end_subject)
! 5093: {
! 5094: SCHECK_PARTIAL();
! 5095: break;
! 5096: }
1.1 misha 5097: GETCHARLEN(c, eptr, len);
5098: switch(c)
5099: {
5100: default: gotspace = FALSE; break;
5101: case 0x0a: /* LF */
5102: case 0x0b: /* VT */
5103: case 0x0c: /* FF */
5104: case 0x0d: /* CR */
5105: case 0x85: /* NEL */
5106: case 0x2028: /* LINE SEPARATOR */
5107: case 0x2029: /* PARAGRAPH SEPARATOR */
5108: gotspace = TRUE;
5109: break;
5110: }
5111: if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5112: eptr += len;
5113: }
5114: break;
5115:
5116: case OP_NOT_DIGIT:
5117: for (i = min; i < max; i++)
5118: {
5119: int len = 1;
1.4 ! misha 5120: if (eptr >= md->end_subject)
! 5121: {
! 5122: SCHECK_PARTIAL();
! 5123: break;
! 5124: }
1.1 misha 5125: GETCHARLEN(c, eptr, len);
5126: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5127: eptr+= len;
5128: }
5129: break;
5130:
5131: case OP_DIGIT:
5132: for (i = min; i < max; i++)
5133: {
5134: int len = 1;
1.4 ! misha 5135: if (eptr >= md->end_subject)
! 5136: {
! 5137: SCHECK_PARTIAL();
! 5138: break;
! 5139: }
1.1 misha 5140: GETCHARLEN(c, eptr, len);
5141: if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5142: eptr+= len;
5143: }
5144: break;
5145:
5146: case OP_NOT_WHITESPACE:
5147: for (i = min; i < max; i++)
5148: {
5149: int len = 1;
1.4 ! misha 5150: if (eptr >= md->end_subject)
! 5151: {
! 5152: SCHECK_PARTIAL();
! 5153: break;
! 5154: }
1.1 misha 5155: GETCHARLEN(c, eptr, len);
5156: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5157: eptr+= len;
5158: }
5159: break;
5160:
5161: case OP_WHITESPACE:
5162: for (i = min; i < max; i++)
5163: {
5164: int len = 1;
1.4 ! misha 5165: if (eptr >= md->end_subject)
! 5166: {
! 5167: SCHECK_PARTIAL();
! 5168: break;
! 5169: }
1.1 misha 5170: GETCHARLEN(c, eptr, len);
5171: if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5172: eptr+= len;
5173: }
5174: break;
5175:
5176: case OP_NOT_WORDCHAR:
5177: for (i = min; i < max; i++)
5178: {
5179: int len = 1;
1.4 ! misha 5180: if (eptr >= md->end_subject)
! 5181: {
! 5182: SCHECK_PARTIAL();
! 5183: break;
! 5184: }
1.1 misha 5185: GETCHARLEN(c, eptr, len);
5186: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5187: eptr+= len;
5188: }
5189: break;
5190:
5191: case OP_WORDCHAR:
5192: for (i = min; i < max; i++)
5193: {
5194: int len = 1;
1.4 ! misha 5195: if (eptr >= md->end_subject)
! 5196: {
! 5197: SCHECK_PARTIAL();
! 5198: break;
! 5199: }
1.1 misha 5200: GETCHARLEN(c, eptr, len);
5201: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5202: eptr+= len;
5203: }
5204: break;
5205:
5206: default:
5207: RRETURN(PCRE_ERROR_INTERNAL);
5208: }
5209:
5210: /* eptr is now past the end of the maximum run */
5211:
5212: if (possessive) continue;
5213: for(;;)
5214: {
5215: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
5216: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5217: if (eptr-- == pp) break; /* Stop if tried at original pos */
5218: BACKCHAR(eptr);
5219: }
5220: }
5221: else
5222: #endif /* SUPPORT_UTF8 */
5223:
5224: /* Not UTF-8 mode */
5225: {
5226: switch(ctype)
5227: {
5228: case OP_ANY:
5229: for (i = min; i < max; i++)
5230: {
1.4 ! misha 5231: if (eptr >= md->end_subject)
! 5232: {
! 5233: SCHECK_PARTIAL();
! 5234: break;
! 5235: }
! 5236: if (IS_NEWLINE(eptr)) break;
1.1 misha 5237: eptr++;
5238: }
5239: break;
5240:
5241: case OP_ALLANY:
5242: case OP_ANYBYTE:
5243: c = max - min;
5244: if (c > (unsigned int)(md->end_subject - eptr))
1.4 ! misha 5245: {
! 5246: eptr = md->end_subject;
! 5247: SCHECK_PARTIAL();
! 5248: }
! 5249: else eptr += c;
1.1 misha 5250: break;
5251:
5252: case OP_ANYNL:
5253: for (i = min; i < max; i++)
5254: {
1.4 ! misha 5255: if (eptr >= md->end_subject)
! 5256: {
! 5257: SCHECK_PARTIAL();
! 5258: break;
! 5259: }
1.1 misha 5260: c = *eptr;
5261: if (c == 0x000d)
5262: {
5263: if (++eptr >= md->end_subject) break;
5264: if (*eptr == 0x000a) eptr++;
5265: }
5266: else
5267: {
5268: if (c != 0x000a &&
5269: (md->bsr_anycrlf ||
5270: (c != 0x000b && c != 0x000c && c != 0x0085)))
5271: break;
5272: eptr++;
5273: }
5274: }
5275: break;
5276:
5277: case OP_NOT_HSPACE:
5278: for (i = min; i < max; i++)
5279: {
1.4 ! misha 5280: if (eptr >= md->end_subject)
! 5281: {
! 5282: SCHECK_PARTIAL();
! 5283: break;
! 5284: }
1.1 misha 5285: c = *eptr;
5286: if (c == 0x09 || c == 0x20 || c == 0xa0) break;
5287: eptr++;
5288: }
5289: break;
5290:
5291: case OP_HSPACE:
5292: for (i = min; i < max; i++)
5293: {
1.4 ! misha 5294: if (eptr >= md->end_subject)
! 5295: {
! 5296: SCHECK_PARTIAL();
! 5297: break;
! 5298: }
1.1 misha 5299: c = *eptr;
5300: if (c != 0x09 && c != 0x20 && c != 0xa0) break;
5301: eptr++;
5302: }
5303: break;
5304:
5305: case OP_NOT_VSPACE:
5306: for (i = min; i < max; i++)
5307: {
1.4 ! misha 5308: if (eptr >= md->end_subject)
! 5309: {
! 5310: SCHECK_PARTIAL();
! 5311: break;
! 5312: }
1.1 misha 5313: c = *eptr;
5314: if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
5315: break;
5316: eptr++;
5317: }
5318: break;
5319:
5320: case OP_VSPACE:
5321: for (i = min; i < max; i++)
5322: {
1.4 ! misha 5323: if (eptr >= md->end_subject)
! 5324: {
! 5325: SCHECK_PARTIAL();
! 5326: break;
! 5327: }
1.1 misha 5328: c = *eptr;
5329: if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
5330: break;
5331: eptr++;
5332: }
5333: break;
5334:
5335: case OP_NOT_DIGIT:
5336: for (i = min; i < max; i++)
5337: {
1.4 ! misha 5338: if (eptr >= md->end_subject)
! 5339: {
! 5340: SCHECK_PARTIAL();
1.1 misha 5341: break;
1.4 ! misha 5342: }
! 5343: if ((md->ctypes[*eptr] & ctype_digit) != 0) break;
1.1 misha 5344: eptr++;
5345: }
5346: break;
5347:
5348: case OP_DIGIT:
5349: for (i = min; i < max; i++)
5350: {
1.4 ! misha 5351: if (eptr >= md->end_subject)
! 5352: {
! 5353: SCHECK_PARTIAL();
1.1 misha 5354: break;
1.4 ! misha 5355: }
! 5356: if ((md->ctypes[*eptr] & ctype_digit) == 0) break;
1.1 misha 5357: eptr++;
5358: }
5359: break;
5360:
5361: case OP_NOT_WHITESPACE:
5362: for (i = min; i < max; i++)
5363: {
1.4 ! misha 5364: if (eptr >= md->end_subject)
! 5365: {
! 5366: SCHECK_PARTIAL();
1.1 misha 5367: break;
1.4 ! misha 5368: }
! 5369: if ((md->ctypes[*eptr] & ctype_space) != 0) break;
1.1 misha 5370: eptr++;
5371: }
5372: break;
5373:
5374: case OP_WHITESPACE:
5375: for (i = min; i < max; i++)
5376: {
1.4 ! misha 5377: if (eptr >= md->end_subject)
! 5378: {
! 5379: SCHECK_PARTIAL();
1.1 misha 5380: break;
1.4 ! misha 5381: }
! 5382: if ((md->ctypes[*eptr] & ctype_space) == 0) break;
1.1 misha 5383: eptr++;
5384: }
5385: break;
5386:
5387: case OP_NOT_WORDCHAR:
5388: for (i = min; i < max; i++)
5389: {
1.4 ! misha 5390: if (eptr >= md->end_subject)
! 5391: {
! 5392: SCHECK_PARTIAL();
1.1 misha 5393: break;
1.4 ! misha 5394: }
! 5395: if ((md->ctypes[*eptr] & ctype_word) != 0) break;
1.1 misha 5396: eptr++;
5397: }
5398: break;
5399:
5400: case OP_WORDCHAR:
5401: for (i = min; i < max; i++)
5402: {
1.4 ! misha 5403: if (eptr >= md->end_subject)
! 5404: {
! 5405: SCHECK_PARTIAL();
1.1 misha 5406: break;
1.4 ! misha 5407: }
! 5408: if ((md->ctypes[*eptr] & ctype_word) == 0) break;
1.1 misha 5409: eptr++;
5410: }
5411: break;
5412:
5413: default:
5414: RRETURN(PCRE_ERROR_INTERNAL);
5415: }
5416:
5417: /* eptr is now past the end of the maximum run */
5418:
5419: if (possessive) continue;
5420: while (eptr >= pp)
5421: {
5422: RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
5423: eptr--;
5424: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5425: }
5426: }
5427:
5428: /* Get here if we can't make it match with any permitted repetitions */
5429:
1.4 ! misha 5430: MRRETURN(MATCH_NOMATCH);
1.1 misha 5431: }
5432: /* Control never gets here */
5433:
5434: /* There's been some horrible disaster. Arrival here can only mean there is
5435: something seriously wrong in the code above or the OP_xxx definitions. */
5436:
5437: default:
5438: DPRINTF(("Unknown opcode %d\n", *ecode));
5439: RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
5440: }
5441:
5442: /* Do not stick any code in here without much thought; it is assumed
5443: that "continue" in the code above comes out to here to repeat the main
5444: loop. */
5445:
5446: } /* End of main loop */
5447: /* Control never reaches here */
5448:
5449:
5450: /* When compiling to use the heap rather than the stack for recursive calls to
5451: match(), the RRETURN() macro jumps here. The number that is saved in
5452: frame->Xwhere indicates which label we actually want to return to. */
5453:
5454: #ifdef NO_RECURSE
5455: #define LBL(val) case val: goto L_RM##val;
5456: HEAP_RETURN:
5457: switch (frame->Xwhere)
5458: {
5459: LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
5460: LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
5461: LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
5462: LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
1.4 ! misha 5463: LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58)
1.1 misha 5464: #ifdef SUPPORT_UTF8
5465: LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
5466: LBL(32) LBL(34) LBL(42) LBL(46)
5467: #ifdef SUPPORT_UCP
5468: LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
1.4 ! misha 5469: LBL(59) LBL(60) LBL(61) LBL(62)
1.1 misha 5470: #endif /* SUPPORT_UCP */
5471: #endif /* SUPPORT_UTF8 */
5472: default:
5473: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
5474: return PCRE_ERROR_INTERNAL;
5475: }
5476: #undef LBL
5477: #endif /* NO_RECURSE */
5478: }
5479:
5480:
5481: /***************************************************************************
5482: ****************************************************************************
5483: RECURSION IN THE match() FUNCTION
5484:
5485: Undefine all the macros that were defined above to handle this. */
5486:
5487: #ifdef NO_RECURSE
5488: #undef eptr
5489: #undef ecode
5490: #undef mstart
5491: #undef offset_top
5492: #undef ims
5493: #undef eptrb
5494: #undef flags
5495:
5496: #undef callpat
5497: #undef charptr
5498: #undef data
5499: #undef next
5500: #undef pp
5501: #undef prev
5502: #undef saved_eptr
5503:
5504: #undef new_recursive
5505:
5506: #undef cur_is_word
5507: #undef condition
5508: #undef prev_is_word
5509:
5510: #undef original_ims
5511:
5512: #undef ctype
5513: #undef length
5514: #undef max
5515: #undef min
5516: #undef number
5517: #undef offset
5518: #undef op
5519: #undef save_capture_last
5520: #undef save_offset1
5521: #undef save_offset2
5522: #undef save_offset3
5523: #undef stacksave
5524:
5525: #undef newptrb
5526:
5527: #endif
5528:
5529: /* These two are defined as macros in both cases */
5530:
5531: #undef fc
5532: #undef fi
5533:
5534: /***************************************************************************
5535: ***************************************************************************/
5536:
5537:
5538:
5539: /*************************************************
5540: * Execute a Regular Expression *
5541: *************************************************/
5542:
5543: /* This function applies a compiled re to a subject string and picks out
5544: portions of the string if it matches. Two elements in the vector are set for
5545: each substring: the offsets to the start and end of the substring.
5546:
5547: Arguments:
5548: argument_re points to the compiled expression
5549: extra_data points to extra data or is NULL
5550: subject points to the subject string
5551: length length of subject string (may contain binary zeros)
5552: start_offset where to start in the subject string
5553: options option bits
5554: offsets points to a vector of ints to be filled in with offsets
5555: offsetcount the number of elements in the vector
5556:
5557: Returns: > 0 => success; value is the number of elements filled in
5558: = 0 => success, but offsets is not big enough
5559: -1 => failed to match
5560: < -1 => some kind of unexpected problem
5561: */
5562:
1.2 misha 5563: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
1.1 misha 5564: pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
5565: PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
5566: int offsetcount)
5567: {
5568: int rc, resetcount, ocount;
5569: int first_byte = -1;
5570: int req_byte = -1;
5571: int req_byte2 = -1;
5572: int newline;
5573: unsigned long int ims;
5574: BOOL using_temporary_offsets = FALSE;
5575: BOOL anchored;
5576: BOOL startline;
5577: BOOL firstline;
5578: BOOL first_byte_caseless = FALSE;
5579: BOOL req_byte_caseless = FALSE;
5580: BOOL utf8;
5581: match_data match_block;
5582: match_data *md = &match_block;
5583: const uschar *tables;
5584: const uschar *start_bits = NULL;
5585: USPTR start_match = (USPTR)subject + start_offset;
5586: USPTR end_subject;
1.4 ! misha 5587: USPTR start_partial = NULL;
1.1 misha 5588: USPTR req_byte_ptr = start_match - 1;
5589:
5590: pcre_study_data internal_study;
5591: const pcre_study_data *study;
5592:
5593: real_pcre internal_re;
5594: const real_pcre *external_re = (const real_pcre *)argument_re;
5595: const real_pcre *re = external_re;
5596:
5597: /* Plausibility checks */
5598:
5599: if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
5600: if (re == NULL || subject == NULL ||
5601: (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
5602: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
5603:
1.4 ! misha 5604: /* This information is for finding all the numbers associated with a given
! 5605: name, for condition testing. */
! 5606:
! 5607: md->name_table = (uschar *)re + re->name_table_offset;
! 5608: md->name_count = re->name_count;
! 5609: md->name_entry_size = re->name_entry_size;
! 5610:
1.1 misha 5611: /* Fish out the optional data from the extra_data structure, first setting
5612: the default values. */
5613:
5614: study = NULL;
5615: md->match_limit = MATCH_LIMIT;
5616: md->match_limit_recursion = MATCH_LIMIT_RECURSION;
5617: md->callout_data = NULL;
5618:
5619: /* The table pointer is always in native byte order. */
5620:
5621: tables = external_re->tables;
5622:
5623: if (extra_data != NULL)
5624: {
5625: register unsigned int flags = extra_data->flags;
5626: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
5627: study = (const pcre_study_data *)extra_data->study_data;
5628: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
5629: md->match_limit = extra_data->match_limit;
5630: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
5631: md->match_limit_recursion = extra_data->match_limit_recursion;
5632: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
5633: md->callout_data = extra_data->callout_data;
5634: if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
5635: }
5636:
5637: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
5638: is a feature that makes it possible to save compiled regex and re-use them
5639: in other programs later. */
5640:
5641: if (tables == NULL) tables = _pcre_default_tables;
5642:
5643: /* Check that the first field in the block is the magic number. If it is not,
5644: test for a regex that was compiled on a host of opposite endianness. If this is
5645: the case, flipped values are put in internal_re and internal_study if there was
5646: study data too. */
5647:
5648: if (re->magic_number != MAGIC_NUMBER)
5649: {
5650: re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
5651: if (re == NULL) return PCRE_ERROR_BADMAGIC;
5652: if (study != NULL) study = &internal_study;
5653: }
5654:
5655: /* Set up other data */
5656:
5657: anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
5658: startline = (re->flags & PCRE_STARTLINE) != 0;
5659: firstline = (re->options & PCRE_FIRSTLINE) != 0;
5660:
5661: /* The code starts after the real_pcre block and the capture name table. */
5662:
5663: md->start_code = (const uschar *)external_re + re->name_table_offset +
5664: re->name_count * re->name_entry_size;
5665:
5666: md->start_subject = (USPTR)subject;
5667: md->start_offset = start_offset;
5668: md->end_subject = md->start_subject + length;
5669: end_subject = md->end_subject;
5670:
5671: md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
5672: utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
1.4 ! misha 5673: md->use_ucp = (re->options & PCRE_UCP) != 0;
1.1 misha 5674: md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
5675:
5676: md->notbol = (options & PCRE_NOTBOL) != 0;
5677: md->noteol = (options & PCRE_NOTEOL) != 0;
5678: md->notempty = (options & PCRE_NOTEMPTY) != 0;
1.4 ! misha 5679: md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
! 5680: md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
! 5681: ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
1.1 misha 5682: md->hitend = FALSE;
1.4 ! misha 5683: md->mark = NULL; /* In case never set */
1.1 misha 5684:
5685: md->recursive = NULL; /* No recursion at top level */
5686:
5687: md->lcc = tables + lcc_offset;
5688: md->ctypes = tables + ctypes_offset;
5689:
5690: /* Handle different \R options. */
5691:
5692: switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
5693: {
5694: case 0:
5695: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
5696: md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
5697: else
5698: #ifdef BSR_ANYCRLF
5699: md->bsr_anycrlf = TRUE;
5700: #else
5701: md->bsr_anycrlf = FALSE;
5702: #endif
5703: break;
5704:
5705: case PCRE_BSR_ANYCRLF:
5706: md->bsr_anycrlf = TRUE;
5707: break;
5708:
5709: case PCRE_BSR_UNICODE:
5710: md->bsr_anycrlf = FALSE;
5711: break;
5712:
5713: default: return PCRE_ERROR_BADNEWLINE;
5714: }
5715:
5716: /* Handle different types of newline. The three bits give eight cases. If
5717: nothing is set at run time, whatever was used at compile time applies. */
5718:
5719: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
5720: (pcre_uint32)options) & PCRE_NEWLINE_BITS)
5721: {
5722: case 0: newline = NEWLINE; break; /* Compile-time default */
1.3 misha 5723: case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
5724: case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
1.1 misha 5725: case PCRE_NEWLINE_CR+
1.3 misha 5726: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
1.1 misha 5727: case PCRE_NEWLINE_ANY: newline = -1; break;
5728: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
5729: default: return PCRE_ERROR_BADNEWLINE;
5730: }
5731:
5732: if (newline == -2)
5733: {
5734: md->nltype = NLTYPE_ANYCRLF;
5735: }
5736: else if (newline < 0)
5737: {
5738: md->nltype = NLTYPE_ANY;
5739: }
5740: else
5741: {
5742: md->nltype = NLTYPE_FIXED;
5743: if (newline > 255)
5744: {
5745: md->nllen = 2;
5746: md->nl[0] = (newline >> 8) & 255;
5747: md->nl[1] = newline & 255;
5748: }
5749: else
5750: {
5751: md->nllen = 1;
5752: md->nl[0] = newline;
5753: }
5754: }
5755:
1.4 ! misha 5756: /* Partial matching was originally supported only for a restricted set of
! 5757: regexes; from release 8.00 there are no restrictions, but the bits are still
! 5758: defined (though never set). So there's no harm in leaving this code. */
1.1 misha 5759:
5760: if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
5761: return PCRE_ERROR_BADPARTIAL;
5762:
5763: /* Check a UTF-8 string if required. Unfortunately there's no way of passing
5764: back the character offset. */
5765:
5766: #ifdef SUPPORT_UTF8
5767: if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
5768: {
1.3 misha 5769: if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
1.1 misha 5770: return PCRE_ERROR_BADUTF8;
5771: if (start_offset > 0 && start_offset < length)
5772: {
1.3 misha 5773: int tb = ((USPTR)subject)[start_offset];
1.1 misha 5774: if (tb > 127)
5775: {
5776: tb &= 0xc0;
5777: if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
5778: }
5779: }
5780: }
5781: #endif
5782:
5783: /* The ims options can vary during the matching as a result of the presence
5784: of (?ims) items in the pattern. They are kept in a local variable so that
5785: restoring at the exit of a group is easy. */
5786:
5787: ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
5788:
5789: /* If the expression has got more back references than the offsets supplied can
5790: hold, we get a temporary chunk of working store to use during the matching.
5791: Otherwise, we can use the vector supplied, rounding down its size to a multiple
5792: of 3. */
5793:
5794: ocount = offsetcount - (offsetcount % 3);
5795:
5796: if (re->top_backref > 0 && re->top_backref >= ocount/3)
5797: {
5798: ocount = re->top_backref * 3 + 3;
5799: md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
5800: if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
5801: using_temporary_offsets = TRUE;
5802: DPRINTF(("Got memory to hold back references\n"));
5803: }
5804: else md->offset_vector = offsets;
5805:
5806: md->offset_end = ocount;
5807: md->offset_max = (2*ocount)/3;
5808: md->offset_overflow = FALSE;
5809: md->capture_last = -1;
5810:
5811: /* Compute the minimum number of offsets that we need to reset each time. Doing
5812: this makes a huge difference to execution time when there aren't many brackets
5813: in the pattern. */
5814:
5815: resetcount = 2 + re->top_bracket * 2;
5816: if (resetcount > offsetcount) resetcount = ocount;
5817:
5818: /* Reset the working variable associated with each extraction. These should
5819: never be used unless previously set, but they get saved and restored, and so we
5820: initialize them to avoid reading uninitialized locations. */
5821:
5822: if (md->offset_vector != NULL)
5823: {
5824: register int *iptr = md->offset_vector + ocount;
5825: register int *iend = iptr - resetcount/2 + 1;
5826: while (--iptr >= iend) *iptr = -1;
5827: }
5828:
5829: /* Set up the first character to match, if available. The first_byte value is
5830: never set for an anchored regular expression, but the anchoring may be forced
5831: at run time, so we have to test for anchoring. The first char may be unset for
5832: an unanchored pattern, of course. If there's no first char and the pattern was
5833: studied, there may be a bitmap of possible first characters. */
5834:
5835: if (!anchored)
5836: {
5837: if ((re->flags & PCRE_FIRSTSET) != 0)
5838: {
5839: first_byte = re->first_byte & 255;
5840: if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
5841: first_byte = md->lcc[first_byte];
5842: }
5843: else
5844: if (!startline && study != NULL &&
1.4 ! misha 5845: (study->flags & PCRE_STUDY_MAPPED) != 0)
1.1 misha 5846: start_bits = study->start_bits;
5847: }
5848:
5849: /* For anchored or unanchored matches, there may be a "last known required
5850: character" set. */
5851:
5852: if ((re->flags & PCRE_REQCHSET) != 0)
5853: {
5854: req_byte = re->req_byte & 255;
5855: req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
5856: req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
5857: }
5858:
5859:
5860: /* ==========================================================================*/
5861:
5862: /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5863: the loop runs just once. */
5864:
5865: for(;;)
5866: {
5867: USPTR save_end_subject = end_subject;
5868: USPTR new_start_match;
5869:
5870: /* Reset the maximum number of extractions we might see. */
5871:
5872: if (md->offset_vector != NULL)
5873: {
5874: register int *iptr = md->offset_vector;
5875: register int *iend = iptr + resetcount;
5876: while (iptr < iend) *iptr++ = -1;
5877: }
5878:
1.3 misha 5879: /* If firstline is TRUE, the start of the match is constrained to the first
5880: line of a multiline string. That is, the match must be before or at the first
5881: newline. Implement this by temporarily adjusting end_subject so that we stop
5882: scanning at a newline. If the match fails at the newline, later code breaks
5883: this loop. */
1.1 misha 5884:
5885: if (firstline)
5886: {
5887: USPTR t = start_match;
1.2 misha 5888: #ifdef SUPPORT_UTF8
5889: if (utf8)
5890: {
5891: while (t < md->end_subject && !IS_NEWLINE(t))
5892: {
5893: t++;
5894: while (t < end_subject && (*t & 0xc0) == 0x80) t++;
5895: }
5896: }
5897: else
5898: #endif
1.1 misha 5899: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
5900: end_subject = t;
5901: }
5902:
1.3 misha 5903: /* There are some optimizations that avoid running the match if a known
5904: starting point is not found, or if a known later character is not present.
5905: However, there is an option that disables these, for testing and for ensuring
5906: that all callouts do actually occur. */
1.1 misha 5907:
1.3 misha 5908: if ((options & PCRE_NO_START_OPTIMIZE) == 0)
1.1 misha 5909: {
1.3 misha 5910: /* Advance to a unique first byte if there is one. */
5911:
5912: if (first_byte >= 0)
5913: {
5914: if (first_byte_caseless)
5915: while (start_match < end_subject && md->lcc[*start_match] != first_byte)
5916: start_match++;
5917: else
5918: while (start_match < end_subject && *start_match != first_byte)
5919: start_match++;
5920: }
1.1 misha 5921:
1.3 misha 5922: /* Or to just after a linebreak for a multiline match */
1.1 misha 5923:
1.3 misha 5924: else if (startline)
1.1 misha 5925: {
1.3 misha 5926: if (start_match > md->start_subject + start_offset)
5927: {
1.2 misha 5928: #ifdef SUPPORT_UTF8
1.3 misha 5929: if (utf8)
1.2 misha 5930: {
1.3 misha 5931: while (start_match < end_subject && !WAS_NEWLINE(start_match))
5932: {
1.2 misha 5933: start_match++;
1.3 misha 5934: while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
5935: start_match++;
5936: }
1.2 misha 5937: }
1.3 misha 5938: else
1.2 misha 5939: #endif
1.3 misha 5940: while (start_match < end_subject && !WAS_NEWLINE(start_match))
5941: start_match++;
1.1 misha 5942:
1.3 misha 5943: /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
5944: and we are now at a LF, advance the match position by one more character.
5945: */
5946:
5947: if (start_match[-1] == CHAR_CR &&
5948: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
5949: start_match < end_subject &&
5950: *start_match == CHAR_NL)
5951: start_match++;
5952: }
1.1 misha 5953: }
5954:
1.3 misha 5955: /* Or to a non-unique first byte after study */
1.1 misha 5956:
1.3 misha 5957: else if (start_bits != NULL)
1.1 misha 5958: {
1.3 misha 5959: while (start_match < end_subject)
5960: {
5961: register unsigned int c = *start_match;
1.4 ! misha 5962: if ((start_bits[c/8] & (1 << (c&7))) == 0)
! 5963: {
! 5964: start_match++;
! 5965: #ifdef SUPPORT_UTF8
! 5966: if (utf8)
! 5967: while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
! 5968: start_match++;
! 5969: #endif
! 5970: }
! 5971: else break;
1.3 misha 5972: }
1.1 misha 5973: }
1.3 misha 5974: } /* Starting optimizations */
1.1 misha 5975:
5976: /* Restore fudged end_subject */
5977:
5978: end_subject = save_end_subject;
5979:
1.4 ! misha 5980: /* The following two optimizations are disabled for partial matching or if
! 5981: disabling is explicitly requested. */
1.1 misha 5982:
1.4 ! misha 5983: if ((options & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
! 5984: {
! 5985: /* If the pattern was studied, a minimum subject length may be set. This is
! 5986: a lower bound; no actual string of that length may actually match the
! 5987: pattern. Although the value is, strictly, in characters, we treat it as
! 5988: bytes to avoid spending too much time in this optimization. */
1.1 misha 5989:
1.4 ! misha 5990: if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
! 5991: (pcre_uint32)(end_subject - start_match) < study->minlength)
! 5992: {
! 5993: rc = MATCH_NOMATCH;
! 5994: break;
! 5995: }
1.1 misha 5996:
1.4 ! misha 5997: /* If req_byte is set, we know that that character must appear in the
! 5998: subject for the match to succeed. If the first character is set, req_byte
! 5999: must be later in the subject; otherwise the test starts at the match point.
! 6000: This optimization can save a huge amount of backtracking in patterns with
! 6001: nested unlimited repeats that aren't going to match. Writing separate code
! 6002: for cased/caseless versions makes it go faster, as does using an
! 6003: autoincrement and backing off on a match.
1.1 misha 6004:
1.4 ! misha 6005: HOWEVER: when the subject string is very, very long, searching to its end
! 6006: can take a long time, and give bad performance on quite ordinary patterns.
! 6007: This showed up when somebody was matching something like /^\d+C/ on a
! 6008: 32-megabyte string... so we don't do this when the string is sufficiently
! 6009: long. */
1.1 misha 6010:
1.4 ! misha 6011: if (req_byte >= 0 && end_subject - start_match < REQ_BYTE_MAX)
1.1 misha 6012: {
1.4 ! misha 6013: register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
! 6014:
! 6015: /* We don't need to repeat the search if we haven't yet reached the
! 6016: place we found it at last time. */
! 6017:
! 6018: if (p > req_byte_ptr)
1.1 misha 6019: {
1.4 ! misha 6020: if (req_byte_caseless)
1.1 misha 6021: {
1.4 ! misha 6022: while (p < end_subject)
! 6023: {
! 6024: register int pp = *p++;
! 6025: if (pp == req_byte || pp == req_byte2) { p--; break; }
! 6026: }
1.1 misha 6027: }
1.4 ! misha 6028: else
1.1 misha 6029: {
1.4 ! misha 6030: while (p < end_subject)
! 6031: {
! 6032: if (*p++ == req_byte) { p--; break; }
! 6033: }
1.1 misha 6034: }
6035:
1.4 ! misha 6036: /* If we can't find the required character, break the matching loop,
! 6037: forcing a match failure. */
1.1 misha 6038:
1.4 ! misha 6039: if (p >= end_subject)
! 6040: {
! 6041: rc = MATCH_NOMATCH;
! 6042: break;
! 6043: }
1.1 misha 6044:
1.4 ! misha 6045: /* If we have found the required character, save the point where we
! 6046: found it, so that we don't search again next time round the loop if
! 6047: the start hasn't passed this character yet. */
1.1 misha 6048:
1.4 ! misha 6049: req_byte_ptr = p;
! 6050: }
1.1 misha 6051: }
6052: }
6053:
1.4 ! misha 6054: #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
! 6055: printf(">>>> Match against: ");
! 6056: pchars(start_match, end_subject - start_match, TRUE, md);
! 6057: printf("\n");
! 6058: #endif
! 6059:
! 6060: /* OK, we can now run the match. If "hitend" is set afterwards, remember the
! 6061: first starting point for which a partial match was found. */
1.1 misha 6062:
6063: md->start_match_ptr = start_match;
1.4 ! misha 6064: md->start_used_ptr = start_match;
1.1 misha 6065: md->match_call_count = 0;
1.4 ! misha 6066: rc = match(start_match, md->start_code, start_match, NULL, 2, md, ims, NULL,
! 6067: 0, 0);
! 6068: if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
1.1 misha 6069:
6070: switch(rc)
6071: {
1.4 ! misha 6072: /* SKIP passes back the next starting point explicitly, but if it is the
! 6073: same as the match we have just done, treat it as NOMATCH. */
! 6074:
! 6075: case MATCH_SKIP:
! 6076: if (md->start_match_ptr != start_match)
! 6077: {
! 6078: new_start_match = md->start_match_ptr;
! 6079: break;
! 6080: }
! 6081: /* Fall through */
! 6082:
! 6083: /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
! 6084: the SKIP's arg was not found. We also treat this as NOMATCH. */
! 6085:
! 6086: case MATCH_SKIP_ARG:
! 6087: /* Fall through */
! 6088:
1.1 misha 6089: /* NOMATCH and PRUNE advance by one character. THEN at this level acts
6090: exactly like PRUNE. */
6091:
6092: case MATCH_NOMATCH:
6093: case MATCH_PRUNE:
6094: case MATCH_THEN:
6095: new_start_match = start_match + 1;
6096: #ifdef SUPPORT_UTF8
6097: if (utf8)
6098: while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
6099: new_start_match++;
6100: #endif
6101: break;
6102:
6103: /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6104:
6105: case MATCH_COMMIT:
6106: rc = MATCH_NOMATCH;
6107: goto ENDLOOP;
6108:
1.4 ! misha 6109: /* Any other return is either a match, or some kind of error. */
1.1 misha 6110:
6111: default:
6112: goto ENDLOOP;
6113: }
6114:
6115: /* Control reaches here for the various types of "no match at this point"
6116: result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6117:
6118: rc = MATCH_NOMATCH;
6119:
6120: /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6121: newline in the subject (though it may continue over the newline). Therefore,
6122: if we have just failed to match, starting at a newline, do not continue. */
6123:
6124: if (firstline && IS_NEWLINE(start_match)) break;
6125:
6126: /* Advance to new matching position */
6127:
6128: start_match = new_start_match;
6129:
6130: /* Break the loop if the pattern is anchored or if we have passed the end of
6131: the subject. */
6132:
6133: if (anchored || start_match > end_subject) break;
6134:
6135: /* If we have just passed a CR and we are now at a LF, and the pattern does
6136: not contain any explicit matches for \r or \n, and the newline option is CRLF
6137: or ANY or ANYCRLF, advance the match position by one more character. */
6138:
1.3 misha 6139: if (start_match[-1] == CHAR_CR &&
1.1 misha 6140: start_match < end_subject &&
1.3 misha 6141: *start_match == CHAR_NL &&
1.1 misha 6142: (re->flags & PCRE_HASCRORLF) == 0 &&
6143: (md->nltype == NLTYPE_ANY ||
6144: md->nltype == NLTYPE_ANYCRLF ||
6145: md->nllen == 2))
6146: start_match++;
6147:
1.4 ! misha 6148: md->mark = NULL; /* Reset for start of next match attempt */
! 6149: } /* End of for(;;) "bumpalong" loop */
1.1 misha 6150:
6151: /* ==========================================================================*/
6152:
6153: /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6154: conditions is true:
6155:
6156: (1) The pattern is anchored or the match was failed by (*COMMIT);
6157:
6158: (2) We are past the end of the subject;
6159:
6160: (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6161: this option requests that a match occur at or before the first newline in
6162: the subject.
6163:
6164: When we have a match and the offset vector is big enough to deal with any
6165: backreferences, captured substring offsets will already be set up. In the case
6166: where we had to get some local store to hold offsets for backreference
6167: processing, copy those that we can. In this case there need not be overflow if
6168: certain parts of the pattern were not used, even though there are more
6169: capturing parentheses than vector slots. */
6170:
6171: ENDLOOP:
6172:
1.4 ! misha 6173: if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
1.1 misha 6174: {
6175: if (using_temporary_offsets)
6176: {
6177: if (offsetcount >= 4)
6178: {
6179: memcpy(offsets + 2, md->offset_vector + 2,
6180: (offsetcount - 2) * sizeof(int));
6181: DPRINTF(("Copied offsets from temporary memory\n"));
6182: }
6183: if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
6184: DPRINTF(("Freeing temporary memory\n"));
6185: (pcre_free)(md->offset_vector);
6186: }
6187:
6188: /* Set the return code to the number of captured strings, or 0 if there are
6189: too many to fit into the vector. */
6190:
6191: rc = md->offset_overflow? 0 : md->end_offset_top/2;
6192:
6193: /* If there is space, set up the whole thing as substring 0. The value of
6194: md->start_match_ptr might be modified if \K was encountered on the success
6195: matching path. */
6196:
6197: if (offsetcount < 2) rc = 0; else
6198: {
1.4 ! misha 6199: offsets[0] = (int)(md->start_match_ptr - md->start_subject);
! 6200: offsets[1] = (int)(md->end_match_ptr - md->start_subject);
1.1 misha 6201: }
6202:
6203: DPRINTF((">>>> returning %d\n", rc));
1.4 ! misha 6204: goto RETURN_MARK;
1.1 misha 6205: }
6206:
6207: /* Control gets here if there has been an error, or if the overall match
6208: attempt has failed at all permitted starting positions. */
6209:
6210: if (using_temporary_offsets)
6211: {
6212: DPRINTF(("Freeing temporary memory\n"));
6213: (pcre_free)(md->offset_vector);
6214: }
6215:
1.4 ! misha 6216: /* For anything other than nomatch or partial match, just return the code. */
! 6217:
! 6218: if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
1.1 misha 6219: {
6220: DPRINTF((">>>> error: returning %d\n", rc));
6221: return rc;
6222: }
1.4 ! misha 6223:
! 6224: /* Handle partial matches - disable any mark data */
! 6225:
! 6226: if (start_partial != NULL)
1.1 misha 6227: {
6228: DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
1.4 ! misha 6229: md->mark = NULL;
! 6230: if (offsetcount > 1)
! 6231: {
! 6232: offsets[0] = (int)(start_partial - (USPTR)subject);
! 6233: offsets[1] = (int)(end_subject - (USPTR)subject);
! 6234: }
! 6235: rc = PCRE_ERROR_PARTIAL;
1.1 misha 6236: }
1.4 ! misha 6237:
! 6238: /* This is the classic nomatch case */
! 6239:
1.1 misha 6240: else
6241: {
6242: DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
1.4 ! misha 6243: rc = PCRE_ERROR_NOMATCH;
1.1 misha 6244: }
1.4 ! misha 6245:
! 6246: /* Return the MARK data if it has been requested. */
! 6247:
! 6248: RETURN_MARK:
! 6249:
! 6250: if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
! 6251: *(extra_data->mark) = (unsigned char *)(md->mark);
! 6252: return rc;
1.1 misha 6253: }
6254:
6255: /* End of pcre_exec.c */
E-mail: