Annotation of win32/pcre/pcre_exec.c, revision 1.6
1.1 misha 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
1.6 ! misha 9: Copyright (c) 1997-2012 University of Cambridge
1.1 misha 10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40:
41: /* This module contains pcre_exec(), the externally visible function that does
42: pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43: possible. There are also some static supporting functions. */
44:
45: #ifdef HAVE_CONFIG_H
46: #include "config.h"
47: #endif
48:
49: #define NLBLOCK md /* Block containing newline information */
50: #define PSSTART start_subject /* Field containing processed string start */
51: #define PSEND end_subject /* Field containing processed string end */
52:
53: #include "pcre_internal.h"
54:
55: /* Undefine some potentially clashing cpp symbols */
56:
57: #undef min
58: #undef max
59:
1.6 ! misha 60: /* Values for setting in md->match_function_type to indicate two special types
! 61: of call to match(). We do it this way to save on using another stack variable,
! 62: as stack usage is to be discouraged. */
1.1 misha 63:
1.6 ! misha 64: #define MATCH_CONDASSERT 1 /* Called to check a condition assertion */
! 65: #define MATCH_CBEGROUP 2 /* Could-be-empty unlimited repeat group */
1.1 misha 66:
67: /* Non-error returns from the match() function. Error returns are externally
68: defined PCRE_ERROR_xxx codes, which are all negative. */
69:
70: #define MATCH_MATCH 1
71: #define MATCH_NOMATCH 0
72:
73: /* Special internal returns from the match() function. Make them sufficiently
74: negative to avoid the external error codes. */
75:
1.4 misha 76: #define MATCH_ACCEPT (-999)
77: #define MATCH_COMMIT (-998)
1.6 ! misha 78: #define MATCH_KETRPOS (-997)
! 79: #define MATCH_ONCE (-996)
! 80: #define MATCH_PRUNE (-995)
! 81: #define MATCH_SKIP (-994)
! 82: #define MATCH_SKIP_ARG (-993)
! 83: #define MATCH_THEN (-992)
1.1 misha 84:
85: /* Maximum number of ints of offset to save on the stack for recursive calls.
86: If the offset vector is bigger, malloc is used. This should be a multiple of 3,
87: because the offset vector is always a multiple of 3 long. */
88:
89: #define REC_STACK_SAVE_MAX 30
90:
91: /* Min and max values for the common repeats; for the maxima, 0 => infinity */
92:
93: static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
94: static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
95:
96:
97:
1.4 misha 98: #ifdef PCRE_DEBUG
1.1 misha 99: /*************************************************
100: * Debugging function to print chars *
101: *************************************************/
102:
103: /* Print a sequence of chars in printable format, stopping at the end of the
104: subject if the requested.
105:
106: Arguments:
107: p points to characters
108: length number to print
109: is_subject TRUE if printing from within md->start_subject
110: md pointer to matching data block, if is_subject is TRUE
111:
112: Returns: nothing
113: */
114:
115: static void
1.6 ! misha 116: pchars(const pcre_uchar *p, int length, BOOL is_subject, match_data *md)
1.1 misha 117: {
118: unsigned int c;
119: if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
120: while (length-- > 0)
121: if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
122: }
123: #endif
124:
125:
126:
127: /*************************************************
128: * Match a back-reference *
129: *************************************************/
130:
1.6 ! misha 131: /* Normally, if a back reference hasn't been set, the length that is passed is
! 132: negative, so the match always fails. However, in JavaScript compatibility mode,
! 133: the length passed is zero. Note that in caseless UTF-8 mode, the number of
! 134: subject bytes matched may be different to the number of reference bytes.
1.1 misha 135:
136: Arguments:
137: offset index into the offset vector
1.6 ! misha 138: eptr pointer into the subject
! 139: length length of reference to be matched (number of bytes)
1.1 misha 140: md points to match data block
1.6 ! misha 141: caseless TRUE if caseless
1.1 misha 142:
1.6 ! misha 143: Returns: < 0 if not matched, otherwise the number of subject bytes matched
1.1 misha 144: */
145:
1.6 ! misha 146: static int
! 147: match_ref(int offset, register PCRE_PUCHAR eptr, int length, match_data *md,
! 148: BOOL caseless)
1.1 misha 149: {
1.6 ! misha 150: PCRE_PUCHAR eptr_start = eptr;
! 151: register PCRE_PUCHAR p = md->start_subject + md->offset_vector[offset];
1.1 misha 152:
1.4 misha 153: #ifdef PCRE_DEBUG
1.1 misha 154: if (eptr >= md->end_subject)
155: printf("matching subject <null>");
156: else
157: {
158: printf("matching subject ");
159: pchars(eptr, length, TRUE, md);
160: }
161: printf(" against backref ");
162: pchars(p, length, FALSE, md);
163: printf("\n");
164: #endif
165:
1.6 ! misha 166: /* Always fail if reference not set (and not JavaScript compatible). */
1.1 misha 167:
1.6 ! misha 168: if (length < 0) return -1;
1.1 misha 169:
1.2 misha 170: /* Separate the caseless case for speed. In UTF-8 mode we can only do this
171: properly if Unicode properties are supported. Otherwise, we can check only
172: ASCII characters. */
1.1 misha 173:
1.6 ! misha 174: if (caseless)
1.1 misha 175: {
1.6 ! misha 176: #ifdef SUPPORT_UTF
1.2 misha 177: #ifdef SUPPORT_UCP
1.6 ! misha 178: if (md->utf)
1.2 misha 179: {
1.6 ! misha 180: /* Match characters up to the end of the reference. NOTE: the number of
! 181: bytes matched may differ, because there are some characters whose upper and
! 182: lower case versions code as different numbers of bytes. For example, U+023A
! 183: (2 bytes in UTF-8) is the upper case version of U+2C65 (3 bytes in UTF-8);
! 184: a sequence of 3 of the former uses 6 bytes, as does a sequence of two of
! 185: the latter. It is important, therefore, to check the length along the
! 186: reference, not along the subject (earlier code did this wrong). */
! 187:
! 188: PCRE_PUCHAR endptr = p + length;
! 189: while (p < endptr)
1.2 misha 190: {
191: int c, d;
1.6 ! misha 192: if (eptr >= md->end_subject) return -1;
1.2 misha 193: GETCHARINC(c, eptr);
194: GETCHARINC(d, p);
1.6 ! misha 195: if (c != d && c != UCD_OTHERCASE(d)) return -1;
1.2 misha 196: }
197: }
198: else
199: #endif
200: #endif
201:
202: /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
203: is no UCP support. */
1.6 ! misha 204: {
! 205: if (eptr + length > md->end_subject) return -1;
! 206: while (length-- > 0)
! 207: {
! 208: if (TABLE_GET(*p, md->lcc, *p) != TABLE_GET(*eptr, md->lcc, *eptr)) return -1;
! 209: p++;
! 210: eptr++;
! 211: }
! 212: }
1.1 misha 213: }
1.2 misha 214:
215: /* In the caseful case, we can just compare the bytes, whether or not we
216: are in UTF-8 mode. */
217:
1.1 misha 218: else
1.6 ! misha 219: {
! 220: if (eptr + length > md->end_subject) return -1;
! 221: while (length-- > 0) if (*p++ != *eptr++) return -1;
! 222: }
1.1 misha 223:
1.6 ! misha 224: return (int)(eptr - eptr_start);
1.1 misha 225: }
226:
227:
228:
229: /***************************************************************************
230: ****************************************************************************
231: RECURSION IN THE match() FUNCTION
232:
233: The match() function is highly recursive, though not every recursive call
234: increases the recursive depth. Nevertheless, some regular expressions can cause
235: it to recurse to a great depth. I was writing for Unix, so I just let it call
236: itself recursively. This uses the stack for saving everything that has to be
237: saved for a recursive call. On Unix, the stack can be large, and this works
238: fine.
239:
240: It turns out that on some non-Unix-like systems there are problems with
241: programs that use a lot of stack. (This despite the fact that every last chip
242: has oodles of memory these days, and techniques for extending the stack have
243: been known for decades.) So....
244:
245: There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
246: calls by keeping local variables that need to be preserved in blocks of memory
247: obtained from malloc() instead instead of on the stack. Macros are used to
248: achieve this so that the actual code doesn't look very different to what it
249: always used to.
250:
251: The original heap-recursive code used longjmp(). However, it seems that this
252: can be very slow on some operating systems. Following a suggestion from Stan
253: Switzer, the use of longjmp() has been abolished, at the cost of having to
254: provide a unique number for each call to RMATCH. There is no way of generating
255: a sequence of numbers at compile time in C. I have given them names, to make
256: them stand out more clearly.
257:
258: Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
259: FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
260: tests. Furthermore, not using longjmp() means that local dynamic variables
261: don't have indeterminate values; this has meant that the frame size can be
262: reduced because the result can be "passed back" by straight setting of the
263: variable instead of being passed in the frame.
264: ****************************************************************************
265: ***************************************************************************/
266:
267: /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
268: below must be updated in sync. */
269:
270: enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
271: RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20,
272: RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30,
273: RM31, RM32, RM33, RM34, RM35, RM36, RM37, RM38, RM39, RM40,
274: RM41, RM42, RM43, RM44, RM45, RM46, RM47, RM48, RM49, RM50,
1.4 misha 275: RM51, RM52, RM53, RM54, RM55, RM56, RM57, RM58, RM59, RM60,
1.6 ! misha 276: RM61, RM62, RM63, RM64, RM65, RM66 };
1.1 misha 277:
278: /* These versions of the macros use the stack, as normal. There are debugging
279: versions and production versions. Note that the "rw" argument of RMATCH isn't
1.4 misha 280: actually used in this definition. */
1.1 misha 281:
282: #ifndef NO_RECURSE
283: #define REGISTER register
284:
1.4 misha 285: #ifdef PCRE_DEBUG
1.6 ! misha 286: #define RMATCH(ra,rb,rc,rd,re,rw) \
1.1 misha 287: { \
288: printf("match() called in line %d\n", __LINE__); \
1.6 ! misha 289: rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1); \
1.1 misha 290: printf("to line %d\n", __LINE__); \
291: }
292: #define RRETURN(ra) \
293: { \
294: printf("match() returned %d from line %d ", ra, __LINE__); \
295: return ra; \
296: }
297: #else
1.6 ! misha 298: #define RMATCH(ra,rb,rc,rd,re,rw) \
! 299: rrc = match(ra,rb,mstart,rc,rd,re,rdepth+1)
1.1 misha 300: #define RRETURN(ra) return ra
301: #endif
302:
303: #else
304:
305:
306: /* These versions of the macros manage a private stack on the heap. Note that
307: the "rd" argument of RMATCH isn't actually used in this definition. It's the md
308: argument of match(), which never changes. */
309:
310: #define REGISTER
311:
1.6 ! misha 312: #define RMATCH(ra,rb,rc,rd,re,rw)\
1.1 misha 313: {\
1.6 ! misha 314: heapframe *newframe = (heapframe *)(PUBL(stack_malloc))(sizeof(heapframe));\
1.4 misha 315: if (newframe == NULL) RRETURN(PCRE_ERROR_NOMEMORY);\
1.1 misha 316: frame->Xwhere = rw; \
317: newframe->Xeptr = ra;\
318: newframe->Xecode = rb;\
319: newframe->Xmstart = mstart;\
320: newframe->Xoffset_top = rc;\
1.6 ! misha 321: newframe->Xeptrb = re;\
1.1 misha 322: newframe->Xrdepth = frame->Xrdepth + 1;\
323: newframe->Xprevframe = frame;\
324: frame = newframe;\
325: DPRINTF(("restarting from line %d\n", __LINE__));\
326: goto HEAP_RECURSE;\
327: L_##rw:\
328: DPRINTF(("jumped back to line %d\n", __LINE__));\
329: }
330:
331: #define RRETURN(ra)\
332: {\
1.4 misha 333: heapframe *oldframe = frame;\
334: frame = oldframe->Xprevframe;\
1.6 ! misha 335: if (oldframe != &frame_zero) (PUBL(stack_free))(oldframe);\
1.1 misha 336: if (frame != NULL)\
337: {\
338: rrc = ra;\
339: goto HEAP_RETURN;\
340: }\
341: return ra;\
342: }
343:
344:
345: /* Structure for remembering the local variables in a private frame */
346:
347: typedef struct heapframe {
348: struct heapframe *Xprevframe;
349:
350: /* Function arguments that may change */
351:
1.6 ! misha 352: PCRE_PUCHAR Xeptr;
! 353: const pcre_uchar *Xecode;
! 354: PCRE_PUCHAR Xmstart;
1.1 misha 355: int Xoffset_top;
356: eptrblock *Xeptrb;
357: unsigned int Xrdepth;
358:
359: /* Function local variables */
360:
1.6 ! misha 361: PCRE_PUCHAR Xcallpat;
! 362: #ifdef SUPPORT_UTF
! 363: PCRE_PUCHAR Xcharptr;
! 364: #endif
! 365: PCRE_PUCHAR Xdata;
! 366: PCRE_PUCHAR Xnext;
! 367: PCRE_PUCHAR Xpp;
! 368: PCRE_PUCHAR Xprev;
! 369: PCRE_PUCHAR Xsaved_eptr;
1.1 misha 370:
371: recursion_info Xnew_recursive;
372:
373: BOOL Xcur_is_word;
374: BOOL Xcondition;
375: BOOL Xprev_is_word;
376:
377: #ifdef SUPPORT_UCP
378: int Xprop_type;
379: int Xprop_value;
380: int Xprop_fail_result;
381: int Xoclength;
1.6 ! misha 382: pcre_uchar Xocchars[6];
1.1 misha 383: #endif
384:
1.3 misha 385: int Xcodelink;
1.1 misha 386: int Xctype;
387: unsigned int Xfc;
388: int Xfi;
389: int Xlength;
390: int Xmax;
391: int Xmin;
392: int Xnumber;
393: int Xoffset;
394: int Xop;
395: int Xsave_capture_last;
396: int Xsave_offset1, Xsave_offset2, Xsave_offset3;
397: int Xstacksave[REC_STACK_SAVE_MAX];
398:
399: eptrblock Xnewptrb;
400:
401: /* Where to jump back to */
402:
403: int Xwhere;
404:
405: } heapframe;
406:
407: #endif
408:
409:
410: /***************************************************************************
411: ***************************************************************************/
412:
413:
414:
415: /*************************************************
416: * Match from current position *
417: *************************************************/
418:
419: /* This function is called recursively in many circumstances. Whenever it
420: returns a negative (error) response, the outer incarnation must also return the
1.4 misha 421: same response. */
422:
423: /* These macros pack up tests that are used for partial matching, and which
1.6 ! misha 424: appear several times in the code. We set the "hit end" flag if the pointer is
1.4 misha 425: at the end of the subject and also past the start of the subject (i.e.
426: something has been matched). For hard partial matching, we then return
427: immediately. The second one is used when we already know we are past the end of
428: the subject. */
429:
430: #define CHECK_PARTIAL()\
1.5 misha 431: if (md->partial != 0 && eptr >= md->end_subject && \
432: eptr > md->start_used_ptr) \
433: { \
434: md->hitend = TRUE; \
1.6 ! misha 435: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
1.4 misha 436: }
1.1 misha 437:
1.4 misha 438: #define SCHECK_PARTIAL()\
1.5 misha 439: if (md->partial != 0 && eptr > md->start_used_ptr) \
440: { \
441: md->hitend = TRUE; \
1.6 ! misha 442: if (md->partial > 1) RRETURN(PCRE_ERROR_PARTIAL); \
1.4 misha 443: }
444:
445:
446: /* Performance note: It might be tempting to extract commonly used fields from
1.6 ! misha 447: the md structure (e.g. utf, end_subject) into individual variables to improve
1.1 misha 448: performance. Tests using gcc on a SPARC disproved this; in the first case, it
449: made performance worse.
450:
451: Arguments:
452: eptr pointer to current character in subject
453: ecode pointer to current position in compiled code
454: mstart pointer to the current match start position (can be modified
455: by encountering \K)
456: offset_top current top pointer
457: md pointer to "static" info for the match
458: eptrb pointer to chain of blocks containing eptr at start of
459: brackets - for testing for empty matches
460: rdepth the recursion depth
461:
462: Returns: MATCH_MATCH if matched ) these values are >= 0
463: MATCH_NOMATCH if failed to match )
1.4 misha 464: a negative MATCH_xxx value for PRUNE, SKIP, etc
1.1 misha 465: a negative PCRE_ERROR_xxx value if aborted by an error condition
466: (e.g. stopped by repeated call or recursion limit)
467: */
468:
469: static int
1.6 ! misha 470: match(REGISTER PCRE_PUCHAR eptr, REGISTER const pcre_uchar *ecode,
! 471: PCRE_PUCHAR mstart, int offset_top, match_data *md, eptrblock *eptrb,
! 472: unsigned int rdepth)
1.1 misha 473: {
474: /* These variables do not need to be preserved over recursion in this function,
475: so they can be ordinary variables in all cases. Mark some of them with
476: "register" because they are used a lot in loops. */
477:
478: register int rrc; /* Returns from recursive calls */
479: register int i; /* Used for loops not involving calls to RMATCH() */
480: register unsigned int c; /* Character values not kept over RMATCH() calls */
1.6 ! misha 481: register BOOL utf; /* Local copy of UTF flag for speed */
1.1 misha 482:
483: BOOL minimize, possessive; /* Quantifier options */
1.6 ! misha 484: BOOL caseless;
1.3 misha 485: int condcode;
1.1 misha 486:
487: /* When recursion is not being used, all "local" variables that have to be
1.6 ! misha 488: preserved over calls to RMATCH() are part of a "frame". We set up the top-level
! 489: frame on the stack here; subsequent instantiations are obtained from the heap
! 490: whenever RMATCH() does a "recursion". See the macro definitions above. Putting
! 491: the top-level on the stack rather than malloc-ing them all gives a performance
! 492: boost in many cases where there is not much "recursion". */
1.1 misha 493:
494: #ifdef NO_RECURSE
1.6 ! misha 495: heapframe frame_zero;
! 496: heapframe *frame = &frame_zero;
1.1 misha 497: frame->Xprevframe = NULL; /* Marks the top level */
498:
499: /* Copy in the original argument variables */
500:
501: frame->Xeptr = eptr;
502: frame->Xecode = ecode;
503: frame->Xmstart = mstart;
504: frame->Xoffset_top = offset_top;
505: frame->Xeptrb = eptrb;
506: frame->Xrdepth = rdepth;
507:
508: /* This is where control jumps back to to effect "recursion" */
509:
510: HEAP_RECURSE:
511:
512: /* Macros make the argument variables come from the current frame */
513:
514: #define eptr frame->Xeptr
515: #define ecode frame->Xecode
516: #define mstart frame->Xmstart
517: #define offset_top frame->Xoffset_top
518: #define eptrb frame->Xeptrb
519: #define rdepth frame->Xrdepth
520:
521: /* Ditto for the local variables */
522:
1.6 ! misha 523: #ifdef SUPPORT_UTF
1.1 misha 524: #define charptr frame->Xcharptr
525: #endif
526: #define callpat frame->Xcallpat
1.3 misha 527: #define codelink frame->Xcodelink
1.1 misha 528: #define data frame->Xdata
529: #define next frame->Xnext
530: #define pp frame->Xpp
531: #define prev frame->Xprev
532: #define saved_eptr frame->Xsaved_eptr
533:
534: #define new_recursive frame->Xnew_recursive
535:
536: #define cur_is_word frame->Xcur_is_word
537: #define condition frame->Xcondition
538: #define prev_is_word frame->Xprev_is_word
539:
540: #ifdef SUPPORT_UCP
541: #define prop_type frame->Xprop_type
542: #define prop_value frame->Xprop_value
543: #define prop_fail_result frame->Xprop_fail_result
544: #define oclength frame->Xoclength
545: #define occhars frame->Xocchars
546: #endif
547:
548: #define ctype frame->Xctype
549: #define fc frame->Xfc
550: #define fi frame->Xfi
551: #define length frame->Xlength
552: #define max frame->Xmax
553: #define min frame->Xmin
554: #define number frame->Xnumber
555: #define offset frame->Xoffset
556: #define op frame->Xop
557: #define save_capture_last frame->Xsave_capture_last
558: #define save_offset1 frame->Xsave_offset1
559: #define save_offset2 frame->Xsave_offset2
560: #define save_offset3 frame->Xsave_offset3
561: #define stacksave frame->Xstacksave
562:
563: #define newptrb frame->Xnewptrb
564:
565: /* When recursion is being used, local variables are allocated on the stack and
566: get preserved during recursion in the normal way. In this environment, fi and
567: i, and fc and c, can be the same variables. */
568:
569: #else /* NO_RECURSE not defined */
570: #define fi i
571: #define fc c
572:
1.6 ! misha 573: /* Many of the following variables are used only in small blocks of the code.
! 574: My normal style of coding would have declared them within each of those blocks.
! 575: However, in order to accommodate the version of this code that uses an external
! 576: "stack" implemented on the heap, it is easier to declare them all here, so the
! 577: declarations can be cut out in a block. The only declarations within blocks
! 578: below are for variables that do not have to be preserved over a recursive call
! 579: to RMATCH(). */
! 580:
! 581: #ifdef SUPPORT_UTF
! 582: const pcre_uchar *charptr;
! 583: #endif
! 584: const pcre_uchar *callpat;
! 585: const pcre_uchar *data;
! 586: const pcre_uchar *next;
! 587: PCRE_PUCHAR pp;
! 588: const pcre_uchar *prev;
! 589: PCRE_PUCHAR saved_eptr;
! 590:
! 591: recursion_info new_recursive;
1.1 misha 592:
1.6 ! misha 593: BOOL cur_is_word;
1.1 misha 594: BOOL condition;
595: BOOL prev_is_word;
596:
597: #ifdef SUPPORT_UCP
598: int prop_type;
599: int prop_value;
600: int prop_fail_result;
601: int oclength;
1.6 ! misha 602: pcre_uchar occhars[6];
1.1 misha 603: #endif
604:
1.3 misha 605: int codelink;
1.1 misha 606: int ctype;
607: int length;
608: int max;
609: int min;
610: int number;
611: int offset;
612: int op;
613: int save_capture_last;
614: int save_offset1, save_offset2, save_offset3;
615: int stacksave[REC_STACK_SAVE_MAX];
616:
617: eptrblock newptrb;
1.6 ! misha 618:
! 619: /* There is a special fudge for calling match() in a way that causes it to
! 620: measure the size of its basic stack frame when the stack is being used for
! 621: recursion. The second argument (ecode) being NULL triggers this behaviour. It
! 622: cannot normally ever be NULL. The return is the negated value of the frame
! 623: size. */
! 624:
! 625: if (ecode == NULL)
! 626: {
! 627: if (rdepth == 0)
! 628: return match((PCRE_PUCHAR)&rdepth, NULL, NULL, 0, NULL, NULL, 1);
! 629: else
! 630: {
! 631: int len = (char *)&rdepth - (char *)eptr;
! 632: return (len > 0)? -len : len;
! 633: }
! 634: }
1.1 misha 635: #endif /* NO_RECURSE */
636:
1.6 ! misha 637: /* To save space on the stack and in the heap frame, I have doubled up on some
! 638: of the local variables that are used only in localised parts of the code, but
! 639: still need to be preserved over recursive calls of match(). These macros define
! 640: the alternative names that are used. */
! 641:
! 642: #define allow_zero cur_is_word
! 643: #define cbegroup condition
! 644: #define code_offset codelink
! 645: #define condassert condition
! 646: #define matched_once prev_is_word
! 647: #define foc number
! 648: #define save_mark data
! 649:
1.1 misha 650: /* These statements are here to stop the compiler complaining about unitialized
651: variables. */
652:
653: #ifdef SUPPORT_UCP
654: prop_value = 0;
655: prop_fail_result = 0;
656: #endif
657:
658:
659: /* This label is used for tail recursion, which is used in a few cases even
660: when NO_RECURSE is not defined, in order to reduce the amount of stack that is
661: used. Thanks to Ian Taylor for noticing this possibility and sending the
662: original patch. */
663:
664: TAIL_RECURSE:
665:
666: /* OK, now we can get on with the real code of the function. Recursive calls
667: are specified by the macro RMATCH and RRETURN is used to return. When
668: NO_RECURSE is *not* defined, these just turn into a recursive call to match()
1.4 misha 669: and a "return", respectively (possibly with some debugging if PCRE_DEBUG is
1.1 misha 670: defined). However, RMATCH isn't like a function call because it's quite a
671: complicated macro. It has to be used in one particular way. This shouldn't,
672: however, impact performance when true recursion is being used. */
673:
1.6 ! misha 674: #ifdef SUPPORT_UTF
! 675: utf = md->utf; /* Local copy of the flag */
1.1 misha 676: #else
1.6 ! misha 677: utf = FALSE;
1.1 misha 678: #endif
679:
680: /* First check that we haven't called match() too many times, or that we
681: haven't exceeded the recursive call limit. */
682:
683: if (md->match_call_count++ >= md->match_limit) RRETURN(PCRE_ERROR_MATCHLIMIT);
684: if (rdepth >= md->match_limit_recursion) RRETURN(PCRE_ERROR_RECURSIONLIMIT);
685:
686: /* At the start of a group with an unlimited repeat that may match an empty
1.6 ! misha 687: string, the variable md->match_function_type is set to MATCH_CBEGROUP. It is
! 688: done this way to save having to use another function argument, which would take
! 689: up space on the stack. See also MATCH_CONDASSERT below.
! 690:
! 691: When MATCH_CBEGROUP is set, add the current subject pointer to the chain of
! 692: such remembered pointers, to be checked when we hit the closing ket, in order
! 693: to break infinite loops that match no characters. When match() is called in
! 694: other circumstances, don't add to the chain. The MATCH_CBEGROUP feature must
! 695: NOT be used with tail recursion, because the memory block that is used is on
! 696: the stack, so a new one may be required for each match(). */
1.1 misha 697:
1.6 ! misha 698: if (md->match_function_type == MATCH_CBEGROUP)
1.1 misha 699: {
700: newptrb.epb_saved_eptr = eptr;
701: newptrb.epb_prev = eptrb;
702: eptrb = &newptrb;
1.6 ! misha 703: md->match_function_type = 0;
1.1 misha 704: }
705:
706: /* Now start processing the opcodes. */
707:
708: for (;;)
709: {
710: minimize = possessive = FALSE;
711: op = *ecode;
712:
1.4 misha 713: switch(op)
714: {
715: case OP_MARK:
1.6 ! misha 716: md->nomatch_mark = ecode + 2;
! 717: md->mark = NULL; /* In case previously set by assertion */
! 718: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
! 719: eptrb, RM55);
! 720: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
! 721: md->mark == NULL) md->mark = ecode + 2;
1.4 misha 722:
723: /* A return of MATCH_SKIP_ARG means that matching failed at SKIP with an
724: argument, and we must check whether that argument matches this MARK's
725: argument. It is passed back in md->start_match_ptr (an overloading of that
726: variable). If it does match, we reset that variable to the current subject
727: position and return MATCH_SKIP. Otherwise, pass back the return code
728: unaltered. */
729:
1.6 ! misha 730: else if (rrc == MATCH_SKIP_ARG &&
! 731: STRCMP_UC_UC(ecode + 2, md->start_match_ptr) == 0)
1.4 misha 732: {
733: md->start_match_ptr = eptr;
734: RRETURN(MATCH_SKIP);
735: }
736: RRETURN(rrc);
1.1 misha 737:
738: case OP_FAIL:
1.6 ! misha 739: RRETURN(MATCH_NOMATCH);
1.4 misha 740:
1.5 misha 741: /* COMMIT overrides PRUNE, SKIP, and THEN */
742:
1.4 misha 743: case OP_COMMIT:
1.6 ! misha 744: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
! 745: eptrb, RM52);
1.5 misha 746: if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE &&
747: rrc != MATCH_SKIP && rrc != MATCH_SKIP_ARG &&
748: rrc != MATCH_THEN)
749: RRETURN(rrc);
1.6 ! misha 750: RRETURN(MATCH_COMMIT);
1.1 misha 751:
1.5 misha 752: /* PRUNE overrides THEN */
753:
1.1 misha 754: case OP_PRUNE:
1.6 ! misha 755: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
! 756: eptrb, RM51);
1.5 misha 757: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1.6 ! misha 758: RRETURN(MATCH_PRUNE);
1.1 misha 759:
1.4 misha 760: case OP_PRUNE_ARG:
1.6 ! misha 761: md->nomatch_mark = ecode + 2;
! 762: md->mark = NULL; /* In case previously set by assertion */
! 763: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
! 764: eptrb, RM56);
! 765: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
! 766: md->mark == NULL) md->mark = ecode + 2;
1.5 misha 767: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1.4 misha 768: RRETURN(MATCH_PRUNE);
1.1 misha 769:
1.5 misha 770: /* SKIP overrides PRUNE and THEN */
771:
1.1 misha 772: case OP_SKIP:
1.6 ! misha 773: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
! 774: eptrb, RM53);
1.5 misha 775: if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
776: RRETURN(rrc);
1.1 misha 777: md->start_match_ptr = eptr; /* Pass back current position */
1.6 ! misha 778: RRETURN(MATCH_SKIP);
! 779:
! 780: /* Note that, for Perl compatibility, SKIP with an argument does NOT set
! 781: nomatch_mark. There is a flag that disables this opcode when re-matching a
! 782: pattern that ended with a SKIP for which there was not a matching MARK. */
1.4 misha 783:
784: case OP_SKIP_ARG:
1.6 ! misha 785: if (md->ignore_skip_arg)
! 786: {
! 787: ecode += PRIV(OP_lengths)[*ecode] + ecode[1];
! 788: break;
! 789: }
! 790: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top, md,
! 791: eptrb, RM57);
1.5 misha 792: if (rrc != MATCH_NOMATCH && rrc != MATCH_PRUNE && rrc != MATCH_THEN)
793: RRETURN(rrc);
1.4 misha 794:
795: /* Pass back the current skip name by overloading md->start_match_ptr and
796: returning the special MATCH_SKIP_ARG return code. This will either be
1.6 ! misha 797: caught by a matching MARK, or get to the top, where it causes a rematch
! 798: with the md->ignore_skip_arg flag set. */
1.4 misha 799:
800: md->start_match_ptr = ecode + 2;
801: RRETURN(MATCH_SKIP_ARG);
1.1 misha 802:
1.6 ! misha 803: /* For THEN (and THEN_ARG) we pass back the address of the opcode, so that
! 804: the branch in which it occurs can be determined. Overload the start of
! 805: match pointer to do this. */
1.5 misha 806:
1.1 misha 807: case OP_THEN:
1.6 ! misha 808: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
! 809: eptrb, RM54);
1.1 misha 810: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 811: md->start_match_ptr = ecode;
! 812: RRETURN(MATCH_THEN);
1.4 misha 813:
814: case OP_THEN_ARG:
1.6 ! misha 815: md->nomatch_mark = ecode + 2;
! 816: md->mark = NULL; /* In case previously set by assertion */
! 817: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode] + ecode[1], offset_top,
! 818: md, eptrb, RM58);
! 819: if ((rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) &&
! 820: md->mark == NULL) md->mark = ecode + 2;
1.4 misha 821: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 822: md->start_match_ptr = ecode;
1.1 misha 823: RRETURN(MATCH_THEN);
824:
1.6 ! misha 825: /* Handle an atomic group that does not contain any capturing parentheses.
! 826: This can be handled like an assertion. Prior to 8.13, all atomic groups
! 827: were handled this way. In 8.13, the code was changed as below for ONCE, so
! 828: that backups pass through the group and thereby reset captured values.
! 829: However, this uses a lot more stack, so in 8.20, atomic groups that do not
! 830: contain any captures generate OP_ONCE_NC, which can be handled in the old,
! 831: less stack intensive way.
! 832:
! 833: Check the alternative branches in turn - the matching won't pass the KET
! 834: for this kind of subpattern. If any one branch matches, we carry on as at
! 835: the end of a normal bracket, leaving the subject pointer, but resetting
! 836: the start-of-match value in case it was changed by \K. */
! 837:
! 838: case OP_ONCE_NC:
! 839: prev = ecode;
! 840: saved_eptr = eptr;
! 841: save_mark = md->mark;
! 842: do
! 843: {
! 844: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM64);
! 845: if (rrc == MATCH_MATCH) /* Note: _not_ MATCH_ACCEPT */
! 846: {
! 847: mstart = md->start_match_ptr;
! 848: break;
! 849: }
! 850: if (rrc == MATCH_THEN)
! 851: {
! 852: next = ecode + GET(ecode,1);
! 853: if (md->start_match_ptr < next &&
! 854: (*ecode == OP_ALT || *next == OP_ALT))
! 855: rrc = MATCH_NOMATCH;
! 856: }
! 857:
! 858: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 859: ecode += GET(ecode,1);
! 860: md->mark = save_mark;
! 861: }
! 862: while (*ecode == OP_ALT);
! 863:
! 864: /* If hit the end of the group (which could be repeated), fail */
! 865:
! 866: if (*ecode != OP_ONCE_NC && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
! 867:
! 868: /* Continue as from after the group, updating the offsets high water
! 869: mark, since extracts may have been taken. */
! 870:
! 871: do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
! 872:
! 873: offset_top = md->end_offset_top;
! 874: eptr = md->end_match_ptr;
! 875:
! 876: /* For a non-repeating ket, just continue at this level. This also
! 877: happens for a repeating ket if no characters were matched in the group.
! 878: This is the forcible breaking of infinite loops as implemented in Perl
! 879: 5.005. */
! 880:
! 881: if (*ecode == OP_KET || eptr == saved_eptr)
! 882: {
! 883: ecode += 1+LINK_SIZE;
! 884: break;
! 885: }
! 886:
! 887: /* The repeating kets try the rest of the pattern or restart from the
! 888: preceding bracket, in the appropriate order. The second "call" of match()
! 889: uses tail recursion, to avoid using another stack frame. */
! 890:
! 891: if (*ecode == OP_KETRMIN)
! 892: {
! 893: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM65);
! 894: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 895: ecode = prev;
! 896: goto TAIL_RECURSE;
! 897: }
! 898: else /* OP_KETRMAX */
! 899: {
! 900: md->match_function_type = MATCH_CBEGROUP;
! 901: RMATCH(eptr, prev, offset_top, md, eptrb, RM66);
! 902: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 903: ecode += 1 + LINK_SIZE;
! 904: goto TAIL_RECURSE;
! 905: }
! 906: /* Control never gets here */
! 907:
! 908: /* Handle a capturing bracket, other than those that are possessive with an
! 909: unlimited repeat. If there is space in the offset vector, save the current
! 910: subject position in the working slot at the top of the vector. We mustn't
! 911: change the current values of the data slot, because they may be set from a
! 912: previous iteration of this group, and be referred to by a reference inside
! 913: the group. A failure to match might occur after the group has succeeded,
! 914: if something later on doesn't match. For this reason, we need to restore
! 915: the working value and also the values of the final offsets, in case they
! 916: were set by a previous iteration of the same bracket.
1.1 misha 917:
918: If there isn't enough space in the offset vector, treat this as if it were
919: a non-capturing bracket. Don't worry about setting the flag for the error
920: case here; that is handled in the code for KET. */
921:
922: case OP_CBRA:
923: case OP_SCBRA:
924: number = GET2(ecode, 1+LINK_SIZE);
925: offset = number << 1;
926:
1.4 misha 927: #ifdef PCRE_DEBUG
1.1 misha 928: printf("start bracket %d\n", number);
929: printf("subject=");
930: pchars(eptr, 16, TRUE, md);
931: printf("\n");
932: #endif
933:
934: if (offset < md->offset_max)
935: {
936: save_offset1 = md->offset_vector[offset];
937: save_offset2 = md->offset_vector[offset+1];
938: save_offset3 = md->offset_vector[md->offset_end - number];
939: save_capture_last = md->capture_last;
1.6 ! misha 940: save_mark = md->mark;
1.1 misha 941:
942: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
1.4 misha 943: md->offset_vector[md->offset_end - number] =
944: (int)(eptr - md->start_subject);
1.1 misha 945:
1.6 ! misha 946: for (;;)
1.1 misha 947: {
1.6 ! misha 948: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
! 949: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
! 950: eptrb, RM1);
! 951: if (rrc == MATCH_ONCE) break; /* Backing up through an atomic group */
! 952:
! 953: /* If we backed up to a THEN, check whether it is within the current
! 954: branch by comparing the address of the THEN that is passed back with
! 955: the end of the branch. If it is within the current branch, and the
! 956: branch is one of two or more alternatives (it either starts or ends
! 957: with OP_ALT), we have reached the limit of THEN's action, so convert
! 958: the return code to NOMATCH, which will cause normal backtracking to
! 959: happen from now on. Otherwise, THEN is passed back to an outer
! 960: alternative. This implements Perl's treatment of parenthesized groups,
! 961: where a group not containing | does not affect the current alternative,
! 962: that is, (X) is NOT the same as (X|(*F)). */
! 963:
! 964: if (rrc == MATCH_THEN)
! 965: {
! 966: next = ecode + GET(ecode,1);
! 967: if (md->start_match_ptr < next &&
! 968: (*ecode == OP_ALT || *next == OP_ALT))
! 969: rrc = MATCH_NOMATCH;
! 970: }
! 971:
! 972: /* Anything other than NOMATCH is passed back. */
! 973:
! 974: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misha 975: md->capture_last = save_capture_last;
976: ecode += GET(ecode, 1);
1.6 ! misha 977: md->mark = save_mark;
! 978: if (*ecode != OP_ALT) break;
1.1 misha 979: }
980:
981: DPRINTF(("bracket %d failed\n", number));
982: md->offset_vector[offset] = save_offset1;
983: md->offset_vector[offset+1] = save_offset2;
984: md->offset_vector[md->offset_end - number] = save_offset3;
985:
1.6 ! misha 986: /* At this point, rrc will be one of MATCH_ONCE or MATCH_NOMATCH. */
! 987:
! 988: RRETURN(rrc);
1.1 misha 989: }
990:
991: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
992: as a non-capturing bracket. */
993:
994: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
995: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
996:
997: DPRINTF(("insufficient capture room: treat as non-capturing\n"));
998:
999: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1000: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1001:
1.6 ! misha 1002: /* Non-capturing or atomic group, except for possessive with unlimited
! 1003: repeat and ONCE group with no captures. Loop for all the alternatives.
1.1 misha 1004:
1.6 ! misha 1005: When we get to the final alternative within the brackets, we used to return
! 1006: the result of a recursive call to match() whatever happened so it was
! 1007: possible to reduce stack usage by turning this into a tail recursion,
! 1008: except in the case of a possibly empty group. However, now that there is
! 1009: the possiblity of (*THEN) occurring in the final alternative, this
! 1010: optimization is no longer always possible.
! 1011:
! 1012: We can optimize if we know there are no (*THEN)s in the pattern; at present
! 1013: this is the best that can be done.
! 1014:
! 1015: MATCH_ONCE is returned when the end of an atomic group is successfully
! 1016: reached, but subsequent matching fails. It passes back up the tree (causing
! 1017: captured values to be reset) until the original atomic group level is
! 1018: reached. This is tested by comparing md->once_target with the start of the
! 1019: group. At this point, the return is converted into MATCH_NOMATCH so that
! 1020: previous backup points can be taken. */
! 1021:
! 1022: case OP_ONCE:
1.1 misha 1023: case OP_BRA:
1024: case OP_SBRA:
1025: DPRINTF(("start non-capturing bracket\n"));
1.6 ! misha 1026:
1.1 misha 1027: for (;;)
1028: {
1.6 ! misha 1029: if (op >= OP_SBRA || op == OP_ONCE) md->match_function_type = MATCH_CBEGROUP;
! 1030:
! 1031: /* If this is not a possibly empty group, and there are no (*THEN)s in
! 1032: the pattern, and this is the final alternative, optimize as described
! 1033: above. */
! 1034:
! 1035: else if (!md->hasthen && ecode[GET(ecode, 1)] != OP_ALT)
1.1 misha 1036: {
1.6 ! misha 1037: ecode += PRIV(OP_lengths)[*ecode];
! 1038: goto TAIL_RECURSE;
! 1039: }
! 1040:
! 1041: /* In all other cases, we have to make another call to match(). */
! 1042:
! 1043: save_mark = md->mark;
! 1044: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md, eptrb,
! 1045: RM2);
! 1046:
! 1047: /* See comment in the code for capturing groups above about handling
! 1048: THEN. */
! 1049:
! 1050: if (rrc == MATCH_THEN)
! 1051: {
! 1052: next = ecode + GET(ecode,1);
! 1053: if (md->start_match_ptr < next &&
! 1054: (*ecode == OP_ALT || *next == OP_ALT))
! 1055: rrc = MATCH_NOMATCH;
! 1056: }
! 1057:
! 1058: if (rrc != MATCH_NOMATCH)
! 1059: {
! 1060: if (rrc == MATCH_ONCE)
1.1 misha 1061: {
1.6 ! misha 1062: const pcre_uchar *scode = ecode;
! 1063: if (*scode != OP_ONCE) /* If not at start, find it */
! 1064: {
! 1065: while (*scode == OP_ALT) scode += GET(scode, 1);
! 1066: scode -= GET(scode, 1);
! 1067: }
! 1068: if (md->once_target == scode) rrc = MATCH_NOMATCH;
1.1 misha 1069: }
1.6 ! misha 1070: RRETURN(rrc);
! 1071: }
! 1072: ecode += GET(ecode, 1);
! 1073: md->mark = save_mark;
! 1074: if (*ecode != OP_ALT) break;
! 1075: }
! 1076:
! 1077: RRETURN(MATCH_NOMATCH);
! 1078:
! 1079: /* Handle possessive capturing brackets with an unlimited repeat. We come
! 1080: here from BRAZERO with allow_zero set TRUE. The offset_vector values are
! 1081: handled similarly to the normal case above. However, the matching is
! 1082: different. The end of these brackets will always be OP_KETRPOS, which
! 1083: returns MATCH_KETRPOS without going further in the pattern. By this means
! 1084: we can handle the group by iteration rather than recursion, thereby
! 1085: reducing the amount of stack needed. */
1.1 misha 1086:
1.6 ! misha 1087: case OP_CBRAPOS:
! 1088: case OP_SCBRAPOS:
! 1089: allow_zero = FALSE;
1.1 misha 1090:
1.6 ! misha 1091: POSSESSIVE_CAPTURE:
! 1092: number = GET2(ecode, 1+LINK_SIZE);
! 1093: offset = number << 1;
! 1094:
! 1095: #ifdef PCRE_DEBUG
! 1096: printf("start possessive bracket %d\n", number);
! 1097: printf("subject=");
! 1098: pchars(eptr, 16, TRUE, md);
! 1099: printf("\n");
! 1100: #endif
! 1101:
! 1102: if (offset < md->offset_max)
! 1103: {
! 1104: matched_once = FALSE;
! 1105: code_offset = (int)(ecode - md->start_code);
! 1106:
! 1107: save_offset1 = md->offset_vector[offset];
! 1108: save_offset2 = md->offset_vector[offset+1];
! 1109: save_offset3 = md->offset_vector[md->offset_end - number];
! 1110: save_capture_last = md->capture_last;
! 1111:
! 1112: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
! 1113:
! 1114: /* Each time round the loop, save the current subject position for use
! 1115: when the group matches. For MATCH_MATCH, the group has matched, so we
! 1116: restart it with a new subject starting position, remembering that we had
! 1117: at least one match. For MATCH_NOMATCH, carry on with the alternatives, as
! 1118: usual. If we haven't matched any alternatives in any iteration, check to
! 1119: see if a previous iteration matched. If so, the group has matched;
! 1120: continue from afterwards. Otherwise it has failed; restore the previous
! 1121: capture values before returning NOMATCH. */
! 1122:
! 1123: for (;;)
! 1124: {
! 1125: md->offset_vector[md->offset_end - number] =
! 1126: (int)(eptr - md->start_subject);
! 1127: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
! 1128: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
! 1129: eptrb, RM63);
! 1130: if (rrc == MATCH_KETRPOS)
! 1131: {
! 1132: offset_top = md->end_offset_top;
! 1133: eptr = md->end_match_ptr;
! 1134: ecode = md->start_code + code_offset;
! 1135: save_capture_last = md->capture_last;
! 1136: matched_once = TRUE;
! 1137: continue;
! 1138: }
! 1139:
! 1140: /* See comment in the code for capturing groups above about handling
! 1141: THEN. */
! 1142:
! 1143: if (rrc == MATCH_THEN)
! 1144: {
! 1145: next = ecode + GET(ecode,1);
! 1146: if (md->start_match_ptr < next &&
! 1147: (*ecode == OP_ALT || *next == OP_ALT))
! 1148: rrc = MATCH_NOMATCH;
! 1149: }
! 1150:
! 1151: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 1152: md->capture_last = save_capture_last;
! 1153: ecode += GET(ecode, 1);
! 1154: if (*ecode != OP_ALT) break;
! 1155: }
! 1156:
! 1157: if (!matched_once)
! 1158: {
! 1159: md->offset_vector[offset] = save_offset1;
! 1160: md->offset_vector[offset+1] = save_offset2;
! 1161: md->offset_vector[md->offset_end - number] = save_offset3;
! 1162: }
! 1163:
! 1164: if (allow_zero || matched_once)
! 1165: {
! 1166: ecode += 1 + LINK_SIZE;
! 1167: break;
1.1 misha 1168: }
1169:
1.6 ! misha 1170: RRETURN(MATCH_NOMATCH);
! 1171: }
! 1172:
! 1173: /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
! 1174: as a non-capturing bracket. */
! 1175:
! 1176: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
! 1177: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
1.1 misha 1178:
1.6 ! misha 1179: DPRINTF(("insufficient capture room: treat as non-capturing\n"));
! 1180:
! 1181: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
! 1182: /* VVVVVVVVVVVVVVVVVVVVVVVVV */
! 1183:
! 1184: /* Non-capturing possessive bracket with unlimited repeat. We come here
! 1185: from BRAZERO with allow_zero = TRUE. The code is similar to the above,
! 1186: without the capturing complication. It is written out separately for speed
! 1187: and cleanliness. */
! 1188:
! 1189: case OP_BRAPOS:
! 1190: case OP_SBRAPOS:
! 1191: allow_zero = FALSE;
! 1192:
! 1193: POSSESSIVE_NON_CAPTURE:
! 1194: matched_once = FALSE;
! 1195: code_offset = (int)(ecode - md->start_code);
! 1196:
! 1197: for (;;)
! 1198: {
! 1199: if (op >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
! 1200: RMATCH(eptr, ecode + PRIV(OP_lengths)[*ecode], offset_top, md,
! 1201: eptrb, RM48);
! 1202: if (rrc == MATCH_KETRPOS)
! 1203: {
! 1204: offset_top = md->end_offset_top;
! 1205: eptr = md->end_match_ptr;
! 1206: ecode = md->start_code + code_offset;
! 1207: matched_once = TRUE;
! 1208: continue;
! 1209: }
! 1210:
! 1211: /* See comment in the code for capturing groups above about handling
! 1212: THEN. */
! 1213:
! 1214: if (rrc == MATCH_THEN)
! 1215: {
! 1216: next = ecode + GET(ecode,1);
! 1217: if (md->start_match_ptr < next &&
! 1218: (*ecode == OP_ALT || *next == OP_ALT))
! 1219: rrc = MATCH_NOMATCH;
! 1220: }
! 1221:
! 1222: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misha 1223: ecode += GET(ecode, 1);
1.6 ! misha 1224: if (*ecode != OP_ALT) break;
1.1 misha 1225: }
1.6 ! misha 1226:
! 1227: if (matched_once || allow_zero)
! 1228: {
! 1229: ecode += 1 + LINK_SIZE;
! 1230: break;
! 1231: }
! 1232: RRETURN(MATCH_NOMATCH);
! 1233:
1.1 misha 1234: /* Control never reaches here. */
1235:
1236: /* Conditional group: compilation checked that there are no more than
1237: two branches. If the condition is false, skipping the first branch takes us
1238: past the end if there is only one branch, but that's OK because that is
1.6 ! misha 1239: exactly what going to the ket would do. */
1.1 misha 1240:
1241: case OP_COND:
1242: case OP_SCOND:
1.6 ! misha 1243: codelink = GET(ecode, 1);
1.3 misha 1244:
1245: /* Because of the way auto-callout works during compile, a callout item is
1246: inserted between OP_COND and an assertion condition. */
1247:
1248: if (ecode[LINK_SIZE+1] == OP_CALLOUT)
1249: {
1.6 ! misha 1250: if (PUBL(callout) != NULL)
1.3 misha 1251: {
1.6 ! misha 1252: PUBL(callout_block) cb;
! 1253: cb.version = 2; /* Version 1 of the callout block */
1.3 misha 1254: cb.callout_number = ecode[LINK_SIZE+2];
1255: cb.offset_vector = md->offset_vector;
1.6 ! misha 1256: #ifdef COMPILE_PCRE8
1.3 misha 1257: cb.subject = (PCRE_SPTR)md->start_subject;
1.6 ! misha 1258: #else
! 1259: cb.subject = (PCRE_SPTR16)md->start_subject;
! 1260: #endif
1.4 misha 1261: cb.subject_length = (int)(md->end_subject - md->start_subject);
1262: cb.start_match = (int)(mstart - md->start_subject);
1263: cb.current_position = (int)(eptr - md->start_subject);
1.3 misha 1264: cb.pattern_position = GET(ecode, LINK_SIZE + 3);
1265: cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
1266: cb.capture_top = offset_top/2;
1267: cb.capture_last = md->capture_last;
1268: cb.callout_data = md->callout_data;
1.6 ! misha 1269: cb.mark = md->nomatch_mark;
! 1270: if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1.3 misha 1271: if (rrc < 0) RRETURN(rrc);
1272: }
1.6 ! misha 1273: ecode += PRIV(OP_lengths)[OP_CALLOUT];
1.3 misha 1274: }
1275:
1276: condcode = ecode[LINK_SIZE+1];
1277:
1278: /* Now see what the actual condition is */
1279:
1.4 misha 1280: if (condcode == OP_RREF || condcode == OP_NRREF) /* Recursion test */
1.1 misha 1281: {
1.4 misha 1282: if (md->recursive == NULL) /* Not recursing => FALSE */
1283: {
1284: condition = FALSE;
1285: ecode += GET(ecode, 1);
1286: }
1287: else
1288: {
1289: int recno = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
1.6 ! misha 1290: condition = (recno == RREF_ANY || recno == md->recursive->group_num);
1.4 misha 1291:
1292: /* If the test is for recursion into a specific subpattern, and it is
1293: false, but the test was set up by name, scan the table to see if the
1294: name refers to any other numbers, and test them. The condition is true
1295: if any one is set. */
1296:
1.6 ! misha 1297: if (!condition && condcode == OP_NRREF)
1.4 misha 1298: {
1.6 ! misha 1299: pcre_uchar *slotA = md->name_table;
1.4 misha 1300: for (i = 0; i < md->name_count; i++)
1301: {
1302: if (GET2(slotA, 0) == recno) break;
1303: slotA += md->name_entry_size;
1304: }
1305:
1306: /* Found a name for the number - there can be only one; duplicate
1307: names for different numbers are allowed, but not vice versa. First
1308: scan down for duplicates. */
1309:
1310: if (i < md->name_count)
1311: {
1.6 ! misha 1312: pcre_uchar *slotB = slotA;
1.4 misha 1313: while (slotB > md->name_table)
1314: {
1315: slotB -= md->name_entry_size;
1.6 ! misha 1316: if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1.4 misha 1317: {
1318: condition = GET2(slotB, 0) == md->recursive->group_num;
1319: if (condition) break;
1320: }
1321: else break;
1322: }
1323:
1324: /* Scan up for duplicates */
1325:
1326: if (!condition)
1327: {
1328: slotB = slotA;
1329: for (i++; i < md->name_count; i++)
1330: {
1331: slotB += md->name_entry_size;
1.6 ! misha 1332: if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1.4 misha 1333: {
1334: condition = GET2(slotB, 0) == md->recursive->group_num;
1335: if (condition) break;
1336: }
1337: else break;
1338: }
1339: }
1340: }
1341: }
1342:
1343: /* Chose branch according to the condition */
1344:
1.6 ! misha 1345: ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1.4 misha 1346: }
1.1 misha 1347: }
1348:
1.4 misha 1349: else if (condcode == OP_CREF || condcode == OP_NCREF) /* Group used test */
1.1 misha 1350: {
1351: offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
1352: condition = offset < offset_top && md->offset_vector[offset] >= 0;
1.4 misha 1353:
1354: /* If the numbered capture is unset, but the reference was by name,
1355: scan the table to see if the name refers to any other numbers, and test
1356: them. The condition is true if any one is set. This is tediously similar
1357: to the code above, but not close enough to try to amalgamate. */
1358:
1359: if (!condition && condcode == OP_NCREF)
1360: {
1361: int refno = offset >> 1;
1.6 ! misha 1362: pcre_uchar *slotA = md->name_table;
1.4 misha 1363:
1364: for (i = 0; i < md->name_count; i++)
1365: {
1366: if (GET2(slotA, 0) == refno) break;
1367: slotA += md->name_entry_size;
1368: }
1369:
1370: /* Found a name for the number - there can be only one; duplicate names
1371: for different numbers are allowed, but not vice versa. First scan down
1372: for duplicates. */
1373:
1374: if (i < md->name_count)
1375: {
1.6 ! misha 1376: pcre_uchar *slotB = slotA;
1.4 misha 1377: while (slotB > md->name_table)
1378: {
1379: slotB -= md->name_entry_size;
1.6 ! misha 1380: if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1.4 misha 1381: {
1382: offset = GET2(slotB, 0) << 1;
1383: condition = offset < offset_top &&
1384: md->offset_vector[offset] >= 0;
1385: if (condition) break;
1386: }
1387: else break;
1388: }
1389:
1390: /* Scan up for duplicates */
1391:
1392: if (!condition)
1393: {
1394: slotB = slotA;
1395: for (i++; i < md->name_count; i++)
1396: {
1397: slotB += md->name_entry_size;
1.6 ! misha 1398: if (STRCMP_UC_UC(slotA + IMM2_SIZE, slotB + IMM2_SIZE) == 0)
1.4 misha 1399: {
1400: offset = GET2(slotB, 0) << 1;
1401: condition = offset < offset_top &&
1402: md->offset_vector[offset] >= 0;
1403: if (condition) break;
1404: }
1405: else break;
1406: }
1407: }
1408: }
1409: }
1410:
1411: /* Chose branch according to the condition */
1412:
1.6 ! misha 1413: ecode += condition? 1 + IMM2_SIZE : GET(ecode, 1);
1.1 misha 1414: }
1415:
1.3 misha 1416: else if (condcode == OP_DEF) /* DEFINE - always false */
1.1 misha 1417: {
1418: condition = FALSE;
1419: ecode += GET(ecode, 1);
1420: }
1421:
1422: /* The condition is an assertion. Call match() to evaluate it - setting
1.6 ! misha 1423: md->match_function_type to MATCH_CONDASSERT causes it to stop at the end of
! 1424: an assertion. */
1.1 misha 1425:
1426: else
1427: {
1.6 ! misha 1428: md->match_function_type = MATCH_CONDASSERT;
! 1429: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM3);
1.1 misha 1430: if (rrc == MATCH_MATCH)
1431: {
1.6 ! misha 1432: if (md->end_offset_top > offset_top)
! 1433: offset_top = md->end_offset_top; /* Captures may have happened */
1.1 misha 1434: condition = TRUE;
1435: ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
1436: while (*ecode == OP_ALT) ecode += GET(ecode, 1);
1437: }
1.6 ! misha 1438:
! 1439: /* PCRE doesn't allow the effect of (*THEN) to escape beyond an
! 1440: assertion; it is therefore treated as NOMATCH. */
! 1441:
! 1442: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1.1 misha 1443: {
1444: RRETURN(rrc); /* Need braces because of following else */
1445: }
1446: else
1447: {
1448: condition = FALSE;
1.3 misha 1449: ecode += codelink;
1.1 misha 1450: }
1451: }
1452:
1.6 ! misha 1453: /* We are now at the branch that is to be obeyed. As there is only one, can
! 1454: use tail recursion to avoid using another stack frame, except when there is
! 1455: unlimited repeat of a possibly empty group. In the latter case, a recursive
! 1456: call to match() is always required, unless the second alternative doesn't
! 1457: exist, in which case we can just plough on. Note that, for compatibility
! 1458: with Perl, the | in a conditional group is NOT treated as creating two
! 1459: alternatives. If a THEN is encountered in the branch, it propagates out to
! 1460: the enclosing alternative (unless nested in a deeper set of alternatives,
! 1461: of course). */
1.1 misha 1462:
1463: if (condition || *ecode == OP_ALT)
1464: {
1.6 ! misha 1465: if (op != OP_SCOND)
1.1 misha 1466: {
1.6 ! misha 1467: ecode += 1 + LINK_SIZE;
1.1 misha 1468: goto TAIL_RECURSE;
1469: }
1.6 ! misha 1470:
! 1471: md->match_function_type = MATCH_CBEGROUP;
! 1472: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM49);
! 1473: RRETURN(rrc);
1.1 misha 1474: }
1.6 ! misha 1475:
! 1476: /* Condition false & no alternative; continue after the group. */
! 1477:
! 1478: else
1.1 misha 1479: {
1480: ecode += 1 + LINK_SIZE;
1481: }
1482: break;
1483:
1484:
1.4 misha 1485: /* Before OP_ACCEPT there may be any number of OP_CLOSE opcodes,
1486: to close any currently open capturing brackets. */
1487:
1488: case OP_CLOSE:
1489: number = GET2(ecode, 1);
1490: offset = number << 1;
1491:
1492: #ifdef PCRE_DEBUG
1493: printf("end bracket %d at *ACCEPT", number);
1494: printf("\n");
1495: #endif
1496:
1497: md->capture_last = number;
1498: if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1499: {
1500: md->offset_vector[offset] =
1501: md->offset_vector[md->offset_end - number];
1502: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1503: if (offset_top <= offset) offset_top = offset + 2;
1504: }
1.6 ! misha 1505: ecode += 1 + IMM2_SIZE;
1.4 misha 1506: break;
1507:
1508:
1.6 ! misha 1509: /* End of the pattern, either real or forced. */
1.1 misha 1510:
1.6 ! misha 1511: case OP_END:
1.1 misha 1512: case OP_ACCEPT:
1.6 ! misha 1513: case OP_ASSERT_ACCEPT:
1.1 misha 1514:
1.6 ! misha 1515: /* If we have matched an empty string, fail if not in an assertion and not
! 1516: in a recursion if either PCRE_NOTEMPTY is set, or if PCRE_NOTEMPTY_ATSTART
! 1517: is set and we have matched at the start of the subject. In both cases,
! 1518: backtracking will then try other alternatives, if any. */
! 1519:
! 1520: if (eptr == mstart && op != OP_ASSERT_ACCEPT &&
! 1521: md->recursive == NULL &&
! 1522: (md->notempty ||
! 1523: (md->notempty_atstart &&
! 1524: mstart == md->start_subject + md->start_offset)))
! 1525: RRETURN(MATCH_NOMATCH);
1.4 misha 1526:
1527: /* Otherwise, we have a match. */
1.1 misha 1528:
1529: md->end_match_ptr = eptr; /* Record where we ended */
1530: md->end_offset_top = offset_top; /* and how many extracts were taken */
1531: md->start_match_ptr = mstart; /* and the start (\K can modify) */
1.4 misha 1532:
1533: /* For some reason, the macros don't work properly if an expression is
1.6 ! misha 1534: given as the argument to RRETURN when the heap is in use. */
1.4 misha 1535:
1536: rrc = (op == OP_END)? MATCH_MATCH : MATCH_ACCEPT;
1.6 ! misha 1537: RRETURN(rrc);
1.1 misha 1538:
1539: /* Assertion brackets. Check the alternative branches in turn - the
1540: matching won't pass the KET for an assertion. If any one branch matches,
1541: the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
1542: start of each branch to move the current point backwards, so the code at
1.6 ! misha 1543: this level is identical to the lookahead case. When the assertion is part
! 1544: of a condition, we want to return immediately afterwards. The caller of
! 1545: this incarnation of the match() function will have set MATCH_CONDASSERT in
! 1546: md->match_function type, and one of these opcodes will be the first opcode
! 1547: that is processed. We use a local variable that is preserved over calls to
! 1548: match() to remember this case. */
1.1 misha 1549:
1550: case OP_ASSERT:
1551: case OP_ASSERTBACK:
1.6 ! misha 1552: save_mark = md->mark;
! 1553: if (md->match_function_type == MATCH_CONDASSERT)
! 1554: {
! 1555: condassert = TRUE;
! 1556: md->match_function_type = 0;
! 1557: }
! 1558: else condassert = FALSE;
! 1559:
1.1 misha 1560: do
1561: {
1.6 ! misha 1562: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM4);
1.4 misha 1563: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1564: {
1565: mstart = md->start_match_ptr; /* In case \K reset it */
1566: break;
1567: }
1.6 ! misha 1568:
! 1569: /* PCRE does not allow THEN to escape beyond an assertion; it is treated
! 1570: as NOMATCH. */
! 1571:
! 1572: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1.1 misha 1573: ecode += GET(ecode, 1);
1.6 ! misha 1574: md->mark = save_mark;
1.1 misha 1575: }
1576: while (*ecode == OP_ALT);
1.6 ! misha 1577:
! 1578: if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
1.1 misha 1579:
1580: /* If checking an assertion for a condition, return MATCH_MATCH. */
1581:
1.6 ! misha 1582: if (condassert) RRETURN(MATCH_MATCH);
1.1 misha 1583:
1584: /* Continue from after the assertion, updating the offsets high water
1585: mark, since extracts may have been taken during the assertion. */
1586:
1587: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1588: ecode += 1 + LINK_SIZE;
1589: offset_top = md->end_offset_top;
1590: continue;
1591:
1.4 misha 1592: /* Negative assertion: all branches must fail to match. Encountering SKIP,
1593: PRUNE, or COMMIT means we must assume failure without checking subsequent
1594: branches. */
1.1 misha 1595:
1596: case OP_ASSERT_NOT:
1597: case OP_ASSERTBACK_NOT:
1.6 ! misha 1598: save_mark = md->mark;
! 1599: if (md->match_function_type == MATCH_CONDASSERT)
! 1600: {
! 1601: condassert = TRUE;
! 1602: md->match_function_type = 0;
! 1603: }
! 1604: else condassert = FALSE;
! 1605:
1.1 misha 1606: do
1607: {
1.6 ! misha 1608: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, NULL, RM5);
! 1609: md->mark = save_mark;
! 1610: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT) RRETURN(MATCH_NOMATCH);
1.4 misha 1611: if (rrc == MATCH_SKIP || rrc == MATCH_PRUNE || rrc == MATCH_COMMIT)
1612: {
1613: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1614: break;
1615: }
1.6 ! misha 1616:
! 1617: /* PCRE does not allow THEN to escape beyond an assertion; it is treated
! 1618: as NOMATCH. */
! 1619:
! 1620: if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1.1 misha 1621: ecode += GET(ecode,1);
1622: }
1623: while (*ecode == OP_ALT);
1624:
1.6 ! misha 1625: if (condassert) RRETURN(MATCH_MATCH); /* Condition assertion */
1.1 misha 1626:
1627: ecode += 1 + LINK_SIZE;
1628: continue;
1629:
1630: /* Move the subject pointer back. This occurs only at the start of
1631: each branch of a lookbehind assertion. If we are too close to the start to
1632: move back, this match function fails. When working with UTF-8 we move
1633: back a number of characters, not bytes. */
1634:
1635: case OP_REVERSE:
1.6 ! misha 1636: #ifdef SUPPORT_UTF
! 1637: if (utf)
1.1 misha 1638: {
1639: i = GET(ecode, 1);
1640: while (i-- > 0)
1641: {
1642: eptr--;
1.6 ! misha 1643: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1.1 misha 1644: BACKCHAR(eptr);
1645: }
1646: }
1647: else
1648: #endif
1649:
1650: /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1651:
1652: {
1653: eptr -= GET(ecode, 1);
1.6 ! misha 1654: if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1.1 misha 1655: }
1656:
1.4 misha 1657: /* Save the earliest consulted character, then skip to next op code */
1.1 misha 1658:
1.4 misha 1659: if (eptr < md->start_used_ptr) md->start_used_ptr = eptr;
1.1 misha 1660: ecode += 1 + LINK_SIZE;
1661: break;
1662:
1663: /* The callout item calls an external function, if one is provided, passing
1664: details of the match so far. This is mainly for debugging, though the
1665: function is able to force a failure. */
1666:
1667: case OP_CALLOUT:
1.6 ! misha 1668: if (PUBL(callout) != NULL)
1.1 misha 1669: {
1.6 ! misha 1670: PUBL(callout_block) cb;
! 1671: cb.version = 2; /* Version 1 of the callout block */
1.1 misha 1672: cb.callout_number = ecode[1];
1673: cb.offset_vector = md->offset_vector;
1.6 ! misha 1674: #ifdef COMPILE_PCRE8
1.1 misha 1675: cb.subject = (PCRE_SPTR)md->start_subject;
1.6 ! misha 1676: #else
! 1677: cb.subject = (PCRE_SPTR16)md->start_subject;
! 1678: #endif
1.4 misha 1679: cb.subject_length = (int)(md->end_subject - md->start_subject);
1680: cb.start_match = (int)(mstart - md->start_subject);
1681: cb.current_position = (int)(eptr - md->start_subject);
1.1 misha 1682: cb.pattern_position = GET(ecode, 2);
1683: cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1684: cb.capture_top = offset_top/2;
1685: cb.capture_last = md->capture_last;
1686: cb.callout_data = md->callout_data;
1.6 ! misha 1687: cb.mark = md->nomatch_mark;
! 1688: if ((rrc = (*PUBL(callout))(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1.1 misha 1689: if (rrc < 0) RRETURN(rrc);
1690: }
1691: ecode += 2 + 2*LINK_SIZE;
1692: break;
1693:
1694: /* Recursion either matches the current regex, or some subexpression. The
1695: offset data is the offset to the starting bracket from the start of the
1696: whole pattern. (This is so that it works from duplicated subpatterns.)
1697:
1.6 ! misha 1698: The state of the capturing groups is preserved over recursion, and
! 1699: re-instated afterwards. We don't know how many are started and not yet
! 1700: finished (offset_top records the completed total) so we just have to save
! 1701: all the potential data. There may be up to 65535 such values, which is too
! 1702: large to put on the stack, but using malloc for small numbers seems
! 1703: expensive. As a compromise, the stack is used when there are no more than
! 1704: REC_STACK_SAVE_MAX values to store; otherwise malloc is used.
1.1 misha 1705:
1706: There are also other values that have to be saved. We use a chained
1707: sequence of blocks that actually live on the stack. Thanks to Robin Houston
1.6 ! misha 1708: for the original version of this logic. It has, however, been hacked around
! 1709: a lot, so he is not to blame for the current way it works. */
1.1 misha 1710:
1711: case OP_RECURSE:
1712: {
1.6 ! misha 1713: recursion_info *ri;
! 1714: int recno;
! 1715:
1.1 misha 1716: callpat = md->start_code + GET(ecode, 1);
1.6 ! misha 1717: recno = (callpat == md->start_code)? 0 :
1.1 misha 1718: GET2(callpat, 1 + LINK_SIZE);
1719:
1.6 ! misha 1720: /* Check for repeating a recursion without advancing the subject pointer.
! 1721: This should catch convoluted mutual recursions. (Some simple cases are
! 1722: caught at compile time.) */
! 1723:
! 1724: for (ri = md->recursive; ri != NULL; ri = ri->prevrec)
! 1725: if (recno == ri->group_num && eptr == ri->subject_position)
! 1726: RRETURN(PCRE_ERROR_RECURSELOOP);
! 1727:
1.1 misha 1728: /* Add to "recursing stack" */
1729:
1.6 ! misha 1730: new_recursive.group_num = recno;
! 1731: new_recursive.subject_position = eptr;
1.1 misha 1732: new_recursive.prevrec = md->recursive;
1733: md->recursive = &new_recursive;
1734:
1.6 ! misha 1735: /* Where to continue from afterwards */
1.1 misha 1736:
1737: ecode += 1 + LINK_SIZE;
1738:
1.6 ! misha 1739: /* Now save the offset data */
1.1 misha 1740:
1741: new_recursive.saved_max = md->offset_end;
1742: if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1743: new_recursive.offset_save = stacksave;
1744: else
1745: {
1746: new_recursive.offset_save =
1.6 ! misha 1747: (int *)(PUBL(malloc))(new_recursive.saved_max * sizeof(int));
1.1 misha 1748: if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1749: }
1750: memcpy(new_recursive.offset_save, md->offset_vector,
1751: new_recursive.saved_max * sizeof(int));
1752:
1.6 ! misha 1753: /* OK, now we can do the recursion. After processing each alternative,
! 1754: restore the offset data. If there were nested recursions, md->recursive
! 1755: might be changed, so reset it before looping. */
1.1 misha 1756:
1757: DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1.6 ! misha 1758: cbegroup = (*callpat >= OP_SBRA);
1.1 misha 1759: do
1760: {
1.6 ! misha 1761: if (cbegroup) md->match_function_type = MATCH_CBEGROUP;
! 1762: RMATCH(eptr, callpat + PRIV(OP_lengths)[*callpat], offset_top,
! 1763: md, eptrb, RM6);
! 1764: memcpy(md->offset_vector, new_recursive.offset_save,
! 1765: new_recursive.saved_max * sizeof(int));
! 1766: md->recursive = new_recursive.prevrec;
1.4 misha 1767: if (rrc == MATCH_MATCH || rrc == MATCH_ACCEPT)
1.1 misha 1768: {
1769: DPRINTF(("Recursion matched\n"));
1770: if (new_recursive.offset_save != stacksave)
1.6 ! misha 1771: (PUBL(free))(new_recursive.offset_save);
! 1772:
! 1773: /* Set where we got to in the subject, and reset the start in case
! 1774: it was changed by \K. This *is* propagated back out of a recursion,
! 1775: for Perl compatibility. */
! 1776:
! 1777: eptr = md->end_match_ptr;
! 1778: mstart = md->start_match_ptr;
! 1779: goto RECURSION_MATCHED; /* Exit loop; end processing */
1.1 misha 1780: }
1.6 ! misha 1781:
! 1782: /* PCRE does not allow THEN to escape beyond a recursion; it is treated
! 1783: as NOMATCH. */
! 1784:
! 1785: else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1.1 misha 1786: {
1787: DPRINTF(("Recursion gave error %d\n", rrc));
1.3 misha 1788: if (new_recursive.offset_save != stacksave)
1.6 ! misha 1789: (PUBL(free))(new_recursive.offset_save);
1.1 misha 1790: RRETURN(rrc);
1791: }
1792:
1793: md->recursive = &new_recursive;
1794: callpat += GET(callpat, 1);
1795: }
1796: while (*callpat == OP_ALT);
1797:
1798: DPRINTF(("Recursion didn't match\n"));
1799: md->recursive = new_recursive.prevrec;
1800: if (new_recursive.offset_save != stacksave)
1.6 ! misha 1801: (PUBL(free))(new_recursive.offset_save);
! 1802: RRETURN(MATCH_NOMATCH);
1.1 misha 1803: }
1804:
1.6 ! misha 1805: RECURSION_MATCHED:
! 1806: break;
1.1 misha 1807:
1808: /* An alternation is the end of a branch; scan along to find the end of the
1809: bracketed group and go to there. */
1810:
1811: case OP_ALT:
1812: do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1813: break;
1814:
1815: /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1816: indicating that it may occur zero times. It may repeat infinitely, or not
1817: at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1818: with fixed upper repeat limits are compiled as a number of copies, with the
1819: optional ones preceded by BRAZERO or BRAMINZERO. */
1820:
1821: case OP_BRAZERO:
1.6 ! misha 1822: next = ecode + 1;
! 1823: RMATCH(eptr, next, offset_top, md, eptrb, RM10);
! 1824: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 1825: do next += GET(next, 1); while (*next == OP_ALT);
! 1826: ecode = next + 1 + LINK_SIZE;
1.1 misha 1827: break;
1828:
1829: case OP_BRAMINZERO:
1.6 ! misha 1830: next = ecode + 1;
! 1831: do next += GET(next, 1); while (*next == OP_ALT);
! 1832: RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, eptrb, RM11);
! 1833: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 1834: ecode++;
1.1 misha 1835: break;
1836:
1837: case OP_SKIPZERO:
1.6 ! misha 1838: next = ecode+1;
! 1839: do next += GET(next,1); while (*next == OP_ALT);
! 1840: ecode = next + 1 + LINK_SIZE;
1.1 misha 1841: break;
1842:
1.6 ! misha 1843: /* BRAPOSZERO occurs before a possessive bracket group. Don't do anything
! 1844: here; just jump to the group, with allow_zero set TRUE. */
! 1845:
! 1846: case OP_BRAPOSZERO:
! 1847: op = *(++ecode);
! 1848: allow_zero = TRUE;
! 1849: if (op == OP_CBRAPOS || op == OP_SCBRAPOS) goto POSSESSIVE_CAPTURE;
! 1850: goto POSSESSIVE_NON_CAPTURE;
! 1851:
1.1 misha 1852: /* End of a group, repeated or non-repeating. */
1853:
1854: case OP_KET:
1855: case OP_KETRMIN:
1856: case OP_KETRMAX:
1.6 ! misha 1857: case OP_KETRPOS:
1.1 misha 1858: prev = ecode - GET(ecode, 1);
1859:
1860: /* If this was a group that remembered the subject start, in order to break
1861: infinite repeats of empty string matches, retrieve the subject start from
1862: the chain. Otherwise, set it NULL. */
1863:
1.6 ! misha 1864: if (*prev >= OP_SBRA || *prev == OP_ONCE)
1.1 misha 1865: {
1866: saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1867: eptrb = eptrb->epb_prev; /* Backup to previous group */
1868: }
1869: else saved_eptr = NULL;
1870:
1.6 ! misha 1871: /* If we are at the end of an assertion group or a non-capturing atomic
! 1872: group, stop matching and return MATCH_MATCH, but record the current high
! 1873: water mark for use by positive assertions. We also need to record the match
! 1874: start in case it was changed by \K. */
! 1875:
! 1876: if ((*prev >= OP_ASSERT && *prev <= OP_ASSERTBACK_NOT) ||
! 1877: *prev == OP_ONCE_NC)
1.1 misha 1878: {
1.6 ! misha 1879: md->end_match_ptr = eptr; /* For ONCE_NC */
1.1 misha 1880: md->end_offset_top = offset_top;
1.4 misha 1881: md->start_match_ptr = mstart;
1.6 ! misha 1882: RRETURN(MATCH_MATCH); /* Sets md->mark */
1.1 misha 1883: }
1884:
1885: /* For capturing groups we have to check the group number back at the start
1886: and if necessary complete handling an extraction by setting the offsets and
1.6 ! misha 1887: bumping the high water mark. Whole-pattern recursion is coded as a recurse
! 1888: into group 0, so it won't be picked up here. Instead, we catch it when the
! 1889: OP_END is reached. Other recursion is handled here. We just have to record
! 1890: the current subject position and start match pointer and give a MATCH
! 1891: return. */
1.1 misha 1892:
1.6 ! misha 1893: if (*prev == OP_CBRA || *prev == OP_SCBRA ||
! 1894: *prev == OP_CBRAPOS || *prev == OP_SCBRAPOS)
1.1 misha 1895: {
1896: number = GET2(prev, 1+LINK_SIZE);
1897: offset = number << 1;
1898:
1.4 misha 1899: #ifdef PCRE_DEBUG
1.1 misha 1900: printf("end bracket %d", number);
1901: printf("\n");
1902: #endif
1903:
1.6 ! misha 1904: /* Handle a recursively called group. */
! 1905:
! 1906: if (md->recursive != NULL && md->recursive->group_num == number)
! 1907: {
! 1908: md->end_match_ptr = eptr;
! 1909: md->start_match_ptr = mstart;
! 1910: RRETURN(MATCH_MATCH);
! 1911: }
! 1912:
! 1913: /* Deal with capturing */
! 1914:
1.1 misha 1915: md->capture_last = number;
1916: if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1917: {
1.6 ! misha 1918: /* If offset is greater than offset_top, it means that we are
! 1919: "skipping" a capturing group, and that group's offsets must be marked
! 1920: unset. In earlier versions of PCRE, all the offsets were unset at the
! 1921: start of matching, but this doesn't work because atomic groups and
! 1922: assertions can cause a value to be set that should later be unset.
! 1923: Example: matching /(?>(a))b|(a)c/ against "ac". This sets group 1 as
! 1924: part of the atomic group, but this is not on the final matching path,
! 1925: so must be unset when 2 is set. (If there is no group 2, there is no
! 1926: problem, because offset_top will then be 2, indicating no capture.) */
! 1927:
! 1928: if (offset > offset_top)
! 1929: {
! 1930: register int *iptr = md->offset_vector + offset_top;
! 1931: register int *iend = md->offset_vector + offset;
! 1932: while (iptr < iend) *iptr++ = -1;
! 1933: }
! 1934:
! 1935: /* Now make the extraction */
! 1936:
1.1 misha 1937: md->offset_vector[offset] =
1938: md->offset_vector[md->offset_end - number];
1.4 misha 1939: md->offset_vector[offset+1] = (int)(eptr - md->start_subject);
1.1 misha 1940: if (offset_top <= offset) offset_top = offset + 2;
1941: }
1.6 ! misha 1942: }
1.1 misha 1943:
1.6 ! misha 1944: /* For an ordinary non-repeating ket, just continue at this level. This
! 1945: also happens for a repeating ket if no characters were matched in the
! 1946: group. This is the forcible breaking of infinite loops as implemented in
! 1947: Perl 5.005. For a non-repeating atomic group that includes captures,
! 1948: establish a backup point by processing the rest of the pattern at a lower
! 1949: level. If this results in a NOMATCH return, pass MATCH_ONCE back to the
! 1950: original OP_ONCE level, thereby bypassing intermediate backup points, but
! 1951: resetting any captures that happened along the way. */
1.1 misha 1952:
1.6 ! misha 1953: if (*ecode == OP_KET || eptr == saved_eptr)
! 1954: {
! 1955: if (*prev == OP_ONCE)
1.1 misha 1956: {
1.6 ! misha 1957: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM12);
! 1958: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 1959: md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
! 1960: RRETURN(MATCH_ONCE);
1.1 misha 1961: }
1.6 ! misha 1962: ecode += 1 + LINK_SIZE; /* Carry on at this level */
! 1963: break;
1.1 misha 1964: }
1965:
1.6 ! misha 1966: /* OP_KETRPOS is a possessive repeating ket. Remember the current position,
! 1967: and return the MATCH_KETRPOS. This makes it possible to do the repeats one
! 1968: at a time from the outer level, thus saving stack. */
1.1 misha 1969:
1.6 ! misha 1970: if (*ecode == OP_KETRPOS)
1.1 misha 1971: {
1.6 ! misha 1972: md->end_match_ptr = eptr;
! 1973: md->end_offset_top = offset_top;
! 1974: RRETURN(MATCH_KETRPOS);
1.1 misha 1975: }
1976:
1.6 ! misha 1977: /* The normal repeating kets try the rest of the pattern or restart from
! 1978: the preceding bracket, in the appropriate order. In the second case, we can
! 1979: use tail recursion to avoid using another stack frame, unless we have an
! 1980: an atomic group or an unlimited repeat of a group that can match an empty
! 1981: string. */
1.1 misha 1982:
1983: if (*ecode == OP_KETRMIN)
1984: {
1.6 ! misha 1985: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM7);
1.1 misha 1986: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 1987: if (*prev == OP_ONCE)
1.1 misha 1988: {
1.6 ! misha 1989: RMATCH(eptr, prev, offset_top, md, eptrb, RM8);
! 1990: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 1991: md->once_target = prev; /* Level at which to change to MATCH_NOMATCH */
! 1992: RRETURN(MATCH_ONCE);
! 1993: }
! 1994: if (*prev >= OP_SBRA) /* Could match an empty string */
! 1995: {
! 1996: md->match_function_type = MATCH_CBEGROUP;
! 1997: RMATCH(eptr, prev, offset_top, md, eptrb, RM50);
1.1 misha 1998: RRETURN(rrc);
1999: }
2000: ecode = prev;
2001: goto TAIL_RECURSE;
2002: }
2003: else /* OP_KETRMAX */
2004: {
1.6 ! misha 2005: if (*prev >= OP_SBRA) md->match_function_type = MATCH_CBEGROUP;
! 2006: RMATCH(eptr, prev, offset_top, md, eptrb, RM13);
! 2007: if (rrc == MATCH_ONCE && md->once_target == prev) rrc = MATCH_NOMATCH;
1.1 misha 2008: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 2009: if (*prev == OP_ONCE)
! 2010: {
! 2011: RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, eptrb, RM9);
! 2012: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2013: md->once_target = prev;
! 2014: RRETURN(MATCH_ONCE);
! 2015: }
1.1 misha 2016: ecode += 1 + LINK_SIZE;
2017: goto TAIL_RECURSE;
2018: }
2019: /* Control never gets here */
2020:
1.6 ! misha 2021: /* Not multiline mode: start of subject assertion, unless notbol. */
1.1 misha 2022:
2023: case OP_CIRC:
1.6 ! misha 2024: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1.1 misha 2025:
2026: /* Start of subject assertion */
2027:
2028: case OP_SOD:
1.6 ! misha 2029: if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
! 2030: ecode++;
! 2031: break;
! 2032:
! 2033: /* Multiline mode: start of subject unless notbol, or after any newline. */
! 2034:
! 2035: case OP_CIRCM:
! 2036: if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
! 2037: if (eptr != md->start_subject &&
! 2038: (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
! 2039: RRETURN(MATCH_NOMATCH);
1.1 misha 2040: ecode++;
2041: break;
2042:
2043: /* Start of match assertion */
2044:
2045: case OP_SOM:
1.6 ! misha 2046: if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1.1 misha 2047: ecode++;
2048: break;
2049:
2050: /* Reset the start of match point */
2051:
2052: case OP_SET_SOM:
2053: mstart = eptr;
2054: ecode++;
2055: break;
2056:
1.6 ! misha 2057: /* Multiline mode: assert before any newline, or before end of subject
! 2058: unless noteol is set. */
1.1 misha 2059:
1.6 ! misha 2060: case OP_DOLLM:
! 2061: if (eptr < md->end_subject)
! 2062: { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
! 2063: else
1.1 misha 2064: {
1.6 ! misha 2065: if (md->noteol) RRETURN(MATCH_NOMATCH);
! 2066: SCHECK_PARTIAL();
1.1 misha 2067: }
1.6 ! misha 2068: ecode++;
! 2069: break;
! 2070:
! 2071: /* Not multiline mode: assert before a terminating newline or before end of
! 2072: subject unless noteol is set. */
! 2073:
! 2074: case OP_DOLL:
! 2075: if (md->noteol) RRETURN(MATCH_NOMATCH);
! 2076: if (!md->endonly) goto ASSERT_NL_OR_EOS;
1.5 misha 2077:
1.1 misha 2078: /* ... else fall through for endonly */
2079:
2080: /* End of subject assertion (\z) */
2081:
2082: case OP_EOD:
1.6 ! misha 2083: if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1.5 misha 2084: SCHECK_PARTIAL();
1.1 misha 2085: ecode++;
2086: break;
2087:
2088: /* End of subject or ending \n assertion (\Z) */
2089:
2090: case OP_EODN:
1.5 misha 2091: ASSERT_NL_OR_EOS:
2092: if (eptr < md->end_subject &&
1.1 misha 2093: (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1.6 ! misha 2094: RRETURN(MATCH_NOMATCH);
1.5 misha 2095:
2096: /* Either at end of string or \n before end. */
2097:
2098: SCHECK_PARTIAL();
1.1 misha 2099: ecode++;
2100: break;
2101:
2102: /* Word boundary assertions */
2103:
2104: case OP_NOT_WORD_BOUNDARY:
2105: case OP_WORD_BOUNDARY:
2106: {
2107:
2108: /* Find out if the previous and current characters are "word" characters.
2109: It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1.4 misha 2110: be "non-word" characters. Remember the earliest consulted character for
2111: partial matching. */
1.1 misha 2112:
1.6 ! misha 2113: #ifdef SUPPORT_UTF
! 2114: if (utf)
1.1 misha 2115: {
1.4 misha 2116: /* Get status of previous character */
2117:
1.1 misha 2118: if (eptr == md->start_subject) prev_is_word = FALSE; else
2119: {
1.6 ! misha 2120: PCRE_PUCHAR lastptr = eptr - 1;
! 2121: BACKCHAR(lastptr);
1.4 misha 2122: if (lastptr < md->start_used_ptr) md->start_used_ptr = lastptr;
1.1 misha 2123: GETCHAR(c, lastptr);
1.4 misha 2124: #ifdef SUPPORT_UCP
2125: if (md->use_ucp)
2126: {
2127: if (c == '_') prev_is_word = TRUE; else
2128: {
2129: int cat = UCD_CATEGORY(c);
2130: prev_is_word = (cat == ucp_L || cat == ucp_N);
2131: }
2132: }
2133: else
2134: #endif
1.1 misha 2135: prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2136: }
1.4 misha 2137:
2138: /* Get status of next character */
2139:
2140: if (eptr >= md->end_subject)
2141: {
2142: SCHECK_PARTIAL();
2143: cur_is_word = FALSE;
2144: }
2145: else
1.1 misha 2146: {
2147: GETCHAR(c, eptr);
1.4 misha 2148: #ifdef SUPPORT_UCP
2149: if (md->use_ucp)
2150: {
2151: if (c == '_') cur_is_word = TRUE; else
2152: {
2153: int cat = UCD_CATEGORY(c);
2154: cur_is_word = (cat == ucp_L || cat == ucp_N);
2155: }
2156: }
2157: else
2158: #endif
1.1 misha 2159: cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
2160: }
2161: }
2162: else
2163: #endif
2164:
1.4 misha 2165: /* Not in UTF-8 mode, but we may still have PCRE_UCP set, and for
2166: consistency with the behaviour of \w we do use it in this case. */
1.1 misha 2167:
2168: {
1.4 misha 2169: /* Get status of previous character */
2170:
2171: if (eptr == md->start_subject) prev_is_word = FALSE; else
2172: {
2173: if (eptr <= md->start_used_ptr) md->start_used_ptr = eptr - 1;
2174: #ifdef SUPPORT_UCP
2175: if (md->use_ucp)
2176: {
2177: c = eptr[-1];
2178: if (c == '_') prev_is_word = TRUE; else
2179: {
2180: int cat = UCD_CATEGORY(c);
2181: prev_is_word = (cat == ucp_L || cat == ucp_N);
2182: }
2183: }
2184: else
2185: #endif
1.6 ! misha 2186: prev_is_word = MAX_255(eptr[-1])
! 2187: && ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1.4 misha 2188: }
2189:
2190: /* Get status of next character */
2191:
2192: if (eptr >= md->end_subject)
2193: {
2194: SCHECK_PARTIAL();
2195: cur_is_word = FALSE;
2196: }
2197: else
2198: #ifdef SUPPORT_UCP
2199: if (md->use_ucp)
2200: {
2201: c = *eptr;
2202: if (c == '_') cur_is_word = TRUE; else
2203: {
2204: int cat = UCD_CATEGORY(c);
2205: cur_is_word = (cat == ucp_L || cat == ucp_N);
2206: }
2207: }
2208: else
2209: #endif
1.6 ! misha 2210: cur_is_word = MAX_255(*eptr)
! 2211: && ((md->ctypes[*eptr] & ctype_word) != 0);
1.1 misha 2212: }
2213:
2214: /* Now see if the situation is what we want */
2215:
2216: if ((*ecode++ == OP_WORD_BOUNDARY)?
2217: cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1.6 ! misha 2218: RRETURN(MATCH_NOMATCH);
1.1 misha 2219: }
2220: break;
2221:
2222: /* Match a single character type; inline for speed */
2223:
2224: case OP_ANY:
1.6 ! misha 2225: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1.1 misha 2226: /* Fall through */
2227:
2228: case OP_ALLANY:
1.6 ! misha 2229: if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
! 2230: { /* not be updated before SCHECK_PARTIAL. */
1.4 misha 2231: SCHECK_PARTIAL();
1.6 ! misha 2232: RRETURN(MATCH_NOMATCH);
1.4 misha 2233: }
1.6 ! misha 2234: eptr++;
! 2235: #ifdef SUPPORT_UTF
! 2236: if (utf) ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
! 2237: #endif
1.1 misha 2238: ecode++;
2239: break;
2240:
2241: /* Match a single byte, even in UTF-8 mode. This opcode really does match
2242: any byte, even newline, independent of the setting of PCRE_DOTALL. */
2243:
2244: case OP_ANYBYTE:
1.6 ! misha 2245: if (eptr >= md->end_subject) /* DO NOT merge the eptr++ here; it must */
! 2246: { /* not be updated before SCHECK_PARTIAL. */
1.4 misha 2247: SCHECK_PARTIAL();
1.6 ! misha 2248: RRETURN(MATCH_NOMATCH);
1.4 misha 2249: }
1.6 ! misha 2250: eptr++;
1.1 misha 2251: ecode++;
2252: break;
2253:
2254: case OP_NOT_DIGIT:
1.4 misha 2255: if (eptr >= md->end_subject)
2256: {
2257: SCHECK_PARTIAL();
1.6 ! misha 2258: RRETURN(MATCH_NOMATCH);
1.4 misha 2259: }
1.1 misha 2260: GETCHARINCTEST(c, eptr);
2261: if (
1.6 ! misha 2262: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
1.1 misha 2263: c < 256 &&
2264: #endif
2265: (md->ctypes[c] & ctype_digit) != 0
2266: )
1.6 ! misha 2267: RRETURN(MATCH_NOMATCH);
1.1 misha 2268: ecode++;
2269: break;
2270:
2271: case OP_DIGIT:
1.4 misha 2272: if (eptr >= md->end_subject)
2273: {
2274: SCHECK_PARTIAL();
1.6 ! misha 2275: RRETURN(MATCH_NOMATCH);
1.4 misha 2276: }
1.1 misha 2277: GETCHARINCTEST(c, eptr);
2278: if (
1.6 ! misha 2279: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
! 2280: c > 255 ||
1.1 misha 2281: #endif
2282: (md->ctypes[c] & ctype_digit) == 0
2283: )
1.6 ! misha 2284: RRETURN(MATCH_NOMATCH);
1.1 misha 2285: ecode++;
2286: break;
2287:
2288: case OP_NOT_WHITESPACE:
1.4 misha 2289: if (eptr >= md->end_subject)
2290: {
2291: SCHECK_PARTIAL();
1.6 ! misha 2292: RRETURN(MATCH_NOMATCH);
1.4 misha 2293: }
1.1 misha 2294: GETCHARINCTEST(c, eptr);
2295: if (
1.6 ! misha 2296: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
1.1 misha 2297: c < 256 &&
2298: #endif
2299: (md->ctypes[c] & ctype_space) != 0
2300: )
1.6 ! misha 2301: RRETURN(MATCH_NOMATCH);
1.1 misha 2302: ecode++;
2303: break;
2304:
2305: case OP_WHITESPACE:
1.4 misha 2306: if (eptr >= md->end_subject)
2307: {
2308: SCHECK_PARTIAL();
1.6 ! misha 2309: RRETURN(MATCH_NOMATCH);
1.4 misha 2310: }
1.1 misha 2311: GETCHARINCTEST(c, eptr);
2312: if (
1.6 ! misha 2313: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
! 2314: c > 255 ||
1.1 misha 2315: #endif
2316: (md->ctypes[c] & ctype_space) == 0
2317: )
1.6 ! misha 2318: RRETURN(MATCH_NOMATCH);
1.1 misha 2319: ecode++;
2320: break;
2321:
2322: case OP_NOT_WORDCHAR:
1.4 misha 2323: if (eptr >= md->end_subject)
2324: {
2325: SCHECK_PARTIAL();
1.6 ! misha 2326: RRETURN(MATCH_NOMATCH);
1.4 misha 2327: }
1.1 misha 2328: GETCHARINCTEST(c, eptr);
2329: if (
1.6 ! misha 2330: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
1.1 misha 2331: c < 256 &&
2332: #endif
2333: (md->ctypes[c] & ctype_word) != 0
2334: )
1.6 ! misha 2335: RRETURN(MATCH_NOMATCH);
1.1 misha 2336: ecode++;
2337: break;
2338:
2339: case OP_WORDCHAR:
1.4 misha 2340: if (eptr >= md->end_subject)
2341: {
2342: SCHECK_PARTIAL();
1.6 ! misha 2343: RRETURN(MATCH_NOMATCH);
1.4 misha 2344: }
1.1 misha 2345: GETCHARINCTEST(c, eptr);
2346: if (
1.6 ! misha 2347: #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
! 2348: c > 255 ||
1.1 misha 2349: #endif
2350: (md->ctypes[c] & ctype_word) == 0
2351: )
1.6 ! misha 2352: RRETURN(MATCH_NOMATCH);
1.1 misha 2353: ecode++;
2354: break;
2355:
2356: case OP_ANYNL:
1.4 misha 2357: if (eptr >= md->end_subject)
2358: {
2359: SCHECK_PARTIAL();
1.6 ! misha 2360: RRETURN(MATCH_NOMATCH);
1.4 misha 2361: }
1.1 misha 2362: GETCHARINCTEST(c, eptr);
2363: switch(c)
2364: {
1.6 ! misha 2365: default: RRETURN(MATCH_NOMATCH);
! 2366:
1.1 misha 2367: case 0x000d:
2368: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
2369: break;
2370:
2371: case 0x000a:
2372: break;
2373:
2374: case 0x000b:
2375: case 0x000c:
2376: case 0x0085:
2377: case 0x2028:
2378: case 0x2029:
1.6 ! misha 2379: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1.1 misha 2380: break;
2381: }
2382: ecode++;
2383: break;
2384:
2385: case OP_NOT_HSPACE:
1.4 misha 2386: if (eptr >= md->end_subject)
2387: {
2388: SCHECK_PARTIAL();
1.6 ! misha 2389: RRETURN(MATCH_NOMATCH);
1.4 misha 2390: }
1.1 misha 2391: GETCHARINCTEST(c, eptr);
2392: switch(c)
2393: {
2394: default: break;
2395: case 0x09: /* HT */
2396: case 0x20: /* SPACE */
2397: case 0xa0: /* NBSP */
2398: case 0x1680: /* OGHAM SPACE MARK */
2399: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2400: case 0x2000: /* EN QUAD */
2401: case 0x2001: /* EM QUAD */
2402: case 0x2002: /* EN SPACE */
2403: case 0x2003: /* EM SPACE */
2404: case 0x2004: /* THREE-PER-EM SPACE */
2405: case 0x2005: /* FOUR-PER-EM SPACE */
2406: case 0x2006: /* SIX-PER-EM SPACE */
2407: case 0x2007: /* FIGURE SPACE */
2408: case 0x2008: /* PUNCTUATION SPACE */
2409: case 0x2009: /* THIN SPACE */
2410: case 0x200A: /* HAIR SPACE */
2411: case 0x202f: /* NARROW NO-BREAK SPACE */
2412: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2413: case 0x3000: /* IDEOGRAPHIC SPACE */
1.6 ! misha 2414: RRETURN(MATCH_NOMATCH);
1.1 misha 2415: }
2416: ecode++;
2417: break;
2418:
2419: case OP_HSPACE:
1.4 misha 2420: if (eptr >= md->end_subject)
2421: {
2422: SCHECK_PARTIAL();
1.6 ! misha 2423: RRETURN(MATCH_NOMATCH);
1.4 misha 2424: }
1.1 misha 2425: GETCHARINCTEST(c, eptr);
2426: switch(c)
2427: {
1.6 ! misha 2428: default: RRETURN(MATCH_NOMATCH);
1.1 misha 2429: case 0x09: /* HT */
2430: case 0x20: /* SPACE */
2431: case 0xa0: /* NBSP */
2432: case 0x1680: /* OGHAM SPACE MARK */
2433: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
2434: case 0x2000: /* EN QUAD */
2435: case 0x2001: /* EM QUAD */
2436: case 0x2002: /* EN SPACE */
2437: case 0x2003: /* EM SPACE */
2438: case 0x2004: /* THREE-PER-EM SPACE */
2439: case 0x2005: /* FOUR-PER-EM SPACE */
2440: case 0x2006: /* SIX-PER-EM SPACE */
2441: case 0x2007: /* FIGURE SPACE */
2442: case 0x2008: /* PUNCTUATION SPACE */
2443: case 0x2009: /* THIN SPACE */
2444: case 0x200A: /* HAIR SPACE */
2445: case 0x202f: /* NARROW NO-BREAK SPACE */
2446: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
2447: case 0x3000: /* IDEOGRAPHIC SPACE */
2448: break;
2449: }
2450: ecode++;
2451: break;
2452:
2453: case OP_NOT_VSPACE:
1.4 misha 2454: if (eptr >= md->end_subject)
2455: {
2456: SCHECK_PARTIAL();
1.6 ! misha 2457: RRETURN(MATCH_NOMATCH);
1.4 misha 2458: }
1.1 misha 2459: GETCHARINCTEST(c, eptr);
2460: switch(c)
2461: {
2462: default: break;
2463: case 0x0a: /* LF */
2464: case 0x0b: /* VT */
2465: case 0x0c: /* FF */
2466: case 0x0d: /* CR */
2467: case 0x85: /* NEL */
2468: case 0x2028: /* LINE SEPARATOR */
2469: case 0x2029: /* PARAGRAPH SEPARATOR */
1.6 ! misha 2470: RRETURN(MATCH_NOMATCH);
1.1 misha 2471: }
2472: ecode++;
2473: break;
2474:
2475: case OP_VSPACE:
1.4 misha 2476: if (eptr >= md->end_subject)
2477: {
2478: SCHECK_PARTIAL();
1.6 ! misha 2479: RRETURN(MATCH_NOMATCH);
1.4 misha 2480: }
1.1 misha 2481: GETCHARINCTEST(c, eptr);
2482: switch(c)
2483: {
1.6 ! misha 2484: default: RRETURN(MATCH_NOMATCH);
1.1 misha 2485: case 0x0a: /* LF */
2486: case 0x0b: /* VT */
2487: case 0x0c: /* FF */
2488: case 0x0d: /* CR */
2489: case 0x85: /* NEL */
2490: case 0x2028: /* LINE SEPARATOR */
2491: case 0x2029: /* PARAGRAPH SEPARATOR */
2492: break;
2493: }
2494: ecode++;
2495: break;
2496:
2497: #ifdef SUPPORT_UCP
2498: /* Check the next character by Unicode property. We will get here only
2499: if the support is in the binary; otherwise a compile-time error occurs. */
2500:
2501: case OP_PROP:
2502: case OP_NOTPROP:
1.4 misha 2503: if (eptr >= md->end_subject)
2504: {
2505: SCHECK_PARTIAL();
1.6 ! misha 2506: RRETURN(MATCH_NOMATCH);
1.4 misha 2507: }
1.1 misha 2508: GETCHARINCTEST(c, eptr);
2509: {
1.3 misha 2510: const ucd_record *prop = GET_UCD(c);
1.1 misha 2511:
2512: switch(ecode[1])
2513: {
2514: case PT_ANY:
1.6 ! misha 2515: if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1.1 misha 2516: break;
2517:
2518: case PT_LAMP:
1.2 misha 2519: if ((prop->chartype == ucp_Lu ||
2520: prop->chartype == ucp_Ll ||
2521: prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1.6 ! misha 2522: RRETURN(MATCH_NOMATCH);
1.4 misha 2523: break;
1.1 misha 2524:
2525: case PT_GC:
1.6 ! misha 2526: if ((ecode[2] != PRIV(ucp_gentype)[prop->chartype]) == (op == OP_PROP))
! 2527: RRETURN(MATCH_NOMATCH);
1.1 misha 2528: break;
2529:
2530: case PT_PC:
1.2 misha 2531: if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1.6 ! misha 2532: RRETURN(MATCH_NOMATCH);
1.1 misha 2533: break;
2534:
2535: case PT_SC:
1.2 misha 2536: if ((ecode[2] != prop->script) == (op == OP_PROP))
1.6 ! misha 2537: RRETURN(MATCH_NOMATCH);
1.4 misha 2538: break;
2539:
2540: /* These are specials */
2541:
2542: case PT_ALNUM:
1.6 ! misha 2543: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
! 2544: PRIV(ucp_gentype)[prop->chartype] == ucp_N) == (op == OP_NOTPROP))
! 2545: RRETURN(MATCH_NOMATCH);
1.4 misha 2546: break;
2547:
2548: case PT_SPACE: /* Perl space */
1.6 ! misha 2549: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.4 misha 2550: c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2551: == (op == OP_NOTPROP))
1.6 ! misha 2552: RRETURN(MATCH_NOMATCH);
1.4 misha 2553: break;
2554:
2555: case PT_PXSPACE: /* POSIX space */
1.6 ! misha 2556: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_Z ||
1.4 misha 2557: c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2558: c == CHAR_FF || c == CHAR_CR)
2559: == (op == OP_NOTPROP))
1.6 ! misha 2560: RRETURN(MATCH_NOMATCH);
1.4 misha 2561: break;
2562:
2563: case PT_WORD:
1.6 ! misha 2564: if ((PRIV(ucp_gentype)[prop->chartype] == ucp_L ||
! 2565: PRIV(ucp_gentype)[prop->chartype] == ucp_N ||
1.4 misha 2566: c == CHAR_UNDERSCORE) == (op == OP_NOTPROP))
1.6 ! misha 2567: RRETURN(MATCH_NOMATCH);
1.1 misha 2568: break;
2569:
1.4 misha 2570: /* This should never occur */
2571:
1.1 misha 2572: default:
2573: RRETURN(PCRE_ERROR_INTERNAL);
2574: }
2575:
2576: ecode += 3;
2577: }
2578: break;
2579:
2580: /* Match an extended Unicode sequence. We will get here only if the support
2581: is in the binary; otherwise a compile-time error occurs. */
2582:
2583: case OP_EXTUNI:
1.4 misha 2584: if (eptr >= md->end_subject)
2585: {
2586: SCHECK_PARTIAL();
1.6 ! misha 2587: RRETURN(MATCH_NOMATCH);
1.4 misha 2588: }
1.1 misha 2589: GETCHARINCTEST(c, eptr);
1.6 ! misha 2590: if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
! 2591: while (eptr < md->end_subject)
1.1 misha 2592: {
1.6 ! misha 2593: int len = 1;
! 2594: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
! 2595: if (UCD_CATEGORY(c) != ucp_M) break;
! 2596: eptr += len;
1.1 misha 2597: }
2598: ecode++;
2599: break;
2600: #endif
2601:
2602:
2603: /* Match a back reference, possibly repeatedly. Look past the end of the
2604: item to see if there is repeat information following. The code is similar
2605: to that for character classes, but repeated for efficiency. Then obey
2606: similar code to character type repeats - written out again for speed.
2607: However, if the referenced string is the empty string, always treat
2608: it as matched, any number of times (otherwise there could be infinite
2609: loops). */
2610:
2611: case OP_REF:
1.6 ! misha 2612: case OP_REFI:
! 2613: caseless = op == OP_REFI;
! 2614: offset = GET2(ecode, 1) << 1; /* Doubled ref number */
! 2615: ecode += 1 + IMM2_SIZE;
1.1 misha 2616:
1.6 ! misha 2617: /* If the reference is unset, there are two possibilities:
1.1 misha 2618:
1.6 ! misha 2619: (a) In the default, Perl-compatible state, set the length negative;
! 2620: this ensures that every attempt at a match fails. We can't just fail
! 2621: here, because of the possibility of quantifiers with zero minima.
1.1 misha 2622:
1.6 ! misha 2623: (b) If the JavaScript compatibility flag is set, set the length to zero
! 2624: so that the back reference matches an empty string.
1.1 misha 2625:
1.6 ! misha 2626: Otherwise, set the length to the length of what was matched by the
! 2627: referenced subpattern. */
1.1 misha 2628:
1.6 ! misha 2629: if (offset >= offset_top || md->offset_vector[offset] < 0)
! 2630: length = (md->jscript_compat)? 0 : -1;
! 2631: else
! 2632: length = md->offset_vector[offset+1] - md->offset_vector[offset];
1.1 misha 2633:
1.6 ! misha 2634: /* Set up for repetition, or handle the non-repeated case */
1.1 misha 2635:
1.6 ! misha 2636: switch (*ecode)
! 2637: {
! 2638: case OP_CRSTAR:
! 2639: case OP_CRMINSTAR:
! 2640: case OP_CRPLUS:
! 2641: case OP_CRMINPLUS:
! 2642: case OP_CRQUERY:
! 2643: case OP_CRMINQUERY:
! 2644: c = *ecode++ - OP_CRSTAR;
! 2645: minimize = (c & 1) != 0;
! 2646: min = rep_min[c]; /* Pick up values from tables; */
! 2647: max = rep_max[c]; /* zero for max => infinity */
! 2648: if (max == 0) max = INT_MAX;
! 2649: break;
1.1 misha 2650:
1.6 ! misha 2651: case OP_CRRANGE:
! 2652: case OP_CRMINRANGE:
! 2653: minimize = (*ecode == OP_CRMINRANGE);
! 2654: min = GET2(ecode, 1);
! 2655: max = GET2(ecode, 1 + IMM2_SIZE);
! 2656: if (max == 0) max = INT_MAX;
! 2657: ecode += 1 + 2 * IMM2_SIZE;
! 2658: break;
1.1 misha 2659:
1.6 ! misha 2660: default: /* No repeat follows */
! 2661: if ((length = match_ref(offset, eptr, length, md, caseless)) < 0)
! 2662: {
! 2663: CHECK_PARTIAL();
! 2664: RRETURN(MATCH_NOMATCH);
1.1 misha 2665: }
1.6 ! misha 2666: eptr += length;
! 2667: continue; /* With the main loop */
! 2668: }
1.1 misha 2669:
1.6 ! misha 2670: /* Handle repeated back references. If the length of the reference is
! 2671: zero, just continue with the main loop. If the length is negative, it
! 2672: means the reference is unset in non-Java-compatible mode. If the minimum is
! 2673: zero, we can continue at the same level without recursion. For any other
! 2674: minimum, carrying on will result in NOMATCH. */
1.1 misha 2675:
1.6 ! misha 2676: if (length == 0) continue;
! 2677: if (length < 0 && min == 0) continue;
1.1 misha 2678:
1.6 ! misha 2679: /* First, ensure the minimum number of matches are present. We get back
! 2680: the length of the reference string explicitly rather than passing the
! 2681: address of eptr, so that eptr can be a register variable. */
1.1 misha 2682:
1.6 ! misha 2683: for (i = 1; i <= min; i++)
! 2684: {
! 2685: int slength;
! 2686: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
1.1 misha 2687: {
1.6 ! misha 2688: CHECK_PARTIAL();
! 2689: RRETURN(MATCH_NOMATCH);
1.1 misha 2690: }
1.6 ! misha 2691: eptr += slength;
! 2692: }
1.1 misha 2693:
1.6 ! misha 2694: /* If min = max, continue at the same level without recursion.
! 2695: They are not both allowed to be zero. */
1.1 misha 2696:
1.6 ! misha 2697: if (min == max) continue;
1.1 misha 2698:
1.6 ! misha 2699: /* If minimizing, keep trying and advancing the pointer */
1.1 misha 2700:
1.6 ! misha 2701: if (minimize)
! 2702: {
! 2703: for (fi = min;; fi++)
1.1 misha 2704: {
1.6 ! misha 2705: int slength;
! 2706: RMATCH(eptr, ecode, offset_top, md, eptrb, RM14);
! 2707: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2708: if (fi >= max) RRETURN(MATCH_NOMATCH);
! 2709: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
1.1 misha 2710: {
1.6 ! misha 2711: CHECK_PARTIAL();
! 2712: RRETURN(MATCH_NOMATCH);
1.1 misha 2713: }
1.6 ! misha 2714: eptr += slength;
1.1 misha 2715: }
1.6 ! misha 2716: /* Control never gets here */
! 2717: }
1.1 misha 2718:
1.6 ! misha 2719: /* If maximizing, find the longest string and work backwards */
1.1 misha 2720:
1.6 ! misha 2721: else
! 2722: {
! 2723: pp = eptr;
! 2724: for (i = min; i < max; i++)
1.1 misha 2725: {
1.6 ! misha 2726: int slength;
! 2727: if ((slength = match_ref(offset, eptr, length, md, caseless)) < 0)
1.1 misha 2728: {
1.6 ! misha 2729: CHECK_PARTIAL();
! 2730: break;
1.1 misha 2731: }
1.6 ! misha 2732: eptr += slength;
! 2733: }
! 2734: while (eptr >= pp)
! 2735: {
! 2736: RMATCH(eptr, ecode, offset_top, md, eptrb, RM15);
! 2737: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
! 2738: eptr -= length;
1.1 misha 2739: }
1.6 ! misha 2740: RRETURN(MATCH_NOMATCH);
1.1 misha 2741: }
2742: /* Control never gets here */
2743:
2744: /* Match a bit-mapped character class, possibly repeatedly. This op code is
2745: used when all the characters in the class have values in the range 0-255,
2746: and either the matching is caseful, or the characters are in the range
2747: 0-127 when UTF-8 processing is enabled. The only difference between
2748: OP_CLASS and OP_NCLASS occurs when a data character outside the range is
2749: encountered.
2750:
2751: First, look past the end of the item to see if there is repeat information
2752: following. Then obey similar code to character type repeats - written out
2753: again for speed. */
2754:
2755: case OP_NCLASS:
2756: case OP_CLASS:
2757: {
1.6 ! misha 2758: /* The data variable is saved across frames, so the byte map needs to
! 2759: be stored there. */
! 2760: #define BYTE_MAP ((pcre_uint8 *)data)
1.1 misha 2761: data = ecode + 1; /* Save for matching */
1.6 ! misha 2762: ecode += 1 + (32 / sizeof(pcre_uchar)); /* Advance past the item */
1.1 misha 2763:
2764: switch (*ecode)
2765: {
2766: case OP_CRSTAR:
2767: case OP_CRMINSTAR:
2768: case OP_CRPLUS:
2769: case OP_CRMINPLUS:
2770: case OP_CRQUERY:
2771: case OP_CRMINQUERY:
2772: c = *ecode++ - OP_CRSTAR;
2773: minimize = (c & 1) != 0;
2774: min = rep_min[c]; /* Pick up values from tables; */
2775: max = rep_max[c]; /* zero for max => infinity */
2776: if (max == 0) max = INT_MAX;
2777: break;
2778:
2779: case OP_CRRANGE:
2780: case OP_CRMINRANGE:
2781: minimize = (*ecode == OP_CRMINRANGE);
2782: min = GET2(ecode, 1);
1.6 ! misha 2783: max = GET2(ecode, 1 + IMM2_SIZE);
1.1 misha 2784: if (max == 0) max = INT_MAX;
1.6 ! misha 2785: ecode += 1 + 2 * IMM2_SIZE;
1.1 misha 2786: break;
2787:
2788: default: /* No repeat follows */
2789: min = max = 1;
2790: break;
2791: }
2792:
2793: /* First, ensure the minimum number of matches are present. */
2794:
1.6 ! misha 2795: #ifdef SUPPORT_UTF
! 2796: if (utf)
1.1 misha 2797: {
2798: for (i = 1; i <= min; i++)
2799: {
1.4 misha 2800: if (eptr >= md->end_subject)
2801: {
2802: SCHECK_PARTIAL();
1.6 ! misha 2803: RRETURN(MATCH_NOMATCH);
1.4 misha 2804: }
1.1 misha 2805: GETCHARINC(c, eptr);
2806: if (c > 255)
2807: {
1.6 ! misha 2808: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1.1 misha 2809: }
2810: else
1.6 ! misha 2811: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misha 2812: }
2813: }
2814: else
2815: #endif
1.6 ! misha 2816: /* Not UTF mode */
1.1 misha 2817: {
2818: for (i = 1; i <= min; i++)
2819: {
1.4 misha 2820: if (eptr >= md->end_subject)
2821: {
2822: SCHECK_PARTIAL();
1.6 ! misha 2823: RRETURN(MATCH_NOMATCH);
1.4 misha 2824: }
1.1 misha 2825: c = *eptr++;
1.6 ! misha 2826: #ifndef COMPILE_PCRE8
! 2827: if (c > 255)
! 2828: {
! 2829: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
! 2830: }
! 2831: else
! 2832: #endif
! 2833: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misha 2834: }
2835: }
2836:
2837: /* If max == min we can continue with the main loop without the
2838: need to recurse. */
2839:
2840: if (min == max) continue;
2841:
2842: /* If minimizing, keep testing the rest of the expression and advancing
2843: the pointer while it matches the class. */
2844:
2845: if (minimize)
2846: {
1.6 ! misha 2847: #ifdef SUPPORT_UTF
! 2848: if (utf)
1.1 misha 2849: {
2850: for (fi = min;; fi++)
2851: {
1.6 ! misha 2852: RMATCH(eptr, ecode, offset_top, md, eptrb, RM16);
1.1 misha 2853: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 2854: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 2855: if (eptr >= md->end_subject)
2856: {
2857: SCHECK_PARTIAL();
1.6 ! misha 2858: RRETURN(MATCH_NOMATCH);
1.4 misha 2859: }
1.1 misha 2860: GETCHARINC(c, eptr);
2861: if (c > 255)
2862: {
1.6 ! misha 2863: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1.1 misha 2864: }
2865: else
1.6 ! misha 2866: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misha 2867: }
2868: }
2869: else
2870: #endif
1.6 ! misha 2871: /* Not UTF mode */
1.1 misha 2872: {
2873: for (fi = min;; fi++)
2874: {
1.6 ! misha 2875: RMATCH(eptr, ecode, offset_top, md, eptrb, RM17);
1.1 misha 2876: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 2877: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 2878: if (eptr >= md->end_subject)
2879: {
2880: SCHECK_PARTIAL();
1.6 ! misha 2881: RRETURN(MATCH_NOMATCH);
1.4 misha 2882: }
1.1 misha 2883: c = *eptr++;
1.6 ! misha 2884: #ifndef COMPILE_PCRE8
! 2885: if (c > 255)
! 2886: {
! 2887: if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
! 2888: }
! 2889: else
! 2890: #endif
! 2891: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1.1 misha 2892: }
2893: }
2894: /* Control never gets here */
2895: }
2896:
2897: /* If maximizing, find the longest possible run, then work backwards. */
2898:
2899: else
2900: {
2901: pp = eptr;
2902:
1.6 ! misha 2903: #ifdef SUPPORT_UTF
! 2904: if (utf)
1.1 misha 2905: {
2906: for (i = min; i < max; i++)
2907: {
2908: int len = 1;
1.4 misha 2909: if (eptr >= md->end_subject)
2910: {
2911: SCHECK_PARTIAL();
2912: break;
2913: }
1.1 misha 2914: GETCHARLEN(c, eptr, len);
2915: if (c > 255)
2916: {
2917: if (op == OP_CLASS) break;
2918: }
2919: else
1.6 ! misha 2920: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
1.1 misha 2921: eptr += len;
2922: }
2923: for (;;)
2924: {
1.6 ! misha 2925: RMATCH(eptr, ecode, offset_top, md, eptrb, RM18);
1.1 misha 2926: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2927: if (eptr-- == pp) break; /* Stop if tried at original pos */
2928: BACKCHAR(eptr);
2929: }
2930: }
2931: else
2932: #endif
1.6 ! misha 2933: /* Not UTF mode */
1.1 misha 2934: {
2935: for (i = min; i < max; i++)
2936: {
1.4 misha 2937: if (eptr >= md->end_subject)
2938: {
2939: SCHECK_PARTIAL();
2940: break;
2941: }
1.1 misha 2942: c = *eptr;
1.6 ! misha 2943: #ifndef COMPILE_PCRE8
! 2944: if (c > 255)
! 2945: {
! 2946: if (op == OP_CLASS) break;
! 2947: }
! 2948: else
! 2949: #endif
! 2950: if ((BYTE_MAP[c/8] & (1 << (c&7))) == 0) break;
1.1 misha 2951: eptr++;
2952: }
2953: while (eptr >= pp)
2954: {
1.6 ! misha 2955: RMATCH(eptr, ecode, offset_top, md, eptrb, RM19);
1.1 misha 2956: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2957: eptr--;
2958: }
2959: }
2960:
1.6 ! misha 2961: RRETURN(MATCH_NOMATCH);
1.1 misha 2962: }
1.6 ! misha 2963: #undef BYTE_MAP
1.1 misha 2964: }
2965: /* Control never gets here */
2966:
2967:
2968: /* Match an extended character class. This opcode is encountered only
1.3 misha 2969: when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2970: mode, because Unicode properties are supported in non-UTF-8 mode. */
1.1 misha 2971:
1.6 ! misha 2972: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1.1 misha 2973: case OP_XCLASS:
2974: {
2975: data = ecode + 1 + LINK_SIZE; /* Save for matching */
2976: ecode += GET(ecode, 1); /* Advance past the item */
2977:
2978: switch (*ecode)
2979: {
2980: case OP_CRSTAR:
2981: case OP_CRMINSTAR:
2982: case OP_CRPLUS:
2983: case OP_CRMINPLUS:
2984: case OP_CRQUERY:
2985: case OP_CRMINQUERY:
2986: c = *ecode++ - OP_CRSTAR;
2987: minimize = (c & 1) != 0;
2988: min = rep_min[c]; /* Pick up values from tables; */
2989: max = rep_max[c]; /* zero for max => infinity */
2990: if (max == 0) max = INT_MAX;
2991: break;
2992:
2993: case OP_CRRANGE:
2994: case OP_CRMINRANGE:
2995: minimize = (*ecode == OP_CRMINRANGE);
2996: min = GET2(ecode, 1);
1.6 ! misha 2997: max = GET2(ecode, 1 + IMM2_SIZE);
1.1 misha 2998: if (max == 0) max = INT_MAX;
1.6 ! misha 2999: ecode += 1 + 2 * IMM2_SIZE;
1.1 misha 3000: break;
3001:
3002: default: /* No repeat follows */
3003: min = max = 1;
3004: break;
3005: }
3006:
3007: /* First, ensure the minimum number of matches are present. */
3008:
3009: for (i = 1; i <= min; i++)
3010: {
1.4 misha 3011: if (eptr >= md->end_subject)
3012: {
3013: SCHECK_PARTIAL();
1.6 ! misha 3014: RRETURN(MATCH_NOMATCH);
1.4 misha 3015: }
1.3 misha 3016: GETCHARINCTEST(c, eptr);
1.6 ! misha 3017: if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
1.1 misha 3018: }
3019:
3020: /* If max == min we can continue with the main loop without the
3021: need to recurse. */
3022:
3023: if (min == max) continue;
3024:
3025: /* If minimizing, keep testing the rest of the expression and advancing
3026: the pointer while it matches the class. */
3027:
3028: if (minimize)
3029: {
3030: for (fi = min;; fi++)
3031: {
1.6 ! misha 3032: RMATCH(eptr, ecode, offset_top, md, eptrb, RM20);
1.1 misha 3033: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 3034: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 3035: if (eptr >= md->end_subject)
3036: {
3037: SCHECK_PARTIAL();
1.6 ! misha 3038: RRETURN(MATCH_NOMATCH);
1.4 misha 3039: }
1.3 misha 3040: GETCHARINCTEST(c, eptr);
1.6 ! misha 3041: if (!PRIV(xclass)(c, data, utf)) RRETURN(MATCH_NOMATCH);
1.1 misha 3042: }
3043: /* Control never gets here */
3044: }
3045:
3046: /* If maximizing, find the longest possible run, then work backwards. */
3047:
3048: else
3049: {
3050: pp = eptr;
3051: for (i = min; i < max; i++)
3052: {
3053: int len = 1;
1.4 misha 3054: if (eptr >= md->end_subject)
3055: {
3056: SCHECK_PARTIAL();
3057: break;
3058: }
1.6 ! misha 3059: #ifdef SUPPORT_UTF
1.3 misha 3060: GETCHARLENTEST(c, eptr, len);
1.6 ! misha 3061: #else
! 3062: c = *eptr;
! 3063: #endif
! 3064: if (!PRIV(xclass)(c, data, utf)) break;
1.1 misha 3065: eptr += len;
3066: }
3067: for(;;)
3068: {
1.6 ! misha 3069: RMATCH(eptr, ecode, offset_top, md, eptrb, RM21);
1.1 misha 3070: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3071: if (eptr-- == pp) break; /* Stop if tried at original pos */
1.6 ! misha 3072: #ifdef SUPPORT_UTF
! 3073: if (utf) BACKCHAR(eptr);
! 3074: #endif
1.1 misha 3075: }
1.6 ! misha 3076: RRETURN(MATCH_NOMATCH);
1.1 misha 3077: }
3078:
3079: /* Control never gets here */
3080: }
3081: #endif /* End of XCLASS */
3082:
3083: /* Match a single character, casefully */
3084:
3085: case OP_CHAR:
1.6 ! misha 3086: #ifdef SUPPORT_UTF
! 3087: if (utf)
1.1 misha 3088: {
3089: length = 1;
3090: ecode++;
3091: GETCHARLEN(fc, ecode, length);
1.4 misha 3092: if (length > md->end_subject - eptr)
3093: {
3094: CHECK_PARTIAL(); /* Not SCHECK_PARTIAL() */
1.6 ! misha 3095: RRETURN(MATCH_NOMATCH);
1.4 misha 3096: }
1.6 ! misha 3097: while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
1.1 misha 3098: }
3099: else
3100: #endif
1.6 ! misha 3101: /* Not UTF mode */
1.1 misha 3102: {
1.4 misha 3103: if (md->end_subject - eptr < 1)
3104: {
3105: SCHECK_PARTIAL(); /* This one can use SCHECK_PARTIAL() */
1.6 ! misha 3106: RRETURN(MATCH_NOMATCH);
1.4 misha 3107: }
1.6 ! misha 3108: if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
1.1 misha 3109: ecode += 2;
3110: }
3111: break;
3112:
1.6 ! misha 3113: /* Match a single character, caselessly. If we are at the end of the
! 3114: subject, give up immediately. */
1.1 misha 3115:
1.6 ! misha 3116: case OP_CHARI:
! 3117: if (eptr >= md->end_subject)
! 3118: {
! 3119: SCHECK_PARTIAL();
! 3120: RRETURN(MATCH_NOMATCH);
! 3121: }
! 3122:
! 3123: #ifdef SUPPORT_UTF
! 3124: if (utf)
1.1 misha 3125: {
3126: length = 1;
3127: ecode++;
3128: GETCHARLEN(fc, ecode, length);
3129:
3130: /* If the pattern character's value is < 128, we have only one byte, and
1.6 ! misha 3131: we know that its other case must also be one byte long, so we can use the
! 3132: fast lookup table. We know that there is at least one byte left in the
! 3133: subject. */
1.1 misha 3134:
3135: if (fc < 128)
3136: {
1.6 ! misha 3137: if (md->lcc[fc]
! 3138: != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
! 3139: ecode++;
! 3140: eptr++;
1.1 misha 3141: }
3142:
1.6 ! misha 3143: /* Otherwise we must pick up the subject character. Note that we cannot
! 3144: use the value of "length" to check for sufficient bytes left, because the
! 3145: other case of the character may have more or fewer bytes. */
1.1 misha 3146:
3147: else
3148: {
3149: unsigned int dc;
3150: GETCHARINC(dc, eptr);
3151: ecode += length;
3152:
3153: /* If we have Unicode property support, we can use it to test the other
3154: case of the character, if there is one. */
3155:
3156: if (fc != dc)
3157: {
3158: #ifdef SUPPORT_UCP
1.2 misha 3159: if (dc != UCD_OTHERCASE(fc))
1.1 misha 3160: #endif
1.6 ! misha 3161: RRETURN(MATCH_NOMATCH);
1.1 misha 3162: }
3163: }
3164: }
3165: else
1.6 ! misha 3166: #endif /* SUPPORT_UTF */
1.1 misha 3167:
1.6 ! misha 3168: /* Not UTF mode */
1.1 misha 3169: {
1.6 ! misha 3170: if (TABLE_GET(ecode[1], md->lcc, ecode[1])
! 3171: != TABLE_GET(*eptr, md->lcc, *eptr)) RRETURN(MATCH_NOMATCH);
! 3172: eptr++;
1.1 misha 3173: ecode += 2;
3174: }
3175: break;
3176:
3177: /* Match a single character repeatedly. */
3178:
3179: case OP_EXACT:
1.6 ! misha 3180: case OP_EXACTI:
1.1 misha 3181: min = max = GET2(ecode, 1);
1.6 ! misha 3182: ecode += 1 + IMM2_SIZE;
1.1 misha 3183: goto REPEATCHAR;
3184:
3185: case OP_POSUPTO:
1.6 ! misha 3186: case OP_POSUPTOI:
1.1 misha 3187: possessive = TRUE;
3188: /* Fall through */
3189:
3190: case OP_UPTO:
1.6 ! misha 3191: case OP_UPTOI:
1.1 misha 3192: case OP_MINUPTO:
1.6 ! misha 3193: case OP_MINUPTOI:
1.1 misha 3194: min = 0;
3195: max = GET2(ecode, 1);
1.6 ! misha 3196: minimize = *ecode == OP_MINUPTO || *ecode == OP_MINUPTOI;
! 3197: ecode += 1 + IMM2_SIZE;
1.1 misha 3198: goto REPEATCHAR;
3199:
3200: case OP_POSSTAR:
1.6 ! misha 3201: case OP_POSSTARI:
1.1 misha 3202: possessive = TRUE;
3203: min = 0;
3204: max = INT_MAX;
3205: ecode++;
3206: goto REPEATCHAR;
3207:
3208: case OP_POSPLUS:
1.6 ! misha 3209: case OP_POSPLUSI:
1.1 misha 3210: possessive = TRUE;
3211: min = 1;
3212: max = INT_MAX;
3213: ecode++;
3214: goto REPEATCHAR;
3215:
3216: case OP_POSQUERY:
1.6 ! misha 3217: case OP_POSQUERYI:
1.1 misha 3218: possessive = TRUE;
3219: min = 0;
3220: max = 1;
3221: ecode++;
3222: goto REPEATCHAR;
3223:
3224: case OP_STAR:
1.6 ! misha 3225: case OP_STARI:
1.1 misha 3226: case OP_MINSTAR:
1.6 ! misha 3227: case OP_MINSTARI:
1.1 misha 3228: case OP_PLUS:
1.6 ! misha 3229: case OP_PLUSI:
1.1 misha 3230: case OP_MINPLUS:
1.6 ! misha 3231: case OP_MINPLUSI:
1.1 misha 3232: case OP_QUERY:
1.6 ! misha 3233: case OP_QUERYI:
1.1 misha 3234: case OP_MINQUERY:
1.6 ! misha 3235: case OP_MINQUERYI:
! 3236: c = *ecode++ - ((op < OP_STARI)? OP_STAR : OP_STARI);
1.1 misha 3237: minimize = (c & 1) != 0;
3238: min = rep_min[c]; /* Pick up values from tables; */
3239: max = rep_max[c]; /* zero for max => infinity */
3240: if (max == 0) max = INT_MAX;
3241:
1.4 misha 3242: /* Common code for all repeated single-character matches. */
1.1 misha 3243:
3244: REPEATCHAR:
1.6 ! misha 3245: #ifdef SUPPORT_UTF
! 3246: if (utf)
1.1 misha 3247: {
3248: length = 1;
3249: charptr = ecode;
3250: GETCHARLEN(fc, ecode, length);
3251: ecode += length;
3252:
3253: /* Handle multibyte character matching specially here. There is
3254: support for caseless matching if UCP support is present. */
3255:
3256: if (length > 1)
3257: {
3258: #ifdef SUPPORT_UCP
3259: unsigned int othercase;
1.6 ! misha 3260: if (op >= OP_STARI && /* Caseless */
1.2 misha 3261: (othercase = UCD_OTHERCASE(fc)) != fc)
1.6 ! misha 3262: oclength = PRIV(ord2utf)(othercase, occhars);
1.1 misha 3263: else oclength = 0;
3264: #endif /* SUPPORT_UCP */
3265:
3266: for (i = 1; i <= min; i++)
3267: {
1.4 misha 3268: if (eptr <= md->end_subject - length &&
1.6 ! misha 3269: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
1.1 misha 3270: #ifdef SUPPORT_UCP
1.4 misha 3271: else if (oclength > 0 &&
3272: eptr <= md->end_subject - oclength &&
1.6 ! misha 3273: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
1.4 misha 3274: #endif /* SUPPORT_UCP */
1.1 misha 3275: else
3276: {
1.4 misha 3277: CHECK_PARTIAL();
1.6 ! misha 3278: RRETURN(MATCH_NOMATCH);
1.1 misha 3279: }
3280: }
3281:
3282: if (min == max) continue;
3283:
3284: if (minimize)
3285: {
3286: for (fi = min;; fi++)
3287: {
1.6 ! misha 3288: RMATCH(eptr, ecode, offset_top, md, eptrb, RM22);
1.1 misha 3289: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 3290: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 3291: if (eptr <= md->end_subject - length &&
1.6 ! misha 3292: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
1.1 misha 3293: #ifdef SUPPORT_UCP
1.4 misha 3294: else if (oclength > 0 &&
3295: eptr <= md->end_subject - oclength &&
1.6 ! misha 3296: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
1.4 misha 3297: #endif /* SUPPORT_UCP */
1.1 misha 3298: else
3299: {
1.4 misha 3300: CHECK_PARTIAL();
1.6 ! misha 3301: RRETURN(MATCH_NOMATCH);
1.1 misha 3302: }
3303: }
3304: /* Control never gets here */
3305: }
3306:
3307: else /* Maximize */
3308: {
3309: pp = eptr;
3310: for (i = min; i < max; i++)
3311: {
1.4 misha 3312: if (eptr <= md->end_subject - length &&
1.6 ! misha 3313: memcmp(eptr, charptr, IN_UCHARS(length)) == 0) eptr += length;
1.1 misha 3314: #ifdef SUPPORT_UCP
1.4 misha 3315: else if (oclength > 0 &&
3316: eptr <= md->end_subject - oclength &&
1.6 ! misha 3317: memcmp(eptr, occhars, IN_UCHARS(oclength)) == 0) eptr += oclength;
1.4 misha 3318: #endif /* SUPPORT_UCP */
1.1 misha 3319: else
3320: {
1.4 misha 3321: CHECK_PARTIAL();
3322: break;
1.1 misha 3323: }
3324: }
3325:
3326: if (possessive) continue;
1.4 misha 3327:
1.1 misha 3328: for(;;)
1.4 misha 3329: {
1.6 ! misha 3330: RMATCH(eptr, ecode, offset_top, md, eptrb, RM23);
1.4 misha 3331: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 3332: if (eptr == pp) { RRETURN(MATCH_NOMATCH); }
1.1 misha 3333: #ifdef SUPPORT_UCP
1.4 misha 3334: eptr--;
3335: BACKCHAR(eptr);
1.1 misha 3336: #else /* without SUPPORT_UCP */
1.4 misha 3337: eptr -= length;
1.1 misha 3338: #endif /* SUPPORT_UCP */
1.4 misha 3339: }
1.1 misha 3340: }
3341: /* Control never gets here */
3342: }
3343:
3344: /* If the length of a UTF-8 character is 1, we fall through here, and
3345: obey the code as for non-UTF-8 characters below, though in this case the
3346: value of fc will always be < 128. */
3347: }
3348: else
1.6 ! misha 3349: #endif /* SUPPORT_UTF */
! 3350: /* When not in UTF-8 mode, load a single-byte character. */
! 3351: fc = *ecode++;
1.1 misha 3352:
1.6 ! misha 3353: /* The value of fc at this point is always one character, though we may
! 3354: or may not be in UTF mode. The code is duplicated for the caseless and
1.1 misha 3355: caseful cases, for speed, since matching characters is likely to be quite
3356: common. First, ensure the minimum number of matches are present. If min =
3357: max, continue at the same level without recursing. Otherwise, if
3358: minimizing, keep trying the rest of the expression and advancing one
3359: matching character if failing, up to the maximum. Alternatively, if
3360: maximizing, find the maximum number of characters and work backwards. */
3361:
3362: DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3363: max, eptr));
3364:
1.6 ! misha 3365: if (op >= OP_STARI) /* Caseless */
1.1 misha 3366: {
1.6 ! misha 3367: #ifdef COMPILE_PCRE8
! 3368: /* fc must be < 128 if UTF is enabled. */
! 3369: foc = md->fcc[fc];
! 3370: #else
! 3371: #ifdef SUPPORT_UTF
! 3372: #ifdef SUPPORT_UCP
! 3373: if (utf && fc > 127)
! 3374: foc = UCD_OTHERCASE(fc);
! 3375: #else
! 3376: if (utf && fc > 127)
! 3377: foc = fc;
! 3378: #endif /* SUPPORT_UCP */
! 3379: else
! 3380: #endif /* SUPPORT_UTF */
! 3381: foc = TABLE_GET(fc, md->fcc, fc);
! 3382: #endif /* COMPILE_PCRE8 */
! 3383:
1.1 misha 3384: for (i = 1; i <= min; i++)
1.4 misha 3385: {
3386: if (eptr >= md->end_subject)
3387: {
3388: SCHECK_PARTIAL();
1.6 ! misha 3389: RRETURN(MATCH_NOMATCH);
1.4 misha 3390: }
1.6 ! misha 3391: if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
! 3392: eptr++;
1.4 misha 3393: }
1.1 misha 3394: if (min == max) continue;
3395: if (minimize)
3396: {
3397: for (fi = min;; fi++)
3398: {
1.6 ! misha 3399: RMATCH(eptr, ecode, offset_top, md, eptrb, RM24);
1.1 misha 3400: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 3401: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 3402: if (eptr >= md->end_subject)
3403: {
3404: SCHECK_PARTIAL();
1.6 ! misha 3405: RRETURN(MATCH_NOMATCH);
1.4 misha 3406: }
1.6 ! misha 3407: if (fc != *eptr && foc != *eptr) RRETURN(MATCH_NOMATCH);
! 3408: eptr++;
1.1 misha 3409: }
3410: /* Control never gets here */
3411: }
3412: else /* Maximize */
3413: {
3414: pp = eptr;
3415: for (i = min; i < max; i++)
3416: {
1.4 misha 3417: if (eptr >= md->end_subject)
3418: {
3419: SCHECK_PARTIAL();
3420: break;
3421: }
1.6 ! misha 3422: if (fc != *eptr && foc != *eptr) break;
1.1 misha 3423: eptr++;
3424: }
1.4 misha 3425:
1.1 misha 3426: if (possessive) continue;
1.4 misha 3427:
1.1 misha 3428: while (eptr >= pp)
3429: {
1.6 ! misha 3430: RMATCH(eptr, ecode, offset_top, md, eptrb, RM25);
1.1 misha 3431: eptr--;
3432: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3433: }
1.6 ! misha 3434: RRETURN(MATCH_NOMATCH);
1.1 misha 3435: }
3436: /* Control never gets here */
3437: }
3438:
3439: /* Caseful comparisons (includes all multi-byte characters) */
3440:
3441: else
3442: {
1.4 misha 3443: for (i = 1; i <= min; i++)
3444: {
3445: if (eptr >= md->end_subject)
3446: {
3447: SCHECK_PARTIAL();
1.6 ! misha 3448: RRETURN(MATCH_NOMATCH);
1.4 misha 3449: }
1.6 ! misha 3450: if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
1.4 misha 3451: }
3452:
1.1 misha 3453: if (min == max) continue;
1.4 misha 3454:
1.1 misha 3455: if (minimize)
3456: {
3457: for (fi = min;; fi++)
3458: {
1.6 ! misha 3459: RMATCH(eptr, ecode, offset_top, md, eptrb, RM26);
1.1 misha 3460: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 3461: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 3462: if (eptr >= md->end_subject)
3463: {
3464: SCHECK_PARTIAL();
1.6 ! misha 3465: RRETURN(MATCH_NOMATCH);
1.4 misha 3466: }
1.6 ! misha 3467: if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
1.1 misha 3468: }
3469: /* Control never gets here */
3470: }
3471: else /* Maximize */
3472: {
3473: pp = eptr;
3474: for (i = min; i < max; i++)
3475: {
1.4 misha 3476: if (eptr >= md->end_subject)
3477: {
3478: SCHECK_PARTIAL();
3479: break;
3480: }
3481: if (fc != *eptr) break;
1.1 misha 3482: eptr++;
3483: }
3484: if (possessive) continue;
1.4 misha 3485:
1.1 misha 3486: while (eptr >= pp)
3487: {
1.6 ! misha 3488: RMATCH(eptr, ecode, offset_top, md, eptrb, RM27);
1.1 misha 3489: eptr--;
3490: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3491: }
1.6 ! misha 3492: RRETURN(MATCH_NOMATCH);
1.1 misha 3493: }
3494: }
3495: /* Control never gets here */
3496:
3497: /* Match a negated single one-byte character. The character we are
3498: checking can be multibyte. */
3499:
3500: case OP_NOT:
1.6 ! misha 3501: case OP_NOTI:
1.4 misha 3502: if (eptr >= md->end_subject)
3503: {
3504: SCHECK_PARTIAL();
1.6 ! misha 3505: RRETURN(MATCH_NOMATCH);
1.4 misha 3506: }
1.1 misha 3507: ecode++;
3508: GETCHARINCTEST(c, eptr);
1.6 ! misha 3509: if (op == OP_NOTI) /* The caseless case */
1.1 misha 3510: {
1.6 ! misha 3511: register unsigned int ch, och;
! 3512: ch = *ecode++;
! 3513: #ifdef COMPILE_PCRE8
! 3514: /* ch must be < 128 if UTF is enabled. */
! 3515: och = md->fcc[ch];
! 3516: #else
! 3517: #ifdef SUPPORT_UTF
! 3518: #ifdef SUPPORT_UCP
! 3519: if (utf && ch > 127)
! 3520: och = UCD_OTHERCASE(ch);
! 3521: #else
! 3522: if (utf && ch > 127)
! 3523: och = ch;
! 3524: #endif /* SUPPORT_UCP */
! 3525: else
! 3526: #endif /* SUPPORT_UTF */
! 3527: och = TABLE_GET(ch, md->fcc, ch);
! 3528: #endif /* COMPILE_PCRE8 */
! 3529: if (ch == c || och == c) RRETURN(MATCH_NOMATCH);
1.1 misha 3530: }
1.6 ! misha 3531: else /* Caseful */
1.1 misha 3532: {
1.6 ! misha 3533: if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
1.1 misha 3534: }
3535: break;
3536:
3537: /* Match a negated single one-byte character repeatedly. This is almost a
3538: repeat of the code for a repeated single character, but I haven't found a
3539: nice way of commoning these up that doesn't require a test of the
3540: positive/negative option for each character match. Maybe that wouldn't add
3541: very much to the time taken, but character matching *is* what this is all
3542: about... */
3543:
3544: case OP_NOTEXACT:
1.6 ! misha 3545: case OP_NOTEXACTI:
1.1 misha 3546: min = max = GET2(ecode, 1);
1.6 ! misha 3547: ecode += 1 + IMM2_SIZE;
1.1 misha 3548: goto REPEATNOTCHAR;
3549:
3550: case OP_NOTUPTO:
1.6 ! misha 3551: case OP_NOTUPTOI:
1.1 misha 3552: case OP_NOTMINUPTO:
1.6 ! misha 3553: case OP_NOTMINUPTOI:
1.1 misha 3554: min = 0;
3555: max = GET2(ecode, 1);
1.6 ! misha 3556: minimize = *ecode == OP_NOTMINUPTO || *ecode == OP_NOTMINUPTOI;
! 3557: ecode += 1 + IMM2_SIZE;
1.1 misha 3558: goto REPEATNOTCHAR;
3559:
3560: case OP_NOTPOSSTAR:
1.6 ! misha 3561: case OP_NOTPOSSTARI:
1.1 misha 3562: possessive = TRUE;
3563: min = 0;
3564: max = INT_MAX;
3565: ecode++;
3566: goto REPEATNOTCHAR;
3567:
3568: case OP_NOTPOSPLUS:
1.6 ! misha 3569: case OP_NOTPOSPLUSI:
1.1 misha 3570: possessive = TRUE;
3571: min = 1;
3572: max = INT_MAX;
3573: ecode++;
3574: goto REPEATNOTCHAR;
3575:
3576: case OP_NOTPOSQUERY:
1.6 ! misha 3577: case OP_NOTPOSQUERYI:
1.1 misha 3578: possessive = TRUE;
3579: min = 0;
3580: max = 1;
3581: ecode++;
3582: goto REPEATNOTCHAR;
3583:
3584: case OP_NOTPOSUPTO:
1.6 ! misha 3585: case OP_NOTPOSUPTOI:
1.1 misha 3586: possessive = TRUE;
3587: min = 0;
3588: max = GET2(ecode, 1);
1.6 ! misha 3589: ecode += 1 + IMM2_SIZE;
1.1 misha 3590: goto REPEATNOTCHAR;
3591:
3592: case OP_NOTSTAR:
1.6 ! misha 3593: case OP_NOTSTARI:
1.1 misha 3594: case OP_NOTMINSTAR:
1.6 ! misha 3595: case OP_NOTMINSTARI:
1.1 misha 3596: case OP_NOTPLUS:
1.6 ! misha 3597: case OP_NOTPLUSI:
1.1 misha 3598: case OP_NOTMINPLUS:
1.6 ! misha 3599: case OP_NOTMINPLUSI:
1.1 misha 3600: case OP_NOTQUERY:
1.6 ! misha 3601: case OP_NOTQUERYI:
1.1 misha 3602: case OP_NOTMINQUERY:
1.6 ! misha 3603: case OP_NOTMINQUERYI:
! 3604: c = *ecode++ - ((op >= OP_NOTSTARI)? OP_NOTSTARI: OP_NOTSTAR);
1.1 misha 3605: minimize = (c & 1) != 0;
3606: min = rep_min[c]; /* Pick up values from tables; */
3607: max = rep_max[c]; /* zero for max => infinity */
3608: if (max == 0) max = INT_MAX;
3609:
1.4 misha 3610: /* Common code for all repeated single-byte matches. */
1.1 misha 3611:
3612: REPEATNOTCHAR:
3613: fc = *ecode++;
3614:
3615: /* The code is duplicated for the caseless and caseful cases, for speed,
3616: since matching characters is likely to be quite common. First, ensure the
3617: minimum number of matches are present. If min = max, continue at the same
3618: level without recursing. Otherwise, if minimizing, keep trying the rest of
3619: the expression and advancing one matching character if failing, up to the
3620: maximum. Alternatively, if maximizing, find the maximum number of
3621: characters and work backwards. */
3622:
3623: DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
3624: max, eptr));
3625:
1.6 ! misha 3626: if (op >= OP_NOTSTARI) /* Caseless */
1.1 misha 3627: {
1.6 ! misha 3628: #ifdef COMPILE_PCRE8
! 3629: /* fc must be < 128 if UTF is enabled. */
! 3630: foc = md->fcc[fc];
! 3631: #else
! 3632: #ifdef SUPPORT_UTF
! 3633: #ifdef SUPPORT_UCP
! 3634: if (utf && fc > 127)
! 3635: foc = UCD_OTHERCASE(fc);
! 3636: #else
! 3637: if (utf && fc > 127)
! 3638: foc = fc;
! 3639: #endif /* SUPPORT_UCP */
! 3640: else
! 3641: #endif /* SUPPORT_UTF */
! 3642: foc = TABLE_GET(fc, md->fcc, fc);
! 3643: #endif /* COMPILE_PCRE8 */
1.1 misha 3644:
1.6 ! misha 3645: #ifdef SUPPORT_UTF
! 3646: if (utf)
1.1 misha 3647: {
3648: register unsigned int d;
3649: for (i = 1; i <= min; i++)
3650: {
1.4 misha 3651: if (eptr >= md->end_subject)
3652: {
3653: SCHECK_PARTIAL();
1.6 ! misha 3654: RRETURN(MATCH_NOMATCH);
1.4 misha 3655: }
1.1 misha 3656: GETCHARINC(d, eptr);
1.6 ! misha 3657: if (fc == d || (unsigned int) foc == d) RRETURN(MATCH_NOMATCH);
1.1 misha 3658: }
3659: }
3660: else
3661: #endif
1.6 ! misha 3662: /* Not UTF mode */
1.1 misha 3663: {
3664: for (i = 1; i <= min; i++)
1.4 misha 3665: {
3666: if (eptr >= md->end_subject)
3667: {
3668: SCHECK_PARTIAL();
1.6 ! misha 3669: RRETURN(MATCH_NOMATCH);
1.4 misha 3670: }
1.6 ! misha 3671: if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
! 3672: eptr++;
1.4 misha 3673: }
1.1 misha 3674: }
3675:
3676: if (min == max) continue;
3677:
3678: if (minimize)
3679: {
1.6 ! misha 3680: #ifdef SUPPORT_UTF
! 3681: if (utf)
1.1 misha 3682: {
3683: register unsigned int d;
3684: for (fi = min;; fi++)
3685: {
1.6 ! misha 3686: RMATCH(eptr, ecode, offset_top, md, eptrb, RM28);
1.1 misha 3687: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 3688: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 3689: if (eptr >= md->end_subject)
3690: {
3691: SCHECK_PARTIAL();
1.6 ! misha 3692: RRETURN(MATCH_NOMATCH);
1.4 misha 3693: }
1.1 misha 3694: GETCHARINC(d, eptr);
1.6 ! misha 3695: if (fc == d || (unsigned int)foc == d) RRETURN(MATCH_NOMATCH);
1.1 misha 3696: }
3697: }
3698: else
3699: #endif
1.6 ! misha 3700: /* Not UTF mode */
1.1 misha 3701: {
3702: for (fi = min;; fi++)
3703: {
1.6 ! misha 3704: RMATCH(eptr, ecode, offset_top, md, eptrb, RM29);
1.1 misha 3705: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 3706: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 3707: if (eptr >= md->end_subject)
3708: {
3709: SCHECK_PARTIAL();
1.6 ! misha 3710: RRETURN(MATCH_NOMATCH);
1.4 misha 3711: }
1.6 ! misha 3712: if (fc == *eptr || foc == *eptr) RRETURN(MATCH_NOMATCH);
! 3713: eptr++;
1.1 misha 3714: }
3715: }
3716: /* Control never gets here */
3717: }
3718:
3719: /* Maximize case */
3720:
3721: else
3722: {
3723: pp = eptr;
3724:
1.6 ! misha 3725: #ifdef SUPPORT_UTF
! 3726: if (utf)
1.1 misha 3727: {
3728: register unsigned int d;
3729: for (i = min; i < max; i++)
3730: {
3731: int len = 1;
1.4 misha 3732: if (eptr >= md->end_subject)
3733: {
3734: SCHECK_PARTIAL();
3735: break;
3736: }
1.1 misha 3737: GETCHARLEN(d, eptr, len);
1.6 ! misha 3738: if (fc == d || (unsigned int)foc == d) break;
1.1 misha 3739: eptr += len;
3740: }
1.6 ! misha 3741: if (possessive) continue;
! 3742: for(;;)
1.1 misha 3743: {
1.6 ! misha 3744: RMATCH(eptr, ecode, offset_top, md, eptrb, RM30);
1.1 misha 3745: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3746: if (eptr-- == pp) break; /* Stop if tried at original pos */
3747: BACKCHAR(eptr);
3748: }
3749: }
3750: else
3751: #endif
1.6 ! misha 3752: /* Not UTF mode */
1.1 misha 3753: {
3754: for (i = min; i < max; i++)
3755: {
1.4 misha 3756: if (eptr >= md->end_subject)
3757: {
3758: SCHECK_PARTIAL();
3759: break;
3760: }
1.6 ! misha 3761: if (fc == *eptr || foc == *eptr) break;
1.1 misha 3762: eptr++;
3763: }
3764: if (possessive) continue;
3765: while (eptr >= pp)
3766: {
1.6 ! misha 3767: RMATCH(eptr, ecode, offset_top, md, eptrb, RM31);
1.1 misha 3768: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3769: eptr--;
3770: }
3771: }
3772:
1.6 ! misha 3773: RRETURN(MATCH_NOMATCH);
1.1 misha 3774: }
3775: /* Control never gets here */
3776: }
3777:
3778: /* Caseful comparisons */
3779:
3780: else
3781: {
1.6 ! misha 3782: #ifdef SUPPORT_UTF
! 3783: if (utf)
1.1 misha 3784: {
3785: register unsigned int d;
3786: for (i = 1; i <= min; i++)
3787: {
1.4 misha 3788: if (eptr >= md->end_subject)
3789: {
3790: SCHECK_PARTIAL();
1.6 ! misha 3791: RRETURN(MATCH_NOMATCH);
1.4 misha 3792: }
1.1 misha 3793: GETCHARINC(d, eptr);
1.6 ! misha 3794: if (fc == d) RRETURN(MATCH_NOMATCH);
1.1 misha 3795: }
3796: }
3797: else
3798: #endif
1.6 ! misha 3799: /* Not UTF mode */
1.1 misha 3800: {
3801: for (i = 1; i <= min; i++)
1.4 misha 3802: {
3803: if (eptr >= md->end_subject)
3804: {
3805: SCHECK_PARTIAL();
1.6 ! misha 3806: RRETURN(MATCH_NOMATCH);
1.4 misha 3807: }
1.6 ! misha 3808: if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
1.4 misha 3809: }
1.1 misha 3810: }
3811:
3812: if (min == max) continue;
3813:
3814: if (minimize)
3815: {
1.6 ! misha 3816: #ifdef SUPPORT_UTF
! 3817: if (utf)
1.1 misha 3818: {
3819: register unsigned int d;
3820: for (fi = min;; fi++)
3821: {
1.6 ! misha 3822: RMATCH(eptr, ecode, offset_top, md, eptrb, RM32);
1.1 misha 3823: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 3824: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 3825: if (eptr >= md->end_subject)
3826: {
3827: SCHECK_PARTIAL();
1.6 ! misha 3828: RRETURN(MATCH_NOMATCH);
1.4 misha 3829: }
1.1 misha 3830: GETCHARINC(d, eptr);
1.6 ! misha 3831: if (fc == d) RRETURN(MATCH_NOMATCH);
1.1 misha 3832: }
3833: }
3834: else
3835: #endif
1.6 ! misha 3836: /* Not UTF mode */
1.1 misha 3837: {
3838: for (fi = min;; fi++)
3839: {
1.6 ! misha 3840: RMATCH(eptr, ecode, offset_top, md, eptrb, RM33);
1.1 misha 3841: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 3842: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 3843: if (eptr >= md->end_subject)
3844: {
3845: SCHECK_PARTIAL();
1.6 ! misha 3846: RRETURN(MATCH_NOMATCH);
1.4 misha 3847: }
1.6 ! misha 3848: if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
1.1 misha 3849: }
3850: }
3851: /* Control never gets here */
3852: }
3853:
3854: /* Maximize case */
3855:
3856: else
3857: {
3858: pp = eptr;
3859:
1.6 ! misha 3860: #ifdef SUPPORT_UTF
! 3861: if (utf)
1.1 misha 3862: {
3863: register unsigned int d;
3864: for (i = min; i < max; i++)
3865: {
3866: int len = 1;
1.4 misha 3867: if (eptr >= md->end_subject)
3868: {
3869: SCHECK_PARTIAL();
3870: break;
3871: }
1.1 misha 3872: GETCHARLEN(d, eptr, len);
3873: if (fc == d) break;
3874: eptr += len;
3875: }
3876: if (possessive) continue;
3877: for(;;)
3878: {
1.6 ! misha 3879: RMATCH(eptr, ecode, offset_top, md, eptrb, RM34);
1.1 misha 3880: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3881: if (eptr-- == pp) break; /* Stop if tried at original pos */
3882: BACKCHAR(eptr);
3883: }
3884: }
3885: else
3886: #endif
1.6 ! misha 3887: /* Not UTF mode */
1.1 misha 3888: {
3889: for (i = min; i < max; i++)
3890: {
1.4 misha 3891: if (eptr >= md->end_subject)
3892: {
3893: SCHECK_PARTIAL();
3894: break;
3895: }
3896: if (fc == *eptr) break;
1.1 misha 3897: eptr++;
3898: }
3899: if (possessive) continue;
3900: while (eptr >= pp)
3901: {
1.6 ! misha 3902: RMATCH(eptr, ecode, offset_top, md, eptrb, RM35);
1.1 misha 3903: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3904: eptr--;
3905: }
3906: }
3907:
1.6 ! misha 3908: RRETURN(MATCH_NOMATCH);
1.1 misha 3909: }
3910: }
3911: /* Control never gets here */
3912:
3913: /* Match a single character type repeatedly; several different opcodes
3914: share code. This is very similar to the code for single characters, but we
3915: repeat it in the interests of efficiency. */
3916:
3917: case OP_TYPEEXACT:
3918: min = max = GET2(ecode, 1);
3919: minimize = TRUE;
1.6 ! misha 3920: ecode += 1 + IMM2_SIZE;
1.1 misha 3921: goto REPEATTYPE;
3922:
3923: case OP_TYPEUPTO:
3924: case OP_TYPEMINUPTO:
3925: min = 0;
3926: max = GET2(ecode, 1);
3927: minimize = *ecode == OP_TYPEMINUPTO;
1.6 ! misha 3928: ecode += 1 + IMM2_SIZE;
1.1 misha 3929: goto REPEATTYPE;
3930:
3931: case OP_TYPEPOSSTAR:
3932: possessive = TRUE;
3933: min = 0;
3934: max = INT_MAX;
3935: ecode++;
3936: goto REPEATTYPE;
3937:
3938: case OP_TYPEPOSPLUS:
3939: possessive = TRUE;
3940: min = 1;
3941: max = INT_MAX;
3942: ecode++;
3943: goto REPEATTYPE;
3944:
3945: case OP_TYPEPOSQUERY:
3946: possessive = TRUE;
3947: min = 0;
3948: max = 1;
3949: ecode++;
3950: goto REPEATTYPE;
3951:
3952: case OP_TYPEPOSUPTO:
3953: possessive = TRUE;
3954: min = 0;
3955: max = GET2(ecode, 1);
1.6 ! misha 3956: ecode += 1 + IMM2_SIZE;
1.1 misha 3957: goto REPEATTYPE;
3958:
3959: case OP_TYPESTAR:
3960: case OP_TYPEMINSTAR:
3961: case OP_TYPEPLUS:
3962: case OP_TYPEMINPLUS:
3963: case OP_TYPEQUERY:
3964: case OP_TYPEMINQUERY:
3965: c = *ecode++ - OP_TYPESTAR;
3966: minimize = (c & 1) != 0;
3967: min = rep_min[c]; /* Pick up values from tables; */
3968: max = rep_max[c]; /* zero for max => infinity */
3969: if (max == 0) max = INT_MAX;
3970:
3971: /* Common code for all repeated single character type matches. Note that
3972: in UTF-8 mode, '.' matches a character of any length, but for the other
3973: character types, the valid characters are all one-byte long. */
3974:
3975: REPEATTYPE:
3976: ctype = *ecode++; /* Code for the character type */
3977:
3978: #ifdef SUPPORT_UCP
3979: if (ctype == OP_PROP || ctype == OP_NOTPROP)
3980: {
3981: prop_fail_result = ctype == OP_NOTPROP;
3982: prop_type = *ecode++;
3983: prop_value = *ecode++;
3984: }
3985: else prop_type = -1;
3986: #endif
3987:
3988: /* First, ensure the minimum number of matches are present. Use inline
3989: code for maximizing the speed, and do the type test once at the start
1.4 misha 3990: (i.e. keep it out of the loop). Separate the UTF-8 code completely as that
1.1 misha 3991: is tidier. Also separate the UCP code, which can be the same for both UTF-8
3992: and single-bytes. */
3993:
3994: if (min > 0)
3995: {
3996: #ifdef SUPPORT_UCP
3997: if (prop_type >= 0)
3998: {
3999: switch(prop_type)
4000: {
4001: case PT_ANY:
1.6 ! misha 4002: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
1.1 misha 4003: for (i = 1; i <= min; i++)
4004: {
1.4 misha 4005: if (eptr >= md->end_subject)
4006: {
4007: SCHECK_PARTIAL();
1.6 ! misha 4008: RRETURN(MATCH_NOMATCH);
1.4 misha 4009: }
1.1 misha 4010: GETCHARINCTEST(c, eptr);
4011: }
4012: break;
4013:
4014: case PT_LAMP:
4015: for (i = 1; i <= min; i++)
4016: {
1.6 ! misha 4017: int chartype;
1.4 misha 4018: if (eptr >= md->end_subject)
4019: {
4020: SCHECK_PARTIAL();
1.6 ! misha 4021: RRETURN(MATCH_NOMATCH);
1.4 misha 4022: }
1.1 misha 4023: GETCHARINCTEST(c, eptr);
1.6 ! misha 4024: chartype = UCD_CHARTYPE(c);
! 4025: if ((chartype == ucp_Lu ||
! 4026: chartype == ucp_Ll ||
! 4027: chartype == ucp_Lt) == prop_fail_result)
! 4028: RRETURN(MATCH_NOMATCH);
1.1 misha 4029: }
4030: break;
4031:
4032: case PT_GC:
4033: for (i = 1; i <= min; i++)
4034: {
1.4 misha 4035: if (eptr >= md->end_subject)
4036: {
4037: SCHECK_PARTIAL();
1.6 ! misha 4038: RRETURN(MATCH_NOMATCH);
1.4 misha 4039: }
1.1 misha 4040: GETCHARINCTEST(c, eptr);
1.6 ! misha 4041: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
! 4042: RRETURN(MATCH_NOMATCH);
1.1 misha 4043: }
4044: break;
4045:
4046: case PT_PC:
4047: for (i = 1; i <= min; i++)
4048: {
1.4 misha 4049: if (eptr >= md->end_subject)
4050: {
4051: SCHECK_PARTIAL();
1.6 ! misha 4052: RRETURN(MATCH_NOMATCH);
1.4 misha 4053: }
1.1 misha 4054: GETCHARINCTEST(c, eptr);
1.6 ! misha 4055: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
! 4056: RRETURN(MATCH_NOMATCH);
1.1 misha 4057: }
4058: break;
4059:
4060: case PT_SC:
4061: for (i = 1; i <= min; i++)
4062: {
1.4 misha 4063: if (eptr >= md->end_subject)
4064: {
4065: SCHECK_PARTIAL();
1.6 ! misha 4066: RRETURN(MATCH_NOMATCH);
1.4 misha 4067: }
1.1 misha 4068: GETCHARINCTEST(c, eptr);
1.6 ! misha 4069: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
! 4070: RRETURN(MATCH_NOMATCH);
1.4 misha 4071: }
4072: break;
4073:
4074: case PT_ALNUM:
4075: for (i = 1; i <= min; i++)
4076: {
1.6 ! misha 4077: int category;
1.4 misha 4078: if (eptr >= md->end_subject)
4079: {
4080: SCHECK_PARTIAL();
1.6 ! misha 4081: RRETURN(MATCH_NOMATCH);
1.4 misha 4082: }
4083: GETCHARINCTEST(c, eptr);
1.6 ! misha 4084: category = UCD_CATEGORY(c);
! 4085: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
! 4086: RRETURN(MATCH_NOMATCH);
1.4 misha 4087: }
4088: break;
4089:
4090: case PT_SPACE: /* Perl space */
4091: for (i = 1; i <= min; i++)
4092: {
4093: if (eptr >= md->end_subject)
4094: {
4095: SCHECK_PARTIAL();
1.6 ! misha 4096: RRETURN(MATCH_NOMATCH);
1.4 misha 4097: }
4098: GETCHARINCTEST(c, eptr);
1.6 ! misha 4099: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
1.4 misha 4100: c == CHAR_FF || c == CHAR_CR)
4101: == prop_fail_result)
1.6 ! misha 4102: RRETURN(MATCH_NOMATCH);
1.1 misha 4103: }
4104: break;
4105:
1.4 misha 4106: case PT_PXSPACE: /* POSIX space */
4107: for (i = 1; i <= min; i++)
4108: {
4109: if (eptr >= md->end_subject)
4110: {
4111: SCHECK_PARTIAL();
1.6 ! misha 4112: RRETURN(MATCH_NOMATCH);
1.4 misha 4113: }
4114: GETCHARINCTEST(c, eptr);
1.6 ! misha 4115: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
1.4 misha 4116: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4117: == prop_fail_result)
1.6 ! misha 4118: RRETURN(MATCH_NOMATCH);
1.4 misha 4119: }
4120: break;
4121:
4122: case PT_WORD:
4123: for (i = 1; i <= min; i++)
4124: {
1.6 ! misha 4125: int category;
1.4 misha 4126: if (eptr >= md->end_subject)
4127: {
4128: SCHECK_PARTIAL();
1.6 ! misha 4129: RRETURN(MATCH_NOMATCH);
1.4 misha 4130: }
4131: GETCHARINCTEST(c, eptr);
1.6 ! misha 4132: category = UCD_CATEGORY(c);
! 4133: if ((category == ucp_L || category == ucp_N || c == CHAR_UNDERSCORE)
1.4 misha 4134: == prop_fail_result)
1.6 ! misha 4135: RRETURN(MATCH_NOMATCH);
1.4 misha 4136: }
4137: break;
4138:
4139: /* This should not occur */
4140:
1.1 misha 4141: default:
4142: RRETURN(PCRE_ERROR_INTERNAL);
4143: }
4144: }
4145:
4146: /* Match extended Unicode sequences. We will get here only if the
4147: support is in the binary; otherwise a compile-time error occurs. */
4148:
4149: else if (ctype == OP_EXTUNI)
4150: {
4151: for (i = 1; i <= min; i++)
4152: {
1.4 misha 4153: if (eptr >= md->end_subject)
4154: {
4155: SCHECK_PARTIAL();
1.6 ! misha 4156: RRETURN(MATCH_NOMATCH);
1.4 misha 4157: }
1.1 misha 4158: GETCHARINCTEST(c, eptr);
1.6 ! misha 4159: if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
1.1 misha 4160: while (eptr < md->end_subject)
4161: {
4162: int len = 1;
1.6 ! misha 4163: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
! 4164: if (UCD_CATEGORY(c) != ucp_M) break;
1.1 misha 4165: eptr += len;
4166: }
4167: }
4168: }
4169:
4170: else
4171: #endif /* SUPPORT_UCP */
4172:
4173: /* Handle all other cases when the coding is UTF-8 */
4174:
1.6 ! misha 4175: #ifdef SUPPORT_UTF
! 4176: if (utf) switch(ctype)
1.1 misha 4177: {
4178: case OP_ANY:
4179: for (i = 1; i <= min; i++)
4180: {
1.4 misha 4181: if (eptr >= md->end_subject)
4182: {
4183: SCHECK_PARTIAL();
1.6 ! misha 4184: RRETURN(MATCH_NOMATCH);
1.4 misha 4185: }
1.6 ! misha 4186: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1.1 misha 4187: eptr++;
1.6 ! misha 4188: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misha 4189: }
4190: break;
4191:
4192: case OP_ALLANY:
4193: for (i = 1; i <= min; i++)
4194: {
1.4 misha 4195: if (eptr >= md->end_subject)
4196: {
4197: SCHECK_PARTIAL();
1.6 ! misha 4198: RRETURN(MATCH_NOMATCH);
1.4 misha 4199: }
1.1 misha 4200: eptr++;
1.6 ! misha 4201: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misha 4202: }
4203: break;
4204:
4205: case OP_ANYBYTE:
1.6 ! misha 4206: if (eptr > md->end_subject - min) RRETURN(MATCH_NOMATCH);
1.1 misha 4207: eptr += min;
4208: break;
4209:
4210: case OP_ANYNL:
4211: for (i = 1; i <= min; i++)
4212: {
1.4 misha 4213: if (eptr >= md->end_subject)
4214: {
4215: SCHECK_PARTIAL();
1.6 ! misha 4216: RRETURN(MATCH_NOMATCH);
1.4 misha 4217: }
1.1 misha 4218: GETCHARINC(c, eptr);
4219: switch(c)
4220: {
1.6 ! misha 4221: default: RRETURN(MATCH_NOMATCH);
! 4222:
1.1 misha 4223: case 0x000d:
4224: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4225: break;
4226:
4227: case 0x000a:
4228: break;
4229:
4230: case 0x000b:
4231: case 0x000c:
4232: case 0x0085:
4233: case 0x2028:
4234: case 0x2029:
1.6 ! misha 4235: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1.1 misha 4236: break;
4237: }
4238: }
4239: break;
4240:
4241: case OP_NOT_HSPACE:
4242: for (i = 1; i <= min; i++)
4243: {
1.4 misha 4244: if (eptr >= md->end_subject)
4245: {
4246: SCHECK_PARTIAL();
1.6 ! misha 4247: RRETURN(MATCH_NOMATCH);
1.4 misha 4248: }
1.1 misha 4249: GETCHARINC(c, eptr);
4250: switch(c)
4251: {
4252: default: break;
4253: case 0x09: /* HT */
4254: case 0x20: /* SPACE */
4255: case 0xa0: /* NBSP */
4256: case 0x1680: /* OGHAM SPACE MARK */
4257: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4258: case 0x2000: /* EN QUAD */
4259: case 0x2001: /* EM QUAD */
4260: case 0x2002: /* EN SPACE */
4261: case 0x2003: /* EM SPACE */
4262: case 0x2004: /* THREE-PER-EM SPACE */
4263: case 0x2005: /* FOUR-PER-EM SPACE */
4264: case 0x2006: /* SIX-PER-EM SPACE */
4265: case 0x2007: /* FIGURE SPACE */
4266: case 0x2008: /* PUNCTUATION SPACE */
4267: case 0x2009: /* THIN SPACE */
4268: case 0x200A: /* HAIR SPACE */
4269: case 0x202f: /* NARROW NO-BREAK SPACE */
4270: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4271: case 0x3000: /* IDEOGRAPHIC SPACE */
1.6 ! misha 4272: RRETURN(MATCH_NOMATCH);
1.1 misha 4273: }
4274: }
4275: break;
4276:
4277: case OP_HSPACE:
4278: for (i = 1; i <= min; i++)
4279: {
1.4 misha 4280: if (eptr >= md->end_subject)
4281: {
4282: SCHECK_PARTIAL();
1.6 ! misha 4283: RRETURN(MATCH_NOMATCH);
1.4 misha 4284: }
1.1 misha 4285: GETCHARINC(c, eptr);
4286: switch(c)
4287: {
1.6 ! misha 4288: default: RRETURN(MATCH_NOMATCH);
1.1 misha 4289: case 0x09: /* HT */
4290: case 0x20: /* SPACE */
4291: case 0xa0: /* NBSP */
4292: case 0x1680: /* OGHAM SPACE MARK */
4293: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4294: case 0x2000: /* EN QUAD */
4295: case 0x2001: /* EM QUAD */
4296: case 0x2002: /* EN SPACE */
4297: case 0x2003: /* EM SPACE */
4298: case 0x2004: /* THREE-PER-EM SPACE */
4299: case 0x2005: /* FOUR-PER-EM SPACE */
4300: case 0x2006: /* SIX-PER-EM SPACE */
4301: case 0x2007: /* FIGURE SPACE */
4302: case 0x2008: /* PUNCTUATION SPACE */
4303: case 0x2009: /* THIN SPACE */
4304: case 0x200A: /* HAIR SPACE */
4305: case 0x202f: /* NARROW NO-BREAK SPACE */
4306: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4307: case 0x3000: /* IDEOGRAPHIC SPACE */
4308: break;
4309: }
4310: }
4311: break;
4312:
4313: case OP_NOT_VSPACE:
4314: for (i = 1; i <= min; i++)
4315: {
1.4 misha 4316: if (eptr >= md->end_subject)
4317: {
4318: SCHECK_PARTIAL();
1.6 ! misha 4319: RRETURN(MATCH_NOMATCH);
1.4 misha 4320: }
1.1 misha 4321: GETCHARINC(c, eptr);
4322: switch(c)
4323: {
4324: default: break;
4325: case 0x0a: /* LF */
4326: case 0x0b: /* VT */
4327: case 0x0c: /* FF */
4328: case 0x0d: /* CR */
4329: case 0x85: /* NEL */
4330: case 0x2028: /* LINE SEPARATOR */
4331: case 0x2029: /* PARAGRAPH SEPARATOR */
1.6 ! misha 4332: RRETURN(MATCH_NOMATCH);
1.1 misha 4333: }
4334: }
4335: break;
4336:
4337: case OP_VSPACE:
4338: for (i = 1; i <= min; i++)
4339: {
1.4 misha 4340: if (eptr >= md->end_subject)
4341: {
4342: SCHECK_PARTIAL();
1.6 ! misha 4343: RRETURN(MATCH_NOMATCH);
1.4 misha 4344: }
1.1 misha 4345: GETCHARINC(c, eptr);
4346: switch(c)
4347: {
1.6 ! misha 4348: default: RRETURN(MATCH_NOMATCH);
1.1 misha 4349: case 0x0a: /* LF */
4350: case 0x0b: /* VT */
4351: case 0x0c: /* FF */
4352: case 0x0d: /* CR */
4353: case 0x85: /* NEL */
4354: case 0x2028: /* LINE SEPARATOR */
4355: case 0x2029: /* PARAGRAPH SEPARATOR */
4356: break;
4357: }
4358: }
4359: break;
4360:
4361: case OP_NOT_DIGIT:
4362: for (i = 1; i <= min; i++)
4363: {
1.4 misha 4364: if (eptr >= md->end_subject)
4365: {
4366: SCHECK_PARTIAL();
1.6 ! misha 4367: RRETURN(MATCH_NOMATCH);
1.4 misha 4368: }
1.1 misha 4369: GETCHARINC(c, eptr);
4370: if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
1.6 ! misha 4371: RRETURN(MATCH_NOMATCH);
1.1 misha 4372: }
4373: break;
4374:
4375: case OP_DIGIT:
4376: for (i = 1; i <= min; i++)
4377: {
1.4 misha 4378: if (eptr >= md->end_subject)
4379: {
4380: SCHECK_PARTIAL();
1.6 ! misha 4381: RRETURN(MATCH_NOMATCH);
1.4 misha 4382: }
1.6 ! misha 4383: if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_digit) == 0)
! 4384: RRETURN(MATCH_NOMATCH);
! 4385: eptr++;
1.1 misha 4386: /* No need to skip more bytes - we know it's a 1-byte character */
4387: }
4388: break;
4389:
4390: case OP_NOT_WHITESPACE:
4391: for (i = 1; i <= min; i++)
4392: {
1.4 misha 4393: if (eptr >= md->end_subject)
4394: {
4395: SCHECK_PARTIAL();
1.6 ! misha 4396: RRETURN(MATCH_NOMATCH);
1.4 misha 4397: }
4398: if (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0)
1.6 ! misha 4399: RRETURN(MATCH_NOMATCH);
! 4400: eptr++;
! 4401: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misha 4402: }
4403: break;
4404:
4405: case OP_WHITESPACE:
4406: for (i = 1; i <= min; i++)
4407: {
1.4 misha 4408: if (eptr >= md->end_subject)
4409: {
4410: SCHECK_PARTIAL();
1.6 ! misha 4411: RRETURN(MATCH_NOMATCH);
1.4 misha 4412: }
1.6 ! misha 4413: if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_space) == 0)
! 4414: RRETURN(MATCH_NOMATCH);
! 4415: eptr++;
1.1 misha 4416: /* No need to skip more bytes - we know it's a 1-byte character */
4417: }
4418: break;
4419:
4420: case OP_NOT_WORDCHAR:
4421: for (i = 1; i <= min; i++)
4422: {
1.4 misha 4423: if (eptr >= md->end_subject)
4424: {
4425: SCHECK_PARTIAL();
1.6 ! misha 4426: RRETURN(MATCH_NOMATCH);
1.4 misha 4427: }
4428: if (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0)
1.6 ! misha 4429: RRETURN(MATCH_NOMATCH);
! 4430: eptr++;
! 4431: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misha 4432: }
4433: break;
4434:
4435: case OP_WORDCHAR:
4436: for (i = 1; i <= min; i++)
4437: {
1.4 misha 4438: if (eptr >= md->end_subject)
4439: {
4440: SCHECK_PARTIAL();
1.6 ! misha 4441: RRETURN(MATCH_NOMATCH);
1.4 misha 4442: }
1.6 ! misha 4443: if (*eptr >= 128 || (md->ctypes[*eptr] & ctype_word) == 0)
! 4444: RRETURN(MATCH_NOMATCH);
! 4445: eptr++;
1.1 misha 4446: /* No need to skip more bytes - we know it's a 1-byte character */
4447: }
4448: break;
4449:
4450: default:
4451: RRETURN(PCRE_ERROR_INTERNAL);
4452: } /* End switch(ctype) */
4453:
4454: else
1.6 ! misha 4455: #endif /* SUPPORT_UTF */
1.1 misha 4456:
4457: /* Code for the non-UTF-8 case for minimum matching of operators other
1.4 misha 4458: than OP_PROP and OP_NOTPROP. */
1.1 misha 4459:
4460: switch(ctype)
4461: {
4462: case OP_ANY:
4463: for (i = 1; i <= min; i++)
4464: {
1.4 misha 4465: if (eptr >= md->end_subject)
4466: {
4467: SCHECK_PARTIAL();
1.6 ! misha 4468: RRETURN(MATCH_NOMATCH);
1.4 misha 4469: }
1.6 ! misha 4470: if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1.1 misha 4471: eptr++;
4472: }
4473: break;
4474:
4475: case OP_ALLANY:
1.4 misha 4476: if (eptr > md->end_subject - min)
4477: {
4478: SCHECK_PARTIAL();
1.6 ! misha 4479: RRETURN(MATCH_NOMATCH);
1.4 misha 4480: }
1.1 misha 4481: eptr += min;
4482: break;
4483:
4484: case OP_ANYBYTE:
1.4 misha 4485: if (eptr > md->end_subject - min)
4486: {
4487: SCHECK_PARTIAL();
1.6 ! misha 4488: RRETURN(MATCH_NOMATCH);
1.4 misha 4489: }
1.1 misha 4490: eptr += min;
4491: break;
4492:
4493: case OP_ANYNL:
4494: for (i = 1; i <= min; i++)
4495: {
1.4 misha 4496: if (eptr >= md->end_subject)
4497: {
4498: SCHECK_PARTIAL();
1.6 ! misha 4499: RRETURN(MATCH_NOMATCH);
1.4 misha 4500: }
1.1 misha 4501: switch(*eptr++)
4502: {
1.6 ! misha 4503: default: RRETURN(MATCH_NOMATCH);
! 4504:
1.1 misha 4505: case 0x000d:
4506: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4507: break;
1.6 ! misha 4508:
1.1 misha 4509: case 0x000a:
4510: break;
4511:
4512: case 0x000b:
4513: case 0x000c:
4514: case 0x0085:
1.6 ! misha 4515: #ifdef COMPILE_PCRE16
! 4516: case 0x2028:
! 4517: case 0x2029:
! 4518: #endif
! 4519: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1.1 misha 4520: break;
4521: }
4522: }
4523: break;
4524:
4525: case OP_NOT_HSPACE:
4526: for (i = 1; i <= min; i++)
4527: {
1.4 misha 4528: if (eptr >= md->end_subject)
4529: {
4530: SCHECK_PARTIAL();
1.6 ! misha 4531: RRETURN(MATCH_NOMATCH);
1.4 misha 4532: }
1.1 misha 4533: switch(*eptr++)
4534: {
4535: default: break;
4536: case 0x09: /* HT */
4537: case 0x20: /* SPACE */
4538: case 0xa0: /* NBSP */
1.6 ! misha 4539: #ifdef COMPILE_PCRE16
! 4540: case 0x1680: /* OGHAM SPACE MARK */
! 4541: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
! 4542: case 0x2000: /* EN QUAD */
! 4543: case 0x2001: /* EM QUAD */
! 4544: case 0x2002: /* EN SPACE */
! 4545: case 0x2003: /* EM SPACE */
! 4546: case 0x2004: /* THREE-PER-EM SPACE */
! 4547: case 0x2005: /* FOUR-PER-EM SPACE */
! 4548: case 0x2006: /* SIX-PER-EM SPACE */
! 4549: case 0x2007: /* FIGURE SPACE */
! 4550: case 0x2008: /* PUNCTUATION SPACE */
! 4551: case 0x2009: /* THIN SPACE */
! 4552: case 0x200A: /* HAIR SPACE */
! 4553: case 0x202f: /* NARROW NO-BREAK SPACE */
! 4554: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
! 4555: case 0x3000: /* IDEOGRAPHIC SPACE */
! 4556: #endif
! 4557: RRETURN(MATCH_NOMATCH);
1.1 misha 4558: }
4559: }
4560: break;
4561:
4562: case OP_HSPACE:
4563: for (i = 1; i <= min; i++)
4564: {
1.4 misha 4565: if (eptr >= md->end_subject)
4566: {
4567: SCHECK_PARTIAL();
1.6 ! misha 4568: RRETURN(MATCH_NOMATCH);
1.4 misha 4569: }
1.1 misha 4570: switch(*eptr++)
4571: {
1.6 ! misha 4572: default: RRETURN(MATCH_NOMATCH);
1.1 misha 4573: case 0x09: /* HT */
4574: case 0x20: /* SPACE */
4575: case 0xa0: /* NBSP */
1.6 ! misha 4576: #ifdef COMPILE_PCRE16
! 4577: case 0x1680: /* OGHAM SPACE MARK */
! 4578: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
! 4579: case 0x2000: /* EN QUAD */
! 4580: case 0x2001: /* EM QUAD */
! 4581: case 0x2002: /* EN SPACE */
! 4582: case 0x2003: /* EM SPACE */
! 4583: case 0x2004: /* THREE-PER-EM SPACE */
! 4584: case 0x2005: /* FOUR-PER-EM SPACE */
! 4585: case 0x2006: /* SIX-PER-EM SPACE */
! 4586: case 0x2007: /* FIGURE SPACE */
! 4587: case 0x2008: /* PUNCTUATION SPACE */
! 4588: case 0x2009: /* THIN SPACE */
! 4589: case 0x200A: /* HAIR SPACE */
! 4590: case 0x202f: /* NARROW NO-BREAK SPACE */
! 4591: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
! 4592: case 0x3000: /* IDEOGRAPHIC SPACE */
! 4593: #endif
1.1 misha 4594: break;
4595: }
4596: }
4597: break;
4598:
4599: case OP_NOT_VSPACE:
4600: for (i = 1; i <= min; i++)
4601: {
1.4 misha 4602: if (eptr >= md->end_subject)
4603: {
4604: SCHECK_PARTIAL();
1.6 ! misha 4605: RRETURN(MATCH_NOMATCH);
1.4 misha 4606: }
1.1 misha 4607: switch(*eptr++)
4608: {
4609: default: break;
4610: case 0x0a: /* LF */
4611: case 0x0b: /* VT */
4612: case 0x0c: /* FF */
4613: case 0x0d: /* CR */
4614: case 0x85: /* NEL */
1.6 ! misha 4615: #ifdef COMPILE_PCRE16
! 4616: case 0x2028: /* LINE SEPARATOR */
! 4617: case 0x2029: /* PARAGRAPH SEPARATOR */
! 4618: #endif
! 4619: RRETURN(MATCH_NOMATCH);
1.1 misha 4620: }
4621: }
4622: break;
4623:
4624: case OP_VSPACE:
4625: for (i = 1; i <= min; i++)
4626: {
1.4 misha 4627: if (eptr >= md->end_subject)
4628: {
4629: SCHECK_PARTIAL();
1.6 ! misha 4630: RRETURN(MATCH_NOMATCH);
1.4 misha 4631: }
1.1 misha 4632: switch(*eptr++)
4633: {
1.6 ! misha 4634: default: RRETURN(MATCH_NOMATCH);
1.1 misha 4635: case 0x0a: /* LF */
4636: case 0x0b: /* VT */
4637: case 0x0c: /* FF */
4638: case 0x0d: /* CR */
4639: case 0x85: /* NEL */
1.6 ! misha 4640: #ifdef COMPILE_PCRE16
! 4641: case 0x2028: /* LINE SEPARATOR */
! 4642: case 0x2029: /* PARAGRAPH SEPARATOR */
! 4643: #endif
1.1 misha 4644: break;
4645: }
4646: }
4647: break;
4648:
4649: case OP_NOT_DIGIT:
4650: for (i = 1; i <= min; i++)
1.4 misha 4651: {
4652: if (eptr >= md->end_subject)
4653: {
4654: SCHECK_PARTIAL();
1.6 ! misha 4655: RRETURN(MATCH_NOMATCH);
1.4 misha 4656: }
1.6 ! misha 4657: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0)
! 4658: RRETURN(MATCH_NOMATCH);
! 4659: eptr++;
1.4 misha 4660: }
1.1 misha 4661: break;
4662:
4663: case OP_DIGIT:
4664: for (i = 1; i <= min; i++)
1.4 misha 4665: {
4666: if (eptr >= md->end_subject)
4667: {
4668: SCHECK_PARTIAL();
1.6 ! misha 4669: RRETURN(MATCH_NOMATCH);
1.4 misha 4670: }
1.6 ! misha 4671: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0)
! 4672: RRETURN(MATCH_NOMATCH);
! 4673: eptr++;
1.4 misha 4674: }
1.1 misha 4675: break;
4676:
4677: case OP_NOT_WHITESPACE:
4678: for (i = 1; i <= min; i++)
1.4 misha 4679: {
4680: if (eptr >= md->end_subject)
4681: {
4682: SCHECK_PARTIAL();
1.6 ! misha 4683: RRETURN(MATCH_NOMATCH);
1.4 misha 4684: }
1.6 ! misha 4685: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0)
! 4686: RRETURN(MATCH_NOMATCH);
! 4687: eptr++;
1.4 misha 4688: }
1.1 misha 4689: break;
4690:
4691: case OP_WHITESPACE:
4692: for (i = 1; i <= min; i++)
1.4 misha 4693: {
4694: if (eptr >= md->end_subject)
4695: {
4696: SCHECK_PARTIAL();
1.6 ! misha 4697: RRETURN(MATCH_NOMATCH);
1.4 misha 4698: }
1.6 ! misha 4699: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0)
! 4700: RRETURN(MATCH_NOMATCH);
! 4701: eptr++;
1.4 misha 4702: }
1.1 misha 4703: break;
4704:
4705: case OP_NOT_WORDCHAR:
4706: for (i = 1; i <= min; i++)
1.4 misha 4707: {
4708: if (eptr >= md->end_subject)
4709: {
4710: SCHECK_PARTIAL();
1.6 ! misha 4711: RRETURN(MATCH_NOMATCH);
1.4 misha 4712: }
1.6 ! misha 4713: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0)
! 4714: RRETURN(MATCH_NOMATCH);
! 4715: eptr++;
1.4 misha 4716: }
1.1 misha 4717: break;
4718:
4719: case OP_WORDCHAR:
4720: for (i = 1; i <= min; i++)
1.4 misha 4721: {
4722: if (eptr >= md->end_subject)
4723: {
4724: SCHECK_PARTIAL();
1.6 ! misha 4725: RRETURN(MATCH_NOMATCH);
1.4 misha 4726: }
1.6 ! misha 4727: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0)
! 4728: RRETURN(MATCH_NOMATCH);
! 4729: eptr++;
1.4 misha 4730: }
1.1 misha 4731: break;
4732:
4733: default:
4734: RRETURN(PCRE_ERROR_INTERNAL);
4735: }
4736: }
4737:
4738: /* If min = max, continue at the same level without recursing */
4739:
4740: if (min == max) continue;
4741:
4742: /* If minimizing, we have to test the rest of the pattern before each
4743: subsequent match. Again, separate the UTF-8 case for speed, and also
4744: separate the UCP cases. */
4745:
4746: if (minimize)
4747: {
4748: #ifdef SUPPORT_UCP
4749: if (prop_type >= 0)
4750: {
4751: switch(prop_type)
4752: {
4753: case PT_ANY:
4754: for (fi = min;; fi++)
4755: {
1.6 ! misha 4756: RMATCH(eptr, ecode, offset_top, md, eptrb, RM36);
1.1 misha 4757: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 4758: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4759: if (eptr >= md->end_subject)
4760: {
4761: SCHECK_PARTIAL();
1.6 ! misha 4762: RRETURN(MATCH_NOMATCH);
1.4 misha 4763: }
4764: GETCHARINCTEST(c, eptr);
1.6 ! misha 4765: if (prop_fail_result) RRETURN(MATCH_NOMATCH);
1.1 misha 4766: }
4767: /* Control never gets here */
4768:
4769: case PT_LAMP:
4770: for (fi = min;; fi++)
4771: {
1.6 ! misha 4772: int chartype;
! 4773: RMATCH(eptr, ecode, offset_top, md, eptrb, RM37);
1.1 misha 4774: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 4775: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4776: if (eptr >= md->end_subject)
4777: {
4778: SCHECK_PARTIAL();
1.6 ! misha 4779: RRETURN(MATCH_NOMATCH);
1.4 misha 4780: }
4781: GETCHARINCTEST(c, eptr);
1.6 ! misha 4782: chartype = UCD_CHARTYPE(c);
! 4783: if ((chartype == ucp_Lu ||
! 4784: chartype == ucp_Ll ||
! 4785: chartype == ucp_Lt) == prop_fail_result)
! 4786: RRETURN(MATCH_NOMATCH);
1.1 misha 4787: }
4788: /* Control never gets here */
4789:
4790: case PT_GC:
4791: for (fi = min;; fi++)
4792: {
1.6 ! misha 4793: RMATCH(eptr, ecode, offset_top, md, eptrb, RM38);
1.1 misha 4794: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 4795: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4796: if (eptr >= md->end_subject)
4797: {
4798: SCHECK_PARTIAL();
1.6 ! misha 4799: RRETURN(MATCH_NOMATCH);
1.4 misha 4800: }
4801: GETCHARINCTEST(c, eptr);
1.6 ! misha 4802: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result)
! 4803: RRETURN(MATCH_NOMATCH);
1.1 misha 4804: }
4805: /* Control never gets here */
4806:
4807: case PT_PC:
4808: for (fi = min;; fi++)
4809: {
1.6 ! misha 4810: RMATCH(eptr, ecode, offset_top, md, eptrb, RM39);
1.1 misha 4811: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 4812: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4813: if (eptr >= md->end_subject)
4814: {
4815: SCHECK_PARTIAL();
1.6 ! misha 4816: RRETURN(MATCH_NOMATCH);
1.4 misha 4817: }
4818: GETCHARINCTEST(c, eptr);
1.6 ! misha 4819: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result)
! 4820: RRETURN(MATCH_NOMATCH);
1.1 misha 4821: }
4822: /* Control never gets here */
4823:
4824: case PT_SC:
4825: for (fi = min;; fi++)
4826: {
1.6 ! misha 4827: RMATCH(eptr, ecode, offset_top, md, eptrb, RM40);
1.1 misha 4828: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 4829: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4830: if (eptr >= md->end_subject)
4831: {
4832: SCHECK_PARTIAL();
1.6 ! misha 4833: RRETURN(MATCH_NOMATCH);
1.4 misha 4834: }
4835: GETCHARINCTEST(c, eptr);
1.6 ! misha 4836: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result)
! 4837: RRETURN(MATCH_NOMATCH);
1.4 misha 4838: }
4839: /* Control never gets here */
4840:
4841: case PT_ALNUM:
4842: for (fi = min;; fi++)
4843: {
1.6 ! misha 4844: int category;
! 4845: RMATCH(eptr, ecode, offset_top, md, eptrb, RM59);
1.4 misha 4846: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 4847: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4848: if (eptr >= md->end_subject)
4849: {
4850: SCHECK_PARTIAL();
1.6 ! misha 4851: RRETURN(MATCH_NOMATCH);
1.4 misha 4852: }
4853: GETCHARINCTEST(c, eptr);
1.6 ! misha 4854: category = UCD_CATEGORY(c);
! 4855: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
! 4856: RRETURN(MATCH_NOMATCH);
1.4 misha 4857: }
4858: /* Control never gets here */
4859:
4860: case PT_SPACE: /* Perl space */
4861: for (fi = min;; fi++)
4862: {
1.6 ! misha 4863: RMATCH(eptr, ecode, offset_top, md, eptrb, RM60);
1.4 misha 4864: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 4865: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4866: if (eptr >= md->end_subject)
4867: {
4868: SCHECK_PARTIAL();
1.6 ! misha 4869: RRETURN(MATCH_NOMATCH);
1.4 misha 4870: }
4871: GETCHARINCTEST(c, eptr);
1.6 ! misha 4872: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
1.4 misha 4873: c == CHAR_FF || c == CHAR_CR)
4874: == prop_fail_result)
1.6 ! misha 4875: RRETURN(MATCH_NOMATCH);
1.4 misha 4876: }
4877: /* Control never gets here */
4878:
4879: case PT_PXSPACE: /* POSIX space */
4880: for (fi = min;; fi++)
4881: {
1.6 ! misha 4882: RMATCH(eptr, ecode, offset_top, md, eptrb, RM61);
1.4 misha 4883: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 4884: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4885: if (eptr >= md->end_subject)
4886: {
4887: SCHECK_PARTIAL();
1.6 ! misha 4888: RRETURN(MATCH_NOMATCH);
1.4 misha 4889: }
4890: GETCHARINCTEST(c, eptr);
1.6 ! misha 4891: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
1.4 misha 4892: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
4893: == prop_fail_result)
1.6 ! misha 4894: RRETURN(MATCH_NOMATCH);
1.1 misha 4895: }
4896: /* Control never gets here */
4897:
1.4 misha 4898: case PT_WORD:
4899: for (fi = min;; fi++)
4900: {
1.6 ! misha 4901: int category;
! 4902: RMATCH(eptr, ecode, offset_top, md, eptrb, RM62);
1.4 misha 4903: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 4904: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4905: if (eptr >= md->end_subject)
4906: {
4907: SCHECK_PARTIAL();
1.6 ! misha 4908: RRETURN(MATCH_NOMATCH);
1.4 misha 4909: }
4910: GETCHARINCTEST(c, eptr);
1.6 ! misha 4911: category = UCD_CATEGORY(c);
! 4912: if ((category == ucp_L ||
! 4913: category == ucp_N ||
1.4 misha 4914: c == CHAR_UNDERSCORE)
4915: == prop_fail_result)
1.6 ! misha 4916: RRETURN(MATCH_NOMATCH);
1.4 misha 4917: }
4918: /* Control never gets here */
4919:
4920: /* This should never occur */
4921:
1.1 misha 4922: default:
4923: RRETURN(PCRE_ERROR_INTERNAL);
4924: }
4925: }
4926:
4927: /* Match extended Unicode sequences. We will get here only if the
4928: support is in the binary; otherwise a compile-time error occurs. */
4929:
4930: else if (ctype == OP_EXTUNI)
4931: {
4932: for (fi = min;; fi++)
4933: {
1.6 ! misha 4934: RMATCH(eptr, ecode, offset_top, md, eptrb, RM41);
1.1 misha 4935: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 4936: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4937: if (eptr >= md->end_subject)
4938: {
4939: SCHECK_PARTIAL();
1.6 ! misha 4940: RRETURN(MATCH_NOMATCH);
1.4 misha 4941: }
1.1 misha 4942: GETCHARINCTEST(c, eptr);
1.6 ! misha 4943: if (UCD_CATEGORY(c) == ucp_M) RRETURN(MATCH_NOMATCH);
1.1 misha 4944: while (eptr < md->end_subject)
4945: {
4946: int len = 1;
1.6 ! misha 4947: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
! 4948: if (UCD_CATEGORY(c) != ucp_M) break;
1.1 misha 4949: eptr += len;
4950: }
4951: }
4952: }
4953: else
4954: #endif /* SUPPORT_UCP */
4955:
1.6 ! misha 4956: #ifdef SUPPORT_UTF
! 4957: if (utf)
1.1 misha 4958: {
4959: for (fi = min;; fi++)
4960: {
1.6 ! misha 4961: RMATCH(eptr, ecode, offset_top, md, eptrb, RM42);
1.1 misha 4962: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 4963: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 4964: if (eptr >= md->end_subject)
4965: {
4966: SCHECK_PARTIAL();
1.6 ! misha 4967: RRETURN(MATCH_NOMATCH);
1.4 misha 4968: }
4969: if (ctype == OP_ANY && IS_NEWLINE(eptr))
1.6 ! misha 4970: RRETURN(MATCH_NOMATCH);
1.1 misha 4971: GETCHARINC(c, eptr);
4972: switch(ctype)
4973: {
4974: case OP_ANY: /* This is the non-NL case */
4975: case OP_ALLANY:
4976: case OP_ANYBYTE:
4977: break;
4978:
4979: case OP_ANYNL:
4980: switch(c)
4981: {
1.6 ! misha 4982: default: RRETURN(MATCH_NOMATCH);
1.1 misha 4983: case 0x000d:
4984: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
4985: break;
4986: case 0x000a:
4987: break;
4988:
4989: case 0x000b:
4990: case 0x000c:
4991: case 0x0085:
4992: case 0x2028:
4993: case 0x2029:
1.6 ! misha 4994: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1.1 misha 4995: break;
4996: }
4997: break;
4998:
4999: case OP_NOT_HSPACE:
5000: switch(c)
5001: {
5002: default: break;
5003: case 0x09: /* HT */
5004: case 0x20: /* SPACE */
5005: case 0xa0: /* NBSP */
5006: case 0x1680: /* OGHAM SPACE MARK */
5007: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5008: case 0x2000: /* EN QUAD */
5009: case 0x2001: /* EM QUAD */
5010: case 0x2002: /* EN SPACE */
5011: case 0x2003: /* EM SPACE */
5012: case 0x2004: /* THREE-PER-EM SPACE */
5013: case 0x2005: /* FOUR-PER-EM SPACE */
5014: case 0x2006: /* SIX-PER-EM SPACE */
5015: case 0x2007: /* FIGURE SPACE */
5016: case 0x2008: /* PUNCTUATION SPACE */
5017: case 0x2009: /* THIN SPACE */
5018: case 0x200A: /* HAIR SPACE */
5019: case 0x202f: /* NARROW NO-BREAK SPACE */
5020: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5021: case 0x3000: /* IDEOGRAPHIC SPACE */
1.6 ! misha 5022: RRETURN(MATCH_NOMATCH);
1.1 misha 5023: }
5024: break;
5025:
5026: case OP_HSPACE:
5027: switch(c)
5028: {
1.6 ! misha 5029: default: RRETURN(MATCH_NOMATCH);
1.1 misha 5030: case 0x09: /* HT */
5031: case 0x20: /* SPACE */
5032: case 0xa0: /* NBSP */
5033: case 0x1680: /* OGHAM SPACE MARK */
5034: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5035: case 0x2000: /* EN QUAD */
5036: case 0x2001: /* EM QUAD */
5037: case 0x2002: /* EN SPACE */
5038: case 0x2003: /* EM SPACE */
5039: case 0x2004: /* THREE-PER-EM SPACE */
5040: case 0x2005: /* FOUR-PER-EM SPACE */
5041: case 0x2006: /* SIX-PER-EM SPACE */
5042: case 0x2007: /* FIGURE SPACE */
5043: case 0x2008: /* PUNCTUATION SPACE */
5044: case 0x2009: /* THIN SPACE */
5045: case 0x200A: /* HAIR SPACE */
5046: case 0x202f: /* NARROW NO-BREAK SPACE */
5047: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5048: case 0x3000: /* IDEOGRAPHIC SPACE */
5049: break;
5050: }
5051: break;
5052:
5053: case OP_NOT_VSPACE:
5054: switch(c)
5055: {
5056: default: break;
5057: case 0x0a: /* LF */
5058: case 0x0b: /* VT */
5059: case 0x0c: /* FF */
5060: case 0x0d: /* CR */
5061: case 0x85: /* NEL */
5062: case 0x2028: /* LINE SEPARATOR */
5063: case 0x2029: /* PARAGRAPH SEPARATOR */
1.6 ! misha 5064: RRETURN(MATCH_NOMATCH);
1.1 misha 5065: }
5066: break;
5067:
5068: case OP_VSPACE:
5069: switch(c)
5070: {
1.6 ! misha 5071: default: RRETURN(MATCH_NOMATCH);
1.1 misha 5072: case 0x0a: /* LF */
5073: case 0x0b: /* VT */
5074: case 0x0c: /* FF */
5075: case 0x0d: /* CR */
5076: case 0x85: /* NEL */
5077: case 0x2028: /* LINE SEPARATOR */
5078: case 0x2029: /* PARAGRAPH SEPARATOR */
5079: break;
5080: }
5081: break;
5082:
5083: case OP_NOT_DIGIT:
5084: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
1.6 ! misha 5085: RRETURN(MATCH_NOMATCH);
1.1 misha 5086: break;
5087:
5088: case OP_DIGIT:
5089: if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
1.6 ! misha 5090: RRETURN(MATCH_NOMATCH);
1.1 misha 5091: break;
5092:
5093: case OP_NOT_WHITESPACE:
5094: if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
1.6 ! misha 5095: RRETURN(MATCH_NOMATCH);
1.1 misha 5096: break;
5097:
5098: case OP_WHITESPACE:
1.6 ! misha 5099: if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
! 5100: RRETURN(MATCH_NOMATCH);
1.1 misha 5101: break;
5102:
5103: case OP_NOT_WORDCHAR:
5104: if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
1.6 ! misha 5105: RRETURN(MATCH_NOMATCH);
1.1 misha 5106: break;
5107:
5108: case OP_WORDCHAR:
5109: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
1.6 ! misha 5110: RRETURN(MATCH_NOMATCH);
1.1 misha 5111: break;
5112:
5113: default:
5114: RRETURN(PCRE_ERROR_INTERNAL);
5115: }
5116: }
5117: }
5118: else
5119: #endif
1.6 ! misha 5120: /* Not UTF mode */
1.1 misha 5121: {
5122: for (fi = min;; fi++)
5123: {
1.6 ! misha 5124: RMATCH(eptr, ecode, offset_top, md, eptrb, RM43);
1.1 misha 5125: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.6 ! misha 5126: if (fi >= max) RRETURN(MATCH_NOMATCH);
1.4 misha 5127: if (eptr >= md->end_subject)
5128: {
5129: SCHECK_PARTIAL();
1.6 ! misha 5130: RRETURN(MATCH_NOMATCH);
1.4 misha 5131: }
5132: if (ctype == OP_ANY && IS_NEWLINE(eptr))
1.6 ! misha 5133: RRETURN(MATCH_NOMATCH);
1.1 misha 5134: c = *eptr++;
5135: switch(ctype)
5136: {
5137: case OP_ANY: /* This is the non-NL case */
5138: case OP_ALLANY:
5139: case OP_ANYBYTE:
5140: break;
5141:
5142: case OP_ANYNL:
5143: switch(c)
5144: {
1.6 ! misha 5145: default: RRETURN(MATCH_NOMATCH);
1.1 misha 5146: case 0x000d:
5147: if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
5148: break;
5149:
5150: case 0x000a:
5151: break;
5152:
5153: case 0x000b:
5154: case 0x000c:
5155: case 0x0085:
1.6 ! misha 5156: #ifdef COMPILE_PCRE16
! 5157: case 0x2028:
! 5158: case 0x2029:
! 5159: #endif
! 5160: if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1.1 misha 5161: break;
5162: }
5163: break;
5164:
5165: case OP_NOT_HSPACE:
5166: switch(c)
5167: {
5168: default: break;
5169: case 0x09: /* HT */
5170: case 0x20: /* SPACE */
5171: case 0xa0: /* NBSP */
1.6 ! misha 5172: #ifdef COMPILE_PCRE16
! 5173: case 0x1680: /* OGHAM SPACE MARK */
! 5174: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
! 5175: case 0x2000: /* EN QUAD */
! 5176: case 0x2001: /* EM QUAD */
! 5177: case 0x2002: /* EN SPACE */
! 5178: case 0x2003: /* EM SPACE */
! 5179: case 0x2004: /* THREE-PER-EM SPACE */
! 5180: case 0x2005: /* FOUR-PER-EM SPACE */
! 5181: case 0x2006: /* SIX-PER-EM SPACE */
! 5182: case 0x2007: /* FIGURE SPACE */
! 5183: case 0x2008: /* PUNCTUATION SPACE */
! 5184: case 0x2009: /* THIN SPACE */
! 5185: case 0x200A: /* HAIR SPACE */
! 5186: case 0x202f: /* NARROW NO-BREAK SPACE */
! 5187: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
! 5188: case 0x3000: /* IDEOGRAPHIC SPACE */
! 5189: #endif
! 5190: RRETURN(MATCH_NOMATCH);
1.1 misha 5191: }
5192: break;
5193:
5194: case OP_HSPACE:
5195: switch(c)
5196: {
1.6 ! misha 5197: default: RRETURN(MATCH_NOMATCH);
1.1 misha 5198: case 0x09: /* HT */
5199: case 0x20: /* SPACE */
5200: case 0xa0: /* NBSP */
1.6 ! misha 5201: #ifdef COMPILE_PCRE16
! 5202: case 0x1680: /* OGHAM SPACE MARK */
! 5203: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
! 5204: case 0x2000: /* EN QUAD */
! 5205: case 0x2001: /* EM QUAD */
! 5206: case 0x2002: /* EN SPACE */
! 5207: case 0x2003: /* EM SPACE */
! 5208: case 0x2004: /* THREE-PER-EM SPACE */
! 5209: case 0x2005: /* FOUR-PER-EM SPACE */
! 5210: case 0x2006: /* SIX-PER-EM SPACE */
! 5211: case 0x2007: /* FIGURE SPACE */
! 5212: case 0x2008: /* PUNCTUATION SPACE */
! 5213: case 0x2009: /* THIN SPACE */
! 5214: case 0x200A: /* HAIR SPACE */
! 5215: case 0x202f: /* NARROW NO-BREAK SPACE */
! 5216: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
! 5217: case 0x3000: /* IDEOGRAPHIC SPACE */
! 5218: #endif
1.1 misha 5219: break;
5220: }
5221: break;
5222:
5223: case OP_NOT_VSPACE:
5224: switch(c)
5225: {
5226: default: break;
5227: case 0x0a: /* LF */
5228: case 0x0b: /* VT */
5229: case 0x0c: /* FF */
5230: case 0x0d: /* CR */
5231: case 0x85: /* NEL */
1.6 ! misha 5232: #ifdef COMPILE_PCRE16
! 5233: case 0x2028: /* LINE SEPARATOR */
! 5234: case 0x2029: /* PARAGRAPH SEPARATOR */
! 5235: #endif
! 5236: RRETURN(MATCH_NOMATCH);
1.1 misha 5237: }
5238: break;
5239:
5240: case OP_VSPACE:
5241: switch(c)
5242: {
1.6 ! misha 5243: default: RRETURN(MATCH_NOMATCH);
1.1 misha 5244: case 0x0a: /* LF */
5245: case 0x0b: /* VT */
5246: case 0x0c: /* FF */
5247: case 0x0d: /* CR */
5248: case 0x85: /* NEL */
1.6 ! misha 5249: #ifdef COMPILE_PCRE16
! 5250: case 0x2028: /* LINE SEPARATOR */
! 5251: case 0x2029: /* PARAGRAPH SEPARATOR */
! 5252: #endif
1.1 misha 5253: break;
5254: }
5255: break;
5256:
5257: case OP_NOT_DIGIT:
1.6 ! misha 5258: if (MAX_255(c) && (md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
1.1 misha 5259: break;
5260:
5261: case OP_DIGIT:
1.6 ! misha 5262: if (!MAX_255(c) || (md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
1.1 misha 5263: break;
5264:
5265: case OP_NOT_WHITESPACE:
1.6 ! misha 5266: if (MAX_255(c) && (md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
1.1 misha 5267: break;
5268:
5269: case OP_WHITESPACE:
1.6 ! misha 5270: if (!MAX_255(c) || (md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
1.1 misha 5271: break;
5272:
5273: case OP_NOT_WORDCHAR:
1.6 ! misha 5274: if (MAX_255(c) && (md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
1.1 misha 5275: break;
5276:
5277: case OP_WORDCHAR:
1.6 ! misha 5278: if (!MAX_255(c) || (md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
1.1 misha 5279: break;
5280:
5281: default:
5282: RRETURN(PCRE_ERROR_INTERNAL);
5283: }
5284: }
5285: }
5286: /* Control never gets here */
5287: }
5288:
5289: /* If maximizing, it is worth using inline code for speed, doing the type
5290: test once at the start (i.e. keep it out of the loop). Again, keep the
5291: UTF-8 and UCP stuff separate. */
5292:
5293: else
5294: {
5295: pp = eptr; /* Remember where we started */
5296:
5297: #ifdef SUPPORT_UCP
5298: if (prop_type >= 0)
5299: {
5300: switch(prop_type)
5301: {
5302: case PT_ANY:
5303: for (i = min; i < max; i++)
5304: {
5305: int len = 1;
1.4 misha 5306: if (eptr >= md->end_subject)
5307: {
5308: SCHECK_PARTIAL();
5309: break;
5310: }
5311: GETCHARLENTEST(c, eptr, len);
1.1 misha 5312: if (prop_fail_result) break;
5313: eptr+= len;
5314: }
5315: break;
5316:
5317: case PT_LAMP:
5318: for (i = min; i < max; i++)
5319: {
1.6 ! misha 5320: int chartype;
1.1 misha 5321: int len = 1;
1.4 misha 5322: if (eptr >= md->end_subject)
5323: {
5324: SCHECK_PARTIAL();
5325: break;
5326: }
5327: GETCHARLENTEST(c, eptr, len);
1.6 ! misha 5328: chartype = UCD_CHARTYPE(c);
! 5329: if ((chartype == ucp_Lu ||
! 5330: chartype == ucp_Ll ||
! 5331: chartype == ucp_Lt) == prop_fail_result)
1.1 misha 5332: break;
5333: eptr+= len;
5334: }
5335: break;
5336:
5337: case PT_GC:
5338: for (i = min; i < max; i++)
5339: {
5340: int len = 1;
1.4 misha 5341: if (eptr >= md->end_subject)
5342: {
5343: SCHECK_PARTIAL();
5344: break;
5345: }
5346: GETCHARLENTEST(c, eptr, len);
1.6 ! misha 5347: if ((UCD_CATEGORY(c) == prop_value) == prop_fail_result) break;
1.1 misha 5348: eptr+= len;
5349: }
5350: break;
5351:
5352: case PT_PC:
5353: for (i = min; i < max; i++)
5354: {
5355: int len = 1;
1.4 misha 5356: if (eptr >= md->end_subject)
5357: {
5358: SCHECK_PARTIAL();
5359: break;
5360: }
5361: GETCHARLENTEST(c, eptr, len);
1.6 ! misha 5362: if ((UCD_CHARTYPE(c) == prop_value) == prop_fail_result) break;
1.1 misha 5363: eptr+= len;
5364: }
5365: break;
5366:
5367: case PT_SC:
5368: for (i = min; i < max; i++)
5369: {
5370: int len = 1;
1.4 misha 5371: if (eptr >= md->end_subject)
5372: {
5373: SCHECK_PARTIAL();
5374: break;
5375: }
5376: GETCHARLENTEST(c, eptr, len);
1.6 ! misha 5377: if ((UCD_SCRIPT(c) == prop_value) == prop_fail_result) break;
1.1 misha 5378: eptr+= len;
5379: }
5380: break;
1.4 misha 5381:
5382: case PT_ALNUM:
5383: for (i = min; i < max; i++)
5384: {
1.6 ! misha 5385: int category;
1.4 misha 5386: int len = 1;
5387: if (eptr >= md->end_subject)
5388: {
5389: SCHECK_PARTIAL();
5390: break;
5391: }
5392: GETCHARLENTEST(c, eptr, len);
1.6 ! misha 5393: category = UCD_CATEGORY(c);
! 5394: if ((category == ucp_L || category == ucp_N) == prop_fail_result)
1.4 misha 5395: break;
5396: eptr+= len;
5397: }
5398: break;
5399:
5400: case PT_SPACE: /* Perl space */
5401: for (i = min; i < max; i++)
5402: {
5403: int len = 1;
5404: if (eptr >= md->end_subject)
5405: {
5406: SCHECK_PARTIAL();
5407: break;
5408: }
5409: GETCHARLENTEST(c, eptr, len);
1.6 ! misha 5410: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
1.4 misha 5411: c == CHAR_FF || c == CHAR_CR)
5412: == prop_fail_result)
5413: break;
5414: eptr+= len;
5415: }
5416: break;
5417:
5418: case PT_PXSPACE: /* POSIX space */
5419: for (i = min; i < max; i++)
5420: {
5421: int len = 1;
5422: if (eptr >= md->end_subject)
5423: {
5424: SCHECK_PARTIAL();
5425: break;
5426: }
5427: GETCHARLENTEST(c, eptr, len);
1.6 ! misha 5428: if ((UCD_CATEGORY(c) == ucp_Z || c == CHAR_HT || c == CHAR_NL ||
1.4 misha 5429: c == CHAR_VT || c == CHAR_FF || c == CHAR_CR)
5430: == prop_fail_result)
5431: break;
5432: eptr+= len;
5433: }
5434: break;
5435:
5436: case PT_WORD:
5437: for (i = min; i < max; i++)
5438: {
1.6 ! misha 5439: int category;
1.4 misha 5440: int len = 1;
5441: if (eptr >= md->end_subject)
5442: {
5443: SCHECK_PARTIAL();
5444: break;
5445: }
5446: GETCHARLENTEST(c, eptr, len);
1.6 ! misha 5447: category = UCD_CATEGORY(c);
! 5448: if ((category == ucp_L || category == ucp_N ||
1.4 misha 5449: c == CHAR_UNDERSCORE) == prop_fail_result)
5450: break;
5451: eptr+= len;
5452: }
5453: break;
5454:
5455: default:
5456: RRETURN(PCRE_ERROR_INTERNAL);
1.1 misha 5457: }
5458:
5459: /* eptr is now past the end of the maximum run */
5460:
5461: if (possessive) continue;
5462: for(;;)
5463: {
1.6 ! misha 5464: RMATCH(eptr, ecode, offset_top, md, eptrb, RM44);
1.1 misha 5465: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5466: if (eptr-- == pp) break; /* Stop if tried at original pos */
1.6 ! misha 5467: if (utf) BACKCHAR(eptr);
1.1 misha 5468: }
5469: }
5470:
5471: /* Match extended Unicode sequences. We will get here only if the
5472: support is in the binary; otherwise a compile-time error occurs. */
5473:
5474: else if (ctype == OP_EXTUNI)
5475: {
5476: for (i = min; i < max; i++)
5477: {
1.6 ! misha 5478: int len = 1;
1.4 misha 5479: if (eptr >= md->end_subject)
5480: {
5481: SCHECK_PARTIAL();
5482: break;
5483: }
1.6 ! misha 5484: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
! 5485: if (UCD_CATEGORY(c) == ucp_M) break;
! 5486: eptr += len;
1.1 misha 5487: while (eptr < md->end_subject)
5488: {
1.6 ! misha 5489: len = 1;
! 5490: if (!utf) c = *eptr; else { GETCHARLEN(c, eptr, len); }
! 5491: if (UCD_CATEGORY(c) != ucp_M) break;
1.1 misha 5492: eptr += len;
5493: }
5494: }
5495:
5496: /* eptr is now past the end of the maximum run */
5497:
5498: if (possessive) continue;
1.4 misha 5499:
1.1 misha 5500: for(;;)
5501: {
1.6 ! misha 5502: RMATCH(eptr, ecode, offset_top, md, eptrb, RM45);
1.1 misha 5503: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5504: if (eptr-- == pp) break; /* Stop if tried at original pos */
5505: for (;;) /* Move back over one extended */
5506: {
1.6 ! misha 5507: if (!utf) c = *eptr; else
1.1 misha 5508: {
5509: BACKCHAR(eptr);
1.6 ! misha 5510: GETCHAR(c, eptr);
1.1 misha 5511: }
1.6 ! misha 5512: if (UCD_CATEGORY(c) != ucp_M) break;
1.1 misha 5513: eptr--;
5514: }
5515: }
5516: }
5517:
5518: else
5519: #endif /* SUPPORT_UCP */
5520:
1.6 ! misha 5521: #ifdef SUPPORT_UTF
! 5522: if (utf)
1.1 misha 5523: {
5524: switch(ctype)
5525: {
5526: case OP_ANY:
5527: if (max < INT_MAX)
5528: {
5529: for (i = min; i < max; i++)
5530: {
1.4 misha 5531: if (eptr >= md->end_subject)
5532: {
5533: SCHECK_PARTIAL();
5534: break;
5535: }
5536: if (IS_NEWLINE(eptr)) break;
1.1 misha 5537: eptr++;
1.6 ! misha 5538: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misha 5539: }
5540: }
5541:
5542: /* Handle unlimited UTF-8 repeat */
5543:
5544: else
5545: {
5546: for (i = min; i < max; i++)
5547: {
1.4 misha 5548: if (eptr >= md->end_subject)
5549: {
5550: SCHECK_PARTIAL();
5551: break;
5552: }
5553: if (IS_NEWLINE(eptr)) break;
1.1 misha 5554: eptr++;
1.6 ! misha 5555: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misha 5556: }
5557: }
5558: break;
5559:
5560: case OP_ALLANY:
5561: if (max < INT_MAX)
5562: {
5563: for (i = min; i < max; i++)
5564: {
1.4 misha 5565: if (eptr >= md->end_subject)
5566: {
5567: SCHECK_PARTIAL();
5568: break;
5569: }
1.1 misha 5570: eptr++;
1.6 ! misha 5571: ACROSSCHAR(eptr < md->end_subject, *eptr, eptr++);
1.1 misha 5572: }
5573: }
1.6 ! misha 5574: else
! 5575: {
! 5576: eptr = md->end_subject; /* Unlimited UTF-8 repeat */
! 5577: SCHECK_PARTIAL();
! 5578: }
1.1 misha 5579: break;
5580:
5581: /* The byte case is the same as non-UTF8 */
5582:
5583: case OP_ANYBYTE:
5584: c = max - min;
5585: if (c > (unsigned int)(md->end_subject - eptr))
1.4 misha 5586: {
5587: eptr = md->end_subject;
5588: SCHECK_PARTIAL();
5589: }
5590: else eptr += c;
1.1 misha 5591: break;
5592:
5593: case OP_ANYNL:
5594: for (i = min; i < max; i++)
5595: {
5596: int len = 1;
1.4 misha 5597: if (eptr >= md->end_subject)
5598: {
5599: SCHECK_PARTIAL();
5600: break;
5601: }
1.1 misha 5602: GETCHARLEN(c, eptr, len);
5603: if (c == 0x000d)
5604: {
5605: if (++eptr >= md->end_subject) break;
5606: if (*eptr == 0x000a) eptr++;
5607: }
5608: else
5609: {
5610: if (c != 0x000a &&
5611: (md->bsr_anycrlf ||
5612: (c != 0x000b && c != 0x000c &&
5613: c != 0x0085 && c != 0x2028 && c != 0x2029)))
5614: break;
5615: eptr += len;
5616: }
5617: }
5618: break;
5619:
5620: case OP_NOT_HSPACE:
5621: case OP_HSPACE:
5622: for (i = min; i < max; i++)
5623: {
5624: BOOL gotspace;
5625: int len = 1;
1.4 misha 5626: if (eptr >= md->end_subject)
5627: {
5628: SCHECK_PARTIAL();
5629: break;
5630: }
1.1 misha 5631: GETCHARLEN(c, eptr, len);
5632: switch(c)
5633: {
5634: default: gotspace = FALSE; break;
5635: case 0x09: /* HT */
5636: case 0x20: /* SPACE */
5637: case 0xa0: /* NBSP */
5638: case 0x1680: /* OGHAM SPACE MARK */
5639: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
5640: case 0x2000: /* EN QUAD */
5641: case 0x2001: /* EM QUAD */
5642: case 0x2002: /* EN SPACE */
5643: case 0x2003: /* EM SPACE */
5644: case 0x2004: /* THREE-PER-EM SPACE */
5645: case 0x2005: /* FOUR-PER-EM SPACE */
5646: case 0x2006: /* SIX-PER-EM SPACE */
5647: case 0x2007: /* FIGURE SPACE */
5648: case 0x2008: /* PUNCTUATION SPACE */
5649: case 0x2009: /* THIN SPACE */
5650: case 0x200A: /* HAIR SPACE */
5651: case 0x202f: /* NARROW NO-BREAK SPACE */
5652: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
5653: case 0x3000: /* IDEOGRAPHIC SPACE */
5654: gotspace = TRUE;
5655: break;
5656: }
5657: if (gotspace == (ctype == OP_NOT_HSPACE)) break;
5658: eptr += len;
5659: }
5660: break;
5661:
5662: case OP_NOT_VSPACE:
5663: case OP_VSPACE:
5664: for (i = min; i < max; i++)
5665: {
5666: BOOL gotspace;
5667: int len = 1;
1.4 misha 5668: if (eptr >= md->end_subject)
5669: {
5670: SCHECK_PARTIAL();
5671: break;
5672: }
1.1 misha 5673: GETCHARLEN(c, eptr, len);
5674: switch(c)
5675: {
5676: default: gotspace = FALSE; break;
5677: case 0x0a: /* LF */
5678: case 0x0b: /* VT */
5679: case 0x0c: /* FF */
5680: case 0x0d: /* CR */
5681: case 0x85: /* NEL */
5682: case 0x2028: /* LINE SEPARATOR */
5683: case 0x2029: /* PARAGRAPH SEPARATOR */
5684: gotspace = TRUE;
5685: break;
5686: }
5687: if (gotspace == (ctype == OP_NOT_VSPACE)) break;
5688: eptr += len;
5689: }
5690: break;
5691:
5692: case OP_NOT_DIGIT:
5693: for (i = min; i < max; i++)
5694: {
5695: int len = 1;
1.4 misha 5696: if (eptr >= md->end_subject)
5697: {
5698: SCHECK_PARTIAL();
5699: break;
5700: }
1.1 misha 5701: GETCHARLEN(c, eptr, len);
5702: if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
5703: eptr+= len;
5704: }
5705: break;
5706:
5707: case OP_DIGIT:
5708: for (i = min; i < max; i++)
5709: {
5710: int len = 1;
1.4 misha 5711: if (eptr >= md->end_subject)
5712: {
5713: SCHECK_PARTIAL();
5714: break;
5715: }
1.1 misha 5716: GETCHARLEN(c, eptr, len);
5717: if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
5718: eptr+= len;
5719: }
5720: break;
5721:
5722: case OP_NOT_WHITESPACE:
5723: for (i = min; i < max; i++)
5724: {
5725: int len = 1;
1.4 misha 5726: if (eptr >= md->end_subject)
5727: {
5728: SCHECK_PARTIAL();
5729: break;
5730: }
1.1 misha 5731: GETCHARLEN(c, eptr, len);
5732: if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
5733: eptr+= len;
5734: }
5735: break;
5736:
5737: case OP_WHITESPACE:
5738: for (i = min; i < max; i++)
5739: {
5740: int len = 1;
1.4 misha 5741: if (eptr >= md->end_subject)
5742: {
5743: SCHECK_PARTIAL();
5744: break;
5745: }
1.1 misha 5746: GETCHARLEN(c, eptr, len);
5747: if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
5748: eptr+= len;
5749: }
5750: break;
5751:
5752: case OP_NOT_WORDCHAR:
5753: for (i = min; i < max; i++)
5754: {
5755: int len = 1;
1.4 misha 5756: if (eptr >= md->end_subject)
5757: {
5758: SCHECK_PARTIAL();
5759: break;
5760: }
1.1 misha 5761: GETCHARLEN(c, eptr, len);
5762: if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
5763: eptr+= len;
5764: }
5765: break;
5766:
5767: case OP_WORDCHAR:
5768: for (i = min; i < max; i++)
5769: {
5770: int len = 1;
1.4 misha 5771: if (eptr >= md->end_subject)
5772: {
5773: SCHECK_PARTIAL();
5774: break;
5775: }
1.1 misha 5776: GETCHARLEN(c, eptr, len);
5777: if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
5778: eptr+= len;
5779: }
5780: break;
5781:
5782: default:
5783: RRETURN(PCRE_ERROR_INTERNAL);
5784: }
5785:
1.6 ! misha 5786: /* eptr is now past the end of the maximum run. If possessive, we are
! 5787: done (no backing up). Otherwise, match at this position; anything other
! 5788: than no match is immediately returned. For nomatch, back up one
! 5789: character, unless we are matching \R and the last thing matched was
! 5790: \r\n, in which case, back up two bytes. */
1.1 misha 5791:
5792: if (possessive) continue;
5793: for(;;)
5794: {
1.6 ! misha 5795: RMATCH(eptr, ecode, offset_top, md, eptrb, RM46);
1.1 misha 5796: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
5797: if (eptr-- == pp) break; /* Stop if tried at original pos */
5798: BACKCHAR(eptr);
1.6 ! misha 5799: if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
! 5800: eptr[-1] == '\r') eptr--;
1.1 misha 5801: }
5802: }
5803: else
1.6 ! misha 5804: #endif /* SUPPORT_UTF */
! 5805: /* Not UTF mode */
1.1 misha 5806: {
5807: switch(ctype)
5808: {
5809: case OP_ANY:
5810: for (i = min; i < max; i++)
5811: {
1.4 misha 5812: if (eptr >= md->end_subject)
5813: {
5814: SCHECK_PARTIAL();
5815: break;
5816: }
5817: if (IS_NEWLINE(eptr)) break;
1.1 misha 5818: eptr++;
5819: }
5820: break;
5821:
5822: case OP_ALLANY:
5823: case OP_ANYBYTE:
5824: c = max - min;
5825: if (c > (unsigned int)(md->end_subject - eptr))
1.4 misha 5826: {
5827: eptr = md->end_subject;
5828: SCHECK_PARTIAL();
5829: }
5830: else eptr += c;
1.1 misha 5831: break;
5832:
5833: case OP_ANYNL:
5834: for (i = min; i < max; i++)
5835: {
1.4 misha 5836: if (eptr >= md->end_subject)
5837: {
5838: SCHECK_PARTIAL();
5839: break;
5840: }
1.1 misha 5841: c = *eptr;
5842: if (c == 0x000d)
5843: {
5844: if (++eptr >= md->end_subject) break;
5845: if (*eptr == 0x000a) eptr++;
5846: }
5847: else
5848: {
1.6 ! misha 5849: if (c != 0x000a && (md->bsr_anycrlf ||
! 5850: (c != 0x000b && c != 0x000c && c != 0x0085
! 5851: #ifdef COMPILE_PCRE16
! 5852: && c != 0x2028 && c != 0x2029
! 5853: #endif
! 5854: ))) break;
1.1 misha 5855: eptr++;
5856: }
5857: }
5858: break;
5859:
5860: case OP_NOT_HSPACE:
5861: for (i = min; i < max; i++)
5862: {
1.4 misha 5863: if (eptr >= md->end_subject)
5864: {
5865: SCHECK_PARTIAL();
5866: break;
5867: }
1.1 misha 5868: c = *eptr;
1.6 ! misha 5869: if (c == 0x09 || c == 0x20 || c == 0xa0
! 5870: #ifdef COMPILE_PCRE16
! 5871: || c == 0x1680 || c == 0x180e || (c >= 0x2000 && c <= 0x200A)
! 5872: || c == 0x202f || c == 0x205f || c == 0x3000
! 5873: #endif
! 5874: ) break;
1.1 misha 5875: eptr++;
5876: }
5877: break;
5878:
5879: case OP_HSPACE:
5880: for (i = min; i < max; i++)
5881: {
1.4 misha 5882: if (eptr >= md->end_subject)
5883: {
5884: SCHECK_PARTIAL();
5885: break;
5886: }
1.1 misha 5887: c = *eptr;
1.6 ! misha 5888: if (c != 0x09 && c != 0x20 && c != 0xa0
! 5889: #ifdef COMPILE_PCRE16
! 5890: && c != 0x1680 && c != 0x180e && (c < 0x2000 || c > 0x200A)
! 5891: && c != 0x202f && c != 0x205f && c != 0x3000
! 5892: #endif
! 5893: ) break;
1.1 misha 5894: eptr++;
5895: }
5896: break;
5897:
5898: case OP_NOT_VSPACE:
5899: for (i = min; i < max; i++)
5900: {
1.4 misha 5901: if (eptr >= md->end_subject)
5902: {
5903: SCHECK_PARTIAL();
5904: break;
5905: }
1.1 misha 5906: c = *eptr;
1.6 ! misha 5907: if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85
! 5908: #ifdef COMPILE_PCRE16
! 5909: || c == 0x2028 || c == 0x2029
! 5910: #endif
! 5911: ) break;
1.1 misha 5912: eptr++;
5913: }
5914: break;
5915:
5916: case OP_VSPACE:
5917: for (i = min; i < max; i++)
5918: {
1.4 misha 5919: if (eptr >= md->end_subject)
5920: {
5921: SCHECK_PARTIAL();
5922: break;
5923: }
1.1 misha 5924: c = *eptr;
1.6 ! misha 5925: if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85
! 5926: #ifdef COMPILE_PCRE16
! 5927: && c != 0x2028 && c != 0x2029
! 5928: #endif
! 5929: ) break;
1.1 misha 5930: eptr++;
5931: }
5932: break;
5933:
5934: case OP_NOT_DIGIT:
5935: for (i = min; i < max; i++)
5936: {
1.4 misha 5937: if (eptr >= md->end_subject)
5938: {
5939: SCHECK_PARTIAL();
1.1 misha 5940: break;
1.4 misha 5941: }
1.6 ! misha 5942: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_digit) != 0) break;
1.1 misha 5943: eptr++;
5944: }
5945: break;
5946:
5947: case OP_DIGIT:
5948: for (i = min; i < max; i++)
5949: {
1.4 misha 5950: if (eptr >= md->end_subject)
5951: {
5952: SCHECK_PARTIAL();
1.1 misha 5953: break;
1.4 misha 5954: }
1.6 ! misha 5955: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_digit) == 0) break;
1.1 misha 5956: eptr++;
5957: }
5958: break;
5959:
5960: case OP_NOT_WHITESPACE:
5961: for (i = min; i < max; i++)
5962: {
1.4 misha 5963: if (eptr >= md->end_subject)
5964: {
5965: SCHECK_PARTIAL();
1.1 misha 5966: break;
1.4 misha 5967: }
1.6 ! misha 5968: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_space) != 0) break;
1.1 misha 5969: eptr++;
5970: }
5971: break;
5972:
5973: case OP_WHITESPACE:
5974: for (i = min; i < max; i++)
5975: {
1.4 misha 5976: if (eptr >= md->end_subject)
5977: {
5978: SCHECK_PARTIAL();
1.1 misha 5979: break;
1.4 misha 5980: }
1.6 ! misha 5981: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_space) == 0) break;
1.1 misha 5982: eptr++;
5983: }
5984: break;
5985:
5986: case OP_NOT_WORDCHAR:
5987: for (i = min; i < max; i++)
5988: {
1.4 misha 5989: if (eptr >= md->end_subject)
5990: {
5991: SCHECK_PARTIAL();
1.1 misha 5992: break;
1.4 misha 5993: }
1.6 ! misha 5994: if (MAX_255(*eptr) && (md->ctypes[*eptr] & ctype_word) != 0) break;
1.1 misha 5995: eptr++;
5996: }
5997: break;
5998:
5999: case OP_WORDCHAR:
6000: for (i = min; i < max; i++)
6001: {
1.4 misha 6002: if (eptr >= md->end_subject)
6003: {
6004: SCHECK_PARTIAL();
1.1 misha 6005: break;
1.4 misha 6006: }
1.6 ! misha 6007: if (!MAX_255(*eptr) || (md->ctypes[*eptr] & ctype_word) == 0) break;
1.1 misha 6008: eptr++;
6009: }
6010: break;
6011:
6012: default:
6013: RRETURN(PCRE_ERROR_INTERNAL);
6014: }
6015:
1.6 ! misha 6016: /* eptr is now past the end of the maximum run. If possessive, we are
! 6017: done (no backing up). Otherwise, match at this position; anything other
! 6018: than no match is immediately returned. For nomatch, back up one
! 6019: character (byte), unless we are matching \R and the last thing matched
! 6020: was \r\n, in which case, back up two bytes. */
1.1 misha 6021:
6022: if (possessive) continue;
6023: while (eptr >= pp)
6024: {
1.6 ! misha 6025: RMATCH(eptr, ecode, offset_top, md, eptrb, RM47);
! 6026: if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1.1 misha 6027: eptr--;
1.6 ! misha 6028: if (ctype == OP_ANYNL && eptr > pp && *eptr == '\n' &&
! 6029: eptr[-1] == '\r') eptr--;
1.1 misha 6030: }
6031: }
6032:
6033: /* Get here if we can't make it match with any permitted repetitions */
6034:
1.6 ! misha 6035: RRETURN(MATCH_NOMATCH);
1.1 misha 6036: }
6037: /* Control never gets here */
6038:
6039: /* There's been some horrible disaster. Arrival here can only mean there is
6040: something seriously wrong in the code above or the OP_xxx definitions. */
6041:
6042: default:
6043: DPRINTF(("Unknown opcode %d\n", *ecode));
6044: RRETURN(PCRE_ERROR_UNKNOWN_OPCODE);
6045: }
6046:
6047: /* Do not stick any code in here without much thought; it is assumed
6048: that "continue" in the code above comes out to here to repeat the main
6049: loop. */
6050:
6051: } /* End of main loop */
6052: /* Control never reaches here */
6053:
6054:
6055: /* When compiling to use the heap rather than the stack for recursive calls to
6056: match(), the RRETURN() macro jumps here. The number that is saved in
6057: frame->Xwhere indicates which label we actually want to return to. */
6058:
6059: #ifdef NO_RECURSE
6060: #define LBL(val) case val: goto L_RM##val;
6061: HEAP_RETURN:
6062: switch (frame->Xwhere)
6063: {
6064: LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
6065: LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
6066: LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
6067: LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
1.6 ! misha 6068: LBL(53) LBL(54) LBL(55) LBL(56) LBL(57) LBL(58) LBL(63) LBL(64)
! 6069: LBL(65) LBL(66)
! 6070: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
! 6071: LBL(21)
! 6072: #endif
! 6073: #ifdef SUPPORT_UTF
! 6074: LBL(16) LBL(18) LBL(20)
! 6075: LBL(22) LBL(23) LBL(28) LBL(30)
1.1 misha 6076: LBL(32) LBL(34) LBL(42) LBL(46)
6077: #ifdef SUPPORT_UCP
6078: LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
1.4 misha 6079: LBL(59) LBL(60) LBL(61) LBL(62)
1.1 misha 6080: #endif /* SUPPORT_UCP */
1.6 ! misha 6081: #endif /* SUPPORT_UTF */
1.1 misha 6082: default:
6083: DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
1.6 ! misha 6084:
! 6085: printf("+++jump error in pcre match: label %d non-existent\n", frame->Xwhere);
! 6086:
1.1 misha 6087: return PCRE_ERROR_INTERNAL;
6088: }
6089: #undef LBL
6090: #endif /* NO_RECURSE */
6091: }
6092:
6093:
6094: /***************************************************************************
6095: ****************************************************************************
6096: RECURSION IN THE match() FUNCTION
6097:
6098: Undefine all the macros that were defined above to handle this. */
6099:
6100: #ifdef NO_RECURSE
6101: #undef eptr
6102: #undef ecode
6103: #undef mstart
6104: #undef offset_top
6105: #undef eptrb
6106: #undef flags
6107:
6108: #undef callpat
6109: #undef charptr
6110: #undef data
6111: #undef next
6112: #undef pp
6113: #undef prev
6114: #undef saved_eptr
6115:
6116: #undef new_recursive
6117:
6118: #undef cur_is_word
6119: #undef condition
6120: #undef prev_is_word
6121:
6122: #undef ctype
6123: #undef length
6124: #undef max
6125: #undef min
6126: #undef number
6127: #undef offset
6128: #undef op
6129: #undef save_capture_last
6130: #undef save_offset1
6131: #undef save_offset2
6132: #undef save_offset3
6133: #undef stacksave
6134:
6135: #undef newptrb
6136:
6137: #endif
6138:
6139: /* These two are defined as macros in both cases */
6140:
6141: #undef fc
6142: #undef fi
6143:
6144: /***************************************************************************
6145: ***************************************************************************/
6146:
6147:
6148:
6149: /*************************************************
6150: * Execute a Regular Expression *
6151: *************************************************/
6152:
6153: /* This function applies a compiled re to a subject string and picks out
6154: portions of the string if it matches. Two elements in the vector are set for
6155: each substring: the offsets to the start and end of the substring.
6156:
6157: Arguments:
6158: argument_re points to the compiled expression
6159: extra_data points to extra data or is NULL
6160: subject points to the subject string
6161: length length of subject string (may contain binary zeros)
6162: start_offset where to start in the subject string
6163: options option bits
6164: offsets points to a vector of ints to be filled in with offsets
6165: offsetcount the number of elements in the vector
6166:
6167: Returns: > 0 => success; value is the number of elements filled in
6168: = 0 => success, but offsets is not big enough
6169: -1 => failed to match
6170: < -1 => some kind of unexpected problem
6171: */
6172:
1.6 ! misha 6173: #ifdef COMPILE_PCRE8
1.2 misha 6174: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
1.1 misha 6175: pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
6176: PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
6177: int offsetcount)
1.6 ! misha 6178: #else
! 6179: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
! 6180: pcre16_exec(const pcre16 *argument_re, const pcre16_extra *extra_data,
! 6181: PCRE_SPTR16 subject, int length, int start_offset, int options, int *offsets,
! 6182: int offsetcount)
! 6183: #endif
1.1 misha 6184: {
1.6 ! misha 6185: int rc, ocount, arg_offset_max;
1.1 misha 6186: int newline;
6187: BOOL using_temporary_offsets = FALSE;
6188: BOOL anchored;
6189: BOOL startline;
6190: BOOL firstline;
1.6 ! misha 6191: BOOL utf;
! 6192: BOOL has_first_char = FALSE;
! 6193: BOOL has_req_char = FALSE;
! 6194: pcre_uchar first_char = 0;
! 6195: pcre_uchar first_char2 = 0;
! 6196: pcre_uchar req_char = 0;
! 6197: pcre_uchar req_char2 = 0;
1.1 misha 6198: match_data match_block;
6199: match_data *md = &match_block;
1.6 ! misha 6200: const pcre_uint8 *tables;
! 6201: const pcre_uint8 *start_bits = NULL;
! 6202: PCRE_PUCHAR start_match = (PCRE_PUCHAR)subject + start_offset;
! 6203: PCRE_PUCHAR end_subject;
! 6204: PCRE_PUCHAR start_partial = NULL;
! 6205: PCRE_PUCHAR req_char_ptr = start_match - 1;
1.1 misha 6206:
6207: const pcre_study_data *study;
1.6 ! misha 6208: const REAL_PCRE *re = (const REAL_PCRE *)argument_re;
! 6209:
! 6210: /* Check for the special magic call that measures the size of the stack used
! 6211: per recursive call of match(). */
1.1 misha 6212:
1.6 ! misha 6213: if (re == NULL && extra_data == NULL && subject == NULL && length == -999 &&
! 6214: start_offset == -999)
! 6215: #ifdef NO_RECURSE
! 6216: return -sizeof(heapframe);
! 6217: #else
! 6218: return match(NULL, NULL, NULL, 0, NULL, NULL, 0);
! 6219: #endif
1.1 misha 6220:
6221: /* Plausibility checks */
6222:
6223: if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
1.6 ! misha 6224: if (re == NULL || subject == NULL || (offsets == NULL && offsetcount > 0))
! 6225: return PCRE_ERROR_NULL;
1.1 misha 6226: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
1.5 misha 6227: if (start_offset < 0 || start_offset > length) return PCRE_ERROR_BADOFFSET;
1.1 misha 6228:
1.6 ! misha 6229: /* Check that the first field in the block is the magic number. If it is not,
! 6230: return with PCRE_ERROR_BADMAGIC. However, if the magic number is equal to
! 6231: REVERSED_MAGIC_NUMBER we return with PCRE_ERROR_BADENDIANNESS, which
! 6232: means that the pattern is likely compiled with different endianness. */
! 6233:
! 6234: if (re->magic_number != MAGIC_NUMBER)
! 6235: return re->magic_number == REVERSED_MAGIC_NUMBER?
! 6236: PCRE_ERROR_BADENDIANNESS:PCRE_ERROR_BADMAGIC;
! 6237: if ((re->flags & PCRE_MODE) == 0) return PCRE_ERROR_BADMODE;
! 6238:
! 6239: /* These two settings are used in the code for checking a UTF-8 string that
! 6240: follows immediately afterwards. Other values in the md block are used only
! 6241: during "normal" pcre_exec() processing, not when the JIT support is in use,
! 6242: so they are set up later. */
! 6243:
! 6244: /* PCRE_UTF16 has the same value as PCRE_UTF8. */
! 6245: utf = md->utf = (re->options & PCRE_UTF8) != 0;
! 6246: md->partial = ((options & PCRE_PARTIAL_HARD) != 0)? 2 :
! 6247: ((options & PCRE_PARTIAL_SOFT) != 0)? 1 : 0;
! 6248:
! 6249: /* Check a UTF-8 string if required. Pass back the character offset and error
! 6250: code for an invalid string if a results vector is available. */
! 6251:
! 6252: #ifdef SUPPORT_UTF
! 6253: if (utf && (options & PCRE_NO_UTF8_CHECK) == 0)
! 6254: {
! 6255: int erroroffset;
! 6256: int errorcode = PRIV(valid_utf)((PCRE_PUCHAR)subject, length, &erroroffset);
! 6257: if (errorcode != 0)
! 6258: {
! 6259: if (offsetcount >= 2)
! 6260: {
! 6261: offsets[0] = erroroffset;
! 6262: offsets[1] = errorcode;
! 6263: }
! 6264: #ifdef COMPILE_PCRE16
! 6265: return (errorcode <= PCRE_UTF16_ERR1 && md->partial > 1)?
! 6266: PCRE_ERROR_SHORTUTF16 : PCRE_ERROR_BADUTF16;
! 6267: #else
! 6268: return (errorcode <= PCRE_UTF8_ERR5 && md->partial > 1)?
! 6269: PCRE_ERROR_SHORTUTF8 : PCRE_ERROR_BADUTF8;
! 6270: #endif
! 6271: }
! 6272:
! 6273: /* Check that a start_offset points to the start of a UTF character. */
! 6274: if (start_offset > 0 && start_offset < length &&
! 6275: NOT_FIRSTCHAR(((PCRE_PUCHAR)subject)[start_offset]))
! 6276: return PCRE_ERROR_BADUTF8_OFFSET;
! 6277: }
! 6278: #endif
! 6279:
! 6280: /* If the pattern was successfully studied with JIT support, run the JIT
! 6281: executable instead of the rest of this function. Most options must be set at
! 6282: compile time for the JIT code to be usable. Fallback to the normal code path if
! 6283: an unsupported flag is set. In particular, JIT does not support partial
! 6284: matching. */
! 6285:
! 6286: #ifdef SUPPORT_JIT
! 6287: if (extra_data != NULL
! 6288: && (extra_data->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0
! 6289: && extra_data->executable_jit != NULL
! 6290: && (extra_data->flags & PCRE_EXTRA_TABLES) == 0
! 6291: && (options & ~(PCRE_NO_UTF8_CHECK | PCRE_NOTBOL | PCRE_NOTEOL |
! 6292: PCRE_NOTEMPTY | PCRE_NOTEMPTY_ATSTART)) == 0)
! 6293: return PRIV(jit_exec)(re, extra_data->executable_jit,
! 6294: (const pcre_uchar *)subject, length, start_offset, options,
! 6295: ((extra_data->flags & PCRE_EXTRA_MATCH_LIMIT) == 0)
! 6296: ? MATCH_LIMIT : extra_data->match_limit, offsets, offsetcount);
! 6297: #endif
1.4 misha 6298:
1.6 ! misha 6299: /* Carry on with non-JIT matching. This information is for finding all the
! 6300: numbers associated with a given name, for condition testing. */
! 6301:
! 6302: md->name_table = (pcre_uchar *)re + re->name_table_offset;
1.4 misha 6303: md->name_count = re->name_count;
6304: md->name_entry_size = re->name_entry_size;
6305:
1.1 misha 6306: /* Fish out the optional data from the extra_data structure, first setting
6307: the default values. */
6308:
6309: study = NULL;
6310: md->match_limit = MATCH_LIMIT;
6311: md->match_limit_recursion = MATCH_LIMIT_RECURSION;
6312: md->callout_data = NULL;
6313:
6314: /* The table pointer is always in native byte order. */
6315:
1.6 ! misha 6316: tables = re->tables;
1.1 misha 6317:
6318: if (extra_data != NULL)
6319: {
6320: register unsigned int flags = extra_data->flags;
6321: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
6322: study = (const pcre_study_data *)extra_data->study_data;
6323: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
6324: md->match_limit = extra_data->match_limit;
6325: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
6326: md->match_limit_recursion = extra_data->match_limit_recursion;
6327: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
6328: md->callout_data = extra_data->callout_data;
6329: if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
6330: }
6331:
6332: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
6333: is a feature that makes it possible to save compiled regex and re-use them
6334: in other programs later. */
6335:
1.6 ! misha 6336: if (tables == NULL) tables = PRIV(default_tables);
1.1 misha 6337:
6338: /* Set up other data */
6339:
6340: anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
6341: startline = (re->flags & PCRE_STARTLINE) != 0;
6342: firstline = (re->options & PCRE_FIRSTLINE) != 0;
6343:
6344: /* The code starts after the real_pcre block and the capture name table. */
6345:
1.6 ! misha 6346: md->start_code = (const pcre_uchar *)re + re->name_table_offset +
1.1 misha 6347: re->name_count * re->name_entry_size;
6348:
1.6 ! misha 6349: md->start_subject = (PCRE_PUCHAR)subject;
1.1 misha 6350: md->start_offset = start_offset;
6351: md->end_subject = md->start_subject + length;
6352: end_subject = md->end_subject;
6353:
6354: md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
1.4 misha 6355: md->use_ucp = (re->options & PCRE_UCP) != 0;
1.1 misha 6356: md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
1.6 ! misha 6357: md->ignore_skip_arg = FALSE;
! 6358:
! 6359: /* Some options are unpacked into BOOL variables in the hope that testing
! 6360: them will be faster than individual option bits. */
1.1 misha 6361:
6362: md->notbol = (options & PCRE_NOTBOL) != 0;
6363: md->noteol = (options & PCRE_NOTEOL) != 0;
6364: md->notempty = (options & PCRE_NOTEMPTY) != 0;
1.4 misha 6365: md->notempty_atstart = (options & PCRE_NOTEMPTY_ATSTART) != 0;
1.6 ! misha 6366:
1.1 misha 6367: md->hitend = FALSE;
1.6 ! misha 6368: md->mark = md->nomatch_mark = NULL; /* In case never set */
1.1 misha 6369:
6370: md->recursive = NULL; /* No recursion at top level */
1.6 ! misha 6371: md->hasthen = (re->flags & PCRE_HASTHEN) != 0;
1.1 misha 6372:
6373: md->lcc = tables + lcc_offset;
1.6 ! misha 6374: md->fcc = tables + fcc_offset;
1.1 misha 6375: md->ctypes = tables + ctypes_offset;
6376:
6377: /* Handle different \R options. */
6378:
6379: switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6380: {
6381: case 0:
6382: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
6383: md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
6384: else
6385: #ifdef BSR_ANYCRLF
6386: md->bsr_anycrlf = TRUE;
6387: #else
6388: md->bsr_anycrlf = FALSE;
6389: #endif
6390: break;
6391:
6392: case PCRE_BSR_ANYCRLF:
6393: md->bsr_anycrlf = TRUE;
6394: break;
6395:
6396: case PCRE_BSR_UNICODE:
6397: md->bsr_anycrlf = FALSE;
6398: break;
6399:
6400: default: return PCRE_ERROR_BADNEWLINE;
6401: }
6402:
6403: /* Handle different types of newline. The three bits give eight cases. If
6404: nothing is set at run time, whatever was used at compile time applies. */
6405:
6406: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
6407: (pcre_uint32)options) & PCRE_NEWLINE_BITS)
6408: {
6409: case 0: newline = NEWLINE; break; /* Compile-time default */
1.3 misha 6410: case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6411: case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
1.1 misha 6412: case PCRE_NEWLINE_CR+
1.3 misha 6413: PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
1.1 misha 6414: case PCRE_NEWLINE_ANY: newline = -1; break;
6415: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6416: default: return PCRE_ERROR_BADNEWLINE;
6417: }
6418:
6419: if (newline == -2)
6420: {
6421: md->nltype = NLTYPE_ANYCRLF;
6422: }
6423: else if (newline < 0)
6424: {
6425: md->nltype = NLTYPE_ANY;
6426: }
6427: else
6428: {
6429: md->nltype = NLTYPE_FIXED;
6430: if (newline > 255)
6431: {
6432: md->nllen = 2;
6433: md->nl[0] = (newline >> 8) & 255;
6434: md->nl[1] = newline & 255;
6435: }
6436: else
6437: {
6438: md->nllen = 1;
6439: md->nl[0] = newline;
6440: }
6441: }
6442:
1.4 misha 6443: /* Partial matching was originally supported only for a restricted set of
6444: regexes; from release 8.00 there are no restrictions, but the bits are still
6445: defined (though never set). So there's no harm in leaving this code. */
1.1 misha 6446:
6447: if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
6448: return PCRE_ERROR_BADPARTIAL;
6449:
6450: /* If the expression has got more back references than the offsets supplied can
6451: hold, we get a temporary chunk of working store to use during the matching.
6452: Otherwise, we can use the vector supplied, rounding down its size to a multiple
6453: of 3. */
6454:
6455: ocount = offsetcount - (offsetcount % 3);
1.6 ! misha 6456: arg_offset_max = (2*ocount)/3;
1.1 misha 6457:
6458: if (re->top_backref > 0 && re->top_backref >= ocount/3)
6459: {
6460: ocount = re->top_backref * 3 + 3;
1.6 ! misha 6461: md->offset_vector = (int *)(PUBL(malloc))(ocount * sizeof(int));
1.1 misha 6462: if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
6463: using_temporary_offsets = TRUE;
6464: DPRINTF(("Got memory to hold back references\n"));
6465: }
6466: else md->offset_vector = offsets;
6467:
6468: md->offset_end = ocount;
6469: md->offset_max = (2*ocount)/3;
6470: md->offset_overflow = FALSE;
6471: md->capture_last = -1;
6472:
6473: /* Reset the working variable associated with each extraction. These should
6474: never be used unless previously set, but they get saved and restored, and so we
1.6 ! misha 6475: initialize them to avoid reading uninitialized locations. Also, unset the
! 6476: offsets for the matched string. This is really just for tidiness with callouts,
! 6477: in case they inspect these fields. */
1.1 misha 6478:
6479: if (md->offset_vector != NULL)
6480: {
6481: register int *iptr = md->offset_vector + ocount;
1.6 ! misha 6482: register int *iend = iptr - re->top_bracket;
! 6483: if (iend < md->offset_vector + 2) iend = md->offset_vector + 2;
1.1 misha 6484: while (--iptr >= iend) *iptr = -1;
1.6 ! misha 6485: md->offset_vector[0] = md->offset_vector[1] = -1;
1.1 misha 6486: }
6487:
1.6 ! misha 6488: /* Set up the first character to match, if available. The first_char value is
1.1 misha 6489: never set for an anchored regular expression, but the anchoring may be forced
6490: at run time, so we have to test for anchoring. The first char may be unset for
6491: an unanchored pattern, of course. If there's no first char and the pattern was
6492: studied, there may be a bitmap of possible first characters. */
6493:
6494: if (!anchored)
6495: {
6496: if ((re->flags & PCRE_FIRSTSET) != 0)
6497: {
1.6 ! misha 6498: has_first_char = TRUE;
! 6499: first_char = first_char2 = (pcre_uchar)(re->first_char);
! 6500: if ((re->flags & PCRE_FCH_CASELESS) != 0)
! 6501: {
! 6502: first_char2 = TABLE_GET(first_char, md->fcc, first_char);
! 6503: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
! 6504: if (utf && first_char > 127)
! 6505: first_char2 = UCD_OTHERCASE(first_char);
! 6506: #endif
! 6507: }
1.1 misha 6508: }
6509: else
6510: if (!startline && study != NULL &&
1.4 misha 6511: (study->flags & PCRE_STUDY_MAPPED) != 0)
1.1 misha 6512: start_bits = study->start_bits;
6513: }
6514:
6515: /* For anchored or unanchored matches, there may be a "last known required
6516: character" set. */
6517:
6518: if ((re->flags & PCRE_REQCHSET) != 0)
6519: {
1.6 ! misha 6520: has_req_char = TRUE;
! 6521: req_char = req_char2 = (pcre_uchar)(re->req_char);
! 6522: if ((re->flags & PCRE_RCH_CASELESS) != 0)
! 6523: {
! 6524: req_char2 = TABLE_GET(req_char, md->fcc, req_char);
! 6525: #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
! 6526: if (utf && req_char > 127)
! 6527: req_char2 = UCD_OTHERCASE(req_char);
! 6528: #endif
! 6529: }
1.1 misha 6530: }
6531:
6532:
6533: /* ==========================================================================*/
6534:
6535: /* Loop for handling unanchored repeated matching attempts; for anchored regexs
6536: the loop runs just once. */
6537:
6538: for(;;)
6539: {
1.6 ! misha 6540: PCRE_PUCHAR save_end_subject = end_subject;
! 6541: PCRE_PUCHAR new_start_match;
1.1 misha 6542:
1.3 misha 6543: /* If firstline is TRUE, the start of the match is constrained to the first
6544: line of a multiline string. That is, the match must be before or at the first
6545: newline. Implement this by temporarily adjusting end_subject so that we stop
6546: scanning at a newline. If the match fails at the newline, later code breaks
6547: this loop. */
1.1 misha 6548:
6549: if (firstline)
6550: {
1.6 ! misha 6551: PCRE_PUCHAR t = start_match;
! 6552: #ifdef SUPPORT_UTF
! 6553: if (utf)
1.2 misha 6554: {
6555: while (t < md->end_subject && !IS_NEWLINE(t))
6556: {
6557: t++;
1.6 ! misha 6558: ACROSSCHAR(t < end_subject, *t, t++);
1.2 misha 6559: }
6560: }
6561: else
6562: #endif
1.1 misha 6563: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
6564: end_subject = t;
6565: }
6566:
1.3 misha 6567: /* There are some optimizations that avoid running the match if a known
6568: starting point is not found, or if a known later character is not present.
6569: However, there is an option that disables these, for testing and for ensuring
1.5 misha 6570: that all callouts do actually occur. The option can be set in the regex by
6571: (*NO_START_OPT) or passed in match-time options. */
1.1 misha 6572:
1.5 misha 6573: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0)
1.1 misha 6574: {
1.6 ! misha 6575: /* Advance to a unique first char if there is one. */
1.3 misha 6576:
1.6 ! misha 6577: if (has_first_char)
1.3 misha 6578: {
1.6 ! misha 6579: if (first_char != first_char2)
! 6580: while (start_match < end_subject &&
! 6581: *start_match != first_char && *start_match != first_char2)
1.3 misha 6582: start_match++;
6583: else
1.6 ! misha 6584: while (start_match < end_subject && *start_match != first_char)
1.3 misha 6585: start_match++;
6586: }
1.1 misha 6587:
1.3 misha 6588: /* Or to just after a linebreak for a multiline match */
1.1 misha 6589:
1.3 misha 6590: else if (startline)
1.1 misha 6591: {
1.3 misha 6592: if (start_match > md->start_subject + start_offset)
6593: {
1.6 ! misha 6594: #ifdef SUPPORT_UTF
! 6595: if (utf)
1.2 misha 6596: {
1.3 misha 6597: while (start_match < end_subject && !WAS_NEWLINE(start_match))
6598: {
1.2 misha 6599: start_match++;
1.6 ! misha 6600: ACROSSCHAR(start_match < end_subject, *start_match,
! 6601: start_match++);
1.3 misha 6602: }
1.2 misha 6603: }
1.3 misha 6604: else
1.2 misha 6605: #endif
1.3 misha 6606: while (start_match < end_subject && !WAS_NEWLINE(start_match))
6607: start_match++;
1.1 misha 6608:
1.3 misha 6609: /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
6610: and we are now at a LF, advance the match position by one more character.
6611: */
6612:
6613: if (start_match[-1] == CHAR_CR &&
6614: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
6615: start_match < end_subject &&
6616: *start_match == CHAR_NL)
6617: start_match++;
6618: }
1.1 misha 6619: }
6620:
1.3 misha 6621: /* Or to a non-unique first byte after study */
1.1 misha 6622:
1.3 misha 6623: else if (start_bits != NULL)
1.1 misha 6624: {
1.3 misha 6625: while (start_match < end_subject)
6626: {
6627: register unsigned int c = *start_match;
1.6 ! misha 6628: #ifndef COMPILE_PCRE8
! 6629: if (c > 255) c = 255;
! 6630: #endif
1.4 misha 6631: if ((start_bits[c/8] & (1 << (c&7))) == 0)
6632: {
6633: start_match++;
1.6 ! misha 6634: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
! 6635: /* In non 8-bit mode, the iteration will stop for
! 6636: characters > 255 at the beginning or not stop at all. */
! 6637: if (utf)
! 6638: ACROSSCHAR(start_match < end_subject, *start_match,
! 6639: start_match++);
1.4 misha 6640: #endif
6641: }
6642: else break;
1.3 misha 6643: }
1.1 misha 6644: }
1.3 misha 6645: } /* Starting optimizations */
1.1 misha 6646:
6647: /* Restore fudged end_subject */
6648:
6649: end_subject = save_end_subject;
6650:
1.4 misha 6651: /* The following two optimizations are disabled for partial matching or if
6652: disabling is explicitly requested. */
1.1 misha 6653:
1.6 ! misha 6654: if (((options | re->options) & PCRE_NO_START_OPTIMIZE) == 0 && !md->partial)
1.4 misha 6655: {
6656: /* If the pattern was studied, a minimum subject length may be set. This is
6657: a lower bound; no actual string of that length may actually match the
6658: pattern. Although the value is, strictly, in characters, we treat it as
6659: bytes to avoid spending too much time in this optimization. */
1.1 misha 6660:
1.4 misha 6661: if (study != NULL && (study->flags & PCRE_STUDY_MINLEN) != 0 &&
6662: (pcre_uint32)(end_subject - start_match) < study->minlength)
6663: {
6664: rc = MATCH_NOMATCH;
6665: break;
6666: }
1.1 misha 6667:
1.6 ! misha 6668: /* If req_char is set, we know that that character must appear in the
! 6669: subject for the match to succeed. If the first character is set, req_char
1.4 misha 6670: must be later in the subject; otherwise the test starts at the match point.
6671: This optimization can save a huge amount of backtracking in patterns with
6672: nested unlimited repeats that aren't going to match. Writing separate code
6673: for cased/caseless versions makes it go faster, as does using an
6674: autoincrement and backing off on a match.
1.1 misha 6675:
1.4 misha 6676: HOWEVER: when the subject string is very, very long, searching to its end
6677: can take a long time, and give bad performance on quite ordinary patterns.
6678: This showed up when somebody was matching something like /^\d+C/ on a
6679: 32-megabyte string... so we don't do this when the string is sufficiently
6680: long. */
1.1 misha 6681:
1.6 ! misha 6682: if (has_req_char && end_subject - start_match < REQ_BYTE_MAX)
1.1 misha 6683: {
1.6 ! misha 6684: register PCRE_PUCHAR p = start_match + (has_first_char? 1:0);
1.4 misha 6685:
6686: /* We don't need to repeat the search if we haven't yet reached the
6687: place we found it at last time. */
6688:
1.6 ! misha 6689: if (p > req_char_ptr)
1.1 misha 6690: {
1.6 ! misha 6691: if (req_char != req_char2)
1.1 misha 6692: {
1.4 misha 6693: while (p < end_subject)
6694: {
6695: register int pp = *p++;
1.6 ! misha 6696: if (pp == req_char || pp == req_char2) { p--; break; }
1.4 misha 6697: }
1.1 misha 6698: }
1.4 misha 6699: else
1.1 misha 6700: {
1.4 misha 6701: while (p < end_subject)
6702: {
1.6 ! misha 6703: if (*p++ == req_char) { p--; break; }
1.4 misha 6704: }
1.1 misha 6705: }
6706:
1.4 misha 6707: /* If we can't find the required character, break the matching loop,
6708: forcing a match failure. */
1.1 misha 6709:
1.4 misha 6710: if (p >= end_subject)
6711: {
6712: rc = MATCH_NOMATCH;
6713: break;
6714: }
1.1 misha 6715:
1.4 misha 6716: /* If we have found the required character, save the point where we
6717: found it, so that we don't search again next time round the loop if
6718: the start hasn't passed this character yet. */
1.1 misha 6719:
1.6 ! misha 6720: req_char_ptr = p;
1.4 misha 6721: }
1.1 misha 6722: }
6723: }
6724:
1.4 misha 6725: #ifdef PCRE_DEBUG /* Sigh. Some compilers never learn. */
6726: printf(">>>> Match against: ");
6727: pchars(start_match, end_subject - start_match, TRUE, md);
6728: printf("\n");
6729: #endif
6730:
6731: /* OK, we can now run the match. If "hitend" is set afterwards, remember the
6732: first starting point for which a partial match was found. */
1.1 misha 6733:
6734: md->start_match_ptr = start_match;
1.4 misha 6735: md->start_used_ptr = start_match;
1.1 misha 6736: md->match_call_count = 0;
1.6 ! misha 6737: md->match_function_type = 0;
! 6738: md->end_offset_top = 0;
! 6739: rc = match(start_match, md->start_code, start_match, 2, md, NULL, 0);
1.4 misha 6740: if (md->hitend && start_partial == NULL) start_partial = md->start_used_ptr;
1.1 misha 6741:
6742: switch(rc)
6743: {
1.6 ! misha 6744: /* If MATCH_SKIP_ARG reaches this level it means that a MARK that matched
! 6745: the SKIP's arg was not found. In this circumstance, Perl ignores the SKIP
! 6746: entirely. The only way we can do that is to re-do the match at the same
! 6747: point, with a flag to force SKIP with an argument to be ignored. Just
! 6748: treating this case as NOMATCH does not work because it does not check other
! 6749: alternatives in patterns such as A(*SKIP:A)B|AC when the subject is AC. */
! 6750:
! 6751: case MATCH_SKIP_ARG:
! 6752: new_start_match = start_match;
! 6753: md->ignore_skip_arg = TRUE;
! 6754: break;
! 6755:
1.4 misha 6756: /* SKIP passes back the next starting point explicitly, but if it is the
6757: same as the match we have just done, treat it as NOMATCH. */
6758:
6759: case MATCH_SKIP:
6760: if (md->start_match_ptr != start_match)
6761: {
6762: new_start_match = md->start_match_ptr;
6763: break;
6764: }
6765: /* Fall through */
6766:
1.1 misha 6767: /* NOMATCH and PRUNE advance by one character. THEN at this level acts
1.6 ! misha 6768: exactly like PRUNE. Unset the ignore SKIP-with-argument flag. */
1.1 misha 6769:
6770: case MATCH_NOMATCH:
6771: case MATCH_PRUNE:
6772: case MATCH_THEN:
1.6 ! misha 6773: md->ignore_skip_arg = FALSE;
1.1 misha 6774: new_start_match = start_match + 1;
1.6 ! misha 6775: #ifdef SUPPORT_UTF
! 6776: if (utf)
! 6777: ACROSSCHAR(new_start_match < end_subject, *new_start_match,
! 6778: new_start_match++);
1.1 misha 6779: #endif
6780: break;
6781:
6782: /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
6783:
6784: case MATCH_COMMIT:
6785: rc = MATCH_NOMATCH;
6786: goto ENDLOOP;
6787:
1.4 misha 6788: /* Any other return is either a match, or some kind of error. */
1.1 misha 6789:
6790: default:
6791: goto ENDLOOP;
6792: }
6793:
6794: /* Control reaches here for the various types of "no match at this point"
6795: result. Reset the code to MATCH_NOMATCH for subsequent checking. */
6796:
6797: rc = MATCH_NOMATCH;
6798:
6799: /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
6800: newline in the subject (though it may continue over the newline). Therefore,
6801: if we have just failed to match, starting at a newline, do not continue. */
6802:
6803: if (firstline && IS_NEWLINE(start_match)) break;
6804:
6805: /* Advance to new matching position */
6806:
6807: start_match = new_start_match;
6808:
6809: /* Break the loop if the pattern is anchored or if we have passed the end of
6810: the subject. */
6811:
6812: if (anchored || start_match > end_subject) break;
6813:
6814: /* If we have just passed a CR and we are now at a LF, and the pattern does
6815: not contain any explicit matches for \r or \n, and the newline option is CRLF
1.6 ! misha 6816: or ANY or ANYCRLF, advance the match position by one more character. In
! 6817: normal matching start_match will aways be greater than the first position at
! 6818: this stage, but a failed *SKIP can cause a return at the same point, which is
! 6819: why the first test exists. */
1.1 misha 6820:
1.6 ! misha 6821: if (start_match > (PCRE_PUCHAR)subject + start_offset &&
! 6822: start_match[-1] == CHAR_CR &&
1.1 misha 6823: start_match < end_subject &&
1.3 misha 6824: *start_match == CHAR_NL &&
1.1 misha 6825: (re->flags & PCRE_HASCRORLF) == 0 &&
6826: (md->nltype == NLTYPE_ANY ||
6827: md->nltype == NLTYPE_ANYCRLF ||
6828: md->nllen == 2))
6829: start_match++;
6830:
1.4 misha 6831: md->mark = NULL; /* Reset for start of next match attempt */
6832: } /* End of for(;;) "bumpalong" loop */
1.1 misha 6833:
6834: /* ==========================================================================*/
6835:
6836: /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
6837: conditions is true:
6838:
6839: (1) The pattern is anchored or the match was failed by (*COMMIT);
6840:
6841: (2) We are past the end of the subject;
6842:
6843: (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
6844: this option requests that a match occur at or before the first newline in
6845: the subject.
6846:
6847: When we have a match and the offset vector is big enough to deal with any
6848: backreferences, captured substring offsets will already be set up. In the case
6849: where we had to get some local store to hold offsets for backreference
6850: processing, copy those that we can. In this case there need not be overflow if
6851: certain parts of the pattern were not used, even though there are more
6852: capturing parentheses than vector slots. */
6853:
6854: ENDLOOP:
6855:
1.4 misha 6856: if (rc == MATCH_MATCH || rc == MATCH_ACCEPT)
1.1 misha 6857: {
6858: if (using_temporary_offsets)
6859: {
1.6 ! misha 6860: if (arg_offset_max >= 4)
1.1 misha 6861: {
6862: memcpy(offsets + 2, md->offset_vector + 2,
1.6 ! misha 6863: (arg_offset_max - 2) * sizeof(int));
1.1 misha 6864: DPRINTF(("Copied offsets from temporary memory\n"));
6865: }
1.6 ! misha 6866: if (md->end_offset_top > arg_offset_max) md->offset_overflow = TRUE;
1.1 misha 6867: DPRINTF(("Freeing temporary memory\n"));
1.6 ! misha 6868: (PUBL(free))(md->offset_vector);
1.1 misha 6869: }
6870:
1.6 ! misha 6871: /* Set the return code to the number of captured strings, or 0 if there were
1.1 misha 6872: too many to fit into the vector. */
6873:
1.6 ! misha 6874: rc = (md->offset_overflow && md->end_offset_top >= arg_offset_max)?
! 6875: 0 : md->end_offset_top/2;
! 6876:
! 6877: /* If there is space in the offset vector, set any unused pairs at the end of
! 6878: the pattern to -1 for backwards compatibility. It is documented that this
! 6879: happens. In earlier versions, the whole set of potential capturing offsets
! 6880: was set to -1 each time round the loop, but this is handled differently now.
! 6881: "Gaps" are set to -1 dynamically instead (this fixes a bug). Thus, it is only
! 6882: those at the end that need unsetting here. We can't just unset them all at
! 6883: the start of the whole thing because they may get set in one branch that is
! 6884: not the final matching branch. */
! 6885:
! 6886: if (md->end_offset_top/2 <= re->top_bracket && offsets != NULL)
! 6887: {
! 6888: register int *iptr, *iend;
! 6889: int resetcount = 2 + re->top_bracket * 2;
! 6890: if (resetcount > offsetcount) resetcount = ocount;
! 6891: iptr = offsets + md->end_offset_top;
! 6892: iend = offsets + resetcount;
! 6893: while (iptr < iend) *iptr++ = -1;
! 6894: }
1.1 misha 6895:
6896: /* If there is space, set up the whole thing as substring 0. The value of
6897: md->start_match_ptr might be modified if \K was encountered on the success
6898: matching path. */
6899:
6900: if (offsetcount < 2) rc = 0; else
6901: {
1.4 misha 6902: offsets[0] = (int)(md->start_match_ptr - md->start_subject);
6903: offsets[1] = (int)(md->end_match_ptr - md->start_subject);
1.1 misha 6904: }
6905:
1.6 ! misha 6906: /* Return MARK data if requested */
! 6907:
! 6908: if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
! 6909: *(extra_data->mark) = (pcre_uchar *)md->mark;
1.1 misha 6910: DPRINTF((">>>> returning %d\n", rc));
1.6 ! misha 6911: return rc;
1.1 misha 6912: }
6913:
6914: /* Control gets here if there has been an error, or if the overall match
6915: attempt has failed at all permitted starting positions. */
6916:
6917: if (using_temporary_offsets)
6918: {
6919: DPRINTF(("Freeing temporary memory\n"));
1.6 ! misha 6920: (PUBL(free))(md->offset_vector);
1.1 misha 6921: }
6922:
1.4 misha 6923: /* For anything other than nomatch or partial match, just return the code. */
6924:
6925: if (rc != MATCH_NOMATCH && rc != PCRE_ERROR_PARTIAL)
1.1 misha 6926: {
6927: DPRINTF((">>>> error: returning %d\n", rc));
6928: return rc;
6929: }
1.4 misha 6930:
6931: /* Handle partial matches - disable any mark data */
6932:
6933: if (start_partial != NULL)
1.1 misha 6934: {
6935: DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
1.4 misha 6936: md->mark = NULL;
6937: if (offsetcount > 1)
6938: {
1.6 ! misha 6939: offsets[0] = (int)(start_partial - (PCRE_PUCHAR)subject);
! 6940: offsets[1] = (int)(end_subject - (PCRE_PUCHAR)subject);
1.4 misha 6941: }
6942: rc = PCRE_ERROR_PARTIAL;
1.1 misha 6943: }
1.4 misha 6944:
6945: /* This is the classic nomatch case */
6946:
1.1 misha 6947: else
6948: {
6949: DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
1.4 misha 6950: rc = PCRE_ERROR_NOMATCH;
1.1 misha 6951: }
1.4 misha 6952:
6953: /* Return the MARK data if it has been requested. */
6954:
6955: if (extra_data != NULL && (extra_data->flags & PCRE_EXTRA_MARK) != 0)
1.6 ! misha 6956: *(extra_data->mark) = (pcre_uchar *)md->nomatch_mark;
1.4 misha 6957: return rc;
1.1 misha 6958: }
6959:
6960: /* End of pcre_exec.c */
E-mail: