Annotation of parser3/src/pcre/pcre.c, revision 1.2
1.1 paf 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /*
6: This is a library of functions to support regular expressions whose syntax
7: and semantics are as close as possible to those of the Perl 5 language. See
8: the file Tech.Notes for some information on the internals.
9:
10: Written by: Philip Hazel <ph10@cam.ac.uk>
11:
12: Copyright (c) 1997-1999 University of Cambridge
13:
14: -----------------------------------------------------------------------------
15: Permission is granted to anyone to use this software for any purpose on any
16: computer system, and to redistribute it freely, subject to the following
17: restrictions:
18:
19: 1. This software is distributed in the hope that it will be useful,
20: but WITHOUT ANY WARRANTY; without even the implied warranty of
21: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22:
23: 2. The origin of this software must not be misrepresented, either by
24: explicit claim or by omission.
25:
26: 3. Altered versions must be plainly marked as such, and must not be
27: misrepresented as being the original software.
28:
29: 4. If PCRE is embedded in any software that is released under the GNU
30: General Purpose Licence (GPL), then the terms of that licence shall
31: supersede any condition above with which it is incompatible.
32: -----------------------------------------------------------------------------
33: */
34:
35:
36: /* Define DEBUG to get debugging output on stdout. */
37:
38: /* #define DEBUG */
39:
40: /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
41: inline, and there are *still* stupid compilers about that don't like indented
42: pre-processor statements. I suppose it's only been 10 years... */
43:
44: #ifdef DEBUG
45: #define DPRINTF(p) printf p
46: #else
47: #define DPRINTF(p) /*nothing*/
48: #endif
49:
50: /* Include the internals header, which itself includes Standard C headers plus
51: the external pcre header. */
52:
53: #include "internal.h"
54:
55:
56: /* Allow compilation as C++ source code, should anybody want to do that. */
57:
58: #ifdef __cplusplus
59: #define class pcre_class
60: #endif
61:
62:
63: /* Number of items on the nested bracket stacks at compile time. This should
64: not be set greater than 200. */
65:
66: #define BRASTACK_SIZE 200
67:
68:
69: /* Min and max values for the common repeats; for the maxima, 0 => infinity */
70:
71: static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
72: static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
73:
74: /* Text forms of OP_ values and things, for debugging (not all used) */
75:
76: #ifdef DEBUG
77: static const char *OP_names[] = {
78: "End", "\\A", "\\B", "\\b", "\\D", "\\d",
79: "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
80: "Opt", "^", "$", "Any", "chars", "not",
81: "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
82: "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
83: "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
84: "*", "*?", "+", "+?", "?", "??", "{", "{",
85: "class", "Ref",
86: "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
87: "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
88: "Brazero", "Braminzero", "Bra"
89: };
90: #endif
91:
92: /* Table for handling escaped characters in the range '0'-'z'. Positive returns
93: are simple data values; negative values are for special things like \d and so
94: on. Zero means further processing is needed (for things like \x), or the escape
95: is invalid. */
96:
97: static const short int escapes[] = {
98: 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
99: 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
100: '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
101: 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
102: 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
103: 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
104: '`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
105: 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
106: 0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
107: 0, 0, -ESC_z /* x - z */
108: };
109:
110: /* Definition to allow mutual recursion */
111:
112: static BOOL
113: compile_regex(int, int, int *, uschar **, const uschar **, const char **,
114: BOOL, int, int *, int *, compile_data *);
115:
116:
117:
118: /*************************************************
119: * Global variables *
120: *************************************************/
121:
122: /* PCRE is thread-clean and doesn't use any global variables in the normal
123: sense. However, it calls memory allocation and free functions via the two
124: indirections below, which are can be changed by the caller, but are shared
125: between all threads. */
126:
127: void *(*pcre_malloc)(size_t) = malloc;
128: void (*pcre_free)(void *) = free;
129:
130:
131:
132:
133: /*************************************************
134: * Default character tables *
135: *************************************************/
136:
137: /* A default set of character tables is included in the PCRE binary. Its source
138: is built by the maketables auxiliary program, which uses the default C ctypes
139: functions, and put in the file chartables.c. These tables are used by PCRE
140: whenever the caller of pcre_compile() does not provide an alternate set of
141: tables. */
142:
143: #include "chartables.c"
144:
145:
146:
147: /*************************************************
148: * Return version string *
149: *************************************************/
150:
151: #define STRING(a) # a
152: #define XSTRING(s) STRING(s)
153:
154: const char *
155: pcre_version(void)
156: {
157: return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
158: }
159:
160:
161:
162:
163: /*************************************************
164: * Return info about a compiled pattern *
165: *************************************************/
166:
167: /* This function picks potentially useful data out of the private
168: structure. The public options are passed back in an int - though the
169: re->options field has been expanded to a long int, all the public options
170: at the low end of it, and so even on 16-bit systems this will still be OK.
171: Therefore, I haven't changed the API for pcre_info().
172:
173: Arguments:
174: external_re points to compiled code
175: optptr where to pass back the options
176: first_char where to pass back the first character,
177: or -1 if multiline and all branches start ^,
178: or -2 otherwise
179:
180: Returns: number of identifying extraction brackets
181: or negative values on error
182: */
183:
184: int
185: pcre_info(const pcre *external_re, int *optptr, int *first_char)
186: {
187: const real_pcre *re = (const real_pcre *)external_re;
188: if (re == NULL) return PCRE_ERROR_NULL;
189: if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
190: if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
191: if (first_char != NULL)
192: *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
193: ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
194: return re->top_bracket;
195: }
196:
197:
198:
199:
200: #ifdef DEBUG
201: /*************************************************
202: * Debugging function to print chars *
203: *************************************************/
204:
205: /* Print a sequence of chars in printable format, stopping at the end of the
206: subject if the requested.
207:
208: Arguments:
209: p points to characters
210: length number to print
211: is_subject TRUE if printing from within md->start_subject
212: md pointer to matching data block, if is_subject is TRUE
213:
214: Returns: nothing
215: */
216:
217: static void
218: pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
219: {
220: int c;
221: if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
222: while (length-- > 0)
223: if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
224: }
225: #endif
226:
227:
228:
229:
230: /*************************************************
231: * Handle escapes *
232: *************************************************/
233:
234: /* This function is called when a \ has been encountered. It either returns a
235: positive value for a simple escape such as \n, or a negative value which
236: encodes one of the more complicated things such as \d. On entry, ptr is
237: pointing at the \. On exit, it is on the final character of the escape
238: sequence.
239:
240: Arguments:
241: ptrptr points to the pattern position pointer
242: errorptr points to the pointer to the error message
243: bracount number of previous extracting brackets
244: options the options bits
245: isclass TRUE if inside a character class
246: cd pointer to char tables block
247:
248: Returns: zero or positive => a data character
249: negative => a special escape sequence
250: on error, errorptr is set
251: */
252:
253: static int
254: check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
255: int options, BOOL isclass, compile_data *cd)
256: {
257: const uschar *ptr = *ptrptr;
258: int c = *(++ptr) & 255; /* Ensure > 0 on signed-char systems */
259: int i;
260:
261: if (c == 0) *errorptr = ERR1;
262:
263: /* Digits or letters may have special meaning; all others are literals. */
264:
265: else if (c < '0' || c > 'z') {}
266:
267: /* Do an initial lookup in a table. A non-zero result is something that can be
268: returned immediately. Otherwise further processing may be required. */
269:
270: else if ((i = escapes[c - '0']) != 0) c = i;
271:
272: /* Escapes that need further processing, or are illegal. */
273:
274: else
275: {
276: const uschar *oldptr;
277: switch (c)
278: {
279: /* The handling of escape sequences consisting of a string of digits
280: starting with one that is not zero is not straightforward. By experiment,
281: the way Perl works seems to be as follows:
282:
283: Outside a character class, the digits are read as a decimal number. If the
284: number is less than 10, or if there are that many previous extracting
285: left brackets, then it is a back reference. Otherwise, up to three octal
286: digits are read to form an escaped byte. Thus \123 is likely to be octal
287: 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
288: value is greater than 377, the least significant 8 bits are taken. Inside a
289: character class, \ followed by a digit is always an octal number. */
290:
291: case '1': case '2': case '3': case '4': case '5':
292: case '6': case '7': case '8': case '9':
293:
294: if (!isclass)
295: {
296: oldptr = ptr;
297: c -= '0';
298: while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
299: c = c * 10 + *(++ptr) - '0';
300: if (c < 10 || c <= bracount)
301: {
302: c = -(ESC_REF + c);
303: break;
304: }
305: ptr = oldptr; /* Put the pointer back and fall through */
306: }
307:
308: /* Handle an octal number following \. If the first digit is 8 or 9, Perl
309: generates a binary zero byte and treats the digit as a following literal.
310: Thus we have to pull back the pointer by one. */
311:
312: if ((c = *ptr) >= '8')
313: {
314: ptr--;
315: c = 0;
316: break;
317: }
318:
319: /* \0 always starts an octal number, but we may drop through to here with a
320: larger first octal digit */
321:
322: case '0':
323: c -= '0';
324: while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
325: ptr[1] != '8' && ptr[1] != '9')
326: c = c * 8 + *(++ptr) - '0';
327: break;
328:
329: /* Special escapes not starting with a digit are straightforward */
330:
331: case 'x':
332: c = 0;
333: while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
334: {
335: ptr++;
336: c = c * 16 + cd->lcc[*ptr] -
337: (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
338: }
339: break;
340:
341: case 'c':
342: c = *(++ptr);
343: if (c == 0)
344: {
345: *errorptr = ERR2;
346: return 0;
347: }
348:
349: /* A letter is upper-cased; then the 0x40 bit is flipped */
350:
351: if (c >= 'a' && c <= 'z') c = cd->fcc[c];
352: c ^= 0x40;
353: break;
354:
355: /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
356: other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
357: for Perl compatibility, it is a literal. This code looks a bit odd, but
358: there used to be some cases other than the default, and there may be again
359: in future, so I haven't "optimized" it. */
360:
361: default:
362: if ((options & PCRE_EXTRA) != 0) switch(c)
363: {
364: default:
365: *errorptr = ERR3;
366: break;
367: }
368: break;
369: }
370: }
371:
372: *ptrptr = ptr;
373: return c;
374: }
375:
376:
377:
378: /*************************************************
379: * Check for counted repeat *
380: *************************************************/
381:
382: /* This function is called when a '{' is encountered in a place where it might
383: start a quantifier. It looks ahead to see if it really is a quantifier or not.
384: It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
385: where the ddds are digits.
386:
387: Arguments:
388: p pointer to the first char after '{'
389: cd pointer to char tables block
390:
391: Returns: TRUE or FALSE
392: */
393:
394: static BOOL
395: is_counted_repeat(const uschar *p, compile_data *cd)
396: {
397: if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
398: while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
399: if (*p == '}') return TRUE;
400:
401: if (*p++ != ',') return FALSE;
402: if (*p == '}') return TRUE;
403:
404: if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
405: while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
406: return (*p == '}');
407: }
408:
409:
410:
411: /*************************************************
412: * Read repeat counts *
413: *************************************************/
414:
415: /* Read an item of the form {n,m} and return the values. This is called only
416: after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
417: so the syntax is guaranteed to be correct, but we need to check the values.
418:
419: Arguments:
420: p pointer to first char after '{'
421: minp pointer to int for min
422: maxp pointer to int for max
423: returned as -1 if no max
424: errorptr points to pointer to error message
425: cd pointer to character tables clock
426:
427: Returns: pointer to '}' on success;
428: current ptr on error, with errorptr set
429: */
430:
431: static const uschar *
432: read_repeat_counts(const uschar *p, int *minp, int *maxp,
433: const char **errorptr, compile_data *cd)
434: {
435: int min = 0;
436: int max = -1;
437:
438: while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
439:
440: if (*p == '}') max = min; else
441: {
442: if (*(++p) != '}')
443: {
444: max = 0;
445: while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
446: if (max < min)
447: {
448: *errorptr = ERR4;
449: return p;
450: }
451: }
452: }
453:
454: /* Do paranoid checks, then fill in the required variables, and pass back the
455: pointer to the terminating '}'. */
456:
457: if (min > 65535 || max > 65535)
458: *errorptr = ERR5;
459: else
460: {
461: *minp = min;
462: *maxp = max;
463: }
464: return p;
465: }
466:
467:
468:
469: /*************************************************
470: * Find the fixed length of a pattern *
471: *************************************************/
472:
473: /* Scan a pattern and compute the fixed length of subject that will match it,
474: if the length is fixed. This is needed for dealing with backward assertions.
475:
476: Arguments:
477: code points to the start of the pattern (the bracket)
478:
479: Returns: the fixed length, or -1 if there is no fixed length
480: */
481:
482: static int
483: find_fixedlength(uschar *code)
484: {
485: int length = -1;
486:
487: register int branchlength = 0;
488: register uschar *cc = code + 3;
489:
490: /* Scan along the opcodes for this branch. If we get to the end of the
491: branch, check the length against that of the other branches. */
492:
493: for (;;)
494: {
495: int d;
496: register int op = *cc;
497: if (op >= OP_BRA) op = OP_BRA;
498:
499: switch (op)
500: {
501: case OP_BRA:
502: case OP_ONCE:
503: case OP_COND:
504: d = find_fixedlength(cc);
505: if (d < 0) return -1;
506: branchlength += d;
507: do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
508: cc += 3;
509: break;
510:
511: /* Reached end of a branch; if it's a ket it is the end of a nested
512: call. If it's ALT it is an alternation in a nested call. If it is
513: END it's the end of the outer call. All can be handled by the same code. */
514:
515: case OP_ALT:
516: case OP_KET:
517: case OP_KETRMAX:
518: case OP_KETRMIN:
519: case OP_END:
520: if (length < 0) length = branchlength;
521: else if (length != branchlength) return -1;
522: if (*cc != OP_ALT) return length;
523: cc += 3;
524: branchlength = 0;
525: break;
526:
527: /* Skip over assertive subpatterns */
528:
529: case OP_ASSERT:
530: case OP_ASSERT_NOT:
531: case OP_ASSERTBACK:
532: case OP_ASSERTBACK_NOT:
533: do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
534: cc += 3;
535: break;
536:
537: /* Skip over things that don't match chars */
538:
539: case OP_REVERSE:
540: cc++;
541: /* Fall through */
542:
543: case OP_CREF:
544: case OP_OPT:
545: cc++;
546: /* Fall through */
547:
548: case OP_SOD:
549: case OP_EOD:
550: case OP_EODN:
551: case OP_CIRC:
552: case OP_DOLL:
553: case OP_NOT_WORD_BOUNDARY:
554: case OP_WORD_BOUNDARY:
555: cc++;
556: break;
557:
558: /* Handle char strings */
559:
560: case OP_CHARS:
561: branchlength += *(++cc);
562: cc += *cc + 1;
563: break;
564:
565: /* Handle exact repetitions */
566:
567: case OP_EXACT:
568: case OP_TYPEEXACT:
569: branchlength += (cc[1] << 8) + cc[2];
570: cc += 4;
571: break;
572:
573: /* Handle single-char matchers */
574:
575: case OP_NOT_DIGIT:
576: case OP_DIGIT:
577: case OP_NOT_WHITESPACE:
578: case OP_WHITESPACE:
579: case OP_NOT_WORDCHAR:
580: case OP_WORDCHAR:
581: case OP_ANY:
582: branchlength++;
583: cc++;
584: break;
585:
586:
587: /* Check a class for variable quantification */
588:
589: case OP_CLASS:
590: cc += (*cc == OP_REF)? 2 : 33;
591:
592: switch (*cc)
593: {
594: case OP_CRSTAR:
595: case OP_CRMINSTAR:
596: case OP_CRQUERY:
597: case OP_CRMINQUERY:
598: return -1;
599:
600: case OP_CRRANGE:
601: case OP_CRMINRANGE:
602: if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
603: branchlength += (cc[1] << 8) + cc[2];
604: cc += 5;
605: break;
606:
607: default:
608: branchlength++;
609: }
610: break;
611:
612: /* Anything else is variable length */
613:
614: default:
615: return -1;
616: }
617: }
618: /* Control never gets here */
619: }
620:
621:
622:
623:
624: /*************************************************
625: * Compile one branch *
626: *************************************************/
627:
628: /* Scan the pattern, compiling it into the code vector.
629:
630: Arguments:
631: options the option bits
632: brackets points to number of brackets used
633: code points to the pointer to the current code point
634: ptrptr points to the current pattern pointer
635: errorptr points to pointer to error message
636: optchanged set to the value of the last OP_OPT item compiled
637: reqchar set to the last literal character required, else -1
638: countlits set to count of mandatory literal characters
639: cd contains pointers to tables
640:
641: Returns: TRUE on success
642: FALSE, with *errorptr set on error
643: */
644:
645: static BOOL
646: compile_branch(int options, int *brackets, uschar **codeptr,
647: const uschar **ptrptr, const char **errorptr, int *optchanged,
648: int *reqchar, int *countlits, compile_data *cd)
649: {
650: int repeat_type, op_type;
651: int repeat_min, repeat_max;
652: int bravalue, length;
653: int greedy_default, greedy_non_default;
654: int prevreqchar;
655: int condcount = 0;
656: int subcountlits = 0;
657: register int c;
658: register uschar *code = *codeptr;
659: uschar *tempcode;
660: const uschar *ptr = *ptrptr;
661: const uschar *tempptr;
662: uschar *previous = NULL;
663: uschar class[32];
664:
665: /* Set up the default and non-default settings for greediness */
666:
667: greedy_default = ((options & PCRE_UNGREEDY) != 0);
668: greedy_non_default = greedy_default ^ 1;
669:
670: /* Initialize no required char, and count of literals */
671:
672: *reqchar = prevreqchar = -1;
673: *countlits = 0;
674:
675: /* Switch on next character until the end of the branch */
676:
677: for (;; ptr++)
678: {
679: BOOL negate_class;
680: int class_charcount;
681: int class_lastchar;
682: int newoptions;
683: int condref;
684: int subreqchar;
685:
686: c = *ptr;
687: if ((options & PCRE_EXTENDED) != 0)
688: {
689: if ((cd->ctypes[c] & ctype_space) != 0) continue;
690: if (c == '#')
691: {
692: while ((c = *(++ptr)) != 0 && c != '\n');
693: continue;
694: }
695: }
696:
697: switch(c)
698: {
699: /* The branch terminates at end of string, |, or ). */
700:
701: case 0:
702: case '|':
703: case ')':
704: *codeptr = code;
705: *ptrptr = ptr;
706: return TRUE;
707:
708: /* Handle single-character metacharacters */
709:
710: case '^':
711: previous = NULL;
712: *code++ = OP_CIRC;
713: break;
714:
715: case '$':
716: previous = NULL;
717: *code++ = OP_DOLL;
718: break;
719:
720: case '.':
721: previous = code;
722: *code++ = OP_ANY;
723: break;
724:
725: /* Character classes. These always build a 32-byte bitmap of the permitted
726: characters, except in the special case where there is only one character.
727: For negated classes, we build the map as usual, then invert it at the end.
728: */
729:
730: case '[':
731: previous = code;
732: *code++ = OP_CLASS;
733:
734: /* If the first character is '^', set the negation flag and skip it. */
735:
736: if ((c = *(++ptr)) == '^')
737: {
738: negate_class = TRUE;
739: c = *(++ptr);
740: }
741: else negate_class = FALSE;
742:
743: /* Keep a count of chars so that we can optimize the case of just a single
744: character. */
745:
746: class_charcount = 0;
747: class_lastchar = -1;
748:
749: /* Initialize the 32-char bit map to all zeros. We have to build the
750: map in a temporary bit of store, in case the class contains only 1
751: character, because in that case the compiled code doesn't use the
752: bit map. */
753:
754: memset(class, 0, 32 * sizeof(uschar));
755:
756: /* Process characters until ] is reached. By writing this as a "do" it
757: means that an initial ] is taken as a data character. */
758:
759: do
760: {
761: if (c == 0)
762: {
763: *errorptr = ERR6;
764: goto FAILED;
765: }
766:
767: /* Backslash may introduce a single character, or it may introduce one
768: of the specials, which just set a flag. Escaped items are checked for
769: validity in the pre-compiling pass. The sequence \b is a special case.
770: Inside a class (and only there) it is treated as backspace. Elsewhere
771: it marks a word boundary. Other escapes have preset maps ready to
772: or into the one we are building. We assume they have more than one
773: character in them, so set class_count bigger than one. */
774:
775: if (c == '\\')
776: {
777: c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
778: if (-c == ESC_b) c = '\b';
779: else if (c < 0)
780: {
781: register const uschar *cbits = cd->cbits;
782: class_charcount = 10;
783: switch (-c)
784: {
785: case ESC_d:
786: for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
787: continue;
788:
789: case ESC_D:
790: for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
791: continue;
792:
793: case ESC_w:
794: for (c = 0; c < 32; c++)
795: class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
796: continue;
797:
798: case ESC_W:
799: for (c = 0; c < 32; c++)
800: class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
801: continue;
802:
803: case ESC_s:
804: for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
805: continue;
806:
807: case ESC_S:
808: for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
809: continue;
810:
811: default:
812: *errorptr = ERR7;
813: goto FAILED;
814: }
815: }
816: /* Fall through if single character */
817: }
818:
819: /* A single character may be followed by '-' to form a range. However,
820: Perl does not permit ']' to be the end of the range. A '-' character
821: here is treated as a literal. */
822:
823: if (ptr[1] == '-' && ptr[2] != ']')
824: {
825: int d;
826: ptr += 2;
827: d = *ptr;
828:
829: if (d == 0)
830: {
831: *errorptr = ERR6;
832: goto FAILED;
833: }
834:
835: /* The second part of a range can be a single-character escape, but
836: not any of the other escapes. */
837:
838: if (d == '\\')
839: {
840: d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
841: if (d < 0)
842: {
843: if (d == -ESC_b) d = '\b'; else
844: {
845: *errorptr = ERR7;
846: goto FAILED;
847: }
848: }
849: }
850:
851: if (d < c)
852: {
853: *errorptr = ERR8;
854: goto FAILED;
855: }
856:
857: for (; c <= d; c++)
858: {
859: class[c/8] |= (1 << (c&7));
860: if ((options & PCRE_CASELESS) != 0)
861: {
862: int uc = cd->fcc[c]; /* flip case */
863: class[uc/8] |= (1 << (uc&7));
864: }
865: class_charcount++; /* in case a one-char range */
866: class_lastchar = c;
867: }
868: continue; /* Go get the next char in the class */
869: }
870:
871: /* Handle a lone single character - we can get here for a normal
872: non-escape char, or after \ that introduces a single character. */
873:
874: class [c/8] |= (1 << (c&7));
875: if ((options & PCRE_CASELESS) != 0)
876: {
877: c = cd->fcc[c]; /* flip case */
878: class[c/8] |= (1 << (c&7));
879: }
880: class_charcount++;
881: class_lastchar = c;
882: }
883:
884: /* Loop until ']' reached; the check for end of string happens inside the
885: loop. This "while" is the end of the "do" above. */
886:
887: while ((c = *(++ptr)) != ']');
888:
889: /* If class_charcount is 1 and class_lastchar is not negative, we saw
890: precisely one character. This doesn't need the whole 32-byte bit map.
891: We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
892: it's negative. */
893:
894: if (class_charcount == 1 && class_lastchar >= 0)
895: {
896: if (negate_class)
897: {
898: code[-1] = OP_NOT;
899: }
900: else
901: {
902: code[-1] = OP_CHARS;
903: *code++ = 1;
904: }
905: *code++ = class_lastchar;
906: }
907:
908: /* Otherwise, negate the 32-byte map if necessary, and copy it into
909: the code vector. */
910:
911: else
912: {
913: if (negate_class)
914: for (c = 0; c < 32; c++) code[c] = ~class[c];
915: else
916: memcpy(code, class, 32);
917: code += 32;
918: }
919: break;
920:
921: /* Various kinds of repeat */
922:
923: case '{':
924: if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
925: ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
926: if (*errorptr != NULL) goto FAILED;
927: goto REPEAT;
928:
929: case '*':
930: repeat_min = 0;
931: repeat_max = -1;
932: goto REPEAT;
933:
934: case '+':
935: repeat_min = 1;
936: repeat_max = -1;
937: goto REPEAT;
938:
939: case '?':
940: repeat_min = 0;
941: repeat_max = 1;
942:
943: REPEAT:
944: if (previous == NULL)
945: {
946: *errorptr = ERR9;
947: goto FAILED;
948: }
949:
950: /* If the next character is '?' this is a minimizing repeat, by default,
951: but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
952: next character. */
953:
954: if (ptr[1] == '?')
955: { repeat_type = greedy_non_default; ptr++; }
956: else repeat_type = greedy_default;
957:
958: /* If previous was a string of characters, chop off the last one and use it
959: as the subject of the repeat. If there was only one character, we can
960: abolish the previous item altogether. A repeat with a zero minimum wipes
961: out any reqchar setting, backing up to the previous value. We must also
962: adjust the countlits value. */
963:
964: if (*previous == OP_CHARS)
965: {
966: int len = previous[1];
967:
968: if (repeat_min == 0) *reqchar = prevreqchar;
969: *countlits += repeat_min - 1;
970:
971: if (len == 1)
972: {
973: c = previous[2];
974: code = previous;
975: }
976: else
977: {
978: c = previous[len+1];
979: previous[1]--;
980: code--;
981: }
982: op_type = 0; /* Use single-char op codes */
983: goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
984: }
985:
986: /* If previous was a single negated character ([^a] or similar), we use
987: one of the special opcodes, replacing it. The code is shared with single-
988: character repeats by adding a suitable offset into repeat_type. */
989:
990: else if ((int)*previous == OP_NOT)
991: {
992: op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
993: c = previous[1];
994: code = previous;
995: goto OUTPUT_SINGLE_REPEAT;
996: }
997:
998: /* If previous was a character type match (\d or similar), abolish it and
999: create a suitable repeat item. The code is shared with single-character
1000: repeats by adding a suitable offset into repeat_type. */
1001:
1002: else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1003: {
1004: op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1005: c = *previous;
1006: code = previous;
1007:
1008: OUTPUT_SINGLE_REPEAT:
1009:
1010: /* If the maximum is zero then the minimum must also be zero; Perl allows
1011: this case, so we do too - by simply omitting the item altogether. */
1012:
1013: if (repeat_max == 0) goto END_REPEAT;
1014:
1015: /* Combine the op_type with the repeat_type */
1016:
1017: repeat_type += op_type;
1018:
1019: /* A minimum of zero is handled either as the special case * or ?, or as
1020: an UPTO, with the maximum given. */
1021:
1022: if (repeat_min == 0)
1023: {
1024: if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1025: else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1026: else
1027: {
1028: *code++ = OP_UPTO + repeat_type;
1029: *code++ = repeat_max >> 8;
1030: *code++ = (repeat_max & 255);
1031: }
1032: }
1033:
1034: /* The case {1,} is handled as the special case + */
1035:
1036: else if (repeat_min == 1 && repeat_max == -1)
1037: *code++ = OP_PLUS + repeat_type;
1038:
1039: /* The case {n,n} is just an EXACT, while the general case {n,m} is
1040: handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1041:
1042: else
1043: {
1044: if (repeat_min != 1)
1045: {
1046: *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1047: *code++ = repeat_min >> 8;
1048: *code++ = (repeat_min & 255);
1049: }
1050:
1051: /* If the mininum is 1 and the previous item was a character string,
1052: we either have to put back the item that got cancelled if the string
1053: length was 1, or add the character back onto the end of a longer
1054: string. For a character type nothing need be done; it will just get
1055: put back naturally. Note that the final character is always going to
1056: get added below. */
1057:
1058: else if (*previous == OP_CHARS)
1059: {
1060: if (code == previous) code += 2; else previous[1]++;
1061: }
1062:
1063: /* For a single negated character we also have to put back the
1064: item that got cancelled. */
1065:
1066: else if (*previous == OP_NOT) code++;
1067:
1068: /* If the maximum is unlimited, insert an OP_STAR. */
1069:
1070: if (repeat_max < 0)
1071: {
1072: *code++ = c;
1073: *code++ = OP_STAR + repeat_type;
1074: }
1075:
1076: /* Else insert an UPTO if the max is greater than the min. */
1077:
1078: else if (repeat_max != repeat_min)
1079: {
1080: *code++ = c;
1081: repeat_max -= repeat_min;
1082: *code++ = OP_UPTO + repeat_type;
1083: *code++ = repeat_max >> 8;
1084: *code++ = (repeat_max & 255);
1085: }
1086: }
1087:
1088: /* The character or character type itself comes last in all cases. */
1089:
1090: *code++ = c;
1091: }
1092:
1093: /* If previous was a character class or a back reference, we put the repeat
1094: stuff after it, but just skip the item if the repeat was {0,0}. */
1095:
1096: else if (*previous == OP_CLASS || *previous == OP_REF)
1097: {
1098: if (repeat_max == 0)
1099: {
1100: code = previous;
1101: goto END_REPEAT;
1102: }
1103: if (repeat_min == 0 && repeat_max == -1)
1104: *code++ = OP_CRSTAR + repeat_type;
1105: else if (repeat_min == 1 && repeat_max == -1)
1106: *code++ = OP_CRPLUS + repeat_type;
1107: else if (repeat_min == 0 && repeat_max == 1)
1108: *code++ = OP_CRQUERY + repeat_type;
1109: else
1110: {
1111: *code++ = OP_CRRANGE + repeat_type;
1112: *code++ = repeat_min >> 8;
1113: *code++ = repeat_min & 255;
1114: if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1115: *code++ = repeat_max >> 8;
1116: *code++ = repeat_max & 255;
1117: }
1118: }
1119:
1120: /* If previous was a bracket group, we may have to replicate it in certain
1121: cases. */
1122:
1123: else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1124: (int)*previous == OP_COND)
1125: {
1126: register int i;
1127: int ketoffset = 0;
1128: int len = code - previous;
1129: uschar *bralink = NULL;
1130:
1131: /* If the maximum repeat count is unlimited, find the end of the bracket
1132: by scanning through from the start, and compute the offset back to it
1133: from the current code pointer. There may be an OP_OPT setting following
1134: the final KET, so we can't find the end just by going back from the code
1135: pointer. */
1136:
1137: if (repeat_max == -1)
1138: {
1139: register uschar *ket = previous;
1140: do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1141: ketoffset = code - ket;
1142: }
1143:
1144: /* The case of a zero minimum is special because of the need to stick
1145: OP_BRAZERO in front of it, and because the group appears once in the
1146: data, whereas in other cases it appears the minimum number of times. For
1147: this reason, it is simplest to treat this case separately, as otherwise
1148: the code gets far too mess. There are several special subcases when the
1149: minimum is zero. */
1150:
1151: if (repeat_min == 0)
1152: {
1153: /* If we set up a required char from the bracket, we must back off
1154: to the previous value and reset the countlits value too. */
1155:
1156: if (subcountlits > 0)
1157: {
1158: *reqchar = prevreqchar;
1159: *countlits -= subcountlits;
1160: }
1161:
1162: /* If the maximum is also zero, we just omit the group from the output
1163: altogether. */
1164:
1165: if (repeat_max == 0)
1166: {
1167: code = previous;
1168: goto END_REPEAT;
1169: }
1170:
1171: /* If the maximum is 1 or unlimited, we just have to stick in the
1172: BRAZERO and do no more at this point. */
1173:
1174: if (repeat_max <= 1)
1175: {
1176: memmove(previous+1, previous, len);
1177: code++;
1178: *previous++ = OP_BRAZERO + repeat_type;
1179: }
1180:
1181: /* If the maximum is greater than 1 and limited, we have to replicate
1182: in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1183: The first one has to be handled carefully because it's the original
1184: copy, which has to be moved up. The remainder can be handled by code
1185: that is common with the non-zero minimum case below. We just have to
1186: adjust the value or repeat_max, since one less copy is required. */
1187:
1188: else
1189: {
1190: int offset;
1191: memmove(previous+4, previous, len);
1192: code += 4;
1193: *previous++ = OP_BRAZERO + repeat_type;
1194: *previous++ = OP_BRA;
1195:
1196: /* We chain together the bracket offset fields that have to be
1197: filled in later when the ends of the brackets are reached. */
1198:
1199: offset = (bralink == NULL)? 0 : previous - bralink;
1200: bralink = previous;
1201: *previous++ = offset >> 8;
1202: *previous++ = offset & 255;
1203: }
1204:
1205: repeat_max--;
1206: }
1207:
1208: /* If the minimum is greater than zero, replicate the group as many
1209: times as necessary, and adjust the maximum to the number of subsequent
1210: copies that we need. */
1211:
1212: else
1213: {
1214: for (i = 1; i < repeat_min; i++)
1215: {
1216: memcpy(code, previous, len);
1217: code += len;
1218: }
1219: if (repeat_max > 0) repeat_max -= repeat_min;
1220: }
1221:
1222: /* This code is common to both the zero and non-zero minimum cases. If
1223: the maximum is limited, it replicates the group in a nested fashion,
1224: remembering the bracket starts on a stack. In the case of a zero minimum,
1225: the first one was set up above. In all cases the repeat_max now specifies
1226: the number of additional copies needed. */
1227:
1228: if (repeat_max >= 0)
1229: {
1230: for (i = repeat_max - 1; i >= 0; i--)
1231: {
1232: *code++ = OP_BRAZERO + repeat_type;
1233:
1234: /* All but the final copy start a new nesting, maintaining the
1235: chain of brackets outstanding. */
1236:
1237: if (i != 0)
1238: {
1239: int offset;
1240: *code++ = OP_BRA;
1241: offset = (bralink == NULL)? 0 : code - bralink;
1242: bralink = code;
1243: *code++ = offset >> 8;
1244: *code++ = offset & 255;
1245: }
1246:
1247: memcpy(code, previous, len);
1248: code += len;
1249: }
1250:
1251: /* Now chain through the pending brackets, and fill in their length
1252: fields (which are holding the chain links pro tem). */
1253:
1254: while (bralink != NULL)
1255: {
1256: int oldlinkoffset;
1257: int offset = code - bralink + 1;
1258: uschar *bra = code - offset;
1259: oldlinkoffset = (bra[1] << 8) + bra[2];
1260: bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1261: *code++ = OP_KET;
1262: *code++ = bra[1] = offset >> 8;
1263: *code++ = bra[2] = (offset & 255);
1264: }
1265: }
1266:
1267: /* If the maximum is unlimited, set a repeater in the final copy. We
1268: can't just offset backwards from the current code point, because we
1269: don't know if there's been an options resetting after the ket. The
1270: correct offset was computed above. */
1271:
1272: else code[-ketoffset] = OP_KETRMAX + repeat_type;
1273: }
1274:
1275: /* Else there's some kind of shambles */
1276:
1277: else
1278: {
1279: *errorptr = ERR11;
1280: goto FAILED;
1281: }
1282:
1283: /* In all case we no longer have a previous item. */
1284:
1285: END_REPEAT:
1286: previous = NULL;
1287: break;
1288:
1289:
1290: /* Start of nested bracket sub-expression, or comment or lookahead or
1291: lookbehind or option setting or condition. First deal with special things
1292: that can come after a bracket; all are introduced by ?, and the appearance
1293: of any of them means that this is not a referencing group. They were
1294: checked for validity in the first pass over the string, so we don't have to
1295: check for syntax errors here. */
1296:
1297: case '(':
1298: newoptions = options;
1299: condref = -1;
1300:
1301: if (*(++ptr) == '?')
1302: {
1303: int set, unset;
1304: int *optset;
1305:
1306: switch (*(++ptr))
1307: {
1308: case '#': /* Comment; skip to ket */
1309: ptr++;
1310: while (*ptr != ')') ptr++;
1311: continue;
1312:
1313: case ':': /* Non-extracting bracket */
1314: bravalue = OP_BRA;
1315: ptr++;
1316: break;
1317:
1318: case '(':
1319: bravalue = OP_COND; /* Conditional group */
1320: if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1321: {
1322: condref = *ptr - '0';
1323: while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1324: ptr++;
1325: }
1326: else ptr--;
1327: break;
1328:
1329: case '=': /* Positive lookahead */
1330: bravalue = OP_ASSERT;
1331: ptr++;
1332: break;
1333:
1334: case '!': /* Negative lookahead */
1335: bravalue = OP_ASSERT_NOT;
1336: ptr++;
1337: break;
1338:
1339: case '<': /* Lookbehinds */
1340: switch (*(++ptr))
1341: {
1342: case '=': /* Positive lookbehind */
1343: bravalue = OP_ASSERTBACK;
1344: ptr++;
1345: break;
1346:
1347: case '!': /* Negative lookbehind */
1348: bravalue = OP_ASSERTBACK_NOT;
1349: ptr++;
1350: break;
1351:
1352: default: /* Syntax error */
1353: *errorptr = ERR24;
1354: goto FAILED;
1355: }
1356: break;
1357:
1358: case '>': /* One-time brackets */
1359: bravalue = OP_ONCE;
1360: ptr++;
1361: break;
1362:
1363: default: /* Option setting */
1364: set = unset = 0;
1365: optset = &set;
1366:
1367: while (*ptr != ')' && *ptr != ':')
1368: {
1369: switch (*ptr++)
1370: {
1371: case '-': optset = &unset; break;
1372:
1373: case 'i': *optset |= PCRE_CASELESS; break;
1374: case 'm': *optset |= PCRE_MULTILINE; break;
1375: case 's': *optset |= PCRE_DOTALL; break;
1376: case 'x': *optset |= PCRE_EXTENDED; break;
1377: case 'U': *optset |= PCRE_UNGREEDY; break;
1378: case 'X': *optset |= PCRE_EXTRA; break;
1379:
1380: default:
1381: *errorptr = ERR12;
1382: goto FAILED;
1383: }
1384: }
1385:
1386: /* Set up the changed option bits, but don't change anything yet. */
1387:
1388: newoptions = (options | set) & (~unset);
1389:
1390: /* If the options ended with ')' this is not the start of a nested
1391: group with option changes, so the options change at this level. At top
1392: level there is nothing else to be done (the options will in fact have
1393: been set from the start of compiling as a result of the first pass) but
1394: at an inner level we must compile code to change the ims options if
1395: necessary, and pass the new setting back so that it can be put at the
1396: start of any following branches, and when this group ends, a resetting
1397: item can be compiled. */
1398:
1399: if (*ptr == ')')
1400: {
1401: if ((options & PCRE_INGROUP) != 0 &&
1402: (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1403: {
1404: *code++ = OP_OPT;
1405: *code++ = *optchanged = newoptions & PCRE_IMS;
1406: }
1407: options = newoptions; /* Change options at this level */
1408: previous = NULL; /* This item can't be repeated */
1409: continue; /* It is complete */
1410: }
1411:
1412: /* If the options ended with ':' we are heading into a nested group
1413: with possible change of options. Such groups are non-capturing and are
1414: not assertions of any kind. All we need to do is skip over the ':';
1415: the newoptions value is handled below. */
1416:
1417: bravalue = OP_BRA;
1418: ptr++;
1419: }
1420: }
1421:
1422: /* Else we have a referencing group; adjust the opcode. */
1423:
1424: else
1425: {
1426: if (++(*brackets) > EXTRACT_MAX)
1427: {
1428: *errorptr = ERR13;
1429: goto FAILED;
1430: }
1431: bravalue = OP_BRA + *brackets;
1432: }
1433:
1434: /* Process nested bracketed re. Assertions may not be repeated, but other
1435: kinds can be. We copy code into a non-register variable in order to be able
1436: to pass its address because some compilers complain otherwise. Pass in a
1437: new setting for the ims options if they have changed. */
1438:
1439: previous = (bravalue >= OP_ONCE)? code : NULL;
1440: *code = bravalue;
1441: tempcode = code;
1442:
1443: if (!compile_regex(
1444: options | PCRE_INGROUP, /* Set for all nested groups */
1445: ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1446: newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1447: brackets, /* Bracket level */
1448: &tempcode, /* Where to put code (updated) */
1449: &ptr, /* Input pointer (updated) */
1450: errorptr, /* Where to put an error message */
1451: (bravalue == OP_ASSERTBACK ||
1452: bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1453: condref, /* Condition reference number */
1454: &subreqchar, /* For possible last char */
1455: &subcountlits, /* For literal count */
1456: cd)) /* Tables block */
1457: goto FAILED;
1458:
1459: /* At the end of compiling, code is still pointing to the start of the
1460: group, while tempcode has been updated to point past the end of the group
1461: and any option resetting that may follow it. The pattern pointer (ptr)
1462: is on the bracket. */
1463:
1464: /* If this is a conditional bracket, check that there are no more than
1465: two branches in the group. */
1466:
1467: if (bravalue == OP_COND)
1468: {
1469: uschar *tc = code;
1470: condcount = 0;
1471:
1472: do {
1473: condcount++;
1474: tc += (tc[1] << 8) | tc[2];
1475: }
1476: while (*tc != OP_KET);
1477:
1478: if (condcount > 2)
1479: {
1480: *errorptr = ERR27;
1481: goto FAILED;
1482: }
1483: }
1484:
1485: /* Handle updating of the required character. If the subpattern didn't
1486: set one, leave it as it was. Otherwise, update it for normal brackets of
1487: all kinds, forward assertions, and conditions with two branches. Don't
1488: update the literal count for forward assertions, however. If the bracket
1489: is followed by a quantifier with zero repeat, we have to back off. Hence
1490: the definition of prevreqchar and subcountlits outside the main loop so
1491: that they can be accessed for the back off. */
1492:
1493: if (subreqchar > 0 &&
1494: (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1495: (bravalue == OP_COND && condcount == 2)))
1496: {
1497: prevreqchar = *reqchar;
1498: *reqchar = subreqchar;
1499: if (bravalue != OP_ASSERT) *countlits += subcountlits;
1500: }
1501:
1502: /* Now update the main code pointer to the end of the group. */
1503:
1504: code = tempcode;
1505:
1506: /* Error if hit end of pattern */
1507:
1508: if (*ptr != ')')
1509: {
1510: *errorptr = ERR14;
1511: goto FAILED;
1512: }
1513: break;
1514:
1515: /* Check \ for being a real metacharacter; if not, fall through and handle
1516: it as a data character at the start of a string. Escape items are checked
1517: for validity in the pre-compiling pass. */
1518:
1519: case '\\':
1520: tempptr = ptr;
1521: c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1522:
1523: /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1524: are arranged to be the negation of the corresponding OP_values. For the
1525: back references, the values are ESC_REF plus the reference number. Only
1526: back references and those types that consume a character may be repeated.
1527: We can test for values between ESC_b and ESC_Z for the latter; this may
1528: have to change if any new ones are ever created. */
1529:
1530: if (c < 0)
1531: {
1532: if (-c >= ESC_REF)
1533: {
1534: previous = code;
1535: *code++ = OP_REF;
1536: *code++ = -c - ESC_REF;
1537: }
1538: else
1539: {
1540: previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1541: *code++ = -c;
1542: }
1543: continue;
1544: }
1545:
1546: /* Data character: reset and fall through */
1547:
1548: ptr = tempptr;
1549: c = '\\';
1550:
1551: /* Handle a run of data characters until a metacharacter is encountered.
1552: The first character is guaranteed not to be whitespace or # when the
1553: extended flag is set. */
1554:
1555: NORMAL_CHAR:
1556: default:
1557: previous = code;
1558: *code = OP_CHARS;
1559: code += 2;
1560: length = 0;
1561:
1562: do
1563: {
1564: if ((options & PCRE_EXTENDED) != 0)
1565: {
1566: if ((cd->ctypes[c] & ctype_space) != 0) continue;
1567: if (c == '#')
1568: {
1569: while ((c = *(++ptr)) != 0 && c != '\n');
1570: if (c == 0) break;
1571: continue;
1572: }
1573: }
1574:
1575: /* Backslash may introduce a data char or a metacharacter. Escaped items
1576: are checked for validity in the pre-compiling pass. Stop the string
1577: before a metaitem. */
1578:
1579: if (c == '\\')
1580: {
1581: tempptr = ptr;
1582: c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1583: if (c < 0) { ptr = tempptr; break; }
1584: }
1585:
1586: /* Ordinary character or single-char escape */
1587:
1588: *code++ = c;
1589: length++;
1590: }
1591:
1592: /* This "while" is the end of the "do" above. */
1593:
1594: while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
1595:
1596: /* Update the last character and the count of literals */
1597:
1598: prevreqchar = (length > 1)? code[-2] : *reqchar;
1599: *reqchar = code[-1];
1600: *countlits += length;
1601:
1602: /* Compute the length and set it in the data vector, and advance to
1603: the next state. */
1604:
1605: previous[1] = length;
1606: if (length < 255) ptr--;
1607: break;
1608: }
1609: } /* end of big loop */
1610:
1611: /* Control never reaches here by falling through, only by a goto for all the
1612: error states. Pass back the position in the pattern so that it can be displayed
1613: to the user for diagnosing the error. */
1614:
1615: FAILED:
1616: *ptrptr = ptr;
1617: return FALSE;
1618: }
1619:
1620:
1621:
1622:
1623: /*************************************************
1624: * Compile sequence of alternatives *
1625: *************************************************/
1626:
1627: /* On entry, ptr is pointing past the bracket character, but on return
1628: it points to the closing bracket, or vertical bar, or end of string.
1629: The code variable is pointing at the byte into which the BRA operator has been
1630: stored. If the ims options are changed at the start (for a (?ims: group) or
1631: during any branch, we need to insert an OP_OPT item at the start of every
1632: following branch to ensure they get set correctly at run time, and also pass
1633: the new options into every subsequent branch compile.
1634:
1635: Argument:
1636: options the option bits
1637: optchanged new ims options to set as if (?ims) were at the start, or -1
1638: for no change
1639: brackets -> int containing the number of extracting brackets used
1640: codeptr -> the address of the current code pointer
1641: ptrptr -> the address of the current pattern pointer
1642: errorptr -> pointer to error message
1643: lookbehind TRUE if this is a lookbehind assertion
1644: condref > 0 for OPT_CREF setting at start of conditional group
1645: reqchar -> place to put the last required character, or a negative number
1646: countlits -> place to put the shortest literal count of any branch
1647: cd points to the data block with tables pointers
1648:
1649: Returns: TRUE on success
1650: */
1651:
1652: static BOOL
1653: compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
1654: const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
1655: int *reqchar, int *countlits, compile_data *cd)
1656: {
1657: const uschar *ptr = *ptrptr;
1658: uschar *code = *codeptr;
1659: uschar *last_branch = code;
1660: uschar *start_bracket = code;
1661: uschar *reverse_count = NULL;
1662: int oldoptions = options & PCRE_IMS;
1663: int branchreqchar, branchcountlits;
1664:
1665: *reqchar = -1;
1.2 ! paf 1666: *countlits = PCRE_MAX_POS;
1.1 paf 1667: code += 3;
1668:
1669: /* At the start of a reference-based conditional group, insert the reference
1670: number as an OP_CREF item. */
1671:
1672: if (condref > 0)
1673: {
1674: *code++ = OP_CREF;
1675: *code++ = condref;
1676: }
1677:
1678: /* Loop for each alternative branch */
1679:
1680: for (;;)
1681: {
1682: int length;
1683:
1684: /* Handle change of options */
1685:
1686: if (optchanged >= 0)
1687: {
1688: *code++ = OP_OPT;
1689: *code++ = optchanged;
1690: options = (options & ~PCRE_IMS) | optchanged;
1691: }
1692:
1693: /* Set up dummy OP_REVERSE if lookbehind assertion */
1694:
1695: if (lookbehind)
1696: {
1697: *code++ = OP_REVERSE;
1698: reverse_count = code;
1699: *code++ = 0;
1700: *code++ = 0;
1701: }
1702:
1703: /* Now compile the branch */
1704:
1705: if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
1706: &branchreqchar, &branchcountlits, cd))
1707: {
1708: *ptrptr = ptr;
1709: return FALSE;
1710: }
1711:
1712: /* Fill in the length of the last branch */
1713:
1714: length = code - last_branch;
1715: last_branch[1] = length >> 8;
1716: last_branch[2] = length & 255;
1717:
1718: /* Save the last required character if all branches have the same; a current
1719: value of -1 means unset, while -2 means "previous branch had no last required
1720: char". */
1721:
1722: if (*reqchar != -2)
1723: {
1724: if (branchreqchar >= 0)
1725: {
1726: if (*reqchar == -1) *reqchar = branchreqchar;
1727: else if (*reqchar != branchreqchar) *reqchar = -2;
1728: }
1729: else *reqchar = -2;
1730: }
1731:
1732: /* Keep the shortest literal count */
1733:
1734: if (branchcountlits < *countlits) *countlits = branchcountlits;
1735: DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
1736:
1737: /* If lookbehind, check that this branch matches a fixed-length string,
1738: and put the length into the OP_REVERSE item. Temporarily mark the end of
1739: the branch with OP_END. */
1740:
1741: if (lookbehind)
1742: {
1743: *code = OP_END;
1744: length = find_fixedlength(last_branch);
1745: DPRINTF(("fixed length = %d\n", length));
1746: if (length < 0)
1747: {
1748: *errorptr = ERR25;
1749: *ptrptr = ptr;
1750: return FALSE;
1751: }
1752: reverse_count[0] = (length >> 8);
1753: reverse_count[1] = length & 255;
1754: }
1755:
1756: /* Reached end of expression, either ')' or end of pattern. Insert a
1757: terminating ket and the length of the whole bracketed item, and return,
1758: leaving the pointer at the terminating char. If any of the ims options
1759: were changed inside the group, compile a resetting op-code following. */
1760:
1761: if (*ptr != '|')
1762: {
1763: length = code - start_bracket;
1764: *code++ = OP_KET;
1765: *code++ = length >> 8;
1766: *code++ = length & 255;
1767: if (optchanged >= 0)
1768: {
1769: *code++ = OP_OPT;
1770: *code++ = oldoptions;
1771: }
1772: *codeptr = code;
1773: *ptrptr = ptr;
1774: return TRUE;
1775: }
1776:
1777: /* Another branch follows; insert an "or" node and advance the pointer. */
1778:
1779: *code = OP_ALT;
1780: last_branch = code;
1781: code += 3;
1782: ptr++;
1783: }
1784: /* Control never reaches here */
1785: }
1786:
1787:
1788:
1789:
1790: /*************************************************
1791: * Find first significant op code *
1792: *************************************************/
1793:
1794: /* This is called by several functions that scan a compiled expression looking
1795: for a fixed first character, or an anchoring op code etc. It skips over things
1796: that do not influence this. For one application, a change of caseless option is
1797: important.
1798:
1799: Arguments:
1800: code pointer to the start of the group
1801: options pointer to external options
1802: optbit the option bit whose changing is significant, or
1803: zero if none are
1804: optstop TRUE to return on option change, otherwise change the options
1805: value and continue
1806:
1807: Returns: pointer to the first significant opcode
1808: */
1809:
1810: static const uschar*
1811: first_significant_code(const uschar *code, int *options, int optbit,
1812: BOOL optstop)
1813: {
1814: for (;;)
1815: {
1816: switch ((int)*code)
1817: {
1818: case OP_OPT:
1819: if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1820: {
1821: if (optstop) return code;
1822: *options = (int)code[1];
1823: }
1824: code += 2;
1825: break;
1826:
1827: case OP_CREF:
1828: code += 2;
1829: break;
1830:
1831: case OP_WORD_BOUNDARY:
1832: case OP_NOT_WORD_BOUNDARY:
1833: code++;
1834: break;
1835:
1836: case OP_ASSERT_NOT:
1837: case OP_ASSERTBACK:
1838: case OP_ASSERTBACK_NOT:
1839: do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
1840: code += 3;
1841: break;
1842:
1843: default:
1844: return code;
1845: }
1846: }
1847: /* Control never reaches here */
1848: }
1849:
1850:
1851:
1852:
1853: /*************************************************
1854: * Check for anchored expression *
1855: *************************************************/
1856:
1857: /* Try to find out if this is an anchored regular expression. Consider each
1858: alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
1859: all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
1860: it's anchored. However, if this is a multiline pattern, then only OP_SOD
1861: counts, since OP_CIRC can match in the middle.
1862:
1863: A branch is also implicitly anchored if it starts with .* and DOTALL is set,
1864: because that will try the rest of the pattern at all possible matching points,
1865: so there is no point trying them again.
1866:
1867: Arguments:
1868: code points to start of expression (the bracket)
1869: options points to the options setting
1870:
1871: Returns: TRUE or FALSE
1872: */
1873:
1874: static BOOL
1875: is_anchored(register const uschar *code, int *options)
1876: {
1877: do {
1878: const uschar *scode = first_significant_code(code + 3, options,
1879: PCRE_MULTILINE, FALSE);
1880: register int op = *scode;
1881: if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1882: { if (!is_anchored(scode, options)) return FALSE; }
1883: else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
1884: (*options & PCRE_DOTALL) != 0)
1885: { if (scode[1] != OP_ANY) return FALSE; }
1886: else if (op != OP_SOD &&
1887: ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
1888: return FALSE;
1889: code += (code[1] << 8) + code[2];
1890: }
1891: while (*code == OP_ALT);
1892: return TRUE;
1893: }
1894:
1895:
1896:
1897: /*************************************************
1898: * Check for starting with ^ or .* *
1899: *************************************************/
1900:
1901: /* This is called to find out if every branch starts with ^ or .* so that
1902: "first char" processing can be done to speed things up in multiline
1903: matching and for non-DOTALL patterns that start with .* (which must start at
1904: the beginning or after \n).
1905:
1906: Argument: points to start of expression (the bracket)
1907: Returns: TRUE or FALSE
1908: */
1909:
1910: static BOOL
1911: is_startline(const uschar *code)
1912: {
1913: do {
1914: const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
1915: register int op = *scode;
1916: if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
1917: { if (!is_startline(scode)) return FALSE; }
1918: else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1919: { if (scode[1] != OP_ANY) return FALSE; }
1920: else if (op != OP_CIRC) return FALSE;
1921: code += (code[1] << 8) + code[2];
1922: }
1923: while (*code == OP_ALT);
1924: return TRUE;
1925: }
1926:
1927:
1928:
1929: /*************************************************
1930: * Check for fixed first char *
1931: *************************************************/
1932:
1933: /* Try to find out if there is a fixed first character. This is called for
1934: unanchored expressions, as it speeds up their processing quite considerably.
1935: Consider each alternative branch. If they all start with the same char, or with
1936: a bracket all of whose alternatives start with the same char (recurse ad lib),
1937: then we return that char, otherwise -1.
1938:
1939: Arguments:
1940: code points to start of expression (the bracket)
1941: options pointer to the options (used to check casing changes)
1942:
1943: Returns: -1 or the fixed first char
1944: */
1945:
1946: static int
1947: find_firstchar(const uschar *code, int *options)
1948: {
1949: register int c = -1;
1950: do {
1951: int d;
1952: const uschar *scode = first_significant_code(code + 3, options,
1953: PCRE_CASELESS, TRUE);
1954: register int op = *scode;
1955:
1956: if (op >= OP_BRA) op = OP_BRA;
1957:
1958: switch(op)
1959: {
1960: default:
1961: return -1;
1962:
1963: case OP_BRA:
1964: case OP_ASSERT:
1965: case OP_ONCE:
1966: case OP_COND:
1967: if ((d = find_firstchar(scode, options)) < 0) return -1;
1968: if (c < 0) c = d; else if (c != d) return -1;
1969: break;
1970:
1971: case OP_EXACT: /* Fall through */
1972: scode++;
1973:
1974: case OP_CHARS: /* Fall through */
1975: scode++;
1976:
1977: case OP_PLUS:
1978: case OP_MINPLUS:
1979: if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
1980: break;
1981: }
1982:
1983: code += (code[1] << 8) + code[2];
1984: }
1985: while (*code == OP_ALT);
1986: return c;
1987: }
1988:
1989:
1990:
1991:
1992:
1993: /*************************************************
1994: * Compile a Regular Expression *
1995: *************************************************/
1996:
1997: /* This function takes a string and returns a pointer to a block of store
1998: holding a compiled version of the expression.
1999:
2000: Arguments:
2001: pattern the regular expression
2002: options various option bits
2003: errorptr pointer to pointer to error text
2004: erroroffset ptr offset in pattern where error was detected
2005: tables pointer to character tables or NULL
2006:
2007: Returns: pointer to compiled data block, or NULL on error,
2008: with errorptr and erroroffset set
2009: */
2010:
2011: pcre *
2012: pcre_compile(const char *pattern, int options, const char **errorptr,
2013: int *erroroffset, const unsigned char *tables)
2014: {
2015: real_pcre *re;
2016: int length = 3; /* For initial BRA plus length */
2017: int runlength;
2018: int c, size, reqchar, countlits;
2019: int bracount = 0;
2020: int top_backref = 0;
2021: int branch_extra = 0;
2022: int branch_newextra;
2023: unsigned int brastackptr = 0;
2024: uschar *code;
2025: const uschar *ptr;
2026: compile_data compile_block;
2027: int brastack[BRASTACK_SIZE];
2028: uschar bralenstack[BRASTACK_SIZE];
2029:
2030: #ifdef DEBUG
2031: uschar *code_base, *code_end;
2032: #endif
2033:
2034: /* We can't pass back an error message if errorptr is NULL; I guess the best we
2035: can do is just return NULL. */
2036:
2037: if (errorptr == NULL) return NULL;
2038: *errorptr = NULL;
2039:
2040: /* However, we can give a message for this error */
2041:
2042: if (erroroffset == NULL)
2043: {
2044: *errorptr = ERR16;
2045: return NULL;
2046: }
2047: *erroroffset = 0;
2048:
2049: if ((options & ~PUBLIC_OPTIONS) != 0)
2050: {
2051: *errorptr = ERR17;
2052: return NULL;
2053: }
2054:
2055: /* Set up pointers to the individual character tables */
2056:
2057: if (tables == NULL) tables = pcre_default_tables;
2058: compile_block.lcc = tables + lcc_offset;
2059: compile_block.fcc = tables + fcc_offset;
2060: compile_block.cbits = tables + cbits_offset;
2061: compile_block.ctypes = tables + ctypes_offset;
2062:
2063: /* Reflect pattern for debugging output */
2064:
2065: DPRINTF(("------------------------------------------------------------------\n"));
2066: DPRINTF(("%s\n", pattern));
2067:
2068: /* The first thing to do is to make a pass over the pattern to compute the
2069: amount of store required to hold the compiled code. This does not have to be
2070: perfect as long as errors are overestimates. At the same time we can detect any
2071: internal flag settings. Make an attempt to correct for any counted white space
2072: if an "extended" flag setting appears late in the pattern. We can't be so
2073: clever for #-comments. */
2074:
2075: ptr = (const uschar *)(pattern - 1);
2076: while ((c = *(++ptr)) != 0)
2077: {
2078: int min, max;
2079: int class_charcount;
2080:
2081: if ((options & PCRE_EXTENDED) != 0)
2082: {
2083: if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2084: if (c == '#')
2085: {
2086: while ((c = *(++ptr)) != 0 && c != '\n');
2087: continue;
2088: }
2089: }
2090:
2091: switch(c)
2092: {
2093: /* A backslashed item may be an escaped "normal" character or a
2094: character type. For a "normal" character, put the pointers and
2095: character back so that tests for whitespace etc. in the input
2096: are done correctly. */
2097:
2098: case '\\':
2099: {
2100: const uschar *save_ptr = ptr;
2101: c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2102: if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2103: if (c >= 0)
2104: {
2105: ptr = save_ptr;
2106: c = '\\';
2107: goto NORMAL_CHAR;
2108: }
2109: }
2110: length++;
2111:
2112: /* A back reference needs an additional char, plus either one or 5
2113: bytes for a repeat. We also need to keep the value of the highest
2114: back reference. */
2115:
2116: if (c <= -ESC_REF)
2117: {
2118: int refnum = -c - ESC_REF;
2119: if (refnum > top_backref) top_backref = refnum;
2120: length++; /* For single back reference */
2121: if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2122: {
2123: ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2124: if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2125: if ((min == 0 && (max == 1 || max == -1)) ||
2126: (min == 1 && max == -1))
2127: length++;
2128: else length += 5;
2129: if (ptr[1] == '?') ptr++;
2130: }
2131: }
2132: continue;
2133:
2134: case '^':
2135: case '.':
2136: case '$':
2137: case '*': /* These repeats won't be after brackets; */
2138: case '+': /* those are handled separately */
2139: case '?':
2140: length++;
2141: continue;
2142:
2143: /* This covers the cases of repeats after a single char, metachar, class,
2144: or back reference. */
2145:
2146: case '{':
2147: if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2148: ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2149: if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2150: if ((min == 0 && (max == 1 || max == -1)) ||
2151: (min == 1 && max == -1))
2152: length++;
2153: else
2154: {
2155: length--; /* Uncount the original char or metachar */
2156: if (min == 1) length++; else if (min > 0) length += 4;
2157: if (max > 0) length += 4; else length += 2;
2158: }
2159: if (ptr[1] == '?') ptr++;
2160: continue;
2161:
2162: /* An alternation contains an offset to the next branch or ket. If any ims
2163: options changed in the previous branch(es), and/or if we are in a
2164: lookbehind assertion, extra space will be needed at the start of the
2165: branch. This is handled by branch_extra. */
2166:
2167: case '|':
2168: length += 3 + branch_extra;
2169: continue;
2170:
2171: /* A character class uses 33 characters. Don't worry about character types
2172: that aren't allowed in classes - they'll get picked up during the compile.
2173: A character class that contains only one character uses 2 or 3 bytes,
2174: depending on whether it is negated or not. Notice this where we can. */
2175:
2176: case '[':
2177: class_charcount = 0;
2178: if (*(++ptr) == '^') ptr++;
2179: do
2180: {
2181: if (*ptr == '\\')
2182: {
2183: int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2184: &compile_block);
2185: if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2186: if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2187: }
2188: else class_charcount++;
2189: ptr++;
2190: }
2191: while (*ptr != 0 && *ptr != ']');
2192:
2193: /* Repeats for negated single chars are handled by the general code */
2194:
2195: if (class_charcount == 1) length += 3; else
2196: {
2197: length += 33;
2198:
2199: /* A repeat needs either 1 or 5 bytes. */
2200:
2201: if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2202: {
2203: ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2204: if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2205: if ((min == 0 && (max == 1 || max == -1)) ||
2206: (min == 1 && max == -1))
2207: length++;
2208: else length += 5;
2209: if (ptr[1] == '?') ptr++;
2210: }
2211: }
2212: continue;
2213:
2214: /* Brackets may be genuine groups or special things */
2215:
2216: case '(':
2217: branch_newextra = 0;
2218:
2219: /* Handle special forms of bracket, which all start (? */
2220:
2221: if (ptr[1] == '?')
2222: {
2223: int set, unset;
2224: int *optset;
2225:
2226: switch (c = ptr[2])
2227: {
2228: /* Skip over comments entirely */
2229: case '#':
2230: ptr += 3;
2231: while (*ptr != 0 && *ptr != ')') ptr++;
2232: if (*ptr == 0)
2233: {
2234: *errorptr = ERR18;
2235: goto PCRE_ERROR_RETURN;
2236: }
2237: continue;
2238:
2239: /* Non-referencing groups and lookaheads just move the pointer on, and
2240: then behave like a non-special bracket, except that they don't increment
2241: the count of extracting brackets. Ditto for the "once only" bracket,
2242: which is in Perl from version 5.005. */
2243:
2244: case ':':
2245: case '=':
2246: case '!':
2247: case '>':
2248: ptr += 2;
2249: break;
2250:
2251: /* Lookbehinds are in Perl from version 5.005 */
2252:
2253: case '<':
2254: if (ptr[3] == '=' || ptr[3] == '!')
2255: {
2256: ptr += 3;
2257: branch_newextra = 3;
2258: length += 3; /* For the first branch */
2259: break;
2260: }
2261: *errorptr = ERR24;
2262: goto PCRE_ERROR_RETURN;
2263:
2264: /* Conditionals are in Perl from version 5.005. The bracket must either
2265: be followed by a number (for bracket reference) or by an assertion
2266: group. */
2267:
2268: case '(':
2269: if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2270: {
2271: ptr += 4;
2272: length += 2;
2273: while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2274: if (*ptr != ')')
2275: {
2276: *errorptr = ERR26;
2277: goto PCRE_ERROR_RETURN;
2278: }
2279: }
2280: else /* An assertion must follow */
2281: {
2282: ptr++; /* Can treat like ':' as far as spacing is concerned */
2283:
2284: if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
2285: {
2286: ptr += 2; /* To get right offset in message */
2287: *errorptr = ERR28;
2288: goto PCRE_ERROR_RETURN;
2289: }
2290: }
2291: break;
2292:
2293: /* Else loop checking valid options until ) is met. Anything else is an
2294: error. If we are without any brackets, i.e. at top level, the settings
2295: act as if specified in the options, so massage the options immediately.
2296: This is for backward compatibility with Perl 5.004. */
2297:
2298: default:
2299: set = unset = 0;
2300: optset = &set;
2301: ptr += 2;
2302:
2303: for (;; ptr++)
2304: {
2305: c = *ptr;
2306: switch (c)
2307: {
2308: case 'i':
2309: *optset |= PCRE_CASELESS;
2310: continue;
2311:
2312: case 'm':
2313: *optset |= PCRE_MULTILINE;
2314: continue;
2315:
2316: case 's':
2317: *optset |= PCRE_DOTALL;
2318: continue;
2319:
2320: case 'x':
2321: *optset |= PCRE_EXTENDED;
2322: continue;
2323:
2324: case 'X':
2325: *optset |= PCRE_EXTRA;
2326: continue;
2327:
2328: case 'U':
2329: *optset |= PCRE_UNGREEDY;
2330: continue;
2331:
2332: case '-':
2333: optset = &unset;
2334: continue;
2335:
2336: /* A termination by ')' indicates an options-setting-only item;
2337: this is global at top level; otherwise nothing is done here and
2338: it is handled during the compiling process on a per-bracket-group
2339: basis. */
2340:
2341: case ')':
2342: if (brastackptr == 0)
2343: {
2344: options = (options | set) & (~unset);
2345: set = unset = 0; /* To save length */
2346: }
2347: /* Fall through */
2348:
2349: /* A termination by ':' indicates the start of a nested group with
2350: the given options set. This is again handled at compile time, but
2351: we must allow for compiled space if any of the ims options are
2352: set. We also have to allow for resetting space at the end of
2353: the group, which is why 4 is added to the length and not just 2.
2354: If there are several changes of options within the same group, this
2355: will lead to an over-estimate on the length, but this shouldn't
2356: matter very much. We also have to allow for resetting options at
2357: the start of any alternations, which we do by setting
2358: branch_newextra to 2. Finally, we record whether the case-dependent
2359: flag ever changes within the regex. This is used by the "required
2360: character" code. */
2361:
2362: case ':':
2363: if (((set|unset) & PCRE_IMS) != 0)
2364: {
2365: length += 4;
2366: branch_newextra = 2;
2367: if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2368: }
2369: goto END_OPTIONS;
2370:
2371: /* Unrecognized option character */
2372:
2373: default:
2374: *errorptr = ERR12;
2375: goto PCRE_ERROR_RETURN;
2376: }
2377: }
2378:
2379: /* If we hit a closing bracket, that's it - this is a freestanding
2380: option-setting. We need to ensure that branch_extra is updated if
2381: necessary. The only values branch_newextra can have here are 0 or 2.
2382: If the value is 2, then branch_extra must either be 2 or 5, depending
2383: on whether this is a lookbehind group or not. */
2384:
2385: END_OPTIONS:
2386: if (c == ')')
2387: {
2388: if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2389: branch_extra += branch_newextra;
2390: continue;
2391: }
2392:
2393: /* If options were terminated by ':' control comes here. Fall through
2394: to handle the group below. */
2395: }
2396: }
2397:
2398: /* Extracting brackets must be counted so we can process escapes in a
2399: Perlish way. */
2400:
2401: else bracount++;
2402:
2403: /* Non-special forms of bracket. Save length for computing whole length
2404: at end if there's a repeat that requires duplication of the group. Also
2405: save the current value of branch_extra, and start the new group with
2406: the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
2407: for a lookbehind assertion. */
2408:
2409: if (brastackptr >= sizeof(brastack)/sizeof(int))
2410: {
2411: *errorptr = ERR19;
2412: goto PCRE_ERROR_RETURN;
2413: }
2414:
2415: bralenstack[brastackptr] = branch_extra;
2416: branch_extra = branch_newextra;
2417:
2418: brastack[brastackptr++] = length;
2419: length += 3;
2420: continue;
2421:
2422: /* Handle ket. Look for subsequent max/min; for certain sets of values we
2423: have to replicate this bracket up to that many times. If brastackptr is
2424: 0 this is an unmatched bracket which will generate an error, but take care
2425: not to try to access brastack[-1] when computing the length and restoring
2426: the branch_extra value. */
2427:
2428: case ')':
2429: length += 3;
2430: {
2431: int minval = 1;
2432: int maxval = 1;
2433: int duplength;
2434:
2435: if (brastackptr > 0)
2436: {
2437: duplength = length - brastack[--brastackptr];
2438: branch_extra = bralenstack[brastackptr];
2439: }
2440: else duplength = 0;
2441:
2442: /* Leave ptr at the final char; for read_repeat_counts this happens
2443: automatically; for the others we need an increment. */
2444:
2445: if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2446: {
2447: ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2448: &compile_block);
2449: if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2450: }
2451: else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2452: else if (c == '+') { maxval = -1; ptr++; }
2453: else if (c == '?') { minval = 0; ptr++; }
2454:
2455: /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2456: group, and if the maximum is greater than zero, we have to replicate
2457: maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2458: bracket set - hence the 7. */
2459:
2460: if (minval == 0)
2461: {
2462: length++;
2463: if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2464: }
2465:
2466: /* When the minimum is greater than zero, 1 we have to replicate up to
2467: minval-1 times, with no additions required in the copies. Then, if
2468: there is a limited maximum we have to replicate up to maxval-1 times
2469: allowing for a BRAZERO item before each optional copy and nesting
2470: brackets for all but one of the optional copies. */
2471:
2472: else
2473: {
2474: length += (minval - 1) * duplength;
2475: if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2476: length += (maxval - minval) * (duplength + 7) - 6;
2477: }
2478: }
2479: continue;
2480:
2481: /* Non-special character. For a run of such characters the length required
2482: is the number of characters + 2, except that the maximum run length is 255.
2483: We won't get a skipped space or a non-data escape or the start of a #
2484: comment as the first character, so the length can't be zero. */
2485:
2486: NORMAL_CHAR:
2487: default:
2488: length += 2;
2489: runlength = 0;
2490: do
2491: {
2492: if ((options & PCRE_EXTENDED) != 0)
2493: {
2494: if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2495: if (c == '#')
2496: {
2497: while ((c = *(++ptr)) != 0 && c != '\n');
2498: continue;
2499: }
2500: }
2501:
2502: /* Backslash may introduce a data char or a metacharacter; stop the
2503: string before the latter. */
2504:
2505: if (c == '\\')
2506: {
2507: const uschar *saveptr = ptr;
2508: c = check_escape(&ptr, errorptr, bracount, options, FALSE,
2509: &compile_block);
2510: if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2511: if (c < 0) { ptr = saveptr; break; }
2512: }
2513:
2514: /* Ordinary character or single-char escape */
2515:
2516: runlength++;
2517: }
2518:
2519: /* This "while" is the end of the "do" above. */
2520:
2521: while (runlength < 255 &&
2522: (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
2523:
2524: ptr--;
2525: length += runlength;
2526: continue;
2527: }
2528: }
2529:
2530: length += 4; /* For final KET and END */
2531:
2532: if (length > 65539)
2533: {
2534: *errorptr = ERR20;
2535: return NULL;
2536: }
2537:
2538: /* Compute the size of data block needed and get it, either from malloc or
2539: externally provided function. We specify "code[0]" in the offsetof() expression
2540: rather than just "code", because it has been reported that one broken compiler
2541: fails on "code" because it is also an independent variable. It should make no
2542: difference to the value of the offsetof(). */
2543:
2544: size = length + offsetof(real_pcre, code[0]);
2545: re = (real_pcre *)(pcre_malloc)(size);
2546:
2547: if (re == NULL)
2548: {
2549: *errorptr = ERR21;
2550: return NULL;
2551: }
2552:
2553: /* Put in the magic number and the options. */
2554:
2555: re->magic_number = MAGIC_NUMBER;
2556: re->options = options;
2557: re->tables = tables;
2558:
2559: /* Set up a starting, non-extracting bracket, then compile the expression. On
2560: error, *errorptr will be set non-NULL, so we don't need to look at the result
2561: of the function here. */
2562:
2563: ptr = (const uschar *)pattern;
2564: code = re->code;
2565: *code = OP_BRA;
2566: bracount = 0;
2567: (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
2568: &reqchar, &countlits, &compile_block);
2569: re->top_bracket = bracount;
2570: re->top_backref = top_backref;
2571:
2572: /* If not reached end of pattern on success, there's an excess bracket. */
2573:
2574: if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
2575:
2576: /* Fill in the terminating state and check for disastrous overflow, but
2577: if debugging, leave the test till after things are printed out. */
2578:
2579: *code++ = OP_END;
2580:
2581: #ifndef DEBUG
2582: if (code - re->code > length) *errorptr = ERR23;
2583: #endif
2584:
2585: /* Give an error if there's back reference to a non-existent capturing
2586: subpattern. */
2587:
2588: if (top_backref > re->top_bracket) *errorptr = ERR15;
2589:
2590: /* Failed to compile */
2591:
2592: if (*errorptr != NULL)
2593: {
2594: (pcre_free)(re);
2595: PCRE_ERROR_RETURN:
2596: *erroroffset = ptr - (const uschar *)pattern;
2597: return NULL;
2598: }
2599:
2600: /* If the anchored option was not passed, set flag if we can determine that the
2601: pattern is anchored by virtue of ^ characters or \A or anything else (such as
2602: starting with .* when DOTALL is set).
2603:
2604: Otherwise, see if we can determine what the first character has to be, because
2605: that speeds up unanchored matches no end. If not, see if we can set the
2606: PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
2607: start with ^. and also when all branches start with .* for non-DOTALL matches.
2608: */
2609:
2610: if ((options & PCRE_ANCHORED) == 0)
2611: {
2612: int temp_options = options;
2613: if (is_anchored(re->code, &temp_options))
2614: re->options |= PCRE_ANCHORED;
2615: else
2616: {
2617: int ch = find_firstchar(re->code, &temp_options);
2618: if (ch >= 0)
2619: {
2620: re->first_char = ch;
2621: re->options |= PCRE_FIRSTSET;
2622: }
2623: else if (is_startline(re->code))
2624: re->options |= PCRE_STARTLINE;
2625: }
2626: }
2627:
2628: /* Save the last required character if there are at least two literal
2629: characters on all paths, or if there is no first character setting. */
2630:
2631: if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
2632: {
2633: re->req_char = reqchar;
2634: re->options |= PCRE_REQCHSET;
2635: }
2636:
2637: /* Print out the compiled data for debugging */
2638:
2639: #ifdef DEBUG
2640:
2641: printf("Length = %d top_bracket = %d top_backref = %d\n",
2642: length, re->top_bracket, re->top_backref);
2643:
2644: if (re->options != 0)
2645: {
2646: printf("%s%s%s%s%s%s%s%s%s\n",
2647: ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
2648: ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
2649: ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
2650: ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
2651: ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
2652: ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
2653: ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
2654: ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
2655: ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
2656: }
2657:
2658: if ((re->options & PCRE_FIRSTSET) != 0)
2659: {
2660: if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
2661: else printf("First char = \\x%02x\n", re->first_char);
2662: }
2663:
2664: if ((re->options & PCRE_REQCHSET) != 0)
2665: {
2666: if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
2667: else printf("Req char = \\x%02x\n", re->req_char);
2668: }
2669:
2670: code_end = code;
2671: code_base = code = re->code;
2672:
2673: while (code < code_end)
2674: {
2675: int charlength;
2676:
2677: printf("%3d ", code - code_base);
2678:
2679: if (*code >= OP_BRA)
2680: {
2681: printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
2682: code += 2;
2683: }
2684:
2685: else switch(*code)
2686: {
2687: case OP_OPT:
2688: printf(" %.2x %s", code[1], OP_names[*code]);
2689: code++;
2690: break;
2691:
2692: case OP_COND:
2693: printf("%3d Cond", (code[1] << 8) + code[2]);
2694: code += 2;
2695: break;
2696:
2697: case OP_CREF:
2698: printf(" %.2d %s", code[1], OP_names[*code]);
2699: code++;
2700: break;
2701:
2702: case OP_CHARS:
2703: charlength = *(++code);
2704: printf("%3d ", charlength);
2705: while (charlength-- > 0)
2706: if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
2707: break;
2708:
2709: case OP_KETRMAX:
2710: case OP_KETRMIN:
2711: case OP_ALT:
2712: case OP_KET:
2713: case OP_ASSERT:
2714: case OP_ASSERT_NOT:
2715: case OP_ASSERTBACK:
2716: case OP_ASSERTBACK_NOT:
2717: case OP_ONCE:
2718: printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2719: code += 2;
2720: break;
2721:
2722: case OP_REVERSE:
2723: printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
2724: code += 2;
2725: break;
2726:
2727: case OP_STAR:
2728: case OP_MINSTAR:
2729: case OP_PLUS:
2730: case OP_MINPLUS:
2731: case OP_QUERY:
2732: case OP_MINQUERY:
2733: case OP_TYPESTAR:
2734: case OP_TYPEMINSTAR:
2735: case OP_TYPEPLUS:
2736: case OP_TYPEMINPLUS:
2737: case OP_TYPEQUERY:
2738: case OP_TYPEMINQUERY:
2739: if (*code >= OP_TYPESTAR)
2740: printf(" %s", OP_names[code[1]]);
2741: else if (isprint(c = code[1])) printf(" %c", c);
2742: else printf(" \\x%02x", c);
2743: printf("%s", OP_names[*code++]);
2744: break;
2745:
2746: case OP_EXACT:
2747: case OP_UPTO:
2748: case OP_MINUPTO:
2749: if (isprint(c = code[3])) printf(" %c{", c);
2750: else printf(" \\x%02x{", c);
2751: if (*code != OP_EXACT) printf("0,");
2752: printf("%d}", (code[1] << 8) + code[2]);
2753: if (*code == OP_MINUPTO) printf("?");
2754: code += 3;
2755: break;
2756:
2757: case OP_TYPEEXACT:
2758: case OP_TYPEUPTO:
2759: case OP_TYPEMINUPTO:
2760: printf(" %s{", OP_names[code[3]]);
2761: if (*code != OP_TYPEEXACT) printf(",");
2762: printf("%d}", (code[1] << 8) + code[2]);
2763: if (*code == OP_TYPEMINUPTO) printf("?");
2764: code += 3;
2765: break;
2766:
2767: case OP_NOT:
2768: if (isprint(c = *(++code))) printf(" [^%c]", c);
2769: else printf(" [^\\x%02x]", c);
2770: break;
2771:
2772: case OP_NOTSTAR:
2773: case OP_NOTMINSTAR:
2774: case OP_NOTPLUS:
2775: case OP_NOTMINPLUS:
2776: case OP_NOTQUERY:
2777: case OP_NOTMINQUERY:
2778: if (isprint(c = code[1])) printf(" [^%c]", c);
2779: else printf(" [^\\x%02x]", c);
2780: printf("%s", OP_names[*code++]);
2781: break;
2782:
2783: case OP_NOTEXACT:
2784: case OP_NOTUPTO:
2785: case OP_NOTMINUPTO:
2786: if (isprint(c = code[3])) printf(" [^%c]{", c);
2787: else printf(" [^\\x%02x]{", c);
2788: if (*code != OP_NOTEXACT) printf(",");
2789: printf("%d}", (code[1] << 8) + code[2]);
2790: if (*code == OP_NOTMINUPTO) printf("?");
2791: code += 3;
2792: break;
2793:
2794: case OP_REF:
2795: printf(" \\%d", *(++code));
2796: code ++;
2797: goto CLASS_REF_REPEAT;
2798:
2799: case OP_CLASS:
2800: {
2801: int i, min, max;
2802: code++;
2803: printf(" [");
2804:
2805: for (i = 0; i < 256; i++)
2806: {
2807: if ((code[i/8] & (1 << (i&7))) != 0)
2808: {
2809: int j;
2810: for (j = i+1; j < 256; j++)
2811: if ((code[j/8] & (1 << (j&7))) == 0) break;
2812: if (i == '-' || i == ']') printf("\\");
2813: if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
2814: if (--j > i)
2815: {
2816: printf("-");
2817: if (j == '-' || j == ']') printf("\\");
2818: if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
2819: }
2820: i = j;
2821: }
2822: }
2823: printf("]");
2824: code += 32;
2825:
2826: CLASS_REF_REPEAT:
2827:
2828: switch(*code)
2829: {
2830: case OP_CRSTAR:
2831: case OP_CRMINSTAR:
2832: case OP_CRPLUS:
2833: case OP_CRMINPLUS:
2834: case OP_CRQUERY:
2835: case OP_CRMINQUERY:
2836: printf("%s", OP_names[*code]);
2837: break;
2838:
2839: case OP_CRRANGE:
2840: case OP_CRMINRANGE:
2841: min = (code[1] << 8) + code[2];
2842: max = (code[3] << 8) + code[4];
2843: if (max == 0) printf("{%d,}", min);
2844: else printf("{%d,%d}", min, max);
2845: if (*code == OP_CRMINRANGE) printf("?");
2846: code += 4;
2847: break;
2848:
2849: default:
2850: code--;
2851: }
2852: }
2853: break;
2854:
2855: /* Anything else is just a one-node item */
2856:
2857: default:
2858: printf(" %s", OP_names[*code]);
2859: break;
2860: }
2861:
2862: code++;
2863: printf("\n");
2864: }
2865: printf("------------------------------------------------------------------\n");
2866:
2867: /* This check is done here in the debugging case so that the code that
2868: was compiled can be seen. */
2869:
2870: if (code - re->code > length)
2871: {
2872: *errorptr = ERR23;
2873: (pcre_free)(re);
2874: *erroroffset = ptr - (uschar *)pattern;
2875: return NULL;
2876: }
2877: #endif
2878:
2879: return (pcre *)re;
2880: }
2881:
2882:
2883:
2884: /*************************************************
2885: * Match a back-reference *
2886: *************************************************/
2887:
2888: /* If a back reference hasn't been set, the length that is passed is greater
2889: than the number of characters left in the string, so the match fails.
2890:
2891: Arguments:
2892: offset index into the offset vector
2893: eptr points into the subject
2894: length length to be matched
2895: md points to match data block
2896: ims the ims flags
2897:
2898: Returns: TRUE if matched
2899: */
2900:
2901: static BOOL
2902: match_ref(int offset, register const uschar *eptr, int length, match_data *md,
2903: unsigned long int ims)
2904: {
2905: const uschar *p = md->start_subject + md->offset_vector[offset];
2906:
2907: #ifdef DEBUG
2908: if (eptr >= md->end_subject)
2909: printf("matching subject <null>");
2910: else
2911: {
2912: printf("matching subject ");
2913: pchars(eptr, length, TRUE, md);
2914: }
2915: printf(" against backref ");
2916: pchars(p, length, FALSE, md);
2917: printf("\n");
2918: #endif
2919:
2920: /* Always fail if not enough characters left */
2921:
2922: if (length > md->end_subject - eptr) return FALSE;
2923:
2924: /* Separate the caselesss case for speed */
2925:
2926: if ((ims & PCRE_CASELESS) != 0)
2927: {
2928: while (length-- > 0)
2929: if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
2930: }
2931: else
2932: { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
2933:
2934: return TRUE;
2935: }
2936:
2937:
2938:
2939: /*************************************************
2940: * Match from current position *
2941: *************************************************/
2942:
2943: /* On entry ecode points to the first opcode, and eptr to the first character
2944: in the subject string, while eptrb holds the value of eptr at the start of the
2945: last bracketed group - used for breaking infinite loops matching zero-length
2946: strings.
2947:
2948: Arguments:
2949: eptr pointer in subject
2950: ecode position in code
2951: offset_top current top pointer
2952: md pointer to "static" info for the match
2953: ims current /i, /m, and /s options
2954: condassert TRUE if called to check a condition assertion
2955: eptrb eptr at start of last bracket
2956:
2957: Returns: TRUE if matched
2958: */
2959:
2960: static BOOL
2961: match(register const uschar *eptr, register const uschar *ecode,
2962: int offset_top, match_data *md, unsigned long int ims, BOOL condassert,
2963: const uschar *eptrb)
2964: {
2965: unsigned long int original_ims = ims; /* Save for resetting on ')' */
2966:
2967: for (;;)
2968: {
2969: int op = (int)*ecode;
2970: int min, max, ctype;
2971: register int i;
2972: register int c;
2973: BOOL minimize = FALSE;
2974:
2975: /* Opening capturing bracket. If there is space in the offset vector, save
2976: the current subject position in the working slot at the top of the vector. We
2977: mustn't change the current values of the data slot, because they may be set
2978: from a previous iteration of this group, and be referred to by a reference
2979: inside the group.
2980:
2981: If the bracket fails to match, we need to restore this value and also the
2982: values of the final offsets, in case they were set by a previous iteration of
2983: the same bracket.
2984:
2985: If there isn't enough space in the offset vector, treat this as if it were a
2986: non-capturing bracket. Don't worry about setting the flag for the error case
2987: here; that is handled in the code for KET. */
2988:
2989: if (op > OP_BRA)
2990: {
2991: int number = op - OP_BRA;
2992: int offset = number << 1;
2993:
2994: #ifdef DEBUG
2995: printf("start bracket %d subject=", number);
2996: pchars(eptr, 16, TRUE, md);
2997: printf("\n");
2998: #endif
2999:
3000: if (offset < md->offset_max)
3001: {
3002: int save_offset1 = md->offset_vector[offset];
3003: int save_offset2 = md->offset_vector[offset+1];
3004: int save_offset3 = md->offset_vector[md->offset_end - number];
3005:
3006: DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3007: md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3008:
3009: do
3010: {
3011: if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3012: ecode += (ecode[1] << 8) + ecode[2];
3013: }
3014: while (*ecode == OP_ALT);
3015:
3016: DPRINTF(("bracket %d failed\n", number));
3017:
3018: md->offset_vector[offset] = save_offset1;
3019: md->offset_vector[offset+1] = save_offset2;
3020: md->offset_vector[md->offset_end - number] = save_offset3;
3021: return FALSE;
3022: }
3023:
3024: /* Insufficient room for saving captured contents */
3025:
3026: else op = OP_BRA;
3027: }
3028:
3029: /* Other types of node can be handled by a switch */
3030:
3031: switch(op)
3032: {
3033: case OP_BRA: /* Non-capturing bracket: optimized */
3034: DPRINTF(("start bracket 0\n"));
3035: do
3036: {
3037: if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3038: ecode += (ecode[1] << 8) + ecode[2];
3039: }
3040: while (*ecode == OP_ALT);
3041: DPRINTF(("bracket 0 failed\n"));
3042: return FALSE;
3043:
3044: /* Conditional group: compilation checked that there are no more than
3045: two branches. If the condition is false, skipping the first branch takes us
3046: past the end if there is only one branch, but that's OK because that is
3047: exactly what going to the ket would do. */
3048:
3049: case OP_COND:
3050: if (ecode[3] == OP_CREF) /* Condition is extraction test */
3051: {
3052: int offset = ecode[4] << 1; /* Doubled reference number */
3053: return match(eptr,
3054: ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3055: 5 : 3 + (ecode[1] << 8) + ecode[2]),
3056: offset_top, md, ims, FALSE, eptr);
3057: }
3058:
3059: /* The condition is an assertion. Call match() to evaluate it - setting
3060: the final argument TRUE causes it to stop at the end of an assertion. */
3061:
3062: else
3063: {
3064: if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
3065: {
3066: ecode += 3 + (ecode[4] << 8) + ecode[5];
3067: while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3068: }
3069: else ecode += (ecode[1] << 8) + ecode[2];
3070: return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
3071: }
3072: /* Control never reaches here */
3073:
3074: /* Skip over conditional reference data if encountered (should not be) */
3075:
3076: case OP_CREF:
3077: ecode += 2;
3078: break;
3079:
3080: /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3081: an empty string - recursion will then try other alternatives, if any. */
3082:
3083: case OP_END:
3084: if (md->notempty && eptr == md->start_match) return FALSE;
3085: md->end_match_ptr = eptr; /* Record where we ended */
3086: md->end_offset_top = offset_top; /* and how many extracts were taken */
3087: return TRUE;
3088:
3089: /* Change option settings */
3090:
3091: case OP_OPT:
3092: ims = ecode[1];
3093: ecode += 2;
3094: DPRINTF(("ims set to %02lx\n", ims));
3095: break;
3096:
3097: /* Assertion brackets. Check the alternative branches in turn - the
3098: matching won't pass the KET for an assertion. If any one branch matches,
3099: the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3100: start of each branch to move the current point backwards, so the code at
3101: this level is identical to the lookahead case. */
3102:
3103: case OP_ASSERT:
3104: case OP_ASSERTBACK:
3105: do
3106: {
3107: if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
3108: ecode += (ecode[1] << 8) + ecode[2];
3109: }
3110: while (*ecode == OP_ALT);
3111: if (*ecode == OP_KET) return FALSE;
3112:
3113: /* If checking an assertion for a condition, return TRUE. */
3114:
3115: if (condassert) return TRUE;
3116:
3117: /* Continue from after the assertion, updating the offsets high water
3118: mark, since extracts may have been taken during the assertion. */
3119:
3120: do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3121: ecode += 3;
3122: offset_top = md->end_offset_top;
3123: continue;
3124:
3125: /* Negative assertion: all branches must fail to match */
3126:
3127: case OP_ASSERT_NOT:
3128: case OP_ASSERTBACK_NOT:
3129: do
3130: {
3131: if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
3132: ecode += (ecode[1] << 8) + ecode[2];
3133: }
3134: while (*ecode == OP_ALT);
3135:
3136: if (condassert) return TRUE;
3137: ecode += 3;
3138: continue;
3139:
3140: /* Move the subject pointer back. This occurs only at the start of
3141: each branch of a lookbehind assertion. If we are too close to the start to
3142: move back, this match function fails. */
3143:
3144: case OP_REVERSE:
3145: eptr -= (ecode[1] << 8) + ecode[2];
3146: if (eptr < md->start_subject) return FALSE;
3147: ecode += 3;
3148: break;
3149:
3150:
3151: /* "Once" brackets are like assertion brackets except that after a match,
3152: the point in the subject string is not moved back. Thus there can never be
3153: a move back into the brackets. Check the alternative branches in turn - the
3154: matching won't pass the KET for this kind of subpattern. If any one branch
3155: matches, we carry on as at the end of a normal bracket, leaving the subject
3156: pointer. */
3157:
3158: case OP_ONCE:
3159: {
3160: const uschar *prev = ecode;
3161:
3162: do
3163: {
3164: if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
3165: ecode += (ecode[1] << 8) + ecode[2];
3166: }
3167: while (*ecode == OP_ALT);
3168:
3169: /* If hit the end of the group (which could be repeated), fail */
3170:
3171: if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3172:
3173: /* Continue as from after the assertion, updating the offsets high water
3174: mark, since extracts may have been taken. */
3175:
3176: do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3177:
3178: offset_top = md->end_offset_top;
3179: eptr = md->end_match_ptr;
3180:
3181: /* For a non-repeating ket, just continue at this level. This also
3182: happens for a repeating ket if no characters were matched in the group.
3183: This is the forcible breaking of infinite loops as implemented in Perl
3184: 5.005. If there is an options reset, it will get obeyed in the normal
3185: course of events. */
3186:
3187: if (*ecode == OP_KET || eptr == eptrb)
3188: {
3189: ecode += 3;
3190: break;
3191: }
3192:
3193: /* The repeating kets try the rest of the pattern or restart from the
3194: preceding bracket, in the appropriate order. We need to reset any options
3195: that changed within the bracket before re-running it, so check the next
3196: opcode. */
3197:
3198: if (ecode[3] == OP_OPT)
3199: {
3200: ims = (ims & ~PCRE_IMS) | ecode[4];
3201: DPRINTF(("ims set to %02lx at group repeat\n", ims));
3202: }
3203:
3204: if (*ecode == OP_KETRMIN)
3205: {
3206: if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3207: match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3208: }
3209: else /* OP_KETRMAX */
3210: {
3211: if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3212: match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3213: }
3214: }
3215: return FALSE;
3216:
3217: /* An alternation is the end of a branch; scan along to find the end of the
3218: bracketed group and go to there. */
3219:
3220: case OP_ALT:
3221: do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3222: break;
3223:
3224: /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3225: that it may occur zero times. It may repeat infinitely, or not at all -
3226: i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3227: repeat limits are compiled as a number of copies, with the optional ones
3228: preceded by BRAZERO or BRAMINZERO. */
3229:
3230: case OP_BRAZERO:
3231: {
3232: const uschar *next = ecode+1;
3233: if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
3234: do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3235: ecode = next + 3;
3236: }
3237: break;
3238:
3239: case OP_BRAMINZERO:
3240: {
3241: const uschar *next = ecode+1;
3242: do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3243: if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3244: ecode++;
3245: }
3246: break;
3247:
3248: /* End of a group, repeated or non-repeating. If we are at the end of
3249: an assertion "group", stop matching and return TRUE, but record the
3250: current high water mark for use by positive assertions. Do this also
3251: for the "once" (not-backup up) groups. */
3252:
3253: case OP_KET:
3254: case OP_KETRMIN:
3255: case OP_KETRMAX:
3256: {
3257: const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3258:
3259: if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3260: *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3261: *prev == OP_ONCE)
3262: {
3263: md->end_match_ptr = eptr; /* For ONCE */
3264: md->end_offset_top = offset_top;
3265: return TRUE;
3266: }
3267:
3268: /* In all other cases except a conditional group we have to check the
3269: group number back at the start and if necessary complete handling an
3270: extraction by setting the offsets and bumping the high water mark. */
3271:
3272: if (*prev != OP_COND)
3273: {
3274: int number = *prev - OP_BRA;
3275: int offset = number << 1;
3276:
3277: DPRINTF(("end bracket %d\n", number));
3278:
3279: if (number > 0)
3280: {
3281: if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3282: {
3283: md->offset_vector[offset] =
3284: md->offset_vector[md->offset_end - number];
3285: md->offset_vector[offset+1] = eptr - md->start_subject;
3286: if (offset_top <= offset) offset_top = offset + 2;
3287: }
3288: }
3289: }
3290:
3291: /* Reset the value of the ims flags, in case they got changed during
3292: the group. */
3293:
3294: ims = original_ims;
3295: DPRINTF(("ims reset to %02lx\n", ims));
3296:
3297: /* For a non-repeating ket, just continue at this level. This also
3298: happens for a repeating ket if no characters were matched in the group.
3299: This is the forcible breaking of infinite loops as implemented in Perl
3300: 5.005. If there is an options reset, it will get obeyed in the normal
3301: course of events. */
3302:
3303: if (*ecode == OP_KET || eptr == eptrb)
3304: {
3305: ecode += 3;
3306: break;
3307: }
3308:
3309: /* The repeating kets try the rest of the pattern or restart from the
3310: preceding bracket, in the appropriate order. */
3311:
3312: if (*ecode == OP_KETRMIN)
3313: {
3314: if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
3315: match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
3316: }
3317: else /* OP_KETRMAX */
3318: {
3319: if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
3320: match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
3321: }
3322: }
3323: return FALSE;
3324:
3325: /* Start of subject unless notbol, or after internal newline if multiline */
3326:
3327: case OP_CIRC:
3328: if (md->notbol && eptr == md->start_subject) return FALSE;
3329: if ((ims & PCRE_MULTILINE) != 0)
3330: {
3331: if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
3332: ecode++;
3333: break;
3334: }
3335: /* ... else fall through */
3336:
3337: /* Start of subject assertion */
3338:
3339: case OP_SOD:
3340: if (eptr != md->start_subject) return FALSE;
3341: ecode++;
3342: break;
3343:
3344: /* Assert before internal newline if multiline, or before a terminating
3345: newline unless endonly is set, else end of subject unless noteol is set. */
3346:
3347: case OP_DOLL:
3348: if ((ims & PCRE_MULTILINE) != 0)
3349: {
3350: if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
3351: else { if (md->noteol) return FALSE; }
3352: ecode++;
3353: break;
3354: }
3355: else
3356: {
3357: if (md->noteol) return FALSE;
3358: if (!md->endonly)
3359: {
3360: if (eptr < md->end_subject - 1 ||
3361: (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3362:
3363: ecode++;
3364: break;
3365: }
3366: }
3367: /* ... else fall through */
3368:
3369: /* End of subject assertion (\z) */
3370:
3371: case OP_EOD:
3372: if (eptr < md->end_subject) return FALSE;
3373: ecode++;
3374: break;
3375:
3376: /* End of subject or ending \n assertion (\Z) */
3377:
3378: case OP_EODN:
3379: if (eptr < md->end_subject - 1 ||
3380: (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
3381: ecode++;
3382: break;
3383:
3384: /* Word boundary assertions */
3385:
3386: case OP_NOT_WORD_BOUNDARY:
3387: case OP_WORD_BOUNDARY:
3388: {
3389: BOOL prev_is_word = (eptr != md->start_subject) &&
3390: ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3391: BOOL cur_is_word = (eptr < md->end_subject) &&
3392: ((md->ctypes[*eptr] & ctype_word) != 0);
3393: if ((*ecode++ == OP_WORD_BOUNDARY)?
3394: cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3395: return FALSE;
3396: }
3397: break;
3398:
3399: /* Match a single character type; inline for speed */
3400:
3401: case OP_ANY:
3402: if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
3403: return FALSE;
3404: if (eptr++ >= md->end_subject) return FALSE;
3405: ecode++;
3406: break;
3407:
3408: case OP_NOT_DIGIT:
3409: if (eptr >= md->end_subject ||
3410: (md->ctypes[*eptr++] & ctype_digit) != 0)
3411: return FALSE;
3412: ecode++;
3413: break;
3414:
3415: case OP_DIGIT:
3416: if (eptr >= md->end_subject ||
3417: (md->ctypes[*eptr++] & ctype_digit) == 0)
3418: return FALSE;
3419: ecode++;
3420: break;
3421:
3422: case OP_NOT_WHITESPACE:
3423: if (eptr >= md->end_subject ||
3424: (md->ctypes[*eptr++] & ctype_space) != 0)
3425: return FALSE;
3426: ecode++;
3427: break;
3428:
3429: case OP_WHITESPACE:
3430: if (eptr >= md->end_subject ||
3431: (md->ctypes[*eptr++] & ctype_space) == 0)
3432: return FALSE;
3433: ecode++;
3434: break;
3435:
3436: case OP_NOT_WORDCHAR:
3437: if (eptr >= md->end_subject ||
3438: (md->ctypes[*eptr++] & ctype_word) != 0)
3439: return FALSE;
3440: ecode++;
3441: break;
3442:
3443: case OP_WORDCHAR:
3444: if (eptr >= md->end_subject ||
3445: (md->ctypes[*eptr++] & ctype_word) == 0)
3446: return FALSE;
3447: ecode++;
3448: break;
3449:
3450: /* Match a back reference, possibly repeatedly. Look past the end of the
3451: item to see if there is repeat information following. The code is similar
3452: to that for character classes, but repeated for efficiency. Then obey
3453: similar code to character type repeats - written out again for speed.
3454: However, if the referenced string is the empty string, always treat
3455: it as matched, any number of times (otherwise there could be infinite
3456: loops). */
3457:
3458: case OP_REF:
3459: {
3460: int length;
3461: int offset = ecode[1] << 1; /* Doubled reference number */
3462: ecode += 2; /* Advance past the item */
3463:
3464: /* If the reference is unset, set the length to be longer than the amount
3465: of subject left; this ensures that every attempt at a match fails. We
3466: can't just fail here, because of the possibility of quantifiers with zero
3467: minima. */
3468:
3469: length = (offset >= offset_top || md->offset_vector[offset] < 0)?
3470: md->end_subject - eptr + 1 :
3471: md->offset_vector[offset+1] - md->offset_vector[offset];
3472:
3473: /* Set up for repetition, or handle the non-repeated case */
3474:
3475: switch (*ecode)
3476: {
3477: case OP_CRSTAR:
3478: case OP_CRMINSTAR:
3479: case OP_CRPLUS:
3480: case OP_CRMINPLUS:
3481: case OP_CRQUERY:
3482: case OP_CRMINQUERY:
3483: c = *ecode++ - OP_CRSTAR;
3484: minimize = (c & 1) != 0;
3485: min = rep_min[c]; /* Pick up values from tables; */
3486: max = rep_max[c]; /* zero for max => infinity */
1.2 ! paf 3487: if (max == 0) max = PCRE_MAX_POS;
1.1 paf 3488: break;
3489:
3490: case OP_CRRANGE:
3491: case OP_CRMINRANGE:
3492: minimize = (*ecode == OP_CRMINRANGE);
3493: min = (ecode[1] << 8) + ecode[2];
3494: max = (ecode[3] << 8) + ecode[4];
1.2 ! paf 3495: if (max == 0) max = PCRE_MAX_POS;
1.1 paf 3496: ecode += 5;
3497: break;
3498:
3499: default: /* No repeat follows */
3500: if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3501: eptr += length;
3502: continue; /* With the main loop */
3503: }
3504:
3505: /* If the length of the reference is zero, just continue with the
3506: main loop. */
3507:
3508: if (length == 0) continue;
3509:
3510: /* First, ensure the minimum number of matches are present. We get back
3511: the length of the reference string explicitly rather than passing the
3512: address of eptr, so that eptr can be a register variable. */
3513:
3514: for (i = 1; i <= min; i++)
3515: {
3516: if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
3517: eptr += length;
3518: }
3519:
3520: /* If min = max, continue at the same level without recursion.
3521: They are not both allowed to be zero. */
3522:
3523: if (min == max) continue;
3524:
3525: /* If minimizing, keep trying and advancing the pointer */
3526:
3527: if (minimize)
3528: {
3529: for (i = min;; i++)
3530: {
3531: if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3532: return TRUE;
3533: if (i >= max || !match_ref(offset, eptr, length, md, ims))
3534: return FALSE;
3535: eptr += length;
3536: }
3537: /* Control never gets here */
3538: }
3539:
3540: /* If maximizing, find the longest string and work backwards */
3541:
3542: else
3543: {
3544: const uschar *pp = eptr;
3545: for (i = min; i < max; i++)
3546: {
3547: if (!match_ref(offset, eptr, length, md, ims)) break;
3548: eptr += length;
3549: }
3550: while (eptr >= pp)
3551: {
3552: if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3553: return TRUE;
3554: eptr -= length;
3555: }
3556: return FALSE;
3557: }
3558: }
3559: /* Control never gets here */
3560:
3561:
3562:
3563: /* Match a character class, possibly repeatedly. Look past the end of the
3564: item to see if there is repeat information following. Then obey similar
3565: code to character type repeats - written out again for speed. */
3566:
3567: case OP_CLASS:
3568: {
3569: const uschar *data = ecode + 1; /* Save for matching */
3570: ecode += 33; /* Advance past the item */
3571:
3572: switch (*ecode)
3573: {
3574: case OP_CRSTAR:
3575: case OP_CRMINSTAR:
3576: case OP_CRPLUS:
3577: case OP_CRMINPLUS:
3578: case OP_CRQUERY:
3579: case OP_CRMINQUERY:
3580: c = *ecode++ - OP_CRSTAR;
3581: minimize = (c & 1) != 0;
3582: min = rep_min[c]; /* Pick up values from tables; */
3583: max = rep_max[c]; /* zero for max => infinity */
1.2 ! paf 3584: if (max == 0) max = PCRE_MAX_POS;
1.1 paf 3585: break;
3586:
3587: case OP_CRRANGE:
3588: case OP_CRMINRANGE:
3589: minimize = (*ecode == OP_CRMINRANGE);
3590: min = (ecode[1] << 8) + ecode[2];
3591: max = (ecode[3] << 8) + ecode[4];
1.2 ! paf 3592: if (max == 0) max = PCRE_MAX_POS;
1.1 paf 3593: ecode += 5;
3594: break;
3595:
3596: default: /* No repeat follows */
3597: min = max = 1;
3598: break;
3599: }
3600:
3601: /* First, ensure the minimum number of matches are present. */
3602:
3603: for (i = 1; i <= min; i++)
3604: {
3605: if (eptr >= md->end_subject) return FALSE;
3606: c = *eptr++;
3607: if ((data[c/8] & (1 << (c&7))) != 0) continue;
3608: return FALSE;
3609: }
3610:
3611: /* If max == min we can continue with the main loop without the
3612: need to recurse. */
3613:
3614: if (min == max) continue;
3615:
3616: /* If minimizing, keep testing the rest of the expression and advancing
3617: the pointer while it matches the class. */
3618:
3619: if (minimize)
3620: {
3621: for (i = min;; i++)
3622: {
3623: if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3624: return TRUE;
3625: if (i >= max || eptr >= md->end_subject) return FALSE;
3626: c = *eptr++;
3627: if ((data[c/8] & (1 << (c&7))) != 0) continue;
3628: return FALSE;
3629: }
3630: /* Control never gets here */
3631: }
3632:
3633: /* If maximizing, find the longest possible run, then work backwards. */
3634:
3635: else
3636: {
3637: const uschar *pp = eptr;
3638: for (i = min; i < max; eptr++, i++)
3639: {
3640: if (eptr >= md->end_subject) break;
3641: c = *eptr;
3642: if ((data[c/8] & (1 << (c&7))) != 0) continue;
3643: break;
3644: }
3645:
3646: while (eptr >= pp)
3647: if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3648: return TRUE;
3649: return FALSE;
3650: }
3651: }
3652: /* Control never gets here */
3653:
3654: /* Match a run of characters */
3655:
3656: case OP_CHARS:
3657: {
3658: register int length = ecode[1];
3659: ecode += 2;
3660:
3661: #ifdef DEBUG /* Sigh. Some compilers never learn. */
3662: if (eptr >= md->end_subject)
3663: printf("matching subject <null> against pattern ");
3664: else
3665: {
3666: printf("matching subject ");
3667: pchars(eptr, length, TRUE, md);
3668: printf(" against pattern ");
3669: }
3670: pchars(ecode, length, FALSE, md);
3671: printf("\n");
3672: #endif
3673:
3674: if (length > md->end_subject - eptr) return FALSE;
3675: if ((ims & PCRE_CASELESS) != 0)
3676: {
3677: while (length-- > 0)
3678: if (md->lcc[*ecode++] != md->lcc[*eptr++])
3679: return FALSE;
3680: }
3681: else
3682: {
3683: while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
3684: }
3685: }
3686: break;
3687:
3688: /* Match a single character repeatedly; different opcodes share code. */
3689:
3690: case OP_EXACT:
3691: min = max = (ecode[1] << 8) + ecode[2];
3692: ecode += 3;
3693: goto REPEATCHAR;
3694:
3695: case OP_UPTO:
3696: case OP_MINUPTO:
3697: min = 0;
3698: max = (ecode[1] << 8) + ecode[2];
3699: minimize = *ecode == OP_MINUPTO;
3700: ecode += 3;
3701: goto REPEATCHAR;
3702:
3703: case OP_STAR:
3704: case OP_MINSTAR:
3705: case OP_PLUS:
3706: case OP_MINPLUS:
3707: case OP_QUERY:
3708: case OP_MINQUERY:
3709: c = *ecode++ - OP_STAR;
3710: minimize = (c & 1) != 0;
3711: min = rep_min[c]; /* Pick up values from tables; */
3712: max = rep_max[c]; /* zero for max => infinity */
1.2 ! paf 3713: if (max == 0) max = PCRE_MAX_POS;
1.1 paf 3714:
3715: /* Common code for all repeated single-character matches. We can give
3716: up quickly if there are fewer than the minimum number of characters left in
3717: the subject. */
3718:
3719: REPEATCHAR:
3720: if (min > md->end_subject - eptr) return FALSE;
3721: c = *ecode++;
3722:
3723: /* The code is duplicated for the caseless and caseful cases, for speed,
3724: since matching characters is likely to be quite common. First, ensure the
3725: minimum number of matches are present. If min = max, continue at the same
3726: level without recursing. Otherwise, if minimizing, keep trying the rest of
3727: the expression and advancing one matching character if failing, up to the
3728: maximum. Alternatively, if maximizing, find the maximum number of
3729: characters and work backwards. */
3730:
3731: DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
3732: max, eptr));
3733:
3734: if ((ims & PCRE_CASELESS) != 0)
3735: {
3736: c = md->lcc[c];
3737: for (i = 1; i <= min; i++)
3738: if (c != md->lcc[*eptr++]) return FALSE;
3739: if (min == max) continue;
3740: if (minimize)
3741: {
3742: for (i = min;; i++)
3743: {
3744: if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3745: return TRUE;
3746: if (i >= max || eptr >= md->end_subject ||
3747: c != md->lcc[*eptr++])
3748: return FALSE;
3749: }
3750: /* Control never gets here */
3751: }
3752: else
3753: {
3754: const uschar *pp = eptr;
3755: for (i = min; i < max; i++)
3756: {
3757: if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
3758: eptr++;
3759: }
3760: while (eptr >= pp)
3761: if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3762: return TRUE;
3763: return FALSE;
3764: }
3765: /* Control never gets here */
3766: }
3767:
3768: /* Caseful comparisons */
3769:
3770: else
3771: {
3772: for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
3773: if (min == max) continue;
3774: if (minimize)
3775: {
3776: for (i = min;; i++)
3777: {
3778: if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3779: return TRUE;
3780: if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
3781: }
3782: /* Control never gets here */
3783: }
3784: else
3785: {
3786: const uschar *pp = eptr;
3787: for (i = min; i < max; i++)
3788: {
3789: if (eptr >= md->end_subject || c != *eptr) break;
3790: eptr++;
3791: }
3792: while (eptr >= pp)
3793: if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3794: return TRUE;
3795: return FALSE;
3796: }
3797: }
3798: /* Control never gets here */
3799:
3800: /* Match a negated single character */
3801:
3802: case OP_NOT:
3803: if (eptr >= md->end_subject) return FALSE;
3804: ecode++;
3805: if ((ims & PCRE_CASELESS) != 0)
3806: {
3807: if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
3808: }
3809: else
3810: {
3811: if (*ecode++ == *eptr++) return FALSE;
3812: }
3813: break;
3814:
3815: /* Match a negated single character repeatedly. This is almost a repeat of
3816: the code for a repeated single character, but I haven't found a nice way of
3817: commoning these up that doesn't require a test of the positive/negative
3818: option for each character match. Maybe that wouldn't add very much to the
3819: time taken, but character matching *is* what this is all about... */
3820:
3821: case OP_NOTEXACT:
3822: min = max = (ecode[1] << 8) + ecode[2];
3823: ecode += 3;
3824: goto REPEATNOTCHAR;
3825:
3826: case OP_NOTUPTO:
3827: case OP_NOTMINUPTO:
3828: min = 0;
3829: max = (ecode[1] << 8) + ecode[2];
3830: minimize = *ecode == OP_NOTMINUPTO;
3831: ecode += 3;
3832: goto REPEATNOTCHAR;
3833:
3834: case OP_NOTSTAR:
3835: case OP_NOTMINSTAR:
3836: case OP_NOTPLUS:
3837: case OP_NOTMINPLUS:
3838: case OP_NOTQUERY:
3839: case OP_NOTMINQUERY:
3840: c = *ecode++ - OP_NOTSTAR;
3841: minimize = (c & 1) != 0;
3842: min = rep_min[c]; /* Pick up values from tables; */
3843: max = rep_max[c]; /* zero for max => infinity */
1.2 ! paf 3844: if (max == 0) max = PCRE_MAX_POS;
1.1 paf 3845:
3846: /* Common code for all repeated single-character matches. We can give
3847: up quickly if there are fewer than the minimum number of characters left in
3848: the subject. */
3849:
3850: REPEATNOTCHAR:
3851: if (min > md->end_subject - eptr) return FALSE;
3852: c = *ecode++;
3853:
3854: /* The code is duplicated for the caseless and caseful cases, for speed,
3855: since matching characters is likely to be quite common. First, ensure the
3856: minimum number of matches are present. If min = max, continue at the same
3857: level without recursing. Otherwise, if minimizing, keep trying the rest of
3858: the expression and advancing one matching character if failing, up to the
3859: maximum. Alternatively, if maximizing, find the maximum number of
3860: characters and work backwards. */
3861:
3862: DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
3863: max, eptr));
3864:
3865: if ((ims & PCRE_CASELESS) != 0)
3866: {
3867: c = md->lcc[c];
3868: for (i = 1; i <= min; i++)
3869: if (c == md->lcc[*eptr++]) return FALSE;
3870: if (min == max) continue;
3871: if (minimize)
3872: {
3873: for (i = min;; i++)
3874: {
3875: if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3876: return TRUE;
3877: if (i >= max || eptr >= md->end_subject ||
3878: c == md->lcc[*eptr++])
3879: return FALSE;
3880: }
3881: /* Control never gets here */
3882: }
3883: else
3884: {
3885: const uschar *pp = eptr;
3886: for (i = min; i < max; i++)
3887: {
3888: if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
3889: eptr++;
3890: }
3891: while (eptr >= pp)
3892: if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3893: return TRUE;
3894: return FALSE;
3895: }
3896: /* Control never gets here */
3897: }
3898:
3899: /* Caseful comparisons */
3900:
3901: else
3902: {
3903: for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
3904: if (min == max) continue;
3905: if (minimize)
3906: {
3907: for (i = min;; i++)
3908: {
3909: if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
3910: return TRUE;
3911: if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
3912: }
3913: /* Control never gets here */
3914: }
3915: else
3916: {
3917: const uschar *pp = eptr;
3918: for (i = min; i < max; i++)
3919: {
3920: if (eptr >= md->end_subject || c == *eptr) break;
3921: eptr++;
3922: }
3923: while (eptr >= pp)
3924: if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
3925: return TRUE;
3926: return FALSE;
3927: }
3928: }
3929: /* Control never gets here */
3930:
3931: /* Match a single character type repeatedly; several different opcodes
3932: share code. This is very similar to the code for single characters, but we
3933: repeat it in the interests of efficiency. */
3934:
3935: case OP_TYPEEXACT:
3936: min = max = (ecode[1] << 8) + ecode[2];
3937: minimize = TRUE;
3938: ecode += 3;
3939: goto REPEATTYPE;
3940:
3941: case OP_TYPEUPTO:
3942: case OP_TYPEMINUPTO:
3943: min = 0;
3944: max = (ecode[1] << 8) + ecode[2];
3945: minimize = *ecode == OP_TYPEMINUPTO;
3946: ecode += 3;
3947: goto REPEATTYPE;
3948:
3949: case OP_TYPESTAR:
3950: case OP_TYPEMINSTAR:
3951: case OP_TYPEPLUS:
3952: case OP_TYPEMINPLUS:
3953: case OP_TYPEQUERY:
3954: case OP_TYPEMINQUERY:
3955: c = *ecode++ - OP_TYPESTAR;
3956: minimize = (c & 1) != 0;
3957: min = rep_min[c]; /* Pick up values from tables; */
3958: max = rep_max[c]; /* zero for max => infinity */
1.2 ! paf 3959: if (max == 0) max = PCRE_MAX_POS;
1.1 paf 3960:
3961: /* Common code for all repeated single character type matches */
3962:
3963: REPEATTYPE:
3964: ctype = *ecode++; /* Code for the character type */
3965:
3966: /* First, ensure the minimum number of matches are present. Use inline
3967: code for maximizing the speed, and do the type test once at the start
3968: (i.e. keep it out of the loop). Also test that there are at least the
3969: minimum number of characters before we start. */
3970:
3971: if (min > md->end_subject - eptr) return FALSE;
3972: if (min > 0) switch(ctype)
3973: {
3974: case OP_ANY:
3975: if ((ims & PCRE_DOTALL) == 0)
3976: { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
3977: else eptr += min;
3978: break;
3979:
3980: case OP_NOT_DIGIT:
3981: for (i = 1; i <= min; i++)
3982: if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
3983: break;
3984:
3985: case OP_DIGIT:
3986: for (i = 1; i <= min; i++)
3987: if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
3988: break;
3989:
3990: case OP_NOT_WHITESPACE:
3991: for (i = 1; i <= min; i++)
3992: if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
3993: break;
3994:
3995: case OP_WHITESPACE:
3996: for (i = 1; i <= min; i++)
3997: if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
3998: break;
3999:
4000: case OP_NOT_WORDCHAR:
4001: for (i = 1; i <= min; i++)
4002: if ((md->ctypes[*eptr++] & ctype_word) != 0)
4003: return FALSE;
4004: break;
4005:
4006: case OP_WORDCHAR:
4007: for (i = 1; i <= min; i++)
4008: if ((md->ctypes[*eptr++] & ctype_word) == 0)
4009: return FALSE;
4010: break;
4011: }
4012:
4013: /* If min = max, continue at the same level without recursing */
4014:
4015: if (min == max) continue;
4016:
4017: /* If minimizing, we have to test the rest of the pattern before each
4018: subsequent match. */
4019:
4020: if (minimize)
4021: {
4022: for (i = min;; i++)
4023: {
4024: if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
4025: if (i >= max || eptr >= md->end_subject) return FALSE;
4026:
4027: c = *eptr++;
4028: switch(ctype)
4029: {
4030: case OP_ANY:
4031: if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
4032: break;
4033:
4034: case OP_NOT_DIGIT:
4035: if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4036: break;
4037:
4038: case OP_DIGIT:
4039: if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4040: break;
4041:
4042: case OP_NOT_WHITESPACE:
4043: if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4044: break;
4045:
4046: case OP_WHITESPACE:
4047: if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4048: break;
4049:
4050: case OP_NOT_WORDCHAR:
4051: if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4052: break;
4053:
4054: case OP_WORDCHAR:
4055: if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4056: break;
4057: }
4058: }
4059: /* Control never gets here */
4060: }
4061:
4062: /* If maximizing it is worth using inline code for speed, doing the type
4063: test once at the start (i.e. keep it out of the loop). */
4064:
4065: else
4066: {
4067: const uschar *pp = eptr;
4068: switch(ctype)
4069: {
4070: case OP_ANY:
4071: if ((ims & PCRE_DOTALL) == 0)
4072: {
4073: for (i = min; i < max; i++)
4074: {
4075: if (eptr >= md->end_subject || *eptr == '\n') break;
4076: eptr++;
4077: }
4078: }
4079: else
4080: {
4081: c = max - min;
4082: if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4083: eptr += c;
4084: }
4085: break;
4086:
4087: case OP_NOT_DIGIT:
4088: for (i = min; i < max; i++)
4089: {
4090: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4091: break;
4092: eptr++;
4093: }
4094: break;
4095:
4096: case OP_DIGIT:
4097: for (i = min; i < max; i++)
4098: {
4099: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4100: break;
4101: eptr++;
4102: }
4103: break;
4104:
4105: case OP_NOT_WHITESPACE:
4106: for (i = min; i < max; i++)
4107: {
4108: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4109: break;
4110: eptr++;
4111: }
4112: break;
4113:
4114: case OP_WHITESPACE:
4115: for (i = min; i < max; i++)
4116: {
4117: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4118: break;
4119: eptr++;
4120: }
4121: break;
4122:
4123: case OP_NOT_WORDCHAR:
4124: for (i = min; i < max; i++)
4125: {
4126: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4127: break;
4128: eptr++;
4129: }
4130: break;
4131:
4132: case OP_WORDCHAR:
4133: for (i = min; i < max; i++)
4134: {
4135: if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4136: break;
4137: eptr++;
4138: }
4139: break;
4140: }
4141:
4142: while (eptr >= pp)
4143: if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
4144: return TRUE;
4145: return FALSE;
4146: }
4147: /* Control never gets here */
4148:
4149: /* There's been some horrible disaster. */
4150:
4151: default:
4152: DPRINTF(("Unknown opcode %d\n", *ecode));
4153: md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4154: return FALSE;
4155: }
4156:
4157: /* Do not stick any code in here without much thought; it is assumed
4158: that "continue" in the code above comes out to here to repeat the main
4159: loop. */
4160:
4161: } /* End of main loop */
4162: /* Control never reaches here */
4163: }
4164:
4165:
4166:
4167:
4168: /*************************************************
4169: * Execute a Regular Expression *
4170: *************************************************/
4171:
4172: /* This function applies a compiled re to a subject string and picks out
4173: portions of the string if it matches. Two elements in the vector are set for
4174: each substring: the offsets to the start and end of the substring.
4175:
4176: Arguments:
4177: external_re points to the compiled expression
4178: external_extra points to "hints" from pcre_study() or is NULL
4179: subject points to the subject string
4180: length length of subject string (may contain binary zeros)
4181: start_offset where to start in the subject string
4182: options option bits
4183: offsets points to a vector of ints to be filled in with offsets
4184: offsetcount the number of elements in the vector
4185:
4186: Returns: > 0 => success; value is the number of elements filled in
4187: = 0 => success, but offsets is not big enough
4188: -1 => failed to match
4189: < -1 => some kind of unexpected problem
4190: */
4191:
4192: int
4193: pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4194: const char *subject, int length, int start_offset, int options, int *offsets,
4195: int offsetcount)
4196: {
4197: int resetcount, ocount;
4198: int first_char = -1;
4199: int req_char = -1;
4200: int req_char2 = -1;
4201: unsigned long int ims = 0;
4202: match_data match_block;
4203: const uschar *start_bits = NULL;
4204: const uschar *start_match = (const uschar *)subject + start_offset;
4205: const uschar *end_subject;
4206: const uschar *req_char_ptr = start_match - 1;
4207: const real_pcre *re = (const real_pcre *)external_re;
4208: const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4209: BOOL using_temporary_offsets = FALSE;
4210: BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4211: BOOL startline = (re->options & PCRE_STARTLINE) != 0;
4212:
4213: if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4214:
4215: if (re == NULL || subject == NULL ||
4216: (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4217: if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4218:
4219: match_block.start_subject = (const uschar *)subject;
4220: match_block.end_subject = match_block.start_subject + length;
4221: end_subject = match_block.end_subject;
4222:
4223: match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4224:
4225: match_block.notbol = (options & PCRE_NOTBOL) != 0;
4226: match_block.noteol = (options & PCRE_NOTEOL) != 0;
4227: match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4228:
4229: match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4230:
4231: match_block.lcc = re->tables + lcc_offset;
4232: match_block.ctypes = re->tables + ctypes_offset;
4233:
4234: /* The ims options can vary during the matching as a result of the presence
4235: of (?ims) items in the pattern. They are kept in a local variable so that
4236: restoring at the exit of a group is easy. */
4237:
4238: ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4239:
4240: /* If the expression has got more back references than the offsets supplied can
4241: hold, we get a temporary bit of working store to use during the matching.
4242: Otherwise, we can use the vector supplied, rounding down its size to a multiple
4243: of 3. */
4244:
4245: ocount = offsetcount - (offsetcount % 3);
4246:
4247: if (re->top_backref > 0 && re->top_backref >= ocount/3)
4248: {
4249: ocount = re->top_backref * 3 + 3;
4250: match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4251: if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4252: using_temporary_offsets = TRUE;
4253: DPRINTF(("Got memory to hold back references\n"));
4254: }
4255: else match_block.offset_vector = offsets;
4256:
4257: match_block.offset_end = ocount;
4258: match_block.offset_max = (2*ocount)/3;
4259: match_block.offset_overflow = FALSE;
4260:
4261: /* Compute the minimum number of offsets that we need to reset each time. Doing
4262: this makes a huge difference to execution time when there aren't many brackets
4263: in the pattern. */
4264:
4265: resetcount = 2 + re->top_bracket * 2;
4266: if (resetcount > offsetcount) resetcount = ocount;
4267:
4268: /* Reset the working variable associated with each extraction. These should
4269: never be used unless previously set, but they get saved and restored, and so we
4270: initialize them to avoid reading uninitialized locations. */
4271:
4272: if (match_block.offset_vector != NULL)
4273: {
4274: register int *iptr = match_block.offset_vector + ocount;
4275: register int *iend = iptr - resetcount/2 + 1;
4276: while (--iptr >= iend) *iptr = -1;
4277: }
4278:
4279: /* Set up the first character to match, if available. The first_char value is
4280: never set for an anchored regular expression, but the anchoring may be forced
4281: at run time, so we have to test for anchoring. The first char may be unset for
4282: an unanchored pattern, of course. If there's no first char and the pattern was
4283: studied, there may be a bitmap of possible first characters. */
4284:
4285: if (!anchored)
4286: {
4287: if ((re->options & PCRE_FIRSTSET) != 0)
4288: {
4289: first_char = re->first_char;
4290: if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4291: }
4292: else
4293: if (!startline && extra != NULL &&
4294: (extra->options & PCRE_STUDY_MAPPED) != 0)
4295: start_bits = extra->start_bits;
4296: }
4297:
4298: /* For anchored or unanchored matches, there may be a "last known required
4299: character" set. If the PCRE_CASELESS is set, implying that the match starts
4300: caselessly, or if there are any changes of this flag within the regex, set up
4301: both cases of the character. Otherwise set the two values the same, which will
4302: avoid duplicate testing (which takes significant time). This covers the vast
4303: majority of cases. It will be suboptimal when the case flag changes in a regex
4304: and the required character in fact is caseful. */
4305:
4306: if ((re->options & PCRE_REQCHSET) != 0)
4307: {
4308: req_char = re->req_char;
4309: req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4310: (re->tables + fcc_offset)[req_char] : req_char;
4311: }
4312:
4313: /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4314: the loop runs just once. */
4315:
4316: do
4317: {
4318: int rc;
4319: register int *iptr = match_block.offset_vector;
4320: register int *iend = iptr + resetcount;
4321:
4322: /* Reset the maximum number of extractions we might see. */
4323:
4324: while (iptr < iend) *iptr++ = -1;
4325:
4326: /* Advance to a unique first char if possible */
4327:
4328: if (first_char >= 0)
4329: {
4330: if ((ims & PCRE_CASELESS) != 0)
4331: while (start_match < end_subject &&
4332: match_block.lcc[*start_match] != first_char)
4333: start_match++;
4334: else
4335: while (start_match < end_subject && *start_match != first_char)
4336: start_match++;
4337: }
4338:
4339: /* Or to just after \n for a multiline match if possible */
4340:
4341: else if (startline)
4342: {
4343: if (start_match > match_block.start_subject + start_offset)
4344: {
4345: while (start_match < end_subject && start_match[-1] != '\n')
4346: start_match++;
4347: }
4348: }
4349:
4350: /* Or to a non-unique first char after study */
4351:
4352: else if (start_bits != NULL)
4353: {
4354: while (start_match < end_subject)
4355: {
4356: register int c = *start_match;
4357: if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
4358: }
4359: }
4360:
4361: #ifdef DEBUG /* Sigh. Some compilers never learn. */
4362: printf(">>>> Match against: ");
4363: pchars(start_match, end_subject - start_match, TRUE, &match_block);
4364: printf("\n");
4365: #endif
4366:
4367: /* If req_char is set, we know that that character must appear in the subject
4368: for the match to succeed. If the first character is set, req_char must be
4369: later in the subject; otherwise the test starts at the match point. This
4370: optimization can save a huge amount of backtracking in patterns with nested
4371: unlimited repeats that aren't going to match. We don't know what the state of
4372: case matching may be when this character is hit, so test for it in both its
4373: cases if necessary. However, the different cased versions will not be set up
4374: unless PCRE_CASELESS was given or the casing state changes within the regex.
4375: Writing separate code makes it go faster, as does using an autoincrement and
4376: backing off on a match. */
4377:
4378: if (req_char >= 0)
4379: {
4380: register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
4381:
4382: /* We don't need to repeat the search if we haven't yet reached the
4383: place we found it at last time. */
4384:
4385: if (p > req_char_ptr)
4386: {
4387: /* Do a single test if no case difference is set up */
4388:
4389: if (req_char == req_char2)
4390: {
4391: while (p < end_subject)
4392: {
4393: if (*p++ == req_char) { p--; break; }
4394: }
4395: }
4396:
4397: /* Otherwise test for either case */
4398:
4399: else
4400: {
4401: while (p < end_subject)
4402: {
4403: register int pp = *p++;
4404: if (pp == req_char || pp == req_char2) { p--; break; }
4405: }
4406: }
4407:
4408: /* If we can't find the required character, break the matching loop */
4409:
4410: if (p >= end_subject) break;
4411:
4412: /* If we have found the required character, save the point where we
4413: found it, so that we don't search again next time round the loop if
4414: the start hasn't passed this character yet. */
4415:
4416: req_char_ptr = p;
4417: }
4418: }
4419:
4420: /* When a match occurs, substrings will be set for all internal extractions;
4421: we just need to set up the whole thing as substring 0 before returning. If
4422: there were too many extractions, set the return code to zero. In the case
4423: where we had to get some local store to hold offsets for backreferences, copy
4424: those back references that we can. In this case there need not be overflow
4425: if certain parts of the pattern were not used. */
4426:
4427: match_block.start_match = start_match;
4428: if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
4429: continue;
4430:
4431: /* Copy the offset information from temporary store if necessary */
4432:
4433: if (using_temporary_offsets)
4434: {
4435: if (offsetcount >= 4)
4436: {
4437: memcpy(offsets + 2, match_block.offset_vector + 2,
4438: (offsetcount - 2) * sizeof(int));
4439: DPRINTF(("Copied offsets from temporary memory\n"));
4440: }
4441: if (match_block.end_offset_top > offsetcount)
4442: match_block.offset_overflow = TRUE;
4443:
4444: DPRINTF(("Freeing temporary memory\n"));
4445: (pcre_free)(match_block.offset_vector);
4446: }
4447:
4448: rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
4449:
4450: if (match_block.offset_end < 2) rc = 0; else
4451: {
4452: offsets[0] = start_match - match_block.start_subject;
4453: offsets[1] = match_block.end_match_ptr - match_block.start_subject;
4454: }
4455:
4456: DPRINTF((">>>> returning %d\n", rc));
4457: return rc;
4458: }
4459:
4460: /* This "while" is the end of the "do" above */
4461:
4462: while (!anchored &&
4463: match_block.errorcode == PCRE_ERROR_NOMATCH &&
4464: start_match++ < end_subject);
4465:
4466: if (using_temporary_offsets)
4467: {
4468: DPRINTF(("Freeing temporary memory\n"));
4469: (pcre_free)(match_block.offset_vector);
4470: }
4471:
4472: DPRINTF((">>>> returning %d\n", match_block.errorcode));
4473:
4474: return match_block.errorcode;
4475: }
4476:
4477: /* End of pcre.c */
E-mail: