Annotation of win32/pcre/pcre_dfa_exec.c, revision 1.2
1.1 misha 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
9: Copyright (c) 1997-2008 University of Cambridge
10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40:
41: /* This module contains the external function pcre_dfa_exec(), which is an
42: alternative matching function that uses a sort of DFA algorithm (not a true
43: FSM). This is NOT Perl- compatible, but it has advantages in certain
44: applications. */
45:
46:
47: #ifdef HAVE_CONFIG_H
48: #include "config.h"
49: #endif
50:
51: #define NLBLOCK md /* Block containing newline information */
52: #define PSSTART start_subject /* Field containing processed string start */
53: #define PSEND end_subject /* Field containing processed string end */
54:
55: #include "pcre_internal.h"
56:
57:
58: /* For use to indent debugging output */
59:
60: #define SP " "
61:
62:
63:
64: /*************************************************
65: * Code parameters and static tables *
66: *************************************************/
67:
68: /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
69: into others, under special conditions. A gap of 20 between the blocks should be
70: enough. The resulting opcodes don't have to be less than 256 because they are
71: never stored, so we push them well clear of the normal opcodes. */
72:
73: #define OP_PROP_EXTRA 300
74: #define OP_EXTUNI_EXTRA 320
75: #define OP_ANYNL_EXTRA 340
76: #define OP_HSPACE_EXTRA 360
77: #define OP_VSPACE_EXTRA 380
78:
79:
80: /* This table identifies those opcodes that are followed immediately by a
81: character that is to be tested in some way. This makes is possible to
82: centralize the loading of these characters. In the case of Type * etc, the
83: "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
84: small value. ***NOTE*** If the start of this table is modified, the two tables
85: that follow must also be modified. */
86:
87: static const uschar coptable[] = {
88: 0, /* End */
89: 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
90: 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
91: 0, 0, 0, /* Any, AllAny, Anybyte */
92: 0, 0, 0, /* NOTPROP, PROP, EXTUNI */
93: 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
94: 0, 0, 0, 0, 0, /* \Z, \z, Opt, ^, $ */
95: 1, /* Char */
96: 1, /* Charnc */
97: 1, /* not */
98: /* Positive single-char repeats */
99: 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
100: 3, 3, 3, /* upto, minupto, exact */
101: 1, 1, 1, 3, /* *+, ++, ?+, upto+ */
102: /* Negative single-char repeats - only for chars < 256 */
103: 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
104: 3, 3, 3, /* NOT upto, minupto, exact */
105: 1, 1, 1, 3, /* NOT *+, ++, ?+, updo+ */
106: /* Positive type repeats */
107: 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
108: 3, 3, 3, /* Type upto, minupto, exact */
109: 1, 1, 1, 3, /* Type *+, ++, ?+, upto+ */
110: /* Character class & ref repeats */
111: 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
112: 0, 0, /* CRRANGE, CRMINRANGE */
113: 0, /* CLASS */
114: 0, /* NCLASS */
115: 0, /* XCLASS - variable length */
116: 0, /* REF */
117: 0, /* RECURSE */
118: 0, /* CALLOUT */
119: 0, /* Alt */
120: 0, /* Ket */
121: 0, /* KetRmax */
122: 0, /* KetRmin */
123: 0, /* Assert */
124: 0, /* Assert not */
125: 0, /* Assert behind */
126: 0, /* Assert behind not */
127: 0, /* Reverse */
128: 0, 0, 0, 0, /* ONCE, BRA, CBRA, COND */
129: 0, 0, 0, /* SBRA, SCBRA, SCOND */
130: 0, /* CREF */
131: 0, /* RREF */
132: 0, /* DEF */
133: 0, 0, /* BRAZERO, BRAMINZERO */
134: 0, 0, 0, 0, /* PRUNE, SKIP, THEN, COMMIT */
135: 0, 0, 0 /* FAIL, ACCEPT, SKIPZERO */
136: };
137:
138: /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139: and \w */
140:
141: static const uschar toptable1[] = {
142: 0, 0, 0, 0, 0, 0,
143: ctype_digit, ctype_digit,
144: ctype_space, ctype_space,
145: ctype_word, ctype_word,
146: 0, 0 /* OP_ANY, OP_ALLANY */
147: };
148:
149: static const uschar toptable2[] = {
150: 0, 0, 0, 0, 0, 0,
151: ctype_digit, 0,
152: ctype_space, 0,
153: ctype_word, 0,
154: 1, 1 /* OP_ANY, OP_ALLANY */
155: };
156:
157:
158: /* Structure for holding data about a particular state, which is in effect the
159: current data for an active path through the match tree. It must consist
160: entirely of ints because the working vector we are passed, and which we put
161: these structures in, is a vector of ints. */
162:
163: typedef struct stateblock {
164: int offset; /* Offset to opcode */
165: int count; /* Count for repeats */
166: int ims; /* ims flag bits */
167: int data; /* Some use extra data */
168: } stateblock;
169:
170: #define INTS_PER_STATEBLOCK (sizeof(stateblock)/sizeof(int))
171:
172:
173: #ifdef DEBUG
174: /*************************************************
175: * Print character string *
176: *************************************************/
177:
178: /* Character string printing function for debugging.
179:
180: Arguments:
181: p points to string
182: length number of bytes
183: f where to print
184:
185: Returns: nothing
186: */
187:
188: static void
189: pchars(unsigned char *p, int length, FILE *f)
190: {
191: int c;
192: while (length-- > 0)
193: {
194: if (isprint(c = *(p++)))
195: fprintf(f, "%c", c);
196: else
197: fprintf(f, "\\x%02x", c);
198: }
199: }
200: #endif
201:
202:
203:
204: /*************************************************
205: * Execute a Regular Expression - DFA engine *
206: *************************************************/
207:
208: /* This internal function applies a compiled pattern to a subject string,
209: starting at a given point, using a DFA engine. This function is called from the
210: external one, possibly multiple times if the pattern is not anchored. The
211: function calls itself recursively for some kinds of subpattern.
212:
213: Arguments:
214: md the match_data block with fixed information
215: this_start_code the opening bracket of this subexpression's code
216: current_subject where we currently are in the subject string
217: start_offset start offset in the subject string
218: offsets vector to contain the matching string offsets
219: offsetcount size of same
220: workspace vector of workspace
221: wscount size of same
222: ims the current ims flags
223: rlevel function call recursion level
224: recursing regex recursive call level
225:
226: Returns: > 0 => number of match offset pairs placed in offsets
227: = 0 => offsets overflowed; longest matches are present
228: -1 => failed to match
229: < -1 => some kind of unexpected problem
230:
231: The following macros are used for adding states to the two state vectors (one
232: for the current character, one for the following character). */
233:
234: #define ADD_ACTIVE(x,y) \
235: if (active_count++ < wscount) \
236: { \
237: next_active_state->offset = (x); \
238: next_active_state->count = (y); \
239: next_active_state->ims = ims; \
240: next_active_state++; \
241: DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
242: } \
243: else return PCRE_ERROR_DFA_WSSIZE
244:
245: #define ADD_ACTIVE_DATA(x,y,z) \
246: if (active_count++ < wscount) \
247: { \
248: next_active_state->offset = (x); \
249: next_active_state->count = (y); \
250: next_active_state->ims = ims; \
251: next_active_state->data = (z); \
252: next_active_state++; \
253: DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
254: } \
255: else return PCRE_ERROR_DFA_WSSIZE
256:
257: #define ADD_NEW(x,y) \
258: if (new_count++ < wscount) \
259: { \
260: next_new_state->offset = (x); \
261: next_new_state->count = (y); \
262: next_new_state->ims = ims; \
263: next_new_state++; \
264: DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
265: } \
266: else return PCRE_ERROR_DFA_WSSIZE
267:
268: #define ADD_NEW_DATA(x,y,z) \
269: if (new_count++ < wscount) \
270: { \
271: next_new_state->offset = (x); \
272: next_new_state->count = (y); \
273: next_new_state->ims = ims; \
274: next_new_state->data = (z); \
275: next_new_state++; \
276: DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
277: } \
278: else return PCRE_ERROR_DFA_WSSIZE
279:
280: /* And now, here is the code */
281:
282: static int
283: internal_dfa_exec(
284: dfa_match_data *md,
285: const uschar *this_start_code,
286: const uschar *current_subject,
287: int start_offset,
288: int *offsets,
289: int offsetcount,
290: int *workspace,
291: int wscount,
292: int ims,
293: int rlevel,
294: int recursing)
295: {
296: stateblock *active_states, *new_states, *temp_states;
297: stateblock *next_active_state, *next_new_state;
298:
299: const uschar *ctypes, *lcc, *fcc;
300: const uschar *ptr;
301: const uschar *end_code, *first_op;
302:
303: int active_count, new_count, match_count;
304:
305: /* Some fields in the md block are frequently referenced, so we load them into
306: independent variables in the hope that this will perform better. */
307:
308: const uschar *start_subject = md->start_subject;
309: const uschar *end_subject = md->end_subject;
310: const uschar *start_code = md->start_code;
311:
312: #ifdef SUPPORT_UTF8
313: BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
314: #else
315: BOOL utf8 = FALSE;
316: #endif
317:
318: rlevel++;
319: offsetcount &= (-2);
320:
321: wscount -= 2;
322: wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
323: (2 * INTS_PER_STATEBLOCK);
324:
325: DPRINTF(("\n%.*s---------------------\n"
326: "%.*sCall to internal_dfa_exec f=%d r=%d\n",
327: rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
328:
329: ctypes = md->tables + ctypes_offset;
330: lcc = md->tables + lcc_offset;
331: fcc = md->tables + fcc_offset;
332:
333: match_count = PCRE_ERROR_NOMATCH; /* A negative number */
334:
335: active_states = (stateblock *)(workspace + 2);
336: next_new_state = new_states = active_states + wscount;
337: new_count = 0;
338:
339: first_op = this_start_code + 1 + LINK_SIZE +
340: ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
341:
342: /* The first thing in any (sub) pattern is a bracket of some sort. Push all
343: the alternative states onto the list, and find out where the end is. This
344: makes is possible to use this function recursively, when we want to stop at a
345: matching internal ket rather than at the end.
346:
347: If the first opcode in the first alternative is OP_REVERSE, we are dealing with
348: a backward assertion. In that case, we have to find out the maximum amount to
349: move back, and set up each alternative appropriately. */
350:
351: if (*first_op == OP_REVERSE)
352: {
353: int max_back = 0;
354: int gone_back;
355:
356: end_code = this_start_code;
357: do
358: {
359: int back = GET(end_code, 2+LINK_SIZE);
360: if (back > max_back) max_back = back;
361: end_code += GET(end_code, 1);
362: }
363: while (*end_code == OP_ALT);
364:
365: /* If we can't go back the amount required for the longest lookbehind
366: pattern, go back as far as we can; some alternatives may still be viable. */
367:
368: #ifdef SUPPORT_UTF8
369: /* In character mode we have to step back character by character */
370:
371: if (utf8)
372: {
373: for (gone_back = 0; gone_back < max_back; gone_back++)
374: {
375: if (current_subject <= start_subject) break;
376: current_subject--;
377: while (current_subject > start_subject &&
378: (*current_subject & 0xc0) == 0x80)
379: current_subject--;
380: }
381: }
382: else
383: #endif
384:
385: /* In byte-mode we can do this quickly. */
386:
387: {
388: gone_back = (current_subject - max_back < start_subject)?
389: current_subject - start_subject : max_back;
390: current_subject -= gone_back;
391: }
392:
393: /* Now we can process the individual branches. */
394:
395: end_code = this_start_code;
396: do
397: {
398: int back = GET(end_code, 2+LINK_SIZE);
399: if (back <= gone_back)
400: {
401: int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
402: ADD_NEW_DATA(-bstate, 0, gone_back - back);
403: }
404: end_code += GET(end_code, 1);
405: }
406: while (*end_code == OP_ALT);
407: }
408:
409: /* This is the code for a "normal" subpattern (not a backward assertion). The
410: start of a whole pattern is always one of these. If we are at the top level,
411: we may be asked to restart matching from the same point that we reached for a
412: previous partial match. We still have to scan through the top-level branches to
413: find the end state. */
414:
415: else
416: {
417: end_code = this_start_code;
418:
419: /* Restarting */
420:
421: if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
422: {
423: do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
424: new_count = workspace[1];
425: if (!workspace[0])
426: memcpy(new_states, active_states, new_count * sizeof(stateblock));
427: }
428:
429: /* Not restarting */
430:
431: else
432: {
433: int length = 1 + LINK_SIZE +
434: ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
435: do
436: {
437: ADD_NEW(end_code - start_code + length, 0);
438: end_code += GET(end_code, 1);
439: length = 1 + LINK_SIZE;
440: }
441: while (*end_code == OP_ALT);
442: }
443: }
444:
445: workspace[0] = 0; /* Bit indicating which vector is current */
446:
447: DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
448:
449: /* Loop for scanning the subject */
450:
451: ptr = current_subject;
452: for (;;)
453: {
454: int i, j;
455: int clen, dlen;
456: unsigned int c, d;
457:
458: /* Make the new state list into the active state list and empty the
459: new state list. */
460:
461: temp_states = active_states;
462: active_states = new_states;
463: new_states = temp_states;
464: active_count = new_count;
465: new_count = 0;
466:
467: workspace[0] ^= 1; /* Remember for the restarting feature */
468: workspace[1] = active_count;
469:
470: #ifdef DEBUG
471: printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
472: pchars((uschar *)ptr, strlen((char *)ptr), stdout);
473: printf("\"\n");
474:
475: printf("%.*sActive states: ", rlevel*2-2, SP);
476: for (i = 0; i < active_count; i++)
477: printf("%d/%d ", active_states[i].offset, active_states[i].count);
478: printf("\n");
479: #endif
480:
481: /* Set the pointers for adding new states */
482:
483: next_active_state = active_states + active_count;
484: next_new_state = new_states;
485:
486: /* Load the current character from the subject outside the loop, as many
487: different states may want to look at it, and we assume that at least one
488: will. */
489:
490: if (ptr < end_subject)
491: {
492: clen = 1; /* Number of bytes in the character */
493: #ifdef SUPPORT_UTF8
494: if (utf8) { GETCHARLEN(c, ptr, clen); } else
495: #endif /* SUPPORT_UTF8 */
496: c = *ptr;
497: }
498: else
499: {
500: clen = 0; /* This indicates the end of the subject */
501: c = NOTACHAR; /* This value should never actually be used */
502: }
503:
504: /* Scan up the active states and act on each one. The result of an action
505: may be to add more states to the currently active list (e.g. on hitting a
506: parenthesis) or it may be to put states on the new list, for considering
507: when we move the character pointer on. */
508:
509: for (i = 0; i < active_count; i++)
510: {
511: stateblock *current_state = active_states + i;
512: const uschar *code;
513: int state_offset = current_state->offset;
514: int count, codevalue;
515:
516: #ifdef DEBUG
517: printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
518: if (clen == 0) printf("EOL\n");
519: else if (c > 32 && c < 127) printf("'%c'\n", c);
520: else printf("0x%02x\n", c);
521: #endif
522:
523: /* This variable is referred to implicity in the ADD_xxx macros. */
524:
525: ims = current_state->ims;
526:
527: /* A negative offset is a special case meaning "hold off going to this
528: (negated) state until the number of characters in the data field have
529: been skipped". */
530:
531: if (state_offset < 0)
532: {
533: if (current_state->data > 0)
534: {
535: DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
536: ADD_NEW_DATA(state_offset, current_state->count,
537: current_state->data - 1);
538: continue;
539: }
540: else
541: {
542: current_state->offset = state_offset = -state_offset;
543: }
544: }
545:
546: /* Check for a duplicate state with the same count, and skip if found. */
547:
548: for (j = 0; j < i; j++)
549: {
550: if (active_states[j].offset == state_offset &&
551: active_states[j].count == current_state->count)
552: {
553: DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
554: goto NEXT_ACTIVE_STATE;
555: }
556: }
557:
558: /* The state offset is the offset to the opcode */
559:
560: code = start_code + state_offset;
561: codevalue = *code;
562:
563: /* If this opcode is followed by an inline character, load it. It is
564: tempting to test for the presence of a subject character here, but that
565: is wrong, because sometimes zero repetitions of the subject are
566: permitted.
567:
568: We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
569: argument that is not a data character - but is always one byte long. We
570: have to take special action to deal with \P, \p, \H, \h, \V, \v and \X in
571: this case. To keep the other cases fast, convert these ones to new opcodes.
572: */
573:
574: if (coptable[codevalue] > 0)
575: {
576: dlen = 1;
577: #ifdef SUPPORT_UTF8
578: if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
579: #endif /* SUPPORT_UTF8 */
580: d = code[coptable[codevalue]];
581: if (codevalue >= OP_TYPESTAR)
582: {
583: switch(d)
584: {
585: case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
586: case OP_NOTPROP:
587: case OP_PROP: codevalue += OP_PROP_EXTRA; break;
588: case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
589: case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
590: case OP_NOT_HSPACE:
591: case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
592: case OP_NOT_VSPACE:
593: case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
594: default: break;
595: }
596: }
597: }
598: else
599: {
600: dlen = 0; /* Not strictly necessary, but compilers moan */
601: d = NOTACHAR; /* if these variables are not set. */
602: }
603:
604:
605: /* Now process the individual opcodes */
606:
607: switch (codevalue)
608: {
609:
610: /* ========================================================================== */
611: /* Reached a closing bracket. If not at the end of the pattern, carry
612: on with the next opcode. Otherwise, unless we have an empty string and
613: PCRE_NOTEMPTY is set, save the match data, shifting up all previous
614: matches so we always have the longest first. */
615:
616: case OP_KET:
617: case OP_KETRMIN:
618: case OP_KETRMAX:
619: if (code != end_code)
620: {
621: ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
622: if (codevalue != OP_KET)
623: {
624: ADD_ACTIVE(state_offset - GET(code, 1), 0);
625: }
626: }
627: else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
628: {
629: if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
630: else if (match_count > 0 && ++match_count * 2 >= offsetcount)
631: match_count = 0;
632: count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
633: if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
634: if (offsetcount >= 2)
635: {
636: offsets[0] = current_subject - start_subject;
637: offsets[1] = ptr - start_subject;
638: DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
639: offsets[1] - offsets[0], current_subject));
640: }
641: if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
642: {
643: DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
644: "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
645: match_count, rlevel*2-2, SP));
646: return match_count;
647: }
648: }
649: break;
650:
651: /* ========================================================================== */
652: /* These opcodes add to the current list of states without looking
653: at the current character. */
654:
655: /*-----------------------------------------------------------------*/
656: case OP_ALT:
657: do { code += GET(code, 1); } while (*code == OP_ALT);
658: ADD_ACTIVE(code - start_code, 0);
659: break;
660:
661: /*-----------------------------------------------------------------*/
662: case OP_BRA:
663: case OP_SBRA:
664: do
665: {
666: ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
667: code += GET(code, 1);
668: }
669: while (*code == OP_ALT);
670: break;
671:
672: /*-----------------------------------------------------------------*/
673: case OP_CBRA:
674: case OP_SCBRA:
675: ADD_ACTIVE(code - start_code + 3 + LINK_SIZE, 0);
676: code += GET(code, 1);
677: while (*code == OP_ALT)
678: {
679: ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
680: code += GET(code, 1);
681: }
682: break;
683:
684: /*-----------------------------------------------------------------*/
685: case OP_BRAZERO:
686: case OP_BRAMINZERO:
687: ADD_ACTIVE(state_offset + 1, 0);
688: code += 1 + GET(code, 2);
689: while (*code == OP_ALT) code += GET(code, 1);
690: ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
691: break;
692:
693: /*-----------------------------------------------------------------*/
694: case OP_SKIPZERO:
695: code += 1 + GET(code, 2);
696: while (*code == OP_ALT) code += GET(code, 1);
697: ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
698: break;
699:
700: /*-----------------------------------------------------------------*/
701: case OP_CIRC:
702: if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
703: ((ims & PCRE_MULTILINE) != 0 &&
704: ptr != end_subject &&
705: WAS_NEWLINE(ptr)))
706: { ADD_ACTIVE(state_offset + 1, 0); }
707: break;
708:
709: /*-----------------------------------------------------------------*/
710: case OP_EOD:
711: if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
712: break;
713:
714: /*-----------------------------------------------------------------*/
715: case OP_OPT:
716: ims = code[1];
717: ADD_ACTIVE(state_offset + 2, 0);
718: break;
719:
720: /*-----------------------------------------------------------------*/
721: case OP_SOD:
722: if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
723: break;
724:
725: /*-----------------------------------------------------------------*/
726: case OP_SOM:
727: if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
728: break;
729:
730:
731: /* ========================================================================== */
732: /* These opcodes inspect the next subject character, and sometimes
733: the previous one as well, but do not have an argument. The variable
734: clen contains the length of the current character and is zero if we are
735: at the end of the subject. */
736:
737: /*-----------------------------------------------------------------*/
738: case OP_ANY:
739: if (clen > 0 && !IS_NEWLINE(ptr))
740: { ADD_NEW(state_offset + 1, 0); }
741: break;
742:
743: /*-----------------------------------------------------------------*/
744: case OP_ALLANY:
745: if (clen > 0)
746: { ADD_NEW(state_offset + 1, 0); }
747: break;
748:
749: /*-----------------------------------------------------------------*/
750: case OP_EODN:
751: if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
752: { ADD_ACTIVE(state_offset + 1, 0); }
753: break;
754:
755: /*-----------------------------------------------------------------*/
756: case OP_DOLL:
757: if ((md->moptions & PCRE_NOTEOL) == 0)
758: {
759: if (clen == 0 ||
760: (IS_NEWLINE(ptr) &&
761: ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
762: ))
763: { ADD_ACTIVE(state_offset + 1, 0); }
764: }
765: else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
766: { ADD_ACTIVE(state_offset + 1, 0); }
767: break;
768:
769: /*-----------------------------------------------------------------*/
770:
771: case OP_DIGIT:
772: case OP_WHITESPACE:
773: case OP_WORDCHAR:
774: if (clen > 0 && c < 256 &&
775: ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
776: { ADD_NEW(state_offset + 1, 0); }
777: break;
778:
779: /*-----------------------------------------------------------------*/
780: case OP_NOT_DIGIT:
781: case OP_NOT_WHITESPACE:
782: case OP_NOT_WORDCHAR:
783: if (clen > 0 && (c >= 256 ||
784: ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
785: { ADD_NEW(state_offset + 1, 0); }
786: break;
787:
788: /*-----------------------------------------------------------------*/
789: case OP_WORD_BOUNDARY:
790: case OP_NOT_WORD_BOUNDARY:
791: {
792: int left_word, right_word;
793:
794: if (ptr > start_subject)
795: {
796: const uschar *temp = ptr - 1;
797: #ifdef SUPPORT_UTF8
798: if (utf8) BACKCHAR(temp);
799: #endif
800: GETCHARTEST(d, temp);
801: left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
802: }
803: else left_word = 0;
804:
805: if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
806: else right_word = 0;
807:
808: if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
809: { ADD_ACTIVE(state_offset + 1, 0); }
810: }
811: break;
812:
813:
814: /*-----------------------------------------------------------------*/
815: /* Check the next character by Unicode property. We will get here only
816: if the support is in the binary; otherwise a compile-time error occurs.
817: */
818:
819: #ifdef SUPPORT_UCP
820: case OP_PROP:
821: case OP_NOTPROP:
822: if (clen > 0)
823: {
824: BOOL OK;
1.2 ! misha 825: const ucd_record * prop = GET_UCD(c);
1.1 misha 826: switch(code[1])
827: {
828: case PT_ANY:
829: OK = TRUE;
830: break;
831:
832: case PT_LAMP:
1.2 ! misha 833: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1.1 misha 834: break;
835:
836: case PT_GC:
1.2 ! misha 837: OK = _pcre_ucp_gentype[prop->chartype] == code[2];
1.1 misha 838: break;
839:
840: case PT_PC:
1.2 ! misha 841: OK = prop->chartype == code[2];
1.1 misha 842: break;
843:
844: case PT_SC:
1.2 ! misha 845: OK = prop->script == code[2];
1.1 misha 846: break;
847:
848: /* Should never occur, but keep compilers from grumbling. */
849:
850: default:
851: OK = codevalue != OP_PROP;
852: break;
853: }
854:
855: if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
856: }
857: break;
858: #endif
859:
860:
861:
862: /* ========================================================================== */
863: /* These opcodes likewise inspect the subject character, but have an
864: argument that is not a data character. It is one of these opcodes:
865: OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
866: OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
867:
868: case OP_TYPEPLUS:
869: case OP_TYPEMINPLUS:
870: case OP_TYPEPOSPLUS:
871: count = current_state->count; /* Already matched */
872: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
873: if (clen > 0)
874: {
875: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
876: (c < 256 &&
877: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
878: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
879: {
880: if (count > 0 && codevalue == OP_TYPEPOSPLUS)
881: {
882: active_count--; /* Remove non-match possibility */
883: next_active_state--;
884: }
885: count++;
886: ADD_NEW(state_offset, count);
887: }
888: }
889: break;
890:
891: /*-----------------------------------------------------------------*/
892: case OP_TYPEQUERY:
893: case OP_TYPEMINQUERY:
894: case OP_TYPEPOSQUERY:
895: ADD_ACTIVE(state_offset + 2, 0);
896: if (clen > 0)
897: {
898: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
899: (c < 256 &&
900: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
901: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
902: {
903: if (codevalue == OP_TYPEPOSQUERY)
904: {
905: active_count--; /* Remove non-match possibility */
906: next_active_state--;
907: }
908: ADD_NEW(state_offset + 2, 0);
909: }
910: }
911: break;
912:
913: /*-----------------------------------------------------------------*/
914: case OP_TYPESTAR:
915: case OP_TYPEMINSTAR:
916: case OP_TYPEPOSSTAR:
917: ADD_ACTIVE(state_offset + 2, 0);
918: if (clen > 0)
919: {
920: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
921: (c < 256 &&
922: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
923: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
924: {
925: if (codevalue == OP_TYPEPOSSTAR)
926: {
927: active_count--; /* Remove non-match possibility */
928: next_active_state--;
929: }
930: ADD_NEW(state_offset, 0);
931: }
932: }
933: break;
934:
935: /*-----------------------------------------------------------------*/
936: case OP_TYPEEXACT:
937: count = current_state->count; /* Number already matched */
938: if (clen > 0)
939: {
940: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
941: (c < 256 &&
942: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
943: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
944: {
945: if (++count >= GET2(code, 1))
946: { ADD_NEW(state_offset + 4, 0); }
947: else
948: { ADD_NEW(state_offset, count); }
949: }
950: }
951: break;
952:
953: /*-----------------------------------------------------------------*/
954: case OP_TYPEUPTO:
955: case OP_TYPEMINUPTO:
956: case OP_TYPEPOSUPTO:
957: ADD_ACTIVE(state_offset + 4, 0);
958: count = current_state->count; /* Number already matched */
959: if (clen > 0)
960: {
961: if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
962: (c < 256 &&
963: (d != OP_ANY || !IS_NEWLINE(ptr)) &&
964: ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
965: {
966: if (codevalue == OP_TYPEPOSUPTO)
967: {
968: active_count--; /* Remove non-match possibility */
969: next_active_state--;
970: }
971: if (++count >= GET2(code, 1))
972: { ADD_NEW(state_offset + 4, 0); }
973: else
974: { ADD_NEW(state_offset, count); }
975: }
976: }
977: break;
978:
979: /* ========================================================================== */
980: /* These are virtual opcodes that are used when something like
981: OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
982: argument. It keeps the code above fast for the other cases. The argument
983: is in the d variable. */
984:
985: #ifdef SUPPORT_UCP
986: case OP_PROP_EXTRA + OP_TYPEPLUS:
987: case OP_PROP_EXTRA + OP_TYPEMINPLUS:
988: case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
989: count = current_state->count; /* Already matched */
990: if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
991: if (clen > 0)
992: {
993: BOOL OK;
1.2 ! misha 994: const ucd_record * prop = GET_UCD(c);
1.1 misha 995: switch(code[2])
996: {
997: case PT_ANY:
998: OK = TRUE;
999: break;
1000:
1001: case PT_LAMP:
1.2 ! misha 1002: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1.1 misha 1003: break;
1004:
1005: case PT_GC:
1.2 ! misha 1006: OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1.1 misha 1007: break;
1008:
1009: case PT_PC:
1.2 ! misha 1010: OK = prop->chartype == code[3];
1.1 misha 1011: break;
1012:
1013: case PT_SC:
1.2 ! misha 1014: OK = prop->script == code[3];
1.1 misha 1015: break;
1016:
1017: /* Should never occur, but keep compilers from grumbling. */
1018:
1019: default:
1020: OK = codevalue != OP_PROP;
1021: break;
1022: }
1023:
1024: if (OK == (d == OP_PROP))
1025: {
1026: if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1027: {
1028: active_count--; /* Remove non-match possibility */
1029: next_active_state--;
1030: }
1031: count++;
1032: ADD_NEW(state_offset, count);
1033: }
1034: }
1035: break;
1036:
1037: /*-----------------------------------------------------------------*/
1038: case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1039: case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1040: case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1041: count = current_state->count; /* Already matched */
1042: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1.2 ! misha 1043: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1.1 misha 1044: {
1045: const uschar *nptr = ptr + clen;
1046: int ncount = 0;
1047: if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1048: {
1049: active_count--; /* Remove non-match possibility */
1050: next_active_state--;
1051: }
1052: while (nptr < end_subject)
1053: {
1054: int nd;
1055: int ndlen = 1;
1056: GETCHARLEN(nd, nptr, ndlen);
1.2 ! misha 1057: if (UCD_CATEGORY(nd) != ucp_M) break;
1.1 misha 1058: ncount++;
1059: nptr += ndlen;
1060: }
1061: count++;
1062: ADD_NEW_DATA(-state_offset, count, ncount);
1063: }
1064: break;
1065: #endif
1066:
1067: /*-----------------------------------------------------------------*/
1068: case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1069: case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1070: case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1071: count = current_state->count; /* Already matched */
1072: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1073: if (clen > 0)
1074: {
1075: int ncount = 0;
1076: switch (c)
1077: {
1078: case 0x000b:
1079: case 0x000c:
1080: case 0x0085:
1081: case 0x2028:
1082: case 0x2029:
1083: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1084: goto ANYNL01;
1085:
1086: case 0x000d:
1087: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1088: /* Fall through */
1089:
1090: ANYNL01:
1091: case 0x000a:
1092: if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1093: {
1094: active_count--; /* Remove non-match possibility */
1095: next_active_state--;
1096: }
1097: count++;
1098: ADD_NEW_DATA(-state_offset, count, ncount);
1099: break;
1100:
1101: default:
1102: break;
1103: }
1104: }
1105: break;
1106:
1107: /*-----------------------------------------------------------------*/
1108: case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1109: case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1110: case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1111: count = current_state->count; /* Already matched */
1112: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1113: if (clen > 0)
1114: {
1115: BOOL OK;
1116: switch (c)
1117: {
1118: case 0x000a:
1119: case 0x000b:
1120: case 0x000c:
1121: case 0x000d:
1122: case 0x0085:
1123: case 0x2028:
1124: case 0x2029:
1125: OK = TRUE;
1126: break;
1127:
1128: default:
1129: OK = FALSE;
1130: break;
1131: }
1132:
1133: if (OK == (d == OP_VSPACE))
1134: {
1135: if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1136: {
1137: active_count--; /* Remove non-match possibility */
1138: next_active_state--;
1139: }
1140: count++;
1141: ADD_NEW_DATA(-state_offset, count, 0);
1142: }
1143: }
1144: break;
1145:
1146: /*-----------------------------------------------------------------*/
1147: case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1148: case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1149: case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1150: count = current_state->count; /* Already matched */
1151: if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1152: if (clen > 0)
1153: {
1154: BOOL OK;
1155: switch (c)
1156: {
1157: case 0x09: /* HT */
1158: case 0x20: /* SPACE */
1159: case 0xa0: /* NBSP */
1160: case 0x1680: /* OGHAM SPACE MARK */
1161: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1162: case 0x2000: /* EN QUAD */
1163: case 0x2001: /* EM QUAD */
1164: case 0x2002: /* EN SPACE */
1165: case 0x2003: /* EM SPACE */
1166: case 0x2004: /* THREE-PER-EM SPACE */
1167: case 0x2005: /* FOUR-PER-EM SPACE */
1168: case 0x2006: /* SIX-PER-EM SPACE */
1169: case 0x2007: /* FIGURE SPACE */
1170: case 0x2008: /* PUNCTUATION SPACE */
1171: case 0x2009: /* THIN SPACE */
1172: case 0x200A: /* HAIR SPACE */
1173: case 0x202f: /* NARROW NO-BREAK SPACE */
1174: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1175: case 0x3000: /* IDEOGRAPHIC SPACE */
1176: OK = TRUE;
1177: break;
1178:
1179: default:
1180: OK = FALSE;
1181: break;
1182: }
1183:
1184: if (OK == (d == OP_HSPACE))
1185: {
1186: if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1187: {
1188: active_count--; /* Remove non-match possibility */
1189: next_active_state--;
1190: }
1191: count++;
1192: ADD_NEW_DATA(-state_offset, count, 0);
1193: }
1194: }
1195: break;
1196:
1197: /*-----------------------------------------------------------------*/
1198: #ifdef SUPPORT_UCP
1199: case OP_PROP_EXTRA + OP_TYPEQUERY:
1200: case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1201: case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1202: count = 4;
1203: goto QS1;
1204:
1205: case OP_PROP_EXTRA + OP_TYPESTAR:
1206: case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1207: case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1208: count = 0;
1209:
1210: QS1:
1211:
1212: ADD_ACTIVE(state_offset + 4, 0);
1213: if (clen > 0)
1214: {
1215: BOOL OK;
1.2 ! misha 1216: const ucd_record * prop = GET_UCD(c);
1.1 misha 1217: switch(code[2])
1218: {
1219: case PT_ANY:
1220: OK = TRUE;
1221: break;
1222:
1223: case PT_LAMP:
1.2 ! misha 1224: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1.1 misha 1225: break;
1226:
1227: case PT_GC:
1.2 ! misha 1228: OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1.1 misha 1229: break;
1230:
1231: case PT_PC:
1.2 ! misha 1232: OK = prop->chartype == code[3];
1.1 misha 1233: break;
1234:
1235: case PT_SC:
1.2 ! misha 1236: OK = prop->script == code[3];
1.1 misha 1237: break;
1238:
1239: /* Should never occur, but keep compilers from grumbling. */
1240:
1241: default:
1242: OK = codevalue != OP_PROP;
1243: break;
1244: }
1245:
1246: if (OK == (d == OP_PROP))
1247: {
1248: if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1249: codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1250: {
1251: active_count--; /* Remove non-match possibility */
1252: next_active_state--;
1253: }
1254: ADD_NEW(state_offset + count, 0);
1255: }
1256: }
1257: break;
1258:
1259: /*-----------------------------------------------------------------*/
1260: case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1261: case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1262: case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1263: count = 2;
1264: goto QS2;
1265:
1266: case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1267: case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1268: case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1269: count = 0;
1270:
1271: QS2:
1272:
1273: ADD_ACTIVE(state_offset + 2, 0);
1.2 ! misha 1274: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1.1 misha 1275: {
1276: const uschar *nptr = ptr + clen;
1277: int ncount = 0;
1278: if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1279: codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1280: {
1281: active_count--; /* Remove non-match possibility */
1282: next_active_state--;
1283: }
1284: while (nptr < end_subject)
1285: {
1286: int nd;
1287: int ndlen = 1;
1288: GETCHARLEN(nd, nptr, ndlen);
1.2 ! misha 1289: if (UCD_CATEGORY(nd) != ucp_M) break;
1.1 misha 1290: ncount++;
1291: nptr += ndlen;
1292: }
1293: ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1294: }
1295: break;
1296: #endif
1297:
1298: /*-----------------------------------------------------------------*/
1299: case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1300: case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1301: case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1302: count = 2;
1303: goto QS3;
1304:
1305: case OP_ANYNL_EXTRA + OP_TYPESTAR:
1306: case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1307: case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1308: count = 0;
1309:
1310: QS3:
1311: ADD_ACTIVE(state_offset + 2, 0);
1312: if (clen > 0)
1313: {
1314: int ncount = 0;
1315: switch (c)
1316: {
1317: case 0x000b:
1318: case 0x000c:
1319: case 0x0085:
1320: case 0x2028:
1321: case 0x2029:
1322: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1323: goto ANYNL02;
1324:
1325: case 0x000d:
1326: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1327: /* Fall through */
1328:
1329: ANYNL02:
1330: case 0x000a:
1331: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1332: codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1333: {
1334: active_count--; /* Remove non-match possibility */
1335: next_active_state--;
1336: }
1337: ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1338: break;
1339:
1340: default:
1341: break;
1342: }
1343: }
1344: break;
1345:
1346: /*-----------------------------------------------------------------*/
1347: case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1348: case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1349: case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1350: count = 2;
1351: goto QS4;
1352:
1353: case OP_VSPACE_EXTRA + OP_TYPESTAR:
1354: case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1355: case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1356: count = 0;
1357:
1358: QS4:
1359: ADD_ACTIVE(state_offset + 2, 0);
1360: if (clen > 0)
1361: {
1362: BOOL OK;
1363: switch (c)
1364: {
1365: case 0x000a:
1366: case 0x000b:
1367: case 0x000c:
1368: case 0x000d:
1369: case 0x0085:
1370: case 0x2028:
1371: case 0x2029:
1372: OK = TRUE;
1373: break;
1374:
1375: default:
1376: OK = FALSE;
1377: break;
1378: }
1379: if (OK == (d == OP_VSPACE))
1380: {
1381: if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1382: codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1383: {
1384: active_count--; /* Remove non-match possibility */
1385: next_active_state--;
1386: }
1387: ADD_NEW_DATA(-(state_offset + count), 0, 0);
1388: }
1389: }
1390: break;
1391:
1392: /*-----------------------------------------------------------------*/
1393: case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1394: case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1395: case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1396: count = 2;
1397: goto QS5;
1398:
1399: case OP_HSPACE_EXTRA + OP_TYPESTAR:
1400: case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1401: case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1402: count = 0;
1403:
1404: QS5:
1405: ADD_ACTIVE(state_offset + 2, 0);
1406: if (clen > 0)
1407: {
1408: BOOL OK;
1409: switch (c)
1410: {
1411: case 0x09: /* HT */
1412: case 0x20: /* SPACE */
1413: case 0xa0: /* NBSP */
1414: case 0x1680: /* OGHAM SPACE MARK */
1415: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1416: case 0x2000: /* EN QUAD */
1417: case 0x2001: /* EM QUAD */
1418: case 0x2002: /* EN SPACE */
1419: case 0x2003: /* EM SPACE */
1420: case 0x2004: /* THREE-PER-EM SPACE */
1421: case 0x2005: /* FOUR-PER-EM SPACE */
1422: case 0x2006: /* SIX-PER-EM SPACE */
1423: case 0x2007: /* FIGURE SPACE */
1424: case 0x2008: /* PUNCTUATION SPACE */
1425: case 0x2009: /* THIN SPACE */
1426: case 0x200A: /* HAIR SPACE */
1427: case 0x202f: /* NARROW NO-BREAK SPACE */
1428: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1429: case 0x3000: /* IDEOGRAPHIC SPACE */
1430: OK = TRUE;
1431: break;
1432:
1433: default:
1434: OK = FALSE;
1435: break;
1436: }
1437:
1438: if (OK == (d == OP_HSPACE))
1439: {
1440: if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1441: codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1442: {
1443: active_count--; /* Remove non-match possibility */
1444: next_active_state--;
1445: }
1446: ADD_NEW_DATA(-(state_offset + count), 0, 0);
1447: }
1448: }
1449: break;
1450:
1451: /*-----------------------------------------------------------------*/
1452: #ifdef SUPPORT_UCP
1453: case OP_PROP_EXTRA + OP_TYPEEXACT:
1454: case OP_PROP_EXTRA + OP_TYPEUPTO:
1455: case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1456: case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1457: if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1458: { ADD_ACTIVE(state_offset + 6, 0); }
1459: count = current_state->count; /* Number already matched */
1460: if (clen > 0)
1461: {
1462: BOOL OK;
1.2 ! misha 1463: const ucd_record * prop = GET_UCD(c);
1.1 misha 1464: switch(code[4])
1465: {
1466: case PT_ANY:
1467: OK = TRUE;
1468: break;
1469:
1470: case PT_LAMP:
1.2 ! misha 1471: OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1.1 misha 1472: break;
1473:
1474: case PT_GC:
1.2 ! misha 1475: OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1.1 misha 1476: break;
1477:
1478: case PT_PC:
1.2 ! misha 1479: OK = prop->chartype == code[5];
1.1 misha 1480: break;
1481:
1482: case PT_SC:
1.2 ! misha 1483: OK = prop->script == code[5];
1.1 misha 1484: break;
1485:
1486: /* Should never occur, but keep compilers from grumbling. */
1487:
1488: default:
1489: OK = codevalue != OP_PROP;
1490: break;
1491: }
1492:
1493: if (OK == (d == OP_PROP))
1494: {
1495: if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1496: {
1497: active_count--; /* Remove non-match possibility */
1498: next_active_state--;
1499: }
1500: if (++count >= GET2(code, 1))
1501: { ADD_NEW(state_offset + 6, 0); }
1502: else
1503: { ADD_NEW(state_offset, count); }
1504: }
1505: }
1506: break;
1507:
1508: /*-----------------------------------------------------------------*/
1509: case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1510: case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1511: case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1512: case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1513: if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1514: { ADD_ACTIVE(state_offset + 4, 0); }
1515: count = current_state->count; /* Number already matched */
1.2 ! misha 1516: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1.1 misha 1517: {
1518: const uschar *nptr = ptr + clen;
1519: int ncount = 0;
1520: if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1521: {
1522: active_count--; /* Remove non-match possibility */
1523: next_active_state--;
1524: }
1525: while (nptr < end_subject)
1526: {
1527: int nd;
1528: int ndlen = 1;
1529: GETCHARLEN(nd, nptr, ndlen);
1.2 ! misha 1530: if (UCD_CATEGORY(nd) != ucp_M) break;
1.1 misha 1531: ncount++;
1532: nptr += ndlen;
1533: }
1534: if (++count >= GET2(code, 1))
1535: { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1536: else
1537: { ADD_NEW_DATA(-state_offset, count, ncount); }
1538: }
1539: break;
1540: #endif
1541:
1542: /*-----------------------------------------------------------------*/
1543: case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1544: case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1545: case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1546: case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1547: if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1548: { ADD_ACTIVE(state_offset + 4, 0); }
1549: count = current_state->count; /* Number already matched */
1550: if (clen > 0)
1551: {
1552: int ncount = 0;
1553: switch (c)
1554: {
1555: case 0x000b:
1556: case 0x000c:
1557: case 0x0085:
1558: case 0x2028:
1559: case 0x2029:
1560: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1561: goto ANYNL03;
1562:
1563: case 0x000d:
1564: if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1565: /* Fall through */
1566:
1567: ANYNL03:
1568: case 0x000a:
1569: if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1570: {
1571: active_count--; /* Remove non-match possibility */
1572: next_active_state--;
1573: }
1574: if (++count >= GET2(code, 1))
1575: { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1576: else
1577: { ADD_NEW_DATA(-state_offset, count, ncount); }
1578: break;
1579:
1580: default:
1581: break;
1582: }
1583: }
1584: break;
1585:
1586: /*-----------------------------------------------------------------*/
1587: case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1588: case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1589: case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1590: case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1591: if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1592: { ADD_ACTIVE(state_offset + 4, 0); }
1593: count = current_state->count; /* Number already matched */
1594: if (clen > 0)
1595: {
1596: BOOL OK;
1597: switch (c)
1598: {
1599: case 0x000a:
1600: case 0x000b:
1601: case 0x000c:
1602: case 0x000d:
1603: case 0x0085:
1604: case 0x2028:
1605: case 0x2029:
1606: OK = TRUE;
1607: break;
1608:
1609: default:
1610: OK = FALSE;
1611: }
1612:
1613: if (OK == (d == OP_VSPACE))
1614: {
1615: if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1616: {
1617: active_count--; /* Remove non-match possibility */
1618: next_active_state--;
1619: }
1620: if (++count >= GET2(code, 1))
1621: { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1622: else
1623: { ADD_NEW_DATA(-state_offset, count, 0); }
1624: }
1625: }
1626: break;
1627:
1628: /*-----------------------------------------------------------------*/
1629: case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1630: case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1631: case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1632: case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1633: if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1634: { ADD_ACTIVE(state_offset + 4, 0); }
1635: count = current_state->count; /* Number already matched */
1636: if (clen > 0)
1637: {
1638: BOOL OK;
1639: switch (c)
1640: {
1641: case 0x09: /* HT */
1642: case 0x20: /* SPACE */
1643: case 0xa0: /* NBSP */
1644: case 0x1680: /* OGHAM SPACE MARK */
1645: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1646: case 0x2000: /* EN QUAD */
1647: case 0x2001: /* EM QUAD */
1648: case 0x2002: /* EN SPACE */
1649: case 0x2003: /* EM SPACE */
1650: case 0x2004: /* THREE-PER-EM SPACE */
1651: case 0x2005: /* FOUR-PER-EM SPACE */
1652: case 0x2006: /* SIX-PER-EM SPACE */
1653: case 0x2007: /* FIGURE SPACE */
1654: case 0x2008: /* PUNCTUATION SPACE */
1655: case 0x2009: /* THIN SPACE */
1656: case 0x200A: /* HAIR SPACE */
1657: case 0x202f: /* NARROW NO-BREAK SPACE */
1658: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1659: case 0x3000: /* IDEOGRAPHIC SPACE */
1660: OK = TRUE;
1661: break;
1662:
1663: default:
1664: OK = FALSE;
1665: break;
1666: }
1667:
1668: if (OK == (d == OP_HSPACE))
1669: {
1670: if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1671: {
1672: active_count--; /* Remove non-match possibility */
1673: next_active_state--;
1674: }
1675: if (++count >= GET2(code, 1))
1676: { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1677: else
1678: { ADD_NEW_DATA(-state_offset, count, 0); }
1679: }
1680: }
1681: break;
1682:
1683: /* ========================================================================== */
1684: /* These opcodes are followed by a character that is usually compared
1685: to the current subject character; it is loaded into d. We still get
1686: here even if there is no subject character, because in some cases zero
1687: repetitions are permitted. */
1688:
1689: /*-----------------------------------------------------------------*/
1690: case OP_CHAR:
1691: if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1692: break;
1693:
1694: /*-----------------------------------------------------------------*/
1695: case OP_CHARNC:
1696: if (clen == 0) break;
1697:
1698: #ifdef SUPPORT_UTF8
1699: if (utf8)
1700: {
1701: if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1702: {
1703: unsigned int othercase;
1704: if (c < 128) othercase = fcc[c]; else
1705:
1706: /* If we have Unicode property support, we can use it to test the
1707: other case of the character. */
1708:
1709: #ifdef SUPPORT_UCP
1.2 ! misha 1710: othercase = UCD_OTHERCASE(c);
1.1 misha 1711: #else
1712: othercase = NOTACHAR;
1713: #endif
1714:
1715: if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1716: }
1717: }
1718: else
1719: #endif /* SUPPORT_UTF8 */
1720:
1721: /* Non-UTF-8 mode */
1722: {
1723: if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1724: }
1725: break;
1726:
1727:
1728: #ifdef SUPPORT_UCP
1729: /*-----------------------------------------------------------------*/
1730: /* This is a tricky one because it can match more than one character.
1731: Find out how many characters to skip, and then set up a negative state
1732: to wait for them to pass before continuing. */
1733:
1734: case OP_EXTUNI:
1.2 ! misha 1735: if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1.1 misha 1736: {
1737: const uschar *nptr = ptr + clen;
1738: int ncount = 0;
1739: while (nptr < end_subject)
1740: {
1741: int nclen = 1;
1742: GETCHARLEN(c, nptr, nclen);
1.2 ! misha 1743: if (UCD_CATEGORY(c) != ucp_M) break;
1.1 misha 1744: ncount++;
1745: nptr += nclen;
1746: }
1747: ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1748: }
1749: break;
1750: #endif
1751:
1752: /*-----------------------------------------------------------------*/
1753: /* This is a tricky like EXTUNI because it too can match more than one
1754: character (when CR is followed by LF). In this case, set up a negative
1755: state to wait for one character to pass before continuing. */
1756:
1757: case OP_ANYNL:
1758: if (clen > 0) switch(c)
1759: {
1760: case 0x000b:
1761: case 0x000c:
1762: case 0x0085:
1763: case 0x2028:
1764: case 0x2029:
1765: if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1766:
1767: case 0x000a:
1768: ADD_NEW(state_offset + 1, 0);
1769: break;
1770:
1771: case 0x000d:
1772: if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1773: {
1774: ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1775: }
1776: else
1777: {
1778: ADD_NEW(state_offset + 1, 0);
1779: }
1780: break;
1781: }
1782: break;
1783:
1784: /*-----------------------------------------------------------------*/
1785: case OP_NOT_VSPACE:
1786: if (clen > 0) switch(c)
1787: {
1788: case 0x000a:
1789: case 0x000b:
1790: case 0x000c:
1791: case 0x000d:
1792: case 0x0085:
1793: case 0x2028:
1794: case 0x2029:
1795: break;
1796:
1797: default:
1798: ADD_NEW(state_offset + 1, 0);
1799: break;
1800: }
1801: break;
1802:
1803: /*-----------------------------------------------------------------*/
1804: case OP_VSPACE:
1805: if (clen > 0) switch(c)
1806: {
1807: case 0x000a:
1808: case 0x000b:
1809: case 0x000c:
1810: case 0x000d:
1811: case 0x0085:
1812: case 0x2028:
1813: case 0x2029:
1814: ADD_NEW(state_offset + 1, 0);
1815: break;
1816:
1817: default: break;
1818: }
1819: break;
1820:
1821: /*-----------------------------------------------------------------*/
1822: case OP_NOT_HSPACE:
1823: if (clen > 0) switch(c)
1824: {
1825: case 0x09: /* HT */
1826: case 0x20: /* SPACE */
1827: case 0xa0: /* NBSP */
1828: case 0x1680: /* OGHAM SPACE MARK */
1829: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1830: case 0x2000: /* EN QUAD */
1831: case 0x2001: /* EM QUAD */
1832: case 0x2002: /* EN SPACE */
1833: case 0x2003: /* EM SPACE */
1834: case 0x2004: /* THREE-PER-EM SPACE */
1835: case 0x2005: /* FOUR-PER-EM SPACE */
1836: case 0x2006: /* SIX-PER-EM SPACE */
1837: case 0x2007: /* FIGURE SPACE */
1838: case 0x2008: /* PUNCTUATION SPACE */
1839: case 0x2009: /* THIN SPACE */
1840: case 0x200A: /* HAIR SPACE */
1841: case 0x202f: /* NARROW NO-BREAK SPACE */
1842: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1843: case 0x3000: /* IDEOGRAPHIC SPACE */
1844: break;
1845:
1846: default:
1847: ADD_NEW(state_offset + 1, 0);
1848: break;
1849: }
1850: break;
1851:
1852: /*-----------------------------------------------------------------*/
1853: case OP_HSPACE:
1854: if (clen > 0) switch(c)
1855: {
1856: case 0x09: /* HT */
1857: case 0x20: /* SPACE */
1858: case 0xa0: /* NBSP */
1859: case 0x1680: /* OGHAM SPACE MARK */
1860: case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1861: case 0x2000: /* EN QUAD */
1862: case 0x2001: /* EM QUAD */
1863: case 0x2002: /* EN SPACE */
1864: case 0x2003: /* EM SPACE */
1865: case 0x2004: /* THREE-PER-EM SPACE */
1866: case 0x2005: /* FOUR-PER-EM SPACE */
1867: case 0x2006: /* SIX-PER-EM SPACE */
1868: case 0x2007: /* FIGURE SPACE */
1869: case 0x2008: /* PUNCTUATION SPACE */
1870: case 0x2009: /* THIN SPACE */
1871: case 0x200A: /* HAIR SPACE */
1872: case 0x202f: /* NARROW NO-BREAK SPACE */
1873: case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1874: case 0x3000: /* IDEOGRAPHIC SPACE */
1875: ADD_NEW(state_offset + 1, 0);
1876: break;
1877: }
1878: break;
1879:
1880: /*-----------------------------------------------------------------*/
1881: /* Match a negated single character. This is only used for one-byte
1882: characters, that is, we know that d < 256. The character we are
1883: checking (c) can be multibyte. */
1884:
1885: case OP_NOT:
1886: if (clen > 0)
1887: {
1888: unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1889: if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1890: }
1891: break;
1892:
1893: /*-----------------------------------------------------------------*/
1894: case OP_PLUS:
1895: case OP_MINPLUS:
1896: case OP_POSPLUS:
1897: case OP_NOTPLUS:
1898: case OP_NOTMINPLUS:
1899: case OP_NOTPOSPLUS:
1900: count = current_state->count; /* Already matched */
1901: if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1902: if (clen > 0)
1903: {
1904: unsigned int otherd = NOTACHAR;
1905: if ((ims & PCRE_CASELESS) != 0)
1906: {
1907: #ifdef SUPPORT_UTF8
1908: if (utf8 && d >= 128)
1909: {
1910: #ifdef SUPPORT_UCP
1.2 ! misha 1911: otherd = UCD_OTHERCASE(d);
1.1 misha 1912: #endif /* SUPPORT_UCP */
1913: }
1914: else
1915: #endif /* SUPPORT_UTF8 */
1916: otherd = fcc[d];
1917: }
1918: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1919: {
1920: if (count > 0 &&
1921: (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1922: {
1923: active_count--; /* Remove non-match possibility */
1924: next_active_state--;
1925: }
1926: count++;
1927: ADD_NEW(state_offset, count);
1928: }
1929: }
1930: break;
1931:
1932: /*-----------------------------------------------------------------*/
1933: case OP_QUERY:
1934: case OP_MINQUERY:
1935: case OP_POSQUERY:
1936: case OP_NOTQUERY:
1937: case OP_NOTMINQUERY:
1938: case OP_NOTPOSQUERY:
1939: ADD_ACTIVE(state_offset + dlen + 1, 0);
1940: if (clen > 0)
1941: {
1942: unsigned int otherd = NOTACHAR;
1943: if ((ims & PCRE_CASELESS) != 0)
1944: {
1945: #ifdef SUPPORT_UTF8
1946: if (utf8 && d >= 128)
1947: {
1948: #ifdef SUPPORT_UCP
1.2 ! misha 1949: otherd = UCD_OTHERCASE(d);
1.1 misha 1950: #endif /* SUPPORT_UCP */
1951: }
1952: else
1953: #endif /* SUPPORT_UTF8 */
1954: otherd = fcc[d];
1955: }
1956: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1957: {
1958: if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1959: {
1960: active_count--; /* Remove non-match possibility */
1961: next_active_state--;
1962: }
1963: ADD_NEW(state_offset + dlen + 1, 0);
1964: }
1965: }
1966: break;
1967:
1968: /*-----------------------------------------------------------------*/
1969: case OP_STAR:
1970: case OP_MINSTAR:
1971: case OP_POSSTAR:
1972: case OP_NOTSTAR:
1973: case OP_NOTMINSTAR:
1974: case OP_NOTPOSSTAR:
1975: ADD_ACTIVE(state_offset + dlen + 1, 0);
1976: if (clen > 0)
1977: {
1978: unsigned int otherd = NOTACHAR;
1979: if ((ims & PCRE_CASELESS) != 0)
1980: {
1981: #ifdef SUPPORT_UTF8
1982: if (utf8 && d >= 128)
1983: {
1984: #ifdef SUPPORT_UCP
1.2 ! misha 1985: otherd = UCD_OTHERCASE(d);
1.1 misha 1986: #endif /* SUPPORT_UCP */
1987: }
1988: else
1989: #endif /* SUPPORT_UTF8 */
1990: otherd = fcc[d];
1991: }
1992: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1993: {
1994: if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1995: {
1996: active_count--; /* Remove non-match possibility */
1997: next_active_state--;
1998: }
1999: ADD_NEW(state_offset, 0);
2000: }
2001: }
2002: break;
2003:
2004: /*-----------------------------------------------------------------*/
2005: case OP_EXACT:
2006: case OP_NOTEXACT:
2007: count = current_state->count; /* Number already matched */
2008: if (clen > 0)
2009: {
2010: unsigned int otherd = NOTACHAR;
2011: if ((ims & PCRE_CASELESS) != 0)
2012: {
2013: #ifdef SUPPORT_UTF8
2014: if (utf8 && d >= 128)
2015: {
2016: #ifdef SUPPORT_UCP
1.2 ! misha 2017: otherd = UCD_OTHERCASE(d);
1.1 misha 2018: #endif /* SUPPORT_UCP */
2019: }
2020: else
2021: #endif /* SUPPORT_UTF8 */
2022: otherd = fcc[d];
2023: }
2024: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2025: {
2026: if (++count >= GET2(code, 1))
2027: { ADD_NEW(state_offset + dlen + 3, 0); }
2028: else
2029: { ADD_NEW(state_offset, count); }
2030: }
2031: }
2032: break;
2033:
2034: /*-----------------------------------------------------------------*/
2035: case OP_UPTO:
2036: case OP_MINUPTO:
2037: case OP_POSUPTO:
2038: case OP_NOTUPTO:
2039: case OP_NOTMINUPTO:
2040: case OP_NOTPOSUPTO:
2041: ADD_ACTIVE(state_offset + dlen + 3, 0);
2042: count = current_state->count; /* Number already matched */
2043: if (clen > 0)
2044: {
2045: unsigned int otherd = NOTACHAR;
2046: if ((ims & PCRE_CASELESS) != 0)
2047: {
2048: #ifdef SUPPORT_UTF8
2049: if (utf8 && d >= 128)
2050: {
2051: #ifdef SUPPORT_UCP
1.2 ! misha 2052: otherd = UCD_OTHERCASE(d);
1.1 misha 2053: #endif /* SUPPORT_UCP */
2054: }
2055: else
2056: #endif /* SUPPORT_UTF8 */
2057: otherd = fcc[d];
2058: }
2059: if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2060: {
2061: if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2062: {
2063: active_count--; /* Remove non-match possibility */
2064: next_active_state--;
2065: }
2066: if (++count >= GET2(code, 1))
2067: { ADD_NEW(state_offset + dlen + 3, 0); }
2068: else
2069: { ADD_NEW(state_offset, count); }
2070: }
2071: }
2072: break;
2073:
2074:
2075: /* ========================================================================== */
2076: /* These are the class-handling opcodes */
2077:
2078: case OP_CLASS:
2079: case OP_NCLASS:
2080: case OP_XCLASS:
2081: {
2082: BOOL isinclass = FALSE;
2083: int next_state_offset;
2084: const uschar *ecode;
2085:
2086: /* For a simple class, there is always just a 32-byte table, and we
2087: can set isinclass from it. */
2088:
2089: if (codevalue != OP_XCLASS)
2090: {
2091: ecode = code + 33;
2092: if (clen > 0)
2093: {
2094: isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2095: ((code[1 + c/8] & (1 << (c&7))) != 0);
2096: }
2097: }
2098:
2099: /* An extended class may have a table or a list of single characters,
2100: ranges, or both, and it may be positive or negative. There's a
2101: function that sorts all this out. */
2102:
2103: else
2104: {
2105: ecode = code + GET(code, 1);
2106: if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2107: }
2108:
2109: /* At this point, isinclass is set for all kinds of class, and ecode
2110: points to the byte after the end of the class. If there is a
2111: quantifier, this is where it will be. */
2112:
2113: next_state_offset = ecode - start_code;
2114:
2115: switch (*ecode)
2116: {
2117: case OP_CRSTAR:
2118: case OP_CRMINSTAR:
2119: ADD_ACTIVE(next_state_offset + 1, 0);
2120: if (isinclass) { ADD_NEW(state_offset, 0); }
2121: break;
2122:
2123: case OP_CRPLUS:
2124: case OP_CRMINPLUS:
2125: count = current_state->count; /* Already matched */
2126: if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2127: if (isinclass) { count++; ADD_NEW(state_offset, count); }
2128: break;
2129:
2130: case OP_CRQUERY:
2131: case OP_CRMINQUERY:
2132: ADD_ACTIVE(next_state_offset + 1, 0);
2133: if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2134: break;
2135:
2136: case OP_CRRANGE:
2137: case OP_CRMINRANGE:
2138: count = current_state->count; /* Already matched */
2139: if (count >= GET2(ecode, 1))
2140: { ADD_ACTIVE(next_state_offset + 5, 0); }
2141: if (isinclass)
2142: {
2143: int max = GET2(ecode, 3);
2144: if (++count >= max && max != 0) /* Max 0 => no limit */
2145: { ADD_NEW(next_state_offset + 5, 0); }
2146: else
2147: { ADD_NEW(state_offset, count); }
2148: }
2149: break;
2150:
2151: default:
2152: if (isinclass) { ADD_NEW(next_state_offset, 0); }
2153: break;
2154: }
2155: }
2156: break;
2157:
2158: /* ========================================================================== */
2159: /* These are the opcodes for fancy brackets of various kinds. We have
2160: to use recursion in order to handle them. The "always failing" assersion
2161: (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2162: though the other "backtracking verbs" are not supported. */
2163:
2164: case OP_FAIL:
2165: break;
2166:
2167: case OP_ASSERT:
2168: case OP_ASSERT_NOT:
2169: case OP_ASSERTBACK:
2170: case OP_ASSERTBACK_NOT:
2171: {
2172: int rc;
2173: int local_offsets[2];
2174: int local_workspace[1000];
2175: const uschar *endasscode = code + GET(code, 1);
2176:
2177: while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2178:
2179: rc = internal_dfa_exec(
2180: md, /* static match data */
2181: code, /* this subexpression's code */
2182: ptr, /* where we currently are */
2183: ptr - start_subject, /* start offset */
2184: local_offsets, /* offset vector */
2185: sizeof(local_offsets)/sizeof(int), /* size of same */
2186: local_workspace, /* workspace vector */
2187: sizeof(local_workspace)/sizeof(int), /* size of same */
2188: ims, /* the current ims flags */
2189: rlevel, /* function recursion level */
2190: recursing); /* pass on regex recursion */
2191:
2192: if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2193: { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2194: }
2195: break;
2196:
2197: /*-----------------------------------------------------------------*/
2198: case OP_COND:
2199: case OP_SCOND:
2200: {
2201: int local_offsets[1000];
2202: int local_workspace[1000];
2203: int condcode = code[LINK_SIZE+1];
2204:
2205: /* Back reference conditions are not supported */
2206:
2207: if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2208:
2209: /* The DEFINE condition is always false */
2210:
2211: if (condcode == OP_DEF)
2212: {
2213: ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
2214: }
2215:
2216: /* The only supported version of OP_RREF is for the value RREF_ANY,
2217: which means "test if in any recursion". We can't test for specifically
2218: recursed groups. */
2219:
2220: else if (condcode == OP_RREF)
2221: {
2222: int value = GET2(code, LINK_SIZE+2);
2223: if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2224: if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2225: else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2226: }
2227:
2228: /* Otherwise, the condition is an assertion */
2229:
2230: else
2231: {
2232: int rc;
2233: const uschar *asscode = code + LINK_SIZE + 1;
2234: const uschar *endasscode = asscode + GET(asscode, 1);
2235:
2236: while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2237:
2238: rc = internal_dfa_exec(
2239: md, /* fixed match data */
2240: asscode, /* this subexpression's code */
2241: ptr, /* where we currently are */
2242: ptr - start_subject, /* start offset */
2243: local_offsets, /* offset vector */
2244: sizeof(local_offsets)/sizeof(int), /* size of same */
2245: local_workspace, /* workspace vector */
2246: sizeof(local_workspace)/sizeof(int), /* size of same */
2247: ims, /* the current ims flags */
2248: rlevel, /* function recursion level */
2249: recursing); /* pass on regex recursion */
2250:
2251: if ((rc >= 0) ==
2252: (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2253: { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2254: else
2255: { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
2256: }
2257: }
2258: break;
2259:
2260: /*-----------------------------------------------------------------*/
2261: case OP_RECURSE:
2262: {
2263: int local_offsets[1000];
2264: int local_workspace[1000];
2265: int rc;
2266:
2267: DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2268: recursing + 1));
2269:
2270: rc = internal_dfa_exec(
2271: md, /* fixed match data */
2272: start_code + GET(code, 1), /* this subexpression's code */
2273: ptr, /* where we currently are */
2274: ptr - start_subject, /* start offset */
2275: local_offsets, /* offset vector */
2276: sizeof(local_offsets)/sizeof(int), /* size of same */
2277: local_workspace, /* workspace vector */
2278: sizeof(local_workspace)/sizeof(int), /* size of same */
2279: ims, /* the current ims flags */
2280: rlevel, /* function recursion level */
2281: recursing + 1); /* regex recurse level */
2282:
2283: DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2284: recursing + 1, rc));
2285:
2286: /* Ran out of internal offsets */
2287:
2288: if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2289:
2290: /* For each successful matched substring, set up the next state with a
2291: count of characters to skip before trying it. Note that the count is in
2292: characters, not bytes. */
2293:
2294: if (rc > 0)
2295: {
2296: for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2297: {
2298: const uschar *p = start_subject + local_offsets[rc];
2299: const uschar *pp = start_subject + local_offsets[rc+1];
2300: int charcount = local_offsets[rc+1] - local_offsets[rc];
2301: while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2302: if (charcount > 0)
2303: {
2304: ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2305: }
2306: else
2307: {
2308: ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2309: }
2310: }
2311: }
2312: else if (rc != PCRE_ERROR_NOMATCH) return rc;
2313: }
2314: break;
2315:
2316: /*-----------------------------------------------------------------*/
2317: case OP_ONCE:
2318: {
2319: int local_offsets[2];
2320: int local_workspace[1000];
2321:
2322: int rc = internal_dfa_exec(
2323: md, /* fixed match data */
2324: code, /* this subexpression's code */
2325: ptr, /* where we currently are */
2326: ptr - start_subject, /* start offset */
2327: local_offsets, /* offset vector */
2328: sizeof(local_offsets)/sizeof(int), /* size of same */
2329: local_workspace, /* workspace vector */
2330: sizeof(local_workspace)/sizeof(int), /* size of same */
2331: ims, /* the current ims flags */
2332: rlevel, /* function recursion level */
2333: recursing); /* pass on regex recursion */
2334:
2335: if (rc >= 0)
2336: {
2337: const uschar *end_subpattern = code;
2338: int charcount = local_offsets[1] - local_offsets[0];
2339: int next_state_offset, repeat_state_offset;
2340:
2341: do { end_subpattern += GET(end_subpattern, 1); }
2342: while (*end_subpattern == OP_ALT);
2343: next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2344:
2345: /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2346: arrange for the repeat state also to be added to the relevant list.
2347: Calculate the offset, or set -1 for no repeat. */
2348:
2349: repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2350: *end_subpattern == OP_KETRMIN)?
2351: end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2352:
2353: /* If we have matched an empty string, add the next state at the
2354: current character pointer. This is important so that the duplicate
2355: checking kicks in, which is what breaks infinite loops that match an
2356: empty string. */
2357:
2358: if (charcount == 0)
2359: {
2360: ADD_ACTIVE(next_state_offset, 0);
2361: }
2362:
2363: /* Optimization: if there are no more active states, and there
2364: are no new states yet set up, then skip over the subject string
2365: right here, to save looping. Otherwise, set up the new state to swing
2366: into action when the end of the substring is reached. */
2367:
2368: else if (i + 1 >= active_count && new_count == 0)
2369: {
2370: ptr += charcount;
2371: clen = 0;
2372: ADD_NEW(next_state_offset, 0);
2373:
2374: /* If we are adding a repeat state at the new character position,
2375: we must fudge things so that it is the only current state.
2376: Otherwise, it might be a duplicate of one we processed before, and
2377: that would cause it to be skipped. */
2378:
2379: if (repeat_state_offset >= 0)
2380: {
2381: next_active_state = active_states;
2382: active_count = 0;
2383: i = -1;
2384: ADD_ACTIVE(repeat_state_offset, 0);
2385: }
2386: }
2387: else
2388: {
2389: const uschar *p = start_subject + local_offsets[0];
2390: const uschar *pp = start_subject + local_offsets[1];
2391: while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2392: ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2393: if (repeat_state_offset >= 0)
2394: { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2395: }
2396:
2397: }
2398: else if (rc != PCRE_ERROR_NOMATCH) return rc;
2399: }
2400: break;
2401:
2402:
2403: /* ========================================================================== */
2404: /* Handle callouts */
2405:
2406: case OP_CALLOUT:
2407: if (pcre_callout != NULL)
2408: {
2409: int rrc;
2410: pcre_callout_block cb;
2411: cb.version = 1; /* Version 1 of the callout block */
2412: cb.callout_number = code[1];
2413: cb.offset_vector = offsets;
2414: cb.subject = (PCRE_SPTR)start_subject;
2415: cb.subject_length = end_subject - start_subject;
2416: cb.start_match = current_subject - start_subject;
2417: cb.current_position = ptr - start_subject;
2418: cb.pattern_position = GET(code, 2);
2419: cb.next_item_length = GET(code, 2 + LINK_SIZE);
2420: cb.capture_top = 1;
2421: cb.capture_last = -1;
2422: cb.callout_data = md->callout_data;
2423: if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc; /* Abandon */
2424: if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
2425: }
2426: break;
2427:
2428:
2429: /* ========================================================================== */
2430: default: /* Unsupported opcode */
2431: return PCRE_ERROR_DFA_UITEM;
2432: }
2433:
2434: NEXT_ACTIVE_STATE: continue;
2435:
2436: } /* End of loop scanning active states */
2437:
2438: /* We have finished the processing at the current subject character. If no
2439: new states have been set for the next character, we have found all the
2440: matches that we are going to find. If we are at the top level and partial
2441: matching has been requested, check for appropriate conditions. */
2442:
2443: if (new_count <= 0)
2444: {
2445: if (match_count < 0 && /* No matches found */
2446: rlevel == 1 && /* Top level match function */
2447: (md->moptions & PCRE_PARTIAL) != 0 && /* Want partial matching */
2448: ptr >= end_subject && /* Reached end of subject */
2449: ptr > current_subject) /* Matched non-empty string */
2450: {
2451: if (offsetcount >= 2)
2452: {
2453: offsets[0] = current_subject - start_subject;
2454: offsets[1] = end_subject - start_subject;
2455: }
2456: match_count = PCRE_ERROR_PARTIAL;
2457: }
2458:
2459: DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2460: "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2461: rlevel*2-2, SP));
2462: break; /* In effect, "return", but see the comment below */
2463: }
2464:
2465: /* One or more states are active for the next character. */
2466:
2467: ptr += clen; /* Advance to next subject character */
2468: } /* Loop to move along the subject string */
2469:
2470: /* Control gets here from "break" a few lines above. We do it this way because
2471: if we use "return" above, we have compiler trouble. Some compilers warn if
2472: there's nothing here because they think the function doesn't return a value. On
2473: the other hand, if we put a dummy statement here, some more clever compilers
2474: complain that it can't be reached. Sigh. */
2475:
2476: return match_count;
2477: }
2478:
2479:
2480:
2481:
2482: /*************************************************
2483: * Execute a Regular Expression - DFA engine *
2484: *************************************************/
2485:
2486: /* This external function applies a compiled re to a subject string using a DFA
2487: engine. This function calls the internal function multiple times if the pattern
2488: is not anchored.
2489:
2490: Arguments:
2491: argument_re points to the compiled expression
2492: extra_data points to extra data or is NULL
2493: subject points to the subject string
2494: length length of subject string (may contain binary zeros)
2495: start_offset where to start in the subject string
2496: options option bits
2497: offsets vector of match offsets
2498: offsetcount size of same
2499: workspace workspace vector
2500: wscount size of same
2501:
2502: Returns: > 0 => number of match offset pairs placed in offsets
2503: = 0 => offsets overflowed; longest matches are present
2504: -1 => failed to match
2505: < -1 => some kind of unexpected problem
2506: */
2507:
1.2 ! misha 2508: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
1.1 misha 2509: pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2510: const char *subject, int length, int start_offset, int options, int *offsets,
2511: int offsetcount, int *workspace, int wscount)
2512: {
2513: real_pcre *re = (real_pcre *)argument_re;
2514: dfa_match_data match_block;
2515: dfa_match_data *md = &match_block;
2516: BOOL utf8, anchored, startline, firstline;
2517: const uschar *current_subject, *end_subject, *lcc;
2518:
2519: pcre_study_data internal_study;
2520: const pcre_study_data *study = NULL;
2521: real_pcre internal_re;
2522:
2523: const uschar *req_byte_ptr;
2524: const uschar *start_bits = NULL;
2525: BOOL first_byte_caseless = FALSE;
2526: BOOL req_byte_caseless = FALSE;
2527: int first_byte = -1;
2528: int req_byte = -1;
2529: int req_byte2 = -1;
2530: int newline;
2531:
2532: /* Plausibility checks */
2533:
2534: if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2535: if (re == NULL || subject == NULL || workspace == NULL ||
2536: (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2537: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2538: if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2539:
2540: /* We need to find the pointer to any study data before we test for byte
2541: flipping, so we scan the extra_data block first. This may set two fields in the
2542: match block, so we must initialize them beforehand. However, the other fields
2543: in the match block must not be set until after the byte flipping. */
2544:
2545: md->tables = re->tables;
2546: md->callout_data = NULL;
2547:
2548: if (extra_data != NULL)
2549: {
2550: unsigned int flags = extra_data->flags;
2551: if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2552: study = (const pcre_study_data *)extra_data->study_data;
2553: if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2554: if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2555: return PCRE_ERROR_DFA_UMLIMIT;
2556: if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2557: md->callout_data = extra_data->callout_data;
2558: if ((flags & PCRE_EXTRA_TABLES) != 0)
2559: md->tables = extra_data->tables;
2560: }
2561:
2562: /* Check that the first field in the block is the magic number. If it is not,
2563: test for a regex that was compiled on a host of opposite endianness. If this is
2564: the case, flipped values are put in internal_re and internal_study if there was
2565: study data too. */
2566:
2567: if (re->magic_number != MAGIC_NUMBER)
2568: {
2569: re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2570: if (re == NULL) return PCRE_ERROR_BADMAGIC;
2571: if (study != NULL) study = &internal_study;
2572: }
2573:
2574: /* Set some local values */
2575:
2576: current_subject = (const unsigned char *)subject + start_offset;
2577: end_subject = (const unsigned char *)subject + length;
2578: req_byte_ptr = current_subject - 1;
2579:
2580: #ifdef SUPPORT_UTF8
2581: utf8 = (re->options & PCRE_UTF8) != 0;
2582: #else
2583: utf8 = FALSE;
2584: #endif
2585:
2586: anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2587: (re->options & PCRE_ANCHORED) != 0;
2588:
2589: /* The remaining fixed data for passing around. */
2590:
2591: md->start_code = (const uschar *)argument_re +
2592: re->name_table_offset + re->name_count * re->name_entry_size;
2593: md->start_subject = (const unsigned char *)subject;
2594: md->end_subject = end_subject;
2595: md->moptions = options;
2596: md->poptions = re->options;
2597:
2598: /* If the BSR option is not set at match time, copy what was set
2599: at compile time. */
2600:
2601: if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2602: {
2603: if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2604: md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2605: #ifdef BSR_ANYCRLF
2606: else md->moptions |= PCRE_BSR_ANYCRLF;
2607: #endif
2608: }
2609:
2610: /* Handle different types of newline. The three bits give eight cases. If
2611: nothing is set at run time, whatever was used at compile time applies. */
2612:
2613: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2614: PCRE_NEWLINE_BITS)
2615: {
2616: case 0: newline = NEWLINE; break; /* Compile-time default */
2617: case PCRE_NEWLINE_CR: newline = '\r'; break;
2618: case PCRE_NEWLINE_LF: newline = '\n'; break;
2619: case PCRE_NEWLINE_CR+
2620: PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
2621: case PCRE_NEWLINE_ANY: newline = -1; break;
2622: case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2623: default: return PCRE_ERROR_BADNEWLINE;
2624: }
2625:
2626: if (newline == -2)
2627: {
2628: md->nltype = NLTYPE_ANYCRLF;
2629: }
2630: else if (newline < 0)
2631: {
2632: md->nltype = NLTYPE_ANY;
2633: }
2634: else
2635: {
2636: md->nltype = NLTYPE_FIXED;
2637: if (newline > 255)
2638: {
2639: md->nllen = 2;
2640: md->nl[0] = (newline >> 8) & 255;
2641: md->nl[1] = newline & 255;
2642: }
2643: else
2644: {
2645: md->nllen = 1;
2646: md->nl[0] = newline;
2647: }
2648: }
2649:
2650: /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2651: back the character offset. */
2652:
2653: #ifdef SUPPORT_UTF8
2654: if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2655: {
2656: if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2657: return PCRE_ERROR_BADUTF8;
2658: if (start_offset > 0 && start_offset < length)
2659: {
2660: int tb = ((uschar *)subject)[start_offset];
2661: if (tb > 127)
2662: {
2663: tb &= 0xc0;
2664: if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2665: }
2666: }
2667: }
2668: #endif
2669:
2670: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2671: is a feature that makes it possible to save compiled regex and re-use them
2672: in other programs later. */
2673:
2674: if (md->tables == NULL) md->tables = _pcre_default_tables;
2675:
2676: /* The lower casing table and the "must be at the start of a line" flag are
2677: used in a loop when finding where to start. */
2678:
2679: lcc = md->tables + lcc_offset;
2680: startline = (re->flags & PCRE_STARTLINE) != 0;
2681: firstline = (re->options & PCRE_FIRSTLINE) != 0;
2682:
2683: /* Set up the first character to match, if available. The first_byte value is
2684: never set for an anchored regular expression, but the anchoring may be forced
2685: at run time, so we have to test for anchoring. The first char may be unset for
2686: an unanchored pattern, of course. If there's no first char and the pattern was
2687: studied, there may be a bitmap of possible first characters. */
2688:
2689: if (!anchored)
2690: {
2691: if ((re->flags & PCRE_FIRSTSET) != 0)
2692: {
2693: first_byte = re->first_byte & 255;
2694: if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2695: first_byte = lcc[first_byte];
2696: }
2697: else
2698: {
2699: if (startline && study != NULL &&
2700: (study->options & PCRE_STUDY_MAPPED) != 0)
2701: start_bits = study->start_bits;
2702: }
2703: }
2704:
2705: /* For anchored or unanchored matches, there may be a "last known required
2706: character" set. */
2707:
2708: if ((re->flags & PCRE_REQCHSET) != 0)
2709: {
2710: req_byte = re->req_byte & 255;
2711: req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2712: req_byte2 = (md->tables + fcc_offset)[req_byte]; /* case flipped */
2713: }
2714:
2715: /* Call the main matching function, looping for a non-anchored regex after a
2716: failed match. Unless restarting, optimize by moving to the first match
2717: character if possible, when not anchored. Then unless wanting a partial match,
2718: check for a required later character. */
2719:
2720: for (;;)
2721: {
2722: int rc;
2723:
2724: if ((options & PCRE_DFA_RESTART) == 0)
2725: {
2726: const uschar *save_end_subject = end_subject;
2727:
2728: /* Advance to a unique first char if possible. If firstline is TRUE, the
2729: start of the match is constrained to the first line of a multiline string.
2730: Implement this by temporarily adjusting end_subject so that we stop
2731: scanning at a newline. If the match fails at the newline, later code breaks
2732: this loop. */
2733:
2734: if (firstline)
2735: {
1.2 ! misha 2736: USPTR t = current_subject;
! 2737: #ifdef SUPPORT_UTF8
! 2738: if (utf8)
! 2739: {
! 2740: while (t < md->end_subject && !IS_NEWLINE(t))
! 2741: {
! 2742: t++;
! 2743: while (t < end_subject && (*t & 0xc0) == 0x80) t++;
! 2744: }
! 2745: }
! 2746: else
! 2747: #endif
1.1 misha 2748: while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2749: end_subject = t;
2750: }
2751:
2752: if (first_byte >= 0)
2753: {
2754: if (first_byte_caseless)
2755: while (current_subject < end_subject &&
2756: lcc[*current_subject] != first_byte)
2757: current_subject++;
2758: else
2759: while (current_subject < end_subject && *current_subject != first_byte)
2760: current_subject++;
2761: }
2762:
2763: /* Or to just after a linebreak for a multiline match if possible */
2764:
2765: else if (startline)
2766: {
2767: if (current_subject > md->start_subject + start_offset)
2768: {
1.2 ! misha 2769: #ifdef SUPPORT_UTF8
! 2770: if (utf8)
! 2771: {
! 2772: while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
! 2773: {
! 2774: current_subject++;
! 2775: while(current_subject < end_subject &&
! 2776: (*current_subject & 0xc0) == 0x80)
! 2777: current_subject++;
! 2778: }
! 2779: }
! 2780: else
! 2781: #endif
! 2782: while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
1.1 misha 2783: current_subject++;
2784:
2785: /* If we have just passed a CR and the newline option is ANY or
2786: ANYCRLF, and we are now at a LF, advance the match position by one more
2787: character. */
2788:
2789: if (current_subject[-1] == '\r' &&
2790: (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2791: current_subject < end_subject &&
2792: *current_subject == '\n')
2793: current_subject++;
2794: }
2795: }
2796:
2797: /* Or to a non-unique first char after study */
2798:
2799: else if (start_bits != NULL)
2800: {
2801: while (current_subject < end_subject)
2802: {
2803: register unsigned int c = *current_subject;
2804: if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2805: else break;
2806: }
2807: }
2808:
2809: /* Restore fudged end_subject */
2810:
2811: end_subject = save_end_subject;
2812: }
2813:
2814: /* If req_byte is set, we know that that character must appear in the subject
2815: for the match to succeed. If the first character is set, req_byte must be
2816: later in the subject; otherwise the test starts at the match point. This
2817: optimization can save a huge amount of work in patterns with nested unlimited
2818: repeats that aren't going to match. Writing separate code for cased/caseless
2819: versions makes it go faster, as does using an autoincrement and backing off
2820: on a match.
2821:
2822: HOWEVER: when the subject string is very, very long, searching to its end can
2823: take a long time, and give bad performance on quite ordinary patterns. This
2824: showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2825: don't do this when the string is sufficiently long.
2826:
2827: ALSO: this processing is disabled when partial matching is requested.
2828: */
2829:
2830: if (req_byte >= 0 &&
2831: end_subject - current_subject < REQ_BYTE_MAX &&
2832: (options & PCRE_PARTIAL) == 0)
2833: {
2834: register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2835:
2836: /* We don't need to repeat the search if we haven't yet reached the
2837: place we found it at last time. */
2838:
2839: if (p > req_byte_ptr)
2840: {
2841: if (req_byte_caseless)
2842: {
2843: while (p < end_subject)
2844: {
2845: register int pp = *p++;
2846: if (pp == req_byte || pp == req_byte2) { p--; break; }
2847: }
2848: }
2849: else
2850: {
2851: while (p < end_subject)
2852: {
2853: if (*p++ == req_byte) { p--; break; }
2854: }
2855: }
2856:
2857: /* If we can't find the required character, break the matching loop,
2858: which will cause a return or PCRE_ERROR_NOMATCH. */
2859:
2860: if (p >= end_subject) break;
2861:
2862: /* If we have found the required character, save the point where we
2863: found it, so that we don't search again next time round the loop if
2864: the start hasn't passed this character yet. */
2865:
2866: req_byte_ptr = p;
2867: }
2868: }
2869:
2870: /* OK, now we can do the business */
2871:
2872: rc = internal_dfa_exec(
2873: md, /* fixed match data */
2874: md->start_code, /* this subexpression's code */
2875: current_subject, /* where we currently are */
2876: start_offset, /* start offset in subject */
2877: offsets, /* offset vector */
2878: offsetcount, /* size of same */
2879: workspace, /* workspace vector */
2880: wscount, /* size of same */
2881: re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2882: 0, /* function recurse level */
2883: 0); /* regex recurse level */
2884:
2885: /* Anything other than "no match" means we are done, always; otherwise, carry
2886: on only if not anchored. */
2887:
2888: if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2889:
2890: /* Advance to the next subject character unless we are at the end of a line
2891: and firstline is set. */
2892:
2893: if (firstline && IS_NEWLINE(current_subject)) break;
2894: current_subject++;
2895: if (utf8)
2896: {
2897: while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2898: current_subject++;
2899: }
2900: if (current_subject > end_subject) break;
2901:
2902: /* If we have just passed a CR and we are now at a LF, and the pattern does
2903: not contain any explicit matches for \r or \n, and the newline option is CRLF
2904: or ANY or ANYCRLF, advance the match position by one more character. */
2905:
2906: if (current_subject[-1] == '\r' &&
2907: current_subject < end_subject &&
2908: *current_subject == '\n' &&
2909: (re->flags & PCRE_HASCRORLF) == 0 &&
2910: (md->nltype == NLTYPE_ANY ||
2911: md->nltype == NLTYPE_ANYCRLF ||
2912: md->nllen == 2))
2913: current_subject++;
2914:
2915: } /* "Bumpalong" loop */
2916:
2917: return PCRE_ERROR_NOMATCH;
2918: }
2919:
2920: /* End of pcre_dfa_exec.c */
E-mail: