Annotation of win32/pcre/pcre_study.c, revision 1.4
1.1 misha 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
1.3 misha 9: Copyright (c) 1997-2010 University of Cambridge
1.1 misha 10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40:
41: /* This module contains the external function pcre_study(), along with local
42: supporting functions. */
43:
44:
45: #ifdef HAVE_CONFIG_H
46: #include "config.h"
47: #endif
48:
49: #include "pcre_internal.h"
50:
1.3 misha 51: #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
1.1 misha 52:
53: /* Returns from set_start_bits() */
54:
55: enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
56:
57:
1.3 misha 58:
59: /*************************************************
60: * Find the minimum subject length for a group *
61: *************************************************/
62:
63: /* Scan a parenthesized group and compute the minimum length of subject that
64: is needed to match it. This is a lower bound; it does not mean there is a
65: string of that length that matches. In UTF8 mode, the result is in characters
66: rather than bytes.
67:
68: Arguments:
69: code pointer to start of group (the bracket)
70: startcode pointer to start of the whole pattern
71: options the compiling options
72:
73: Returns: the minimum length
74: -1 if \C was encountered
75: -2 internal error (missing capturing bracket)
76: */
77:
78: static int
79: find_minlength(const uschar *code, const uschar *startcode, int options)
80: {
81: int length = -1;
82: BOOL utf8 = (options & PCRE_UTF8) != 0;
83: BOOL had_recurse = FALSE;
84: register int branchlength = 0;
85: register uschar *cc = (uschar *)code + 1 + LINK_SIZE;
86:
87: if (*code == OP_CBRA || *code == OP_SCBRA) cc += 2;
88:
89: /* Scan along the opcodes for this branch. If we get to the end of the
90: branch, check the length against that of the other branches. */
91:
92: for (;;)
93: {
94: int d, min;
95: uschar *cs, *ce;
96: register int op = *cc;
97:
98: switch (op)
99: {
100: case OP_COND:
101: case OP_SCOND:
102:
103: /* If there is only one branch in a condition, the implied branch has zero
104: length, so we don't add anything. This covers the DEFINE "condition"
105: automatically. */
106:
107: cs = cc + GET(cc, 1);
108: if (*cs != OP_ALT)
109: {
110: cc = cs + 1 + LINK_SIZE;
111: break;
112: }
113:
114: /* Otherwise we can fall through and treat it the same as any other
115: subpattern. */
116:
117: case OP_CBRA:
118: case OP_SCBRA:
119: case OP_BRA:
120: case OP_SBRA:
121: case OP_ONCE:
122: d = find_minlength(cc, startcode, options);
123: if (d < 0) return d;
124: branchlength += d;
125: do cc += GET(cc, 1); while (*cc == OP_ALT);
126: cc += 1 + LINK_SIZE;
127: break;
128:
129: /* Reached end of a branch; if it's a ket it is the end of a nested
130: call. If it's ALT it is an alternation in a nested call. If it is
131: END it's the end of the outer call. All can be handled by the same code. */
132:
133: case OP_ALT:
134: case OP_KET:
135: case OP_KETRMAX:
136: case OP_KETRMIN:
137: case OP_END:
138: if (length < 0 || (!had_recurse && branchlength < length))
139: length = branchlength;
140: if (*cc != OP_ALT) return length;
141: cc += 1 + LINK_SIZE;
142: branchlength = 0;
143: had_recurse = FALSE;
144: break;
145:
146: /* Skip over assertive subpatterns */
147:
148: case OP_ASSERT:
149: case OP_ASSERT_NOT:
150: case OP_ASSERTBACK:
151: case OP_ASSERTBACK_NOT:
152: do cc += GET(cc, 1); while (*cc == OP_ALT);
153: /* Fall through */
154:
155: /* Skip over things that don't match chars */
156:
157: case OP_REVERSE:
158: case OP_CREF:
159: case OP_NCREF:
160: case OP_RREF:
161: case OP_NRREF:
162: case OP_DEF:
163: case OP_OPT:
164: case OP_CALLOUT:
165: case OP_SOD:
166: case OP_SOM:
167: case OP_EOD:
168: case OP_EODN:
169: case OP_CIRC:
170: case OP_DOLL:
171: case OP_NOT_WORD_BOUNDARY:
172: case OP_WORD_BOUNDARY:
173: cc += _pcre_OP_lengths[*cc];
174: break;
175:
176: /* Skip over a subpattern that has a {0} or {0,x} quantifier */
177:
178: case OP_BRAZERO:
179: case OP_BRAMINZERO:
180: case OP_SKIPZERO:
181: cc += _pcre_OP_lengths[*cc];
182: do cc += GET(cc, 1); while (*cc == OP_ALT);
183: cc += 1 + LINK_SIZE;
184: break;
185:
186: /* Handle literal characters and + repetitions */
187:
188: case OP_CHAR:
189: case OP_CHARNC:
190: case OP_NOT:
191: case OP_PLUS:
192: case OP_MINPLUS:
193: case OP_POSPLUS:
194: case OP_NOTPLUS:
195: case OP_NOTMINPLUS:
196: case OP_NOTPOSPLUS:
197: branchlength++;
198: cc += 2;
199: #ifdef SUPPORT_UTF8
200: if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
201: #endif
202: break;
203:
204: case OP_TYPEPLUS:
205: case OP_TYPEMINPLUS:
206: case OP_TYPEPOSPLUS:
207: branchlength++;
208: cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
209: break;
210:
211: /* Handle exact repetitions. The count is already in characters, but we
212: need to skip over a multibyte character in UTF8 mode. */
213:
214: case OP_EXACT:
215: case OP_NOTEXACT:
216: branchlength += GET2(cc,1);
217: cc += 4;
218: #ifdef SUPPORT_UTF8
219: if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
220: #endif
221: break;
222:
223: case OP_TYPEEXACT:
224: branchlength += GET2(cc,1);
225: cc += (cc[3] == OP_PROP || cc[3] == OP_NOTPROP)? 6 : 4;
226: break;
227:
228: /* Handle single-char non-literal matchers */
229:
230: case OP_PROP:
231: case OP_NOTPROP:
232: cc += 2;
233: /* Fall through */
234:
235: case OP_NOT_DIGIT:
236: case OP_DIGIT:
237: case OP_NOT_WHITESPACE:
238: case OP_WHITESPACE:
239: case OP_NOT_WORDCHAR:
240: case OP_WORDCHAR:
241: case OP_ANY:
242: case OP_ALLANY:
243: case OP_EXTUNI:
244: case OP_HSPACE:
245: case OP_NOT_HSPACE:
246: case OP_VSPACE:
247: case OP_NOT_VSPACE:
248: branchlength++;
249: cc++;
250: break;
251:
252: /* "Any newline" might match two characters */
253:
254: case OP_ANYNL:
255: branchlength += 2;
256: cc++;
257: break;
258:
259: /* The single-byte matcher means we can't proceed in UTF-8 mode */
260:
261: case OP_ANYBYTE:
262: #ifdef SUPPORT_UTF8
263: if (utf8) return -1;
264: #endif
265: branchlength++;
266: cc++;
267: break;
268:
269: /* For repeated character types, we have to test for \p and \P, which have
270: an extra two bytes of parameters. */
271:
272: case OP_TYPESTAR:
273: case OP_TYPEMINSTAR:
274: case OP_TYPEQUERY:
275: case OP_TYPEMINQUERY:
276: case OP_TYPEPOSSTAR:
277: case OP_TYPEPOSQUERY:
278: if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
279: cc += _pcre_OP_lengths[op];
280: break;
281:
282: case OP_TYPEUPTO:
283: case OP_TYPEMINUPTO:
284: case OP_TYPEPOSUPTO:
285: if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
286: cc += _pcre_OP_lengths[op];
287: break;
288:
289: /* Check a class for variable quantification */
290:
291: #ifdef SUPPORT_UTF8
292: case OP_XCLASS:
293: cc += GET(cc, 1) - 33;
294: /* Fall through */
295: #endif
296:
297: case OP_CLASS:
298: case OP_NCLASS:
299: cc += 33;
300:
301: switch (*cc)
302: {
303: case OP_CRPLUS:
304: case OP_CRMINPLUS:
305: branchlength++;
306: /* Fall through */
307:
308: case OP_CRSTAR:
309: case OP_CRMINSTAR:
310: case OP_CRQUERY:
311: case OP_CRMINQUERY:
312: cc++;
313: break;
314:
315: case OP_CRRANGE:
316: case OP_CRMINRANGE:
317: branchlength += GET2(cc,1);
318: cc += 5;
319: break;
320:
321: default:
322: branchlength++;
323: break;
324: }
325: break;
326:
327: /* Backreferences and subroutine calls are treated in the same way: we find
328: the minimum length for the subpattern. A recursion, however, causes an
329: a flag to be set that causes the length of this branch to be ignored. The
330: logic is that a recursion can only make sense if there is another
331: alternation that stops the recursing. That will provide the minimum length
332: (when no recursion happens). A backreference within the group that it is
333: referencing behaves in the same way.
334:
335: If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
336: matches an empty string (by default it causes a matching failure), so in
337: that case we must set the minimum length to zero. */
338:
339: case OP_REF:
340: if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
341: {
342: ce = cs = (uschar *)_pcre_find_bracket(startcode, utf8, GET2(cc, 1));
343: if (cs == NULL) return -2;
344: do ce += GET(ce, 1); while (*ce == OP_ALT);
345: if (cc > cs && cc < ce)
346: {
347: d = 0;
348: had_recurse = TRUE;
349: }
350: else d = find_minlength(cs, startcode, options);
351: }
352: else d = 0;
353: cc += 3;
354:
355: /* Handle repeated back references */
356:
357: switch (*cc)
358: {
359: case OP_CRSTAR:
360: case OP_CRMINSTAR:
361: case OP_CRQUERY:
362: case OP_CRMINQUERY:
363: min = 0;
364: cc++;
365: break;
366:
367: case OP_CRRANGE:
368: case OP_CRMINRANGE:
369: min = GET2(cc, 1);
370: cc += 5;
371: break;
372:
373: default:
374: min = 1;
375: break;
376: }
377:
378: branchlength += min * d;
379: break;
380:
381: case OP_RECURSE:
382: cs = ce = (uschar *)startcode + GET(cc, 1);
383: if (cs == NULL) return -2;
384: do ce += GET(ce, 1); while (*ce == OP_ALT);
385: if (cc > cs && cc < ce)
386: had_recurse = TRUE;
387: else
388: branchlength += find_minlength(cs, startcode, options);
389: cc += 1 + LINK_SIZE;
390: break;
391:
392: /* Anything else does not or need not match a character. We can get the
393: item's length from the table, but for those that can match zero occurrences
394: of a character, we must take special action for UTF-8 characters. */
395:
396: case OP_UPTO:
397: case OP_NOTUPTO:
398: case OP_MINUPTO:
399: case OP_NOTMINUPTO:
400: case OP_POSUPTO:
401: case OP_STAR:
402: case OP_MINSTAR:
403: case OP_NOTMINSTAR:
404: case OP_POSSTAR:
405: case OP_NOTPOSSTAR:
406: case OP_QUERY:
407: case OP_MINQUERY:
408: case OP_NOTMINQUERY:
409: case OP_POSQUERY:
410: case OP_NOTPOSQUERY:
411: cc += _pcre_OP_lengths[op];
412: #ifdef SUPPORT_UTF8
413: if (utf8 && cc[-1] >= 0xc0) cc += _pcre_utf8_table4[cc[-1] & 0x3f];
414: #endif
415: break;
416:
417: /* Skip these, but we need to add in the name length. */
418:
419: case OP_MARK:
420: case OP_PRUNE_ARG:
421: case OP_SKIP_ARG:
1.4 ! misha 422: cc += _pcre_OP_lengths[op] + cc[1];
! 423: break;
! 424:
1.3 misha 425: case OP_THEN_ARG:
1.4 ! misha 426: cc += _pcre_OP_lengths[op] + cc[1+LINK_SIZE];
1.3 misha 427: break;
428:
429: /* For the record, these are the opcodes that are matched by "default":
430: OP_ACCEPT, OP_CLOSE, OP_COMMIT, OP_FAIL, OP_PRUNE, OP_SET_SOM, OP_SKIP,
431: OP_THEN. */
432:
433: default:
434: cc += _pcre_OP_lengths[op];
435: break;
436: }
437: }
438: /* Control never gets here */
439: }
440:
441:
442:
1.1 misha 443: /*************************************************
444: * Set a bit and maybe its alternate case *
445: *************************************************/
446:
1.3 misha 447: /* Given a character, set its first byte's bit in the table, and also the
448: corresponding bit for the other version of a letter if we are caseless. In
449: UTF-8 mode, for characters greater than 127, we can only do the caseless thing
450: when Unicode property support is available.
1.1 misha 451:
452: Arguments:
453: start_bits points to the bit map
1.3 misha 454: p points to the character
1.1 misha 455: caseless the caseless flag
456: cd the block with char table pointers
1.3 misha 457: utf8 TRUE for UTF-8 mode
458:
459: Returns: pointer after the character
460: */
461:
462: static const uschar *
463: set_table_bit(uschar *start_bits, const uschar *p, BOOL caseless,
464: compile_data *cd, BOOL utf8)
465: {
466: unsigned int c = *p;
467:
468: SET_BIT(c);
469:
470: #ifdef SUPPORT_UTF8
471: if (utf8 && c > 127)
472: {
473: GETCHARINC(c, p);
474: #ifdef SUPPORT_UCP
475: if (caseless)
476: {
477: uschar buff[8];
478: c = UCD_OTHERCASE(c);
479: (void)_pcre_ord2utf8(c, buff);
480: SET_BIT(buff[0]);
481: }
482: #endif
483: return p;
484: }
485: #endif
486:
487: /* Not UTF-8 mode, or character is less than 127. */
488:
489: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
490: return p + 1;
491: }
492:
493:
494:
495: /*************************************************
496: * Set bits for a positive character type *
497: *************************************************/
498:
499: /* This function sets starting bits for a character type. In UTF-8 mode, we can
500: only do a direct setting for bytes less than 128, as otherwise there can be
501: confusion with bytes in the middle of UTF-8 characters. In a "traditional"
502: environment, the tables will only recognize ASCII characters anyway, but in at
503: least one Windows environment, some higher bytes bits were set in the tables.
504: So we deal with that case by considering the UTF-8 encoding.
505:
506: Arguments:
507: start_bits the starting bitmap
508: cbit type the type of character wanted
509: table_limit 32 for non-UTF-8; 16 for UTF-8
510: cd the block with char table pointers
511:
512: Returns: nothing
513: */
514:
515: static void
516: set_type_bits(uschar *start_bits, int cbit_type, int table_limit,
517: compile_data *cd)
518: {
519: register int c;
520: for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
521: if (table_limit == 32) return;
522: for (c = 128; c < 256; c++)
523: {
524: if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
525: {
526: uschar buff[8];
527: (void)_pcre_ord2utf8(c, buff);
528: SET_BIT(buff[0]);
529: }
530: }
531: }
532:
533:
534: /*************************************************
535: * Set bits for a negative character type *
536: *************************************************/
537:
538: /* This function sets starting bits for a negative character type such as \D.
539: In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
540: otherwise there can be confusion with bytes in the middle of UTF-8 characters.
541: Unlike in the positive case, where we can set appropriate starting bits for
542: specific high-valued UTF-8 characters, in this case we have to set the bits for
543: all high-valued characters. The lowest is 0xc2, but we overkill by starting at
544: 0xc0 (192) for simplicity.
1.1 misha 545:
1.3 misha 546: Arguments:
547: start_bits the starting bitmap
548: cbit type the type of character wanted
549: table_limit 32 for non-UTF-8; 16 for UTF-8
550: cd the block with char table pointers
551:
552: Returns: nothing
1.1 misha 553: */
554:
555: static void
1.3 misha 556: set_nottype_bits(uschar *start_bits, int cbit_type, int table_limit,
557: compile_data *cd)
1.1 misha 558: {
1.3 misha 559: register int c;
560: for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
561: if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
1.1 misha 562: }
563:
564:
565:
566: /*************************************************
567: * Create bitmap of starting bytes *
568: *************************************************/
569:
570: /* This function scans a compiled unanchored expression recursively and
571: attempts to build a bitmap of the set of possible starting bytes. As time goes
572: by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
573: useful for parenthesized groups in patterns such as (a*)b where the group
574: provides some optional starting bytes but scanning must continue at the outer
575: level to find at least one mandatory byte. At the outermost level, this
576: function fails unless the result is SSB_DONE.
577:
578: Arguments:
579: code points to an expression
580: start_bits points to a 32-byte table, initialized to 0
581: caseless the current state of the caseless flag
582: utf8 TRUE if in UTF-8 mode
583: cd the block with char table pointers
584:
585: Returns: SSB_FAIL => Failed to find any starting bytes
586: SSB_DONE => Found mandatory starting bytes
587: SSB_CONTINUE => Found optional starting bytes
588: */
589:
590: static int
591: set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
592: BOOL utf8, compile_data *cd)
593: {
594: register int c;
595: int yield = SSB_DONE;
1.3 misha 596: int table_limit = utf8? 16:32;
1.1 misha 597:
598: #if 0
599: /* ========================================================================= */
600: /* The following comment and code was inserted in January 1999. In May 2006,
601: when it was observed to cause compiler warnings about unused values, I took it
602: out again. If anybody is still using OS/2, they will have to put it back
603: manually. */
604:
605: /* This next statement and the later reference to dummy are here in order to
606: trick the optimizer of the IBM C compiler for OS/2 into generating correct
607: code. Apparently IBM isn't going to fix the problem, and we would rather not
608: disable optimization (in this module it actually makes a big difference, and
609: the pcre module can use all the optimization it can get). */
610:
611: volatile int dummy;
612: /* ========================================================================= */
613: #endif
614:
615: do
616: {
617: const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
618: BOOL try_next = TRUE;
619:
620: while (try_next) /* Loop for items in this branch */
621: {
622: int rc;
623: switch(*tcode)
624: {
625: /* Fail if we reach something we don't understand */
626:
627: default:
628: return SSB_FAIL;
629:
630: /* If we hit a bracket or a positive lookahead assertion, recurse to set
631: bits from within the subpattern. If it can't find anything, we have to
632: give up. If it finds some mandatory character(s), we are done for this
633: branch. Otherwise, carry on scanning after the subpattern. */
634:
635: case OP_BRA:
636: case OP_SBRA:
637: case OP_CBRA:
638: case OP_SCBRA:
639: case OP_ONCE:
640: case OP_ASSERT:
641: rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
642: if (rc == SSB_FAIL) return SSB_FAIL;
643: if (rc == SSB_DONE) try_next = FALSE; else
644: {
645: do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
646: tcode += 1 + LINK_SIZE;
647: }
648: break;
649:
650: /* If we hit ALT or KET, it means we haven't found anything mandatory in
651: this branch, though we might have found something optional. For ALT, we
652: continue with the next alternative, but we have to arrange that the final
653: result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
654: return SSB_CONTINUE: if this is the top level, that indicates failure,
655: but after a nested subpattern, it causes scanning to continue. */
656:
657: case OP_ALT:
658: yield = SSB_CONTINUE;
659: try_next = FALSE;
660: break;
661:
662: case OP_KET:
663: case OP_KETRMAX:
664: case OP_KETRMIN:
665: return SSB_CONTINUE;
666:
667: /* Skip over callout */
668:
669: case OP_CALLOUT:
670: tcode += 2 + 2*LINK_SIZE;
671: break;
672:
673: /* Skip over lookbehind and negative lookahead assertions */
674:
675: case OP_ASSERT_NOT:
676: case OP_ASSERTBACK:
677: case OP_ASSERTBACK_NOT:
678: do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
679: tcode += 1 + LINK_SIZE;
680: break;
681:
682: /* Skip over an option setting, changing the caseless flag */
683:
684: case OP_OPT:
685: caseless = (tcode[1] & PCRE_CASELESS) != 0;
686: tcode += 2;
687: break;
688:
689: /* BRAZERO does the bracket, but carries on. */
690:
691: case OP_BRAZERO:
692: case OP_BRAMINZERO:
693: if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
694: return SSB_FAIL;
695: /* =========================================================================
696: See the comment at the head of this function concerning the next line,
697: which was an old fudge for the benefit of OS/2.
698: dummy = 1;
699: ========================================================================= */
700: do tcode += GET(tcode,1); while (*tcode == OP_ALT);
701: tcode += 1 + LINK_SIZE;
702: break;
703:
704: /* SKIPZERO skips the bracket. */
705:
706: case OP_SKIPZERO:
1.2 misha 707: tcode++;
1.1 misha 708: do tcode += GET(tcode,1); while (*tcode == OP_ALT);
709: tcode += 1 + LINK_SIZE;
710: break;
711:
712: /* Single-char * or ? sets the bit and tries the next item */
713:
714: case OP_STAR:
715: case OP_MINSTAR:
716: case OP_POSSTAR:
717: case OP_QUERY:
718: case OP_MINQUERY:
719: case OP_POSQUERY:
1.3 misha 720: tcode = set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
1.1 misha 721: break;
722:
723: /* Single-char upto sets the bit and tries the next */
724:
725: case OP_UPTO:
726: case OP_MINUPTO:
727: case OP_POSUPTO:
1.3 misha 728: tcode = set_table_bit(start_bits, tcode + 3, caseless, cd, utf8);
1.1 misha 729: break;
730:
731: /* At least one single char sets the bit and stops */
732:
733: case OP_EXACT: /* Fall through */
734: tcode += 2;
735:
736: case OP_CHAR:
737: case OP_CHARNC:
738: case OP_PLUS:
739: case OP_MINPLUS:
740: case OP_POSPLUS:
1.3 misha 741: (void)set_table_bit(start_bits, tcode + 1, caseless, cd, utf8);
742: try_next = FALSE;
743: break;
744:
745: /* Special spacing and line-terminating items. These recognize specific
746: lists of characters. The difference between VSPACE and ANYNL is that the
747: latter can match the two-character CRLF sequence, but that is not
748: relevant for finding the first character, so their code here is
749: identical. */
750:
751: case OP_HSPACE:
752: SET_BIT(0x09);
753: SET_BIT(0x20);
754: if (utf8)
755: {
756: SET_BIT(0xC2); /* For U+00A0 */
757: SET_BIT(0xE1); /* For U+1680, U+180E */
758: SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
759: SET_BIT(0xE3); /* For U+3000 */
760: }
761: else SET_BIT(0xA0);
762: try_next = FALSE;
763: break;
764:
765: case OP_ANYNL:
766: case OP_VSPACE:
767: SET_BIT(0x0A);
768: SET_BIT(0x0B);
769: SET_BIT(0x0C);
770: SET_BIT(0x0D);
771: if (utf8)
772: {
773: SET_BIT(0xC2); /* For U+0085 */
774: SET_BIT(0xE2); /* For U+2028, U+2029 */
775: }
776: else SET_BIT(0x85);
1.1 misha 777: try_next = FALSE;
778: break;
779:
1.3 misha 780: /* Single character types set the bits and stop. Note that if PCRE_UCP
781: is set, we do not see these op codes because \d etc are converted to
782: properties. Therefore, these apply in the case when only characters less
783: than 256 are recognized to match the types. */
1.1 misha 784:
785: case OP_NOT_DIGIT:
1.3 misha 786: set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1.1 misha 787: try_next = FALSE;
788: break;
789:
790: case OP_DIGIT:
1.3 misha 791: set_type_bits(start_bits, cbit_digit, table_limit, cd);
1.1 misha 792: try_next = FALSE;
793: break;
794:
795: /* The cbit_space table has vertical tab as whitespace; we have to
1.3 misha 796: ensure it is set as not whitespace. */
1.1 misha 797:
798: case OP_NOT_WHITESPACE:
1.3 misha 799: set_nottype_bits(start_bits, cbit_space, table_limit, cd);
800: start_bits[1] |= 0x08;
1.1 misha 801: try_next = FALSE;
802: break;
803:
804: /* The cbit_space table has vertical tab as whitespace; we have to
1.3 misha 805: not set it from the table. */
1.1 misha 806:
807: case OP_WHITESPACE:
1.3 misha 808: c = start_bits[1]; /* Save in case it was already set */
809: set_type_bits(start_bits, cbit_space, table_limit, cd);
810: start_bits[1] = (start_bits[1] & ~0x08) | c;
1.1 misha 811: try_next = FALSE;
812: break;
813:
814: case OP_NOT_WORDCHAR:
1.3 misha 815: set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1.1 misha 816: try_next = FALSE;
817: break;
818:
819: case OP_WORDCHAR:
1.3 misha 820: set_type_bits(start_bits, cbit_word, table_limit, cd);
1.1 misha 821: try_next = FALSE;
822: break;
823:
824: /* One or more character type fudges the pointer and restarts, knowing
825: it will hit a single character type and stop there. */
826:
827: case OP_TYPEPLUS:
828: case OP_TYPEMINPLUS:
1.3 misha 829: case OP_TYPEPOSPLUS:
1.1 misha 830: tcode++;
831: break;
832:
833: case OP_TYPEEXACT:
834: tcode += 3;
835: break;
836:
837: /* Zero or more repeats of character types set the bits and then
838: try again. */
839:
840: case OP_TYPEUPTO:
841: case OP_TYPEMINUPTO:
842: case OP_TYPEPOSUPTO:
843: tcode += 2; /* Fall through */
844:
845: case OP_TYPESTAR:
846: case OP_TYPEMINSTAR:
847: case OP_TYPEPOSSTAR:
848: case OP_TYPEQUERY:
849: case OP_TYPEMINQUERY:
850: case OP_TYPEPOSQUERY:
851: switch(tcode[1])
852: {
1.3 misha 853: default:
1.1 misha 854: case OP_ANY:
855: case OP_ALLANY:
856: return SSB_FAIL;
857:
1.3 misha 858: case OP_HSPACE:
859: SET_BIT(0x09);
860: SET_BIT(0x20);
861: if (utf8)
862: {
863: SET_BIT(0xC2); /* For U+00A0 */
864: SET_BIT(0xE1); /* For U+1680, U+180E */
865: SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
866: SET_BIT(0xE3); /* For U+3000 */
867: }
868: else SET_BIT(0xA0);
869: break;
870:
871: case OP_ANYNL:
872: case OP_VSPACE:
873: SET_BIT(0x0A);
874: SET_BIT(0x0B);
875: SET_BIT(0x0C);
876: SET_BIT(0x0D);
877: if (utf8)
878: {
879: SET_BIT(0xC2); /* For U+0085 */
880: SET_BIT(0xE2); /* For U+2028, U+2029 */
881: }
882: else SET_BIT(0x85);
883: break;
884:
1.1 misha 885: case OP_NOT_DIGIT:
1.3 misha 886: set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1.1 misha 887: break;
888:
889: case OP_DIGIT:
1.3 misha 890: set_type_bits(start_bits, cbit_digit, table_limit, cd);
1.1 misha 891: break;
892:
893: /* The cbit_space table has vertical tab as whitespace; we have to
1.3 misha 894: ensure it gets set as not whitespace. */
1.1 misha 895:
896: case OP_NOT_WHITESPACE:
1.3 misha 897: set_nottype_bits(start_bits, cbit_space, table_limit, cd);
898: start_bits[1] |= 0x08;
1.1 misha 899: break;
900:
901: /* The cbit_space table has vertical tab as whitespace; we have to
1.3 misha 902: avoid setting it. */
1.1 misha 903:
904: case OP_WHITESPACE:
1.3 misha 905: c = start_bits[1]; /* Save in case it was already set */
906: set_type_bits(start_bits, cbit_space, table_limit, cd);
907: start_bits[1] = (start_bits[1] & ~0x08) | c;
1.1 misha 908: break;
909:
910: case OP_NOT_WORDCHAR:
1.3 misha 911: set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1.1 misha 912: break;
913:
914: case OP_WORDCHAR:
1.3 misha 915: set_type_bits(start_bits, cbit_word, table_limit, cd);
1.1 misha 916: break;
917: }
918:
919: tcode += 2;
920: break;
921:
922: /* Character class where all the information is in a bit map: set the
923: bits and either carry on or not, according to the repeat count. If it was
924: a negative class, and we are operating with UTF-8 characters, any byte
925: with a value >= 0xc4 is a potentially valid starter because it starts a
926: character with a value > 255. */
927:
928: case OP_NCLASS:
929: #ifdef SUPPORT_UTF8
930: if (utf8)
931: {
932: start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
933: memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
934: }
935: #endif
936: /* Fall through */
937:
938: case OP_CLASS:
939: {
940: tcode++;
941:
942: /* In UTF-8 mode, the bits in a bit map correspond to character
943: values, not to byte values. However, the bit map we are constructing is
944: for byte values. So we have to do a conversion for characters whose
945: value is > 127. In fact, there are only two possible starting bytes for
946: characters in the range 128 - 255. */
947:
948: #ifdef SUPPORT_UTF8
949: if (utf8)
950: {
951: for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
952: for (c = 128; c < 256; c++)
953: {
954: if ((tcode[c/8] && (1 << (c&7))) != 0)
955: {
956: int d = (c >> 6) | 0xc0; /* Set bit for this starter */
957: start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
958: c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
959: }
960: }
961: }
962:
963: /* In non-UTF-8 mode, the two bit maps are completely compatible. */
964:
965: else
966: #endif
967: {
968: for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
969: }
970:
971: /* Advance past the bit map, and act on what follows */
972:
973: tcode += 32;
974: switch (*tcode)
975: {
976: case OP_CRSTAR:
977: case OP_CRMINSTAR:
978: case OP_CRQUERY:
979: case OP_CRMINQUERY:
980: tcode++;
981: break;
982:
983: case OP_CRRANGE:
984: case OP_CRMINRANGE:
985: if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
986: else try_next = FALSE;
987: break;
988:
989: default:
990: try_next = FALSE;
991: break;
992: }
993: }
994: break; /* End of bitmap class handling */
995:
996: } /* End of switch */
997: } /* End of try_next loop */
998:
999: code += GET(code, 1); /* Advance to next branch */
1000: }
1001: while (*code == OP_ALT);
1002: return yield;
1003: }
1004:
1005:
1006:
1007: /*************************************************
1008: * Study a compiled expression *
1009: *************************************************/
1010:
1011: /* This function is handed a compiled expression that it must study to produce
1012: information that will speed up the matching. It returns a pcre_extra block
1013: which then gets handed back to pcre_exec().
1014:
1015: Arguments:
1016: re points to the compiled expression
1017: options contains option bits
1018: errorptr points to where to place error messages;
1019: set NULL unless error
1020:
1021: Returns: pointer to a pcre_extra block, with study_data filled in and the
1.3 misha 1022: appropriate flags set;
1.1 misha 1023: NULL on error or if no optimization possible
1024: */
1025:
1.2 misha 1026: PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1.1 misha 1027: pcre_study(const pcre *external_re, int options, const char **errorptr)
1028: {
1.3 misha 1029: int min;
1030: BOOL bits_set = FALSE;
1.1 misha 1031: uschar start_bits[32];
1032: pcre_extra *extra;
1033: pcre_study_data *study;
1034: const uschar *tables;
1035: uschar *code;
1036: compile_data compile_block;
1037: const real_pcre *re = (const real_pcre *)external_re;
1038:
1039: *errorptr = NULL;
1040:
1041: if (re == NULL || re->magic_number != MAGIC_NUMBER)
1042: {
1043: *errorptr = "argument is not a compiled regular expression";
1044: return NULL;
1045: }
1046:
1047: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1048: {
1049: *errorptr = "unknown or incorrect option bit(s) set";
1050: return NULL;
1051: }
1052:
1053: code = (uschar *)re + re->name_table_offset +
1054: (re->name_count * re->name_entry_size);
1055:
1056: /* For an anchored pattern, or an unanchored pattern that has a first char, or
1.3 misha 1057: a multiline pattern that matches only at "line starts", there is no point in
1058: seeking a list of starting bytes. */
1059:
1060: if ((re->options & PCRE_ANCHORED) == 0 &&
1061: (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1062: {
1063: /* Set the character tables in the block that is passed around */
1.1 misha 1064:
1.3 misha 1065: tables = re->tables;
1066: if (tables == NULL)
1067: (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1068: (void *)(&tables));
1069:
1070: compile_block.lcc = tables + lcc_offset;
1071: compile_block.fcc = tables + fcc_offset;
1072: compile_block.cbits = tables + cbits_offset;
1073: compile_block.ctypes = tables + ctypes_offset;
1074:
1075: /* See if we can find a fixed set of initial characters for the pattern. */
1076:
1077: memset(start_bits, 0, 32 * sizeof(uschar));
1078: bits_set = set_start_bits(code, start_bits,
1079: (re->options & PCRE_CASELESS) != 0, (re->options & PCRE_UTF8) != 0,
1080: &compile_block) == SSB_DONE;
1081: }
1082:
1083: /* Find the minimum length of subject string. */
1084:
1085: min = find_minlength(code, code, re->options);
1.1 misha 1086:
1.3 misha 1087: /* Return NULL if no optimization is possible. */
1.1 misha 1088:
1.3 misha 1089: if (!bits_set && min < 0) return NULL;
1.1 misha 1090:
1091: /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
1092: the latter, which is pointed to by the former, which may also get additional
1093: data set later by the calling program. At the moment, the size of
1094: pcre_study_data is fixed. We nevertheless save it in a field for returning via
1095: the pcre_fullinfo() function so that if it becomes variable in the future, we
1096: don't have to change that code. */
1097:
1098: extra = (pcre_extra *)(pcre_malloc)
1099: (sizeof(pcre_extra) + sizeof(pcre_study_data));
1100:
1101: if (extra == NULL)
1102: {
1103: *errorptr = "failed to get memory";
1104: return NULL;
1105: }
1106:
1107: study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
1108: extra->flags = PCRE_EXTRA_STUDY_DATA;
1109: extra->study_data = study;
1110:
1111: study->size = sizeof(pcre_study_data);
1.3 misha 1112: study->flags = 0;
1113:
1114: if (bits_set)
1115: {
1116: study->flags |= PCRE_STUDY_MAPPED;
1117: memcpy(study->start_bits, start_bits, sizeof(start_bits));
1118: }
1119:
1120: if (min >= 0)
1121: {
1122: study->flags |= PCRE_STUDY_MINLEN;
1123: study->minlength = min;
1124: }
1.1 misha 1125:
1126: return extra;
1127: }
1128:
1129: /* End of pcre_study.c */
E-mail: