Annotation of win32/pcre/pcre_study.c, revision 1.5
1.1 misha 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
1.5 ! misha 9: Copyright (c) 1997-2012 University of Cambridge
1.1 misha 10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40:
41: /* This module contains the external function pcre_study(), along with local
42: supporting functions. */
43:
44:
45: #ifdef HAVE_CONFIG_H
46: #include "config.h"
47: #endif
48:
49: #include "pcre_internal.h"
50:
1.3 misha 51: #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
1.1 misha 52:
53: /* Returns from set_start_bits() */
54:
1.5 ! misha 55: enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
1.1 misha 56:
57:
1.3 misha 58:
59: /*************************************************
60: * Find the minimum subject length for a group *
61: *************************************************/
62:
63: /* Scan a parenthesized group and compute the minimum length of subject that
64: is needed to match it. This is a lower bound; it does not mean there is a
65: string of that length that matches. In UTF8 mode, the result is in characters
66: rather than bytes.
67:
68: Arguments:
1.5 ! misha 69: code pointer to start of group (the bracket)
! 70: startcode pointer to start of the whole pattern
! 71: options the compiling options
! 72: int RECURSE depth
1.3 misha 73:
74: Returns: the minimum length
1.5 ! misha 75: -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
1.3 misha 76: -2 internal error (missing capturing bracket)
1.5 ! misha 77: -3 internal error (opcode not listed)
1.3 misha 78: */
79:
80: static int
1.5 ! misha 81: find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
! 82: int recurse_depth)
1.3 misha 83: {
84: int length = -1;
1.5 ! misha 85: /* PCRE_UTF16 has the same value as PCRE_UTF8. */
! 86: BOOL utf = (options & PCRE_UTF8) != 0;
1.3 misha 87: BOOL had_recurse = FALSE;
88: register int branchlength = 0;
1.5 ! misha 89: register pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
1.3 misha 90:
1.5 ! misha 91: if (*code == OP_CBRA || *code == OP_SCBRA ||
! 92: *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
1.3 misha 93:
94: /* Scan along the opcodes for this branch. If we get to the end of the
95: branch, check the length against that of the other branches. */
96:
97: for (;;)
98: {
99: int d, min;
1.5 ! misha 100: pcre_uchar *cs, *ce;
1.3 misha 101: register int op = *cc;
102:
103: switch (op)
104: {
105: case OP_COND:
106: case OP_SCOND:
107:
108: /* If there is only one branch in a condition, the implied branch has zero
109: length, so we don't add anything. This covers the DEFINE "condition"
110: automatically. */
111:
112: cs = cc + GET(cc, 1);
113: if (*cs != OP_ALT)
114: {
115: cc = cs + 1 + LINK_SIZE;
116: break;
117: }
118:
119: /* Otherwise we can fall through and treat it the same as any other
120: subpattern. */
121:
122: case OP_CBRA:
123: case OP_SCBRA:
124: case OP_BRA:
125: case OP_SBRA:
1.5 ! misha 126: case OP_CBRAPOS:
! 127: case OP_SCBRAPOS:
! 128: case OP_BRAPOS:
! 129: case OP_SBRAPOS:
1.3 misha 130: case OP_ONCE:
1.5 ! misha 131: case OP_ONCE_NC:
! 132: d = find_minlength(cc, startcode, options, recurse_depth);
1.3 misha 133: if (d < 0) return d;
134: branchlength += d;
135: do cc += GET(cc, 1); while (*cc == OP_ALT);
136: cc += 1 + LINK_SIZE;
137: break;
138:
1.5 ! misha 139: /* ACCEPT makes things far too complicated; we have to give up. */
! 140:
! 141: case OP_ACCEPT:
! 142: case OP_ASSERT_ACCEPT:
! 143: return -1;
! 144:
1.3 misha 145: /* Reached end of a branch; if it's a ket it is the end of a nested
1.5 ! misha 146: call. If it's ALT it is an alternation in a nested call. If it is END it's
! 147: the end of the outer call. All can be handled by the same code. If an
! 148: ACCEPT was previously encountered, use the length that was in force at that
! 149: time, and pass back the shortest ACCEPT length. */
1.3 misha 150:
151: case OP_ALT:
152: case OP_KET:
153: case OP_KETRMAX:
154: case OP_KETRMIN:
1.5 ! misha 155: case OP_KETRPOS:
1.3 misha 156: case OP_END:
157: if (length < 0 || (!had_recurse && branchlength < length))
158: length = branchlength;
1.5 ! misha 159: if (op != OP_ALT) return length;
1.3 misha 160: cc += 1 + LINK_SIZE;
161: branchlength = 0;
162: had_recurse = FALSE;
163: break;
164:
165: /* Skip over assertive subpatterns */
166:
167: case OP_ASSERT:
168: case OP_ASSERT_NOT:
169: case OP_ASSERTBACK:
170: case OP_ASSERTBACK_NOT:
171: do cc += GET(cc, 1); while (*cc == OP_ALT);
172: /* Fall through */
173:
174: /* Skip over things that don't match chars */
175:
176: case OP_REVERSE:
177: case OP_CREF:
178: case OP_NCREF:
179: case OP_RREF:
180: case OP_NRREF:
181: case OP_DEF:
182: case OP_CALLOUT:
183: case OP_SOD:
184: case OP_SOM:
185: case OP_EOD:
186: case OP_EODN:
187: case OP_CIRC:
1.5 ! misha 188: case OP_CIRCM:
1.3 misha 189: case OP_DOLL:
1.5 ! misha 190: case OP_DOLLM:
1.3 misha 191: case OP_NOT_WORD_BOUNDARY:
192: case OP_WORD_BOUNDARY:
1.5 ! misha 193: cc += PRIV(OP_lengths)[*cc];
1.3 misha 194: break;
195:
196: /* Skip over a subpattern that has a {0} or {0,x} quantifier */
197:
198: case OP_BRAZERO:
199: case OP_BRAMINZERO:
1.5 ! misha 200: case OP_BRAPOSZERO:
1.3 misha 201: case OP_SKIPZERO:
1.5 ! misha 202: cc += PRIV(OP_lengths)[*cc];
1.3 misha 203: do cc += GET(cc, 1); while (*cc == OP_ALT);
204: cc += 1 + LINK_SIZE;
205: break;
206:
207: /* Handle literal characters and + repetitions */
208:
209: case OP_CHAR:
1.5 ! misha 210: case OP_CHARI:
1.3 misha 211: case OP_NOT:
1.5 ! misha 212: case OP_NOTI:
1.3 misha 213: case OP_PLUS:
1.5 ! misha 214: case OP_PLUSI:
1.3 misha 215: case OP_MINPLUS:
1.5 ! misha 216: case OP_MINPLUSI:
1.3 misha 217: case OP_POSPLUS:
1.5 ! misha 218: case OP_POSPLUSI:
1.3 misha 219: case OP_NOTPLUS:
1.5 ! misha 220: case OP_NOTPLUSI:
1.3 misha 221: case OP_NOTMINPLUS:
1.5 ! misha 222: case OP_NOTMINPLUSI:
1.3 misha 223: case OP_NOTPOSPLUS:
1.5 ! misha 224: case OP_NOTPOSPLUSI:
1.3 misha 225: branchlength++;
226: cc += 2;
1.5 ! misha 227: #ifdef SUPPORT_UTF
! 228: if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1.3 misha 229: #endif
230: break;
231:
232: case OP_TYPEPLUS:
233: case OP_TYPEMINPLUS:
234: case OP_TYPEPOSPLUS:
235: branchlength++;
236: cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
237: break;
238:
239: /* Handle exact repetitions. The count is already in characters, but we
240: need to skip over a multibyte character in UTF8 mode. */
241:
242: case OP_EXACT:
1.5 ! misha 243: case OP_EXACTI:
1.3 misha 244: case OP_NOTEXACT:
1.5 ! misha 245: case OP_NOTEXACTI:
1.3 misha 246: branchlength += GET2(cc,1);
1.5 ! misha 247: cc += 2 + IMM2_SIZE;
! 248: #ifdef SUPPORT_UTF
! 249: if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1.3 misha 250: #endif
251: break;
252:
253: case OP_TYPEEXACT:
254: branchlength += GET2(cc,1);
1.5 ! misha 255: cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
! 256: || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
1.3 misha 257: break;
258:
259: /* Handle single-char non-literal matchers */
260:
261: case OP_PROP:
262: case OP_NOTPROP:
263: cc += 2;
264: /* Fall through */
265:
266: case OP_NOT_DIGIT:
267: case OP_DIGIT:
268: case OP_NOT_WHITESPACE:
269: case OP_WHITESPACE:
270: case OP_NOT_WORDCHAR:
271: case OP_WORDCHAR:
272: case OP_ANY:
273: case OP_ALLANY:
274: case OP_EXTUNI:
275: case OP_HSPACE:
276: case OP_NOT_HSPACE:
277: case OP_VSPACE:
278: case OP_NOT_VSPACE:
279: branchlength++;
280: cc++;
281: break;
282:
1.5 ! misha 283: /* "Any newline" might match two characters, but it also might match just
! 284: one. */
1.3 misha 285:
286: case OP_ANYNL:
1.5 ! misha 287: branchlength += 1;
1.3 misha 288: cc++;
289: break;
290:
1.5 ! misha 291: /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
! 292: non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
! 293: appear, but leave the code, just in case.) */
1.3 misha 294:
295: case OP_ANYBYTE:
1.5 ! misha 296: #ifdef SUPPORT_UTF
! 297: if (utf) return -1;
1.3 misha 298: #endif
299: branchlength++;
300: cc++;
301: break;
302:
303: /* For repeated character types, we have to test for \p and \P, which have
304: an extra two bytes of parameters. */
305:
306: case OP_TYPESTAR:
307: case OP_TYPEMINSTAR:
308: case OP_TYPEQUERY:
309: case OP_TYPEMINQUERY:
310: case OP_TYPEPOSSTAR:
311: case OP_TYPEPOSQUERY:
312: if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
1.5 ! misha 313: cc += PRIV(OP_lengths)[op];
1.3 misha 314: break;
315:
316: case OP_TYPEUPTO:
317: case OP_TYPEMINUPTO:
318: case OP_TYPEPOSUPTO:
1.5 ! misha 319: if (cc[1 + IMM2_SIZE] == OP_PROP
! 320: || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
! 321: cc += PRIV(OP_lengths)[op];
1.3 misha 322: break;
323:
324: /* Check a class for variable quantification */
325:
1.5 ! misha 326: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
1.3 misha 327: case OP_XCLASS:
1.5 ! misha 328: cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1.3 misha 329: /* Fall through */
330: #endif
331:
332: case OP_CLASS:
333: case OP_NCLASS:
1.5 ! misha 334: cc += PRIV(OP_lengths)[OP_CLASS];
1.3 misha 335:
336: switch (*cc)
337: {
338: case OP_CRPLUS:
339: case OP_CRMINPLUS:
340: branchlength++;
341: /* Fall through */
342:
343: case OP_CRSTAR:
344: case OP_CRMINSTAR:
345: case OP_CRQUERY:
346: case OP_CRMINQUERY:
347: cc++;
348: break;
349:
350: case OP_CRRANGE:
351: case OP_CRMINRANGE:
352: branchlength += GET2(cc,1);
1.5 ! misha 353: cc += 1 + 2 * IMM2_SIZE;
1.3 misha 354: break;
355:
356: default:
357: branchlength++;
358: break;
359: }
360: break;
361:
362: /* Backreferences and subroutine calls are treated in the same way: we find
363: the minimum length for the subpattern. A recursion, however, causes an
364: a flag to be set that causes the length of this branch to be ignored. The
365: logic is that a recursion can only make sense if there is another
366: alternation that stops the recursing. That will provide the minimum length
367: (when no recursion happens). A backreference within the group that it is
368: referencing behaves in the same way.
369:
370: If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
371: matches an empty string (by default it causes a matching failure), so in
372: that case we must set the minimum length to zero. */
373:
374: case OP_REF:
1.5 ! misha 375: case OP_REFI:
1.3 misha 376: if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
377: {
1.5 ! misha 378: ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
1.3 misha 379: if (cs == NULL) return -2;
380: do ce += GET(ce, 1); while (*ce == OP_ALT);
381: if (cc > cs && cc < ce)
382: {
383: d = 0;
384: had_recurse = TRUE;
385: }
1.5 ! misha 386: else
! 387: {
! 388: d = find_minlength(cs, startcode, options, recurse_depth);
! 389: }
1.3 misha 390: }
391: else d = 0;
1.5 ! misha 392: cc += 1 + IMM2_SIZE;
1.3 misha 393:
394: /* Handle repeated back references */
395:
396: switch (*cc)
397: {
398: case OP_CRSTAR:
399: case OP_CRMINSTAR:
400: case OP_CRQUERY:
401: case OP_CRMINQUERY:
402: min = 0;
403: cc++;
404: break;
405:
1.5 ! misha 406: case OP_CRPLUS:
! 407: case OP_CRMINPLUS:
! 408: min = 1;
! 409: cc++;
! 410: break;
! 411:
1.3 misha 412: case OP_CRRANGE:
413: case OP_CRMINRANGE:
414: min = GET2(cc, 1);
1.5 ! misha 415: cc += 1 + 2 * IMM2_SIZE;
1.3 misha 416: break;
417:
418: default:
419: min = 1;
420: break;
421: }
422:
423: branchlength += min * d;
424: break;
425:
1.5 ! misha 426: /* We can easily detect direct recursion, but not mutual recursion. This is
! 427: caught by a recursion depth count. */
! 428:
1.3 misha 429: case OP_RECURSE:
1.5 ! misha 430: cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
1.3 misha 431: do ce += GET(ce, 1); while (*ce == OP_ALT);
1.5 ! misha 432: if ((cc > cs && cc < ce) || recurse_depth > 10)
1.3 misha 433: had_recurse = TRUE;
434: else
1.5 ! misha 435: {
! 436: branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
! 437: }
1.3 misha 438: cc += 1 + LINK_SIZE;
439: break;
440:
441: /* Anything else does not or need not match a character. We can get the
442: item's length from the table, but for those that can match zero occurrences
1.5 ! misha 443: of a character, we must take special action for UTF-8 characters. As it
! 444: happens, the "NOT" versions of these opcodes are used at present only for
! 445: ASCII characters, so they could be omitted from this list. However, in
! 446: future that may change, so we include them here so as not to leave a
! 447: gotcha for a future maintainer. */
1.3 misha 448:
449: case OP_UPTO:
1.5 ! misha 450: case OP_UPTOI:
1.3 misha 451: case OP_NOTUPTO:
1.5 ! misha 452: case OP_NOTUPTOI:
1.3 misha 453: case OP_MINUPTO:
1.5 ! misha 454: case OP_MINUPTOI:
1.3 misha 455: case OP_NOTMINUPTO:
1.5 ! misha 456: case OP_NOTMINUPTOI:
1.3 misha 457: case OP_POSUPTO:
1.5 ! misha 458: case OP_POSUPTOI:
! 459: case OP_NOTPOSUPTO:
! 460: case OP_NOTPOSUPTOI:
! 461:
1.3 misha 462: case OP_STAR:
1.5 ! misha 463: case OP_STARI:
! 464: case OP_NOTSTAR:
! 465: case OP_NOTSTARI:
1.3 misha 466: case OP_MINSTAR:
1.5 ! misha 467: case OP_MINSTARI:
1.3 misha 468: case OP_NOTMINSTAR:
1.5 ! misha 469: case OP_NOTMINSTARI:
1.3 misha 470: case OP_POSSTAR:
1.5 ! misha 471: case OP_POSSTARI:
1.3 misha 472: case OP_NOTPOSSTAR:
1.5 ! misha 473: case OP_NOTPOSSTARI:
! 474:
1.3 misha 475: case OP_QUERY:
1.5 ! misha 476: case OP_QUERYI:
! 477: case OP_NOTQUERY:
! 478: case OP_NOTQUERYI:
1.3 misha 479: case OP_MINQUERY:
1.5 ! misha 480: case OP_MINQUERYI:
1.3 misha 481: case OP_NOTMINQUERY:
1.5 ! misha 482: case OP_NOTMINQUERYI:
1.3 misha 483: case OP_POSQUERY:
1.5 ! misha 484: case OP_POSQUERYI:
1.3 misha 485: case OP_NOTPOSQUERY:
1.5 ! misha 486: case OP_NOTPOSQUERYI:
! 487:
! 488: cc += PRIV(OP_lengths)[op];
! 489: #ifdef SUPPORT_UTF
! 490: if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1.3 misha 491: #endif
492: break;
493:
494: /* Skip these, but we need to add in the name length. */
495:
496: case OP_MARK:
497: case OP_PRUNE_ARG:
498: case OP_SKIP_ARG:
1.5 ! misha 499: case OP_THEN_ARG:
! 500: cc += PRIV(OP_lengths)[op] + cc[1];
1.4 misha 501: break;
502:
1.5 ! misha 503: /* The remaining opcodes are just skipped over. */
! 504:
! 505: case OP_CLOSE:
! 506: case OP_COMMIT:
! 507: case OP_FAIL:
! 508: case OP_PRUNE:
! 509: case OP_SET_SOM:
! 510: case OP_SKIP:
! 511: case OP_THEN:
! 512: cc += PRIV(OP_lengths)[op];
1.3 misha 513: break;
514:
1.5 ! misha 515: /* This should not occur: we list all opcodes explicitly so that when
! 516: new ones get added they are properly considered. */
1.3 misha 517:
518: default:
1.5 ! misha 519: return -3;
1.3 misha 520: }
521: }
522: /* Control never gets here */
523: }
524:
525:
526:
1.1 misha 527: /*************************************************
528: * Set a bit and maybe its alternate case *
529: *************************************************/
530:
1.3 misha 531: /* Given a character, set its first byte's bit in the table, and also the
532: corresponding bit for the other version of a letter if we are caseless. In
533: UTF-8 mode, for characters greater than 127, we can only do the caseless thing
534: when Unicode property support is available.
1.1 misha 535:
536: Arguments:
537: start_bits points to the bit map
1.3 misha 538: p points to the character
1.1 misha 539: caseless the caseless flag
540: cd the block with char table pointers
1.5 ! misha 541: utf TRUE for UTF-8 / UTF-16 mode
1.3 misha 542:
543: Returns: pointer after the character
544: */
545:
1.5 ! misha 546: static const pcre_uchar *
! 547: set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
! 548: compile_data *cd, BOOL utf)
1.3 misha 549: {
550: unsigned int c = *p;
551:
1.5 ! misha 552: #ifdef COMPILE_PCRE8
1.3 misha 553: SET_BIT(c);
554:
1.5 ! misha 555: #ifdef SUPPORT_UTF
! 556: if (utf && c > 127)
1.3 misha 557: {
558: GETCHARINC(c, p);
559: #ifdef SUPPORT_UCP
560: if (caseless)
561: {
1.5 ! misha 562: pcre_uchar buff[6];
1.3 misha 563: c = UCD_OTHERCASE(c);
1.5 ! misha 564: (void)PRIV(ord2utf)(c, buff);
1.3 misha 565: SET_BIT(buff[0]);
566: }
567: #endif
568: return p;
569: }
570: #endif
571:
572: /* Not UTF-8 mode, or character is less than 127. */
573:
574: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
575: return p + 1;
1.5 ! misha 576: #endif
! 577:
! 578: #ifdef COMPILE_PCRE16
! 579: if (c > 0xff)
! 580: {
! 581: c = 0xff;
! 582: caseless = FALSE;
! 583: }
! 584: SET_BIT(c);
! 585:
! 586: #ifdef SUPPORT_UTF
! 587: if (utf && c > 127)
! 588: {
! 589: GETCHARINC(c, p);
! 590: #ifdef SUPPORT_UCP
! 591: if (caseless)
! 592: {
! 593: c = UCD_OTHERCASE(c);
! 594: if (c > 0xff)
! 595: c = 0xff;
! 596: SET_BIT(c);
! 597: }
! 598: #endif
! 599: return p;
! 600: }
! 601: #endif
! 602:
! 603: if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
! 604: return p + 1;
! 605: #endif
1.3 misha 606: }
607:
608:
609:
610: /*************************************************
611: * Set bits for a positive character type *
612: *************************************************/
613:
614: /* This function sets starting bits for a character type. In UTF-8 mode, we can
615: only do a direct setting for bytes less than 128, as otherwise there can be
616: confusion with bytes in the middle of UTF-8 characters. In a "traditional"
617: environment, the tables will only recognize ASCII characters anyway, but in at
618: least one Windows environment, some higher bytes bits were set in the tables.
619: So we deal with that case by considering the UTF-8 encoding.
620:
621: Arguments:
622: start_bits the starting bitmap
623: cbit type the type of character wanted
624: table_limit 32 for non-UTF-8; 16 for UTF-8
625: cd the block with char table pointers
626:
627: Returns: nothing
628: */
629:
630: static void
1.5 ! misha 631: set_type_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
1.3 misha 632: compile_data *cd)
633: {
634: register int c;
635: for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
1.5 ! misha 636: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1.3 misha 637: if (table_limit == 32) return;
638: for (c = 128; c < 256; c++)
639: {
640: if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
641: {
1.5 ! misha 642: pcre_uchar buff[6];
! 643: (void)PRIV(ord2utf)(c, buff);
1.3 misha 644: SET_BIT(buff[0]);
645: }
646: }
1.5 ! misha 647: #endif
1.3 misha 648: }
649:
650:
651: /*************************************************
652: * Set bits for a negative character type *
653: *************************************************/
654:
655: /* This function sets starting bits for a negative character type such as \D.
656: In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
657: otherwise there can be confusion with bytes in the middle of UTF-8 characters.
658: Unlike in the positive case, where we can set appropriate starting bits for
659: specific high-valued UTF-8 characters, in this case we have to set the bits for
660: all high-valued characters. The lowest is 0xc2, but we overkill by starting at
661: 0xc0 (192) for simplicity.
1.1 misha 662:
1.3 misha 663: Arguments:
664: start_bits the starting bitmap
665: cbit type the type of character wanted
666: table_limit 32 for non-UTF-8; 16 for UTF-8
667: cd the block with char table pointers
668:
669: Returns: nothing
1.1 misha 670: */
671:
672: static void
1.5 ! misha 673: set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
1.3 misha 674: compile_data *cd)
1.1 misha 675: {
1.3 misha 676: register int c;
677: for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
1.5 ! misha 678: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1.3 misha 679: if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
1.5 ! misha 680: #endif
1.1 misha 681: }
682:
683:
684:
685: /*************************************************
686: * Create bitmap of starting bytes *
687: *************************************************/
688:
689: /* This function scans a compiled unanchored expression recursively and
690: attempts to build a bitmap of the set of possible starting bytes. As time goes
691: by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
692: useful for parenthesized groups in patterns such as (a*)b where the group
693: provides some optional starting bytes but scanning must continue at the outer
694: level to find at least one mandatory byte. At the outermost level, this
695: function fails unless the result is SSB_DONE.
696:
697: Arguments:
698: code points to an expression
699: start_bits points to a 32-byte table, initialized to 0
1.5 ! misha 700: utf TRUE if in UTF-8 / UTF-16 mode
1.1 misha 701: cd the block with char table pointers
702:
703: Returns: SSB_FAIL => Failed to find any starting bytes
704: SSB_DONE => Found mandatory starting bytes
705: SSB_CONTINUE => Found optional starting bytes
1.5 ! misha 706: SSB_UNKNOWN => Hit an unrecognized opcode
1.1 misha 707: */
708:
709: static int
1.5 ! misha 710: set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
! 711: compile_data *cd)
1.1 misha 712: {
713: register int c;
714: int yield = SSB_DONE;
1.5 ! misha 715: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
! 716: int table_limit = utf? 16:32;
! 717: #else
! 718: int table_limit = 32;
! 719: #endif
1.1 misha 720:
721: #if 0
722: /* ========================================================================= */
723: /* The following comment and code was inserted in January 1999. In May 2006,
724: when it was observed to cause compiler warnings about unused values, I took it
725: out again. If anybody is still using OS/2, they will have to put it back
726: manually. */
727:
728: /* This next statement and the later reference to dummy are here in order to
729: trick the optimizer of the IBM C compiler for OS/2 into generating correct
730: code. Apparently IBM isn't going to fix the problem, and we would rather not
731: disable optimization (in this module it actually makes a big difference, and
732: the pcre module can use all the optimization it can get). */
733:
734: volatile int dummy;
735: /* ========================================================================= */
736: #endif
737:
738: do
739: {
740: BOOL try_next = TRUE;
1.5 ! misha 741: const pcre_uchar *tcode = code + 1 + LINK_SIZE;
! 742:
! 743: if (*code == OP_CBRA || *code == OP_SCBRA ||
! 744: *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
1.1 misha 745:
746: while (try_next) /* Loop for items in this branch */
747: {
748: int rc;
1.5 ! misha 749:
1.1 misha 750: switch(*tcode)
751: {
1.5 ! misha 752: /* If we reach something we don't understand, it means a new opcode has
! 753: been created that hasn't been added to this code. Hopefully this problem
! 754: will be discovered during testing. */
1.1 misha 755:
756: default:
1.5 ! misha 757: return SSB_UNKNOWN;
! 758:
! 759: /* Fail for a valid opcode that implies no starting bits. */
! 760:
! 761: case OP_ACCEPT:
! 762: case OP_ASSERT_ACCEPT:
! 763: case OP_ALLANY:
! 764: case OP_ANY:
! 765: case OP_ANYBYTE:
! 766: case OP_CIRC:
! 767: case OP_CIRCM:
! 768: case OP_CLOSE:
! 769: case OP_COMMIT:
! 770: case OP_COND:
! 771: case OP_CREF:
! 772: case OP_DEF:
! 773: case OP_DOLL:
! 774: case OP_DOLLM:
! 775: case OP_END:
! 776: case OP_EOD:
! 777: case OP_EODN:
! 778: case OP_EXTUNI:
! 779: case OP_FAIL:
! 780: case OP_MARK:
! 781: case OP_NCREF:
! 782: case OP_NOT:
! 783: case OP_NOTEXACT:
! 784: case OP_NOTEXACTI:
! 785: case OP_NOTI:
! 786: case OP_NOTMINPLUS:
! 787: case OP_NOTMINPLUSI:
! 788: case OP_NOTMINQUERY:
! 789: case OP_NOTMINQUERYI:
! 790: case OP_NOTMINSTAR:
! 791: case OP_NOTMINSTARI:
! 792: case OP_NOTMINUPTO:
! 793: case OP_NOTMINUPTOI:
! 794: case OP_NOTPLUS:
! 795: case OP_NOTPLUSI:
! 796: case OP_NOTPOSPLUS:
! 797: case OP_NOTPOSPLUSI:
! 798: case OP_NOTPOSQUERY:
! 799: case OP_NOTPOSQUERYI:
! 800: case OP_NOTPOSSTAR:
! 801: case OP_NOTPOSSTARI:
! 802: case OP_NOTPOSUPTO:
! 803: case OP_NOTPOSUPTOI:
! 804: case OP_NOTPROP:
! 805: case OP_NOTQUERY:
! 806: case OP_NOTQUERYI:
! 807: case OP_NOTSTAR:
! 808: case OP_NOTSTARI:
! 809: case OP_NOTUPTO:
! 810: case OP_NOTUPTOI:
! 811: case OP_NOT_HSPACE:
! 812: case OP_NOT_VSPACE:
! 813: case OP_NRREF:
! 814: case OP_PROP:
! 815: case OP_PRUNE:
! 816: case OP_PRUNE_ARG:
! 817: case OP_RECURSE:
! 818: case OP_REF:
! 819: case OP_REFI:
! 820: case OP_REVERSE:
! 821: case OP_RREF:
! 822: case OP_SCOND:
! 823: case OP_SET_SOM:
! 824: case OP_SKIP:
! 825: case OP_SKIP_ARG:
! 826: case OP_SOD:
! 827: case OP_SOM:
! 828: case OP_THEN:
! 829: case OP_THEN_ARG:
! 830: #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
! 831: case OP_XCLASS:
! 832: #endif
1.1 misha 833: return SSB_FAIL;
834:
1.5 ! misha 835: /* We can ignore word boundary tests. */
! 836:
! 837: case OP_WORD_BOUNDARY:
! 838: case OP_NOT_WORD_BOUNDARY:
! 839: tcode++;
! 840: break;
! 841:
1.1 misha 842: /* If we hit a bracket or a positive lookahead assertion, recurse to set
843: bits from within the subpattern. If it can't find anything, we have to
844: give up. If it finds some mandatory character(s), we are done for this
845: branch. Otherwise, carry on scanning after the subpattern. */
846:
847: case OP_BRA:
848: case OP_SBRA:
849: case OP_CBRA:
850: case OP_SCBRA:
1.5 ! misha 851: case OP_BRAPOS:
! 852: case OP_SBRAPOS:
! 853: case OP_CBRAPOS:
! 854: case OP_SCBRAPOS:
1.1 misha 855: case OP_ONCE:
1.5 ! misha 856: case OP_ONCE_NC:
1.1 misha 857: case OP_ASSERT:
1.5 ! misha 858: rc = set_start_bits(tcode, start_bits, utf, cd);
! 859: if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1.1 misha 860: if (rc == SSB_DONE) try_next = FALSE; else
861: {
862: do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
863: tcode += 1 + LINK_SIZE;
864: }
865: break;
866:
867: /* If we hit ALT or KET, it means we haven't found anything mandatory in
868: this branch, though we might have found something optional. For ALT, we
869: continue with the next alternative, but we have to arrange that the final
870: result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
871: return SSB_CONTINUE: if this is the top level, that indicates failure,
872: but after a nested subpattern, it causes scanning to continue. */
873:
874: case OP_ALT:
875: yield = SSB_CONTINUE;
876: try_next = FALSE;
877: break;
878:
879: case OP_KET:
880: case OP_KETRMAX:
881: case OP_KETRMIN:
1.5 ! misha 882: case OP_KETRPOS:
1.1 misha 883: return SSB_CONTINUE;
884:
885: /* Skip over callout */
886:
887: case OP_CALLOUT:
888: tcode += 2 + 2*LINK_SIZE;
889: break;
890:
891: /* Skip over lookbehind and negative lookahead assertions */
892:
893: case OP_ASSERT_NOT:
894: case OP_ASSERTBACK:
895: case OP_ASSERTBACK_NOT:
896: do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
897: tcode += 1 + LINK_SIZE;
898: break;
899:
900: /* BRAZERO does the bracket, but carries on. */
901:
902: case OP_BRAZERO:
903: case OP_BRAMINZERO:
1.5 ! misha 904: case OP_BRAPOSZERO:
! 905: rc = set_start_bits(++tcode, start_bits, utf, cd);
! 906: if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1.1 misha 907: /* =========================================================================
908: See the comment at the head of this function concerning the next line,
909: which was an old fudge for the benefit of OS/2.
910: dummy = 1;
911: ========================================================================= */
912: do tcode += GET(tcode,1); while (*tcode == OP_ALT);
913: tcode += 1 + LINK_SIZE;
914: break;
915:
916: /* SKIPZERO skips the bracket. */
917:
918: case OP_SKIPZERO:
1.2 misha 919: tcode++;
1.1 misha 920: do tcode += GET(tcode,1); while (*tcode == OP_ALT);
921: tcode += 1 + LINK_SIZE;
922: break;
923:
924: /* Single-char * or ? sets the bit and tries the next item */
925:
926: case OP_STAR:
927: case OP_MINSTAR:
928: case OP_POSSTAR:
929: case OP_QUERY:
930: case OP_MINQUERY:
931: case OP_POSQUERY:
1.5 ! misha 932: tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
! 933: break;
! 934:
! 935: case OP_STARI:
! 936: case OP_MINSTARI:
! 937: case OP_POSSTARI:
! 938: case OP_QUERYI:
! 939: case OP_MINQUERYI:
! 940: case OP_POSQUERYI:
! 941: tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1.1 misha 942: break;
943:
944: /* Single-char upto sets the bit and tries the next */
945:
946: case OP_UPTO:
947: case OP_MINUPTO:
948: case OP_POSUPTO:
1.5 ! misha 949: tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
! 950: break;
! 951:
! 952: case OP_UPTOI:
! 953: case OP_MINUPTOI:
! 954: case OP_POSUPTOI:
! 955: tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
1.1 misha 956: break;
957:
958: /* At least one single char sets the bit and stops */
959:
1.5 ! misha 960: case OP_EXACT:
! 961: tcode += IMM2_SIZE;
! 962: /* Fall through */
1.1 misha 963: case OP_CHAR:
964: case OP_PLUS:
965: case OP_MINPLUS:
966: case OP_POSPLUS:
1.5 ! misha 967: (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
! 968: try_next = FALSE;
! 969: break;
! 970:
! 971: case OP_EXACTI:
! 972: tcode += IMM2_SIZE;
! 973: /* Fall through */
! 974: case OP_CHARI:
! 975: case OP_PLUSI:
! 976: case OP_MINPLUSI:
! 977: case OP_POSPLUSI:
! 978: (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
1.3 misha 979: try_next = FALSE;
980: break;
981:
982: /* Special spacing and line-terminating items. These recognize specific
983: lists of characters. The difference between VSPACE and ANYNL is that the
984: latter can match the two-character CRLF sequence, but that is not
985: relevant for finding the first character, so their code here is
986: identical. */
987:
988: case OP_HSPACE:
989: SET_BIT(0x09);
990: SET_BIT(0x20);
1.5 ! misha 991: #ifdef SUPPORT_UTF
! 992: if (utf)
1.3 misha 993: {
1.5 ! misha 994: #ifdef COMPILE_PCRE8
1.3 misha 995: SET_BIT(0xC2); /* For U+00A0 */
996: SET_BIT(0xE1); /* For U+1680, U+180E */
997: SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
998: SET_BIT(0xE3); /* For U+3000 */
1.5 ! misha 999: #endif
! 1000: #ifdef COMPILE_PCRE16
! 1001: SET_BIT(0xA0);
! 1002: SET_BIT(0xFF); /* For characters > 255 */
! 1003: #endif
! 1004: }
! 1005: else
! 1006: #endif /* SUPPORT_UTF */
! 1007: {
! 1008: SET_BIT(0xA0);
! 1009: #ifdef COMPILE_PCRE16
! 1010: SET_BIT(0xFF); /* For characters > 255 */
! 1011: #endif
1.3 misha 1012: }
1013: try_next = FALSE;
1014: break;
1015:
1016: case OP_ANYNL:
1017: case OP_VSPACE:
1018: SET_BIT(0x0A);
1019: SET_BIT(0x0B);
1020: SET_BIT(0x0C);
1021: SET_BIT(0x0D);
1.5 ! misha 1022: #ifdef SUPPORT_UTF
! 1023: if (utf)
1.3 misha 1024: {
1.5 ! misha 1025: #ifdef COMPILE_PCRE8
1.3 misha 1026: SET_BIT(0xC2); /* For U+0085 */
1027: SET_BIT(0xE2); /* For U+2028, U+2029 */
1.5 ! misha 1028: #endif
! 1029: #ifdef COMPILE_PCRE16
! 1030: SET_BIT(0x85);
! 1031: SET_BIT(0xFF); /* For characters > 255 */
! 1032: #endif
! 1033: }
! 1034: else
! 1035: #endif /* SUPPORT_UTF */
! 1036: {
! 1037: SET_BIT(0x85);
! 1038: #ifdef COMPILE_PCRE16
! 1039: SET_BIT(0xFF); /* For characters > 255 */
! 1040: #endif
1.3 misha 1041: }
1.1 misha 1042: try_next = FALSE;
1043: break;
1044:
1.3 misha 1045: /* Single character types set the bits and stop. Note that if PCRE_UCP
1046: is set, we do not see these op codes because \d etc are converted to
1047: properties. Therefore, these apply in the case when only characters less
1048: than 256 are recognized to match the types. */
1.1 misha 1049:
1050: case OP_NOT_DIGIT:
1.3 misha 1051: set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1.1 misha 1052: try_next = FALSE;
1053: break;
1054:
1055: case OP_DIGIT:
1.3 misha 1056: set_type_bits(start_bits, cbit_digit, table_limit, cd);
1.1 misha 1057: try_next = FALSE;
1058: break;
1059:
1060: /* The cbit_space table has vertical tab as whitespace; we have to
1.3 misha 1061: ensure it is set as not whitespace. */
1.1 misha 1062:
1063: case OP_NOT_WHITESPACE:
1.3 misha 1064: set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1065: start_bits[1] |= 0x08;
1.1 misha 1066: try_next = FALSE;
1067: break;
1068:
1069: /* The cbit_space table has vertical tab as whitespace; we have to
1.3 misha 1070: not set it from the table. */
1.1 misha 1071:
1072: case OP_WHITESPACE:
1.3 misha 1073: c = start_bits[1]; /* Save in case it was already set */
1074: set_type_bits(start_bits, cbit_space, table_limit, cd);
1075: start_bits[1] = (start_bits[1] & ~0x08) | c;
1.1 misha 1076: try_next = FALSE;
1077: break;
1078:
1079: case OP_NOT_WORDCHAR:
1.3 misha 1080: set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1.1 misha 1081: try_next = FALSE;
1082: break;
1083:
1084: case OP_WORDCHAR:
1.3 misha 1085: set_type_bits(start_bits, cbit_word, table_limit, cd);
1.1 misha 1086: try_next = FALSE;
1087: break;
1088:
1089: /* One or more character type fudges the pointer and restarts, knowing
1090: it will hit a single character type and stop there. */
1091:
1092: case OP_TYPEPLUS:
1093: case OP_TYPEMINPLUS:
1.3 misha 1094: case OP_TYPEPOSPLUS:
1.1 misha 1095: tcode++;
1096: break;
1097:
1098: case OP_TYPEEXACT:
1.5 ! misha 1099: tcode += 1 + IMM2_SIZE;
1.1 misha 1100: break;
1101:
1102: /* Zero or more repeats of character types set the bits and then
1103: try again. */
1104:
1105: case OP_TYPEUPTO:
1106: case OP_TYPEMINUPTO:
1107: case OP_TYPEPOSUPTO:
1.5 ! misha 1108: tcode += IMM2_SIZE; /* Fall through */
1.1 misha 1109:
1110: case OP_TYPESTAR:
1111: case OP_TYPEMINSTAR:
1112: case OP_TYPEPOSSTAR:
1113: case OP_TYPEQUERY:
1114: case OP_TYPEMINQUERY:
1115: case OP_TYPEPOSQUERY:
1116: switch(tcode[1])
1117: {
1.3 misha 1118: default:
1.1 misha 1119: case OP_ANY:
1120: case OP_ALLANY:
1121: return SSB_FAIL;
1122:
1.3 misha 1123: case OP_HSPACE:
1124: SET_BIT(0x09);
1125: SET_BIT(0x20);
1.5 ! misha 1126: #ifdef COMPILE_PCRE8
! 1127: if (utf)
1.3 misha 1128: {
1.5 ! misha 1129: #ifdef COMPILE_PCRE8
1.3 misha 1130: SET_BIT(0xC2); /* For U+00A0 */
1131: SET_BIT(0xE1); /* For U+1680, U+180E */
1132: SET_BIT(0xE2); /* For U+2000 - U+200A, U+202F, U+205F */
1133: SET_BIT(0xE3); /* For U+3000 */
1.5 ! misha 1134: #endif
! 1135: #ifdef COMPILE_PCRE16
! 1136: SET_BIT(0xA0);
! 1137: SET_BIT(0xFF); /* For characters > 255 */
! 1138: #endif
1.3 misha 1139: }
1.5 ! misha 1140: else
! 1141: #endif /* SUPPORT_UTF */
! 1142: SET_BIT(0xA0);
1.3 misha 1143: break;
1144:
1145: case OP_ANYNL:
1146: case OP_VSPACE:
1147: SET_BIT(0x0A);
1148: SET_BIT(0x0B);
1149: SET_BIT(0x0C);
1150: SET_BIT(0x0D);
1.5 ! misha 1151: #ifdef COMPILE_PCRE8
! 1152: if (utf)
1.3 misha 1153: {
1.5 ! misha 1154: #ifdef COMPILE_PCRE8
1.3 misha 1155: SET_BIT(0xC2); /* For U+0085 */
1156: SET_BIT(0xE2); /* For U+2028, U+2029 */
1.5 ! misha 1157: #endif
! 1158: #ifdef COMPILE_PCRE16
! 1159: SET_BIT(0x85);
! 1160: SET_BIT(0xFF); /* For characters > 255 */
! 1161: #endif
1.3 misha 1162: }
1.5 ! misha 1163: else
! 1164: #endif /* SUPPORT_UTF */
! 1165: SET_BIT(0x85);
1.3 misha 1166: break;
1167:
1.1 misha 1168: case OP_NOT_DIGIT:
1.3 misha 1169: set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1.1 misha 1170: break;
1171:
1172: case OP_DIGIT:
1.3 misha 1173: set_type_bits(start_bits, cbit_digit, table_limit, cd);
1.1 misha 1174: break;
1175:
1176: /* The cbit_space table has vertical tab as whitespace; we have to
1.3 misha 1177: ensure it gets set as not whitespace. */
1.1 misha 1178:
1179: case OP_NOT_WHITESPACE:
1.3 misha 1180: set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1181: start_bits[1] |= 0x08;
1.1 misha 1182: break;
1183:
1184: /* The cbit_space table has vertical tab as whitespace; we have to
1.3 misha 1185: avoid setting it. */
1.1 misha 1186:
1187: case OP_WHITESPACE:
1.3 misha 1188: c = start_bits[1]; /* Save in case it was already set */
1189: set_type_bits(start_bits, cbit_space, table_limit, cd);
1190: start_bits[1] = (start_bits[1] & ~0x08) | c;
1.1 misha 1191: break;
1192:
1193: case OP_NOT_WORDCHAR:
1.3 misha 1194: set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1.1 misha 1195: break;
1196:
1197: case OP_WORDCHAR:
1.3 misha 1198: set_type_bits(start_bits, cbit_word, table_limit, cd);
1.1 misha 1199: break;
1200: }
1201:
1202: tcode += 2;
1203: break;
1204:
1205: /* Character class where all the information is in a bit map: set the
1206: bits and either carry on or not, according to the repeat count. If it was
1207: a negative class, and we are operating with UTF-8 characters, any byte
1208: with a value >= 0xc4 is a potentially valid starter because it starts a
1209: character with a value > 255. */
1210:
1211: case OP_NCLASS:
1.5 ! misha 1212: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
! 1213: if (utf)
1.1 misha 1214: {
1215: start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
1216: memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
1217: }
1218: #endif
1.5 ! misha 1219: #ifdef COMPILE_PCRE16
! 1220: SET_BIT(0xFF); /* For characters > 255 */
! 1221: #endif
1.1 misha 1222: /* Fall through */
1223:
1224: case OP_CLASS:
1225: {
1.5 ! misha 1226: pcre_uint8 *map;
1.1 misha 1227: tcode++;
1.5 ! misha 1228: map = (pcre_uint8 *)tcode;
1.1 misha 1229:
1230: /* In UTF-8 mode, the bits in a bit map correspond to character
1231: values, not to byte values. However, the bit map we are constructing is
1232: for byte values. So we have to do a conversion for characters whose
1233: value is > 127. In fact, there are only two possible starting bytes for
1234: characters in the range 128 - 255. */
1235:
1.5 ! misha 1236: #if defined SUPPORT_UTF && defined COMPILE_PCRE8
! 1237: if (utf)
1.1 misha 1238: {
1.5 ! misha 1239: for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1.1 misha 1240: for (c = 128; c < 256; c++)
1241: {
1.5 ! misha 1242: if ((map[c/8] && (1 << (c&7))) != 0)
1.1 misha 1243: {
1244: int d = (c >> 6) | 0xc0; /* Set bit for this starter */
1245: start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
1246: c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
1247: }
1248: }
1249: }
1250: else
1251: #endif
1252: {
1.5 ! misha 1253: /* In non-UTF-8 mode, the two bit maps are completely compatible. */
! 1254: for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1.1 misha 1255: }
1256:
1.5 ! misha 1257: /* Advance past the bit map, and act on what follows. For a zero
! 1258: minimum repeat, continue; otherwise stop processing. */
1.1 misha 1259:
1.5 ! misha 1260: tcode += 32 / sizeof(pcre_uchar);
1.1 misha 1261: switch (*tcode)
1262: {
1263: case OP_CRSTAR:
1264: case OP_CRMINSTAR:
1265: case OP_CRQUERY:
1266: case OP_CRMINQUERY:
1267: tcode++;
1268: break;
1269:
1270: case OP_CRRANGE:
1271: case OP_CRMINRANGE:
1.5 ! misha 1272: if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1.1 misha 1273: else try_next = FALSE;
1274: break;
1275:
1276: default:
1277: try_next = FALSE;
1278: break;
1279: }
1280: }
1281: break; /* End of bitmap class handling */
1282:
1283: } /* End of switch */
1284: } /* End of try_next loop */
1285:
1286: code += GET(code, 1); /* Advance to next branch */
1287: }
1288: while (*code == OP_ALT);
1289: return yield;
1290: }
1291:
1292:
1293:
1.5 ! misha 1294:
! 1295:
1.1 misha 1296: /*************************************************
1297: * Study a compiled expression *
1298: *************************************************/
1299:
1300: /* This function is handed a compiled expression that it must study to produce
1.5 ! misha 1301: information that will speed up the matching. It returns a pcre[16]_extra block
1.1 misha 1302: which then gets handed back to pcre_exec().
1303:
1304: Arguments:
1305: re points to the compiled expression
1306: options contains option bits
1307: errorptr points to where to place error messages;
1308: set NULL unless error
1309:
1.5 ! misha 1310: Returns: pointer to a pcre[16]_extra block, with study_data filled in and
! 1311: the appropriate flags set;
1.1 misha 1312: NULL on error or if no optimization possible
1313: */
1314:
1.5 ! misha 1315: #ifdef COMPILE_PCRE8
1.2 misha 1316: PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1.1 misha 1317: pcre_study(const pcre *external_re, int options, const char **errorptr)
1.5 ! misha 1318: #else
! 1319: PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
! 1320: pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
! 1321: #endif
1.1 misha 1322: {
1.3 misha 1323: int min;
1324: BOOL bits_set = FALSE;
1.5 ! misha 1325: pcre_uint8 start_bits[32];
! 1326: PUBL(extra) *extra = NULL;
1.1 misha 1327: pcre_study_data *study;
1.5 ! misha 1328: const pcre_uint8 *tables;
! 1329: pcre_uchar *code;
1.1 misha 1330: compile_data compile_block;
1.5 ! misha 1331: const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1.1 misha 1332:
1333: *errorptr = NULL;
1334:
1335: if (re == NULL || re->magic_number != MAGIC_NUMBER)
1336: {
1337: *errorptr = "argument is not a compiled regular expression";
1338: return NULL;
1339: }
1340:
1.5 ! misha 1341: if ((re->flags & PCRE_MODE) == 0)
! 1342: {
! 1343: #ifdef COMPILE_PCRE8
! 1344: *errorptr = "argument is compiled in 16 bit mode";
! 1345: #else
! 1346: *errorptr = "argument is compiled in 8 bit mode";
! 1347: #endif
! 1348: return NULL;
! 1349: }
! 1350:
1.1 misha 1351: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1352: {
1353: *errorptr = "unknown or incorrect option bit(s) set";
1354: return NULL;
1355: }
1356:
1.5 ! misha 1357: code = (pcre_uchar *)re + re->name_table_offset +
1.1 misha 1358: (re->name_count * re->name_entry_size);
1359:
1360: /* For an anchored pattern, or an unanchored pattern that has a first char, or
1.3 misha 1361: a multiline pattern that matches only at "line starts", there is no point in
1362: seeking a list of starting bytes. */
1363:
1364: if ((re->options & PCRE_ANCHORED) == 0 &&
1365: (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1366: {
1.5 ! misha 1367: int rc;
! 1368:
1.3 misha 1369: /* Set the character tables in the block that is passed around */
1.1 misha 1370:
1.3 misha 1371: tables = re->tables;
1.5 ! misha 1372:
! 1373: #ifdef COMPILE_PCRE8
1.3 misha 1374: if (tables == NULL)
1375: (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1376: (void *)(&tables));
1.5 ! misha 1377: #else
! 1378: if (tables == NULL)
! 1379: (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
! 1380: (void *)(&tables));
! 1381: #endif
1.3 misha 1382:
1383: compile_block.lcc = tables + lcc_offset;
1384: compile_block.fcc = tables + fcc_offset;
1385: compile_block.cbits = tables + cbits_offset;
1386: compile_block.ctypes = tables + ctypes_offset;
1387:
1388: /* See if we can find a fixed set of initial characters for the pattern. */
1389:
1.5 ! misha 1390: memset(start_bits, 0, 32 * sizeof(pcre_uint8));
! 1391: rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
! 1392: &compile_block);
! 1393: bits_set = rc == SSB_DONE;
! 1394: if (rc == SSB_UNKNOWN)
! 1395: {
! 1396: *errorptr = "internal error: opcode not recognized";
! 1397: return NULL;
! 1398: }
1.3 misha 1399: }
1400:
1401: /* Find the minimum length of subject string. */
1402:
1.5 ! misha 1403: switch(min = find_minlength(code, code, re->options, 0))
! 1404: {
! 1405: case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
! 1406: case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
! 1407: default: break;
! 1408: }
1.1 misha 1409:
1.5 ! misha 1410: /* If a set of starting bytes has been identified, or if the minimum length is
! 1411: greater than zero, or if JIT optimization has been requested, get a
! 1412: pcre[16]_extra block and a pcre_study_data block. The study data is put in the
! 1413: latter, which is pointed to by the former, which may also get additional data
! 1414: set later by the calling program. At the moment, the size of pcre_study_data
! 1415: is fixed. We nevertheless save it in a field for returning via the
! 1416: pcre_fullinfo() function so that if it becomes variable in the future,
! 1417: we don't have to change that code. */
! 1418:
! 1419: if (bits_set || min > 0
! 1420: #ifdef SUPPORT_JIT
! 1421: || (options & PCRE_STUDY_JIT_COMPILE) != 0
! 1422: #endif
! 1423: )
! 1424: {
! 1425: extra = (PUBL(extra) *)(PUBL(malloc))
! 1426: (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
! 1427: if (extra == NULL)
! 1428: {
! 1429: *errorptr = "failed to get memory";
! 1430: return NULL;
! 1431: }
! 1432:
! 1433: study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
! 1434: extra->flags = PCRE_EXTRA_STUDY_DATA;
! 1435: extra->study_data = study;
! 1436:
! 1437: study->size = sizeof(pcre_study_data);
! 1438: study->flags = 0;
! 1439:
! 1440: /* Set the start bits always, to avoid unset memory errors if the
! 1441: study data is written to a file, but set the flag only if any of the bits
! 1442: are set, to save time looking when none are. */
1.1 misha 1443:
1.5 ! misha 1444: if (bits_set)
! 1445: {
! 1446: study->flags |= PCRE_STUDY_MAPPED;
! 1447: memcpy(study->start_bits, start_bits, sizeof(start_bits));
! 1448: }
! 1449: else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
! 1450:
! 1451: #ifdef PCRE_DEBUG
! 1452: if (bits_set)
! 1453: {
! 1454: pcre_uint8 *ptr = start_bits;
! 1455: int i;
! 1456:
! 1457: printf("Start bits:\n");
! 1458: for (i = 0; i < 32; i++)
! 1459: printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
! 1460: }
! 1461: #endif
1.1 misha 1462:
1.5 ! misha 1463: /* Always set the minlength value in the block, because the JIT compiler
! 1464: makes use of it. However, don't set the bit unless the length is greater than
! 1465: zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
! 1466: checking the zero case. */
1.1 misha 1467:
1.5 ! misha 1468: if (min > 0)
! 1469: {
! 1470: study->flags |= PCRE_STUDY_MINLEN;
! 1471: study->minlength = min;
! 1472: }
! 1473: else study->minlength = 0;
1.1 misha 1474:
1.5 ! misha 1475: /* If JIT support was compiled and requested, attempt the JIT compilation.
! 1476: If no starting bytes were found, and the minimum length is zero, and JIT
! 1477: compilation fails, abandon the extra block and return NULL. */
! 1478:
! 1479: #ifdef SUPPORT_JIT
! 1480: extra->executable_jit = NULL;
! 1481: if ((options & PCRE_STUDY_JIT_COMPILE) != 0) PRIV(jit_compile)(re, extra);
! 1482: if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0)
! 1483: {
! 1484: #ifdef COMPILE_PCRE8
! 1485: pcre_free_study(extra);
! 1486: #endif
! 1487: #ifdef COMPILE_PCRE16
! 1488: pcre16_free_study(extra);
! 1489: #endif
! 1490: extra = NULL;
! 1491: }
! 1492: #endif
1.1 misha 1493: }
1494:
1.5 ! misha 1495: return extra;
! 1496: }
1.1 misha 1497:
1.3 misha 1498:
1.5 ! misha 1499: /*************************************************
! 1500: * Free the study data *
! 1501: *************************************************/
! 1502:
! 1503: /* This function frees the memory that was obtained by pcre_study().
1.3 misha 1504:
1.5 ! misha 1505: Argument: a pointer to the pcre[16]_extra block
! 1506: Returns: nothing
! 1507: */
1.1 misha 1508:
1.5 ! misha 1509: #ifdef COMPILE_PCRE8
! 1510: PCRE_EXP_DEFN void
! 1511: pcre_free_study(pcre_extra *extra)
! 1512: #else
! 1513: PCRE_EXP_DEFN void
! 1514: pcre16_free_study(pcre16_extra *extra)
! 1515: #endif
! 1516: {
! 1517: if (extra == NULL)
! 1518: return;
! 1519: #ifdef SUPPORT_JIT
! 1520: if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
! 1521: extra->executable_jit != NULL)
! 1522: PRIV(jit_free)(extra->executable_jit);
! 1523: #endif
! 1524: PUBL(free)(extra);
1.1 misha 1525: }
1526:
1527: /* End of pcre_study.c */
E-mail: