Annotation of win32/pcre/pcre_study.c, revision 1.2
1.1 misha 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /* PCRE is a library of functions to support regular expressions whose syntax
6: and semantics are as close as possible to those of the Perl 5 language.
7:
8: Written by Philip Hazel
9: Copyright (c) 1997-2008 University of Cambridge
10:
11: -----------------------------------------------------------------------------
12: Redistribution and use in source and binary forms, with or without
13: modification, are permitted provided that the following conditions are met:
14:
15: * Redistributions of source code must retain the above copyright notice,
16: this list of conditions and the following disclaimer.
17:
18: * Redistributions in binary form must reproduce the above copyright
19: notice, this list of conditions and the following disclaimer in the
20: documentation and/or other materials provided with the distribution.
21:
22: * Neither the name of the University of Cambridge nor the names of its
23: contributors may be used to endorse or promote products derived from
24: this software without specific prior written permission.
25:
26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36: POSSIBILITY OF SUCH DAMAGE.
37: -----------------------------------------------------------------------------
38: */
39:
40:
41: /* This module contains the external function pcre_study(), along with local
42: supporting functions. */
43:
44:
45: #ifdef HAVE_CONFIG_H
46: #include "config.h"
47: #endif
48:
49: #include "pcre_internal.h"
50:
51:
52: /* Returns from set_start_bits() */
53:
54: enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE };
55:
56:
57: /*************************************************
58: * Set a bit and maybe its alternate case *
59: *************************************************/
60:
61: /* Given a character, set its bit in the table, and also the bit for the other
62: version of a letter if we are caseless.
63:
64: Arguments:
65: start_bits points to the bit map
66: c is the character
67: caseless the caseless flag
68: cd the block with char table pointers
69:
70: Returns: nothing
71: */
72:
73: static void
74: set_bit(uschar *start_bits, unsigned int c, BOOL caseless, compile_data *cd)
75: {
76: start_bits[c/8] |= (1 << (c&7));
77: if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
78: start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
79: }
80:
81:
82:
83: /*************************************************
84: * Create bitmap of starting bytes *
85: *************************************************/
86:
87: /* This function scans a compiled unanchored expression recursively and
88: attempts to build a bitmap of the set of possible starting bytes. As time goes
89: by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
90: useful for parenthesized groups in patterns such as (a*)b where the group
91: provides some optional starting bytes but scanning must continue at the outer
92: level to find at least one mandatory byte. At the outermost level, this
93: function fails unless the result is SSB_DONE.
94:
95: Arguments:
96: code points to an expression
97: start_bits points to a 32-byte table, initialized to 0
98: caseless the current state of the caseless flag
99: utf8 TRUE if in UTF-8 mode
100: cd the block with char table pointers
101:
102: Returns: SSB_FAIL => Failed to find any starting bytes
103: SSB_DONE => Found mandatory starting bytes
104: SSB_CONTINUE => Found optional starting bytes
105: */
106:
107: static int
108: set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
109: BOOL utf8, compile_data *cd)
110: {
111: register int c;
112: int yield = SSB_DONE;
113:
114: #if 0
115: /* ========================================================================= */
116: /* The following comment and code was inserted in January 1999. In May 2006,
117: when it was observed to cause compiler warnings about unused values, I took it
118: out again. If anybody is still using OS/2, they will have to put it back
119: manually. */
120:
121: /* This next statement and the later reference to dummy are here in order to
122: trick the optimizer of the IBM C compiler for OS/2 into generating correct
123: code. Apparently IBM isn't going to fix the problem, and we would rather not
124: disable optimization (in this module it actually makes a big difference, and
125: the pcre module can use all the optimization it can get). */
126:
127: volatile int dummy;
128: /* ========================================================================= */
129: #endif
130:
131: do
132: {
133: const uschar *tcode = code + (((int)*code == OP_CBRA)? 3:1) + LINK_SIZE;
134: BOOL try_next = TRUE;
135:
136: while (try_next) /* Loop for items in this branch */
137: {
138: int rc;
139: switch(*tcode)
140: {
141: /* Fail if we reach something we don't understand */
142:
143: default:
144: return SSB_FAIL;
145:
146: /* If we hit a bracket or a positive lookahead assertion, recurse to set
147: bits from within the subpattern. If it can't find anything, we have to
148: give up. If it finds some mandatory character(s), we are done for this
149: branch. Otherwise, carry on scanning after the subpattern. */
150:
151: case OP_BRA:
152: case OP_SBRA:
153: case OP_CBRA:
154: case OP_SCBRA:
155: case OP_ONCE:
156: case OP_ASSERT:
157: rc = set_start_bits(tcode, start_bits, caseless, utf8, cd);
158: if (rc == SSB_FAIL) return SSB_FAIL;
159: if (rc == SSB_DONE) try_next = FALSE; else
160: {
161: do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
162: tcode += 1 + LINK_SIZE;
163: }
164: break;
165:
166: /* If we hit ALT or KET, it means we haven't found anything mandatory in
167: this branch, though we might have found something optional. For ALT, we
168: continue with the next alternative, but we have to arrange that the final
169: result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
170: return SSB_CONTINUE: if this is the top level, that indicates failure,
171: but after a nested subpattern, it causes scanning to continue. */
172:
173: case OP_ALT:
174: yield = SSB_CONTINUE;
175: try_next = FALSE;
176: break;
177:
178: case OP_KET:
179: case OP_KETRMAX:
180: case OP_KETRMIN:
181: return SSB_CONTINUE;
182:
183: /* Skip over callout */
184:
185: case OP_CALLOUT:
186: tcode += 2 + 2*LINK_SIZE;
187: break;
188:
189: /* Skip over lookbehind and negative lookahead assertions */
190:
191: case OP_ASSERT_NOT:
192: case OP_ASSERTBACK:
193: case OP_ASSERTBACK_NOT:
194: do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
195: tcode += 1 + LINK_SIZE;
196: break;
197:
198: /* Skip over an option setting, changing the caseless flag */
199:
200: case OP_OPT:
201: caseless = (tcode[1] & PCRE_CASELESS) != 0;
202: tcode += 2;
203: break;
204:
205: /* BRAZERO does the bracket, but carries on. */
206:
207: case OP_BRAZERO:
208: case OP_BRAMINZERO:
209: if (set_start_bits(++tcode, start_bits, caseless, utf8, cd) == SSB_FAIL)
210: return SSB_FAIL;
211: /* =========================================================================
212: See the comment at the head of this function concerning the next line,
213: which was an old fudge for the benefit of OS/2.
214: dummy = 1;
215: ========================================================================= */
216: do tcode += GET(tcode,1); while (*tcode == OP_ALT);
217: tcode += 1 + LINK_SIZE;
218: break;
219:
220: /* SKIPZERO skips the bracket. */
221:
222: case OP_SKIPZERO:
1.2 ! misha 223: tcode++;
1.1 misha 224: do tcode += GET(tcode,1); while (*tcode == OP_ALT);
225: tcode += 1 + LINK_SIZE;
226: break;
227:
228: /* Single-char * or ? sets the bit and tries the next item */
229:
230: case OP_STAR:
231: case OP_MINSTAR:
232: case OP_POSSTAR:
233: case OP_QUERY:
234: case OP_MINQUERY:
235: case OP_POSQUERY:
236: set_bit(start_bits, tcode[1], caseless, cd);
237: tcode += 2;
238: #ifdef SUPPORT_UTF8
239: if (utf8 && tcode[-1] >= 0xc0)
240: tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
241: #endif
242: break;
243:
244: /* Single-char upto sets the bit and tries the next */
245:
246: case OP_UPTO:
247: case OP_MINUPTO:
248: case OP_POSUPTO:
249: set_bit(start_bits, tcode[3], caseless, cd);
250: tcode += 4;
251: #ifdef SUPPORT_UTF8
252: if (utf8 && tcode[-1] >= 0xc0)
253: tcode += _pcre_utf8_table4[tcode[-1] & 0x3f];
254: #endif
255: break;
256:
257: /* At least one single char sets the bit and stops */
258:
259: case OP_EXACT: /* Fall through */
260: tcode += 2;
261:
262: case OP_CHAR:
263: case OP_CHARNC:
264: case OP_PLUS:
265: case OP_MINPLUS:
266: case OP_POSPLUS:
267: set_bit(start_bits, tcode[1], caseless, cd);
268: try_next = FALSE;
269: break;
270:
271: /* Single character type sets the bits and stops */
272:
273: case OP_NOT_DIGIT:
274: for (c = 0; c < 32; c++)
275: start_bits[c] |= ~cd->cbits[c+cbit_digit];
276: try_next = FALSE;
277: break;
278:
279: case OP_DIGIT:
280: for (c = 0; c < 32; c++)
281: start_bits[c] |= cd->cbits[c+cbit_digit];
282: try_next = FALSE;
283: break;
284:
285: /* The cbit_space table has vertical tab as whitespace; we have to
286: discard it. */
287:
288: case OP_NOT_WHITESPACE:
289: for (c = 0; c < 32; c++)
290: {
291: int d = cd->cbits[c+cbit_space];
292: if (c == 1) d &= ~0x08;
293: start_bits[c] |= ~d;
294: }
295: try_next = FALSE;
296: break;
297:
298: /* The cbit_space table has vertical tab as whitespace; we have to
299: discard it. */
300:
301: case OP_WHITESPACE:
302: for (c = 0; c < 32; c++)
303: {
304: int d = cd->cbits[c+cbit_space];
305: if (c == 1) d &= ~0x08;
306: start_bits[c] |= d;
307: }
308: try_next = FALSE;
309: break;
310:
311: case OP_NOT_WORDCHAR:
312: for (c = 0; c < 32; c++)
313: start_bits[c] |= ~cd->cbits[c+cbit_word];
314: try_next = FALSE;
315: break;
316:
317: case OP_WORDCHAR:
318: for (c = 0; c < 32; c++)
319: start_bits[c] |= cd->cbits[c+cbit_word];
320: try_next = FALSE;
321: break;
322:
323: /* One or more character type fudges the pointer and restarts, knowing
324: it will hit a single character type and stop there. */
325:
326: case OP_TYPEPLUS:
327: case OP_TYPEMINPLUS:
328: tcode++;
329: break;
330:
331: case OP_TYPEEXACT:
332: tcode += 3;
333: break;
334:
335: /* Zero or more repeats of character types set the bits and then
336: try again. */
337:
338: case OP_TYPEUPTO:
339: case OP_TYPEMINUPTO:
340: case OP_TYPEPOSUPTO:
341: tcode += 2; /* Fall through */
342:
343: case OP_TYPESTAR:
344: case OP_TYPEMINSTAR:
345: case OP_TYPEPOSSTAR:
346: case OP_TYPEQUERY:
347: case OP_TYPEMINQUERY:
348: case OP_TYPEPOSQUERY:
349: switch(tcode[1])
350: {
351: case OP_ANY:
352: case OP_ALLANY:
353: return SSB_FAIL;
354:
355: case OP_NOT_DIGIT:
356: for (c = 0; c < 32; c++)
357: start_bits[c] |= ~cd->cbits[c+cbit_digit];
358: break;
359:
360: case OP_DIGIT:
361: for (c = 0; c < 32; c++)
362: start_bits[c] |= cd->cbits[c+cbit_digit];
363: break;
364:
365: /* The cbit_space table has vertical tab as whitespace; we have to
366: discard it. */
367:
368: case OP_NOT_WHITESPACE:
369: for (c = 0; c < 32; c++)
370: {
371: int d = cd->cbits[c+cbit_space];
372: if (c == 1) d &= ~0x08;
373: start_bits[c] |= ~d;
374: }
375: break;
376:
377: /* The cbit_space table has vertical tab as whitespace; we have to
378: discard it. */
379:
380: case OP_WHITESPACE:
381: for (c = 0; c < 32; c++)
382: {
383: int d = cd->cbits[c+cbit_space];
384: if (c == 1) d &= ~0x08;
385: start_bits[c] |= d;
386: }
387: break;
388:
389: case OP_NOT_WORDCHAR:
390: for (c = 0; c < 32; c++)
391: start_bits[c] |= ~cd->cbits[c+cbit_word];
392: break;
393:
394: case OP_WORDCHAR:
395: for (c = 0; c < 32; c++)
396: start_bits[c] |= cd->cbits[c+cbit_word];
397: break;
398: }
399:
400: tcode += 2;
401: break;
402:
403: /* Character class where all the information is in a bit map: set the
404: bits and either carry on or not, according to the repeat count. If it was
405: a negative class, and we are operating with UTF-8 characters, any byte
406: with a value >= 0xc4 is a potentially valid starter because it starts a
407: character with a value > 255. */
408:
409: case OP_NCLASS:
410: #ifdef SUPPORT_UTF8
411: if (utf8)
412: {
413: start_bits[24] |= 0xf0; /* Bits for 0xc4 - 0xc8 */
414: memset(start_bits+25, 0xff, 7); /* Bits for 0xc9 - 0xff */
415: }
416: #endif
417: /* Fall through */
418:
419: case OP_CLASS:
420: {
421: tcode++;
422:
423: /* In UTF-8 mode, the bits in a bit map correspond to character
424: values, not to byte values. However, the bit map we are constructing is
425: for byte values. So we have to do a conversion for characters whose
426: value is > 127. In fact, there are only two possible starting bytes for
427: characters in the range 128 - 255. */
428:
429: #ifdef SUPPORT_UTF8
430: if (utf8)
431: {
432: for (c = 0; c < 16; c++) start_bits[c] |= tcode[c];
433: for (c = 128; c < 256; c++)
434: {
435: if ((tcode[c/8] && (1 << (c&7))) != 0)
436: {
437: int d = (c >> 6) | 0xc0; /* Set bit for this starter */
438: start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
439: c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
440: }
441: }
442: }
443:
444: /* In non-UTF-8 mode, the two bit maps are completely compatible. */
445:
446: else
447: #endif
448: {
449: for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
450: }
451:
452: /* Advance past the bit map, and act on what follows */
453:
454: tcode += 32;
455: switch (*tcode)
456: {
457: case OP_CRSTAR:
458: case OP_CRMINSTAR:
459: case OP_CRQUERY:
460: case OP_CRMINQUERY:
461: tcode++;
462: break;
463:
464: case OP_CRRANGE:
465: case OP_CRMINRANGE:
466: if (((tcode[1] << 8) + tcode[2]) == 0) tcode += 5;
467: else try_next = FALSE;
468: break;
469:
470: default:
471: try_next = FALSE;
472: break;
473: }
474: }
475: break; /* End of bitmap class handling */
476:
477: } /* End of switch */
478: } /* End of try_next loop */
479:
480: code += GET(code, 1); /* Advance to next branch */
481: }
482: while (*code == OP_ALT);
483: return yield;
484: }
485:
486:
487:
488: /*************************************************
489: * Study a compiled expression *
490: *************************************************/
491:
492: /* This function is handed a compiled expression that it must study to produce
493: information that will speed up the matching. It returns a pcre_extra block
494: which then gets handed back to pcre_exec().
495:
496: Arguments:
497: re points to the compiled expression
498: options contains option bits
499: errorptr points to where to place error messages;
500: set NULL unless error
501:
502: Returns: pointer to a pcre_extra block, with study_data filled in and the
503: appropriate flag set;
504: NULL on error or if no optimization possible
505: */
506:
1.2 ! misha 507: PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
1.1 misha 508: pcre_study(const pcre *external_re, int options, const char **errorptr)
509: {
510: uschar start_bits[32];
511: pcre_extra *extra;
512: pcre_study_data *study;
513: const uschar *tables;
514: uschar *code;
515: compile_data compile_block;
516: const real_pcre *re = (const real_pcre *)external_re;
517:
518: *errorptr = NULL;
519:
520: if (re == NULL || re->magic_number != MAGIC_NUMBER)
521: {
522: *errorptr = "argument is not a compiled regular expression";
523: return NULL;
524: }
525:
526: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
527: {
528: *errorptr = "unknown or incorrect option bit(s) set";
529: return NULL;
530: }
531:
532: code = (uschar *)re + re->name_table_offset +
533: (re->name_count * re->name_entry_size);
534:
535: /* For an anchored pattern, or an unanchored pattern that has a first char, or
536: a multiline pattern that matches only at "line starts", no further processing
537: at present. */
538:
539: if ((re->options & PCRE_ANCHORED) != 0 ||
540: (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
541: return NULL;
542:
543: /* Set the character tables in the block that is passed around */
544:
545: tables = re->tables;
546: if (tables == NULL)
547: (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
548: (void *)(&tables));
549:
550: compile_block.lcc = tables + lcc_offset;
551: compile_block.fcc = tables + fcc_offset;
552: compile_block.cbits = tables + cbits_offset;
553: compile_block.ctypes = tables + ctypes_offset;
554:
555: /* See if we can find a fixed set of initial characters for the pattern. */
556:
557: memset(start_bits, 0, 32 * sizeof(uschar));
558: if (set_start_bits(code, start_bits, (re->options & PCRE_CASELESS) != 0,
559: (re->options & PCRE_UTF8) != 0, &compile_block) != SSB_DONE) return NULL;
560:
561: /* Get a pcre_extra block and a pcre_study_data block. The study data is put in
562: the latter, which is pointed to by the former, which may also get additional
563: data set later by the calling program. At the moment, the size of
564: pcre_study_data is fixed. We nevertheless save it in a field for returning via
565: the pcre_fullinfo() function so that if it becomes variable in the future, we
566: don't have to change that code. */
567:
568: extra = (pcre_extra *)(pcre_malloc)
569: (sizeof(pcre_extra) + sizeof(pcre_study_data));
570:
571: if (extra == NULL)
572: {
573: *errorptr = "failed to get memory";
574: return NULL;
575: }
576:
577: study = (pcre_study_data *)((char *)extra + sizeof(pcre_extra));
578: extra->flags = PCRE_EXTRA_STUDY_DATA;
579: extra->study_data = study;
580:
581: study->size = sizeof(pcre_study_data);
582: study->options = PCRE_STUDY_MAPPED;
583: memcpy(study->start_bits, start_bits, sizeof(start_bits));
584:
585: return extra;
586: }
587:
588: /* End of pcre_study.c */
E-mail: