Annotation of parser3/src/lib/pcre/study.c, revision 1.1.14.1
1.1 paf 1: /*************************************************
2: * Perl-Compatible Regular Expressions *
3: *************************************************/
4:
5: /*
6: This is a library of functions to support regular expressions whose syntax
7: and semantics are as close as possible to those of the Perl 5 language. See
8: the file Tech.Notes for some information on the internals.
9:
10: Written by: Philip Hazel <ph10@cam.ac.uk>
11:
12: Copyright (c) 1997-1999 University of Cambridge
13:
14: -----------------------------------------------------------------------------
15: Permission is granted to anyone to use this software for any purpose on any
16: computer system, and to redistribute it freely, subject to the following
17: restrictions:
18:
19: 1. This software is distributed in the hope that it will be useful,
20: but WITHOUT ANY WARRANTY; without even the implied warranty of
21: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22:
23: 2. The origin of this software must not be misrepresented, either by
24: explicit claim or by omission.
25:
26: 3. Altered versions must be plainly marked as such, and must not be
27: misrepresented as being the original software.
28:
29: 4. If PCRE is embedded in any software that is released under the GNU
30: General Purpose Licence (GPL), then the terms of that licence shall
31: supersede any condition above with which it is incompatible.
32: -----------------------------------------------------------------------------
33: */
34:
35:
36: /* Include the internals header, which itself includes Standard C headers plus
37: the external pcre header. */
38:
39: #include "internal.h"
40:
41:
42:
43: /*************************************************
44: * Set a bit and maybe its alternate case *
45: *************************************************/
46:
47: /* Given a character, set its bit in the table, and also the bit for the other
48: version of a letter if we are caseless.
49:
50: Arguments:
51: start_bits points to the bit map
52: c is the character
53: caseless the caseless flag
54: cd the block with char table pointers
55:
56: Returns: nothing
57: */
58:
59: static void
60: set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
61: {
62: start_bits[c/8] |= (1 << (c&7));
63: if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
64: start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
65: }
66:
67:
68:
69: /*************************************************
70: * Create bitmap of starting chars *
71: *************************************************/
72:
73: /* This function scans a compiled unanchored expression and attempts to build a
74: bitmap of the set of initial characters. If it can't, it returns FALSE. As time
75: goes by, we may be able to get more clever at doing this.
76:
77: Arguments:
78: code points to an expression
79: start_bits points to a 32-byte table, initialized to 0
80: caseless the current state of the caseless flag
81: cd the block with char table pointers
82:
83: Returns: TRUE if table built, FALSE otherwise
84: */
85:
86: static BOOL
87: set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
88: compile_data *cd)
89: {
90: register int c;
91:
92: /* This next statement and the later reference to dummy are here in order to
93: trick the optimizer of the IBM C compiler for OS/2 into generating correct
94: code. Apparently IBM isn't going to fix the problem, and we would rather not
95: disable optimization (in this module it actually makes a big difference, and
96: the pcre module can use all the optimization it can get). */
97:
98: volatile int dummy;
99:
100: do
101: {
102: const uschar *tcode = code + 3;
103: BOOL try_next = TRUE;
104:
105: while (try_next)
106: {
107: try_next = FALSE;
108:
109: /* If a branch starts with a bracket or a positive lookahead assertion,
110: recurse to set bits from within them. That's all for this branch. */
111:
112: if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
113: {
114: if (!set_start_bits(tcode, start_bits, caseless, cd))
115: return FALSE;
116: }
117:
118: else switch(*tcode)
119: {
120: default:
121: return FALSE;
122:
123: /* Skip over lookbehind and negative lookahead assertions */
124:
125: case OP_ASSERT_NOT:
126: case OP_ASSERTBACK:
127: case OP_ASSERTBACK_NOT:
128: try_next = TRUE;
129: do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
130: tcode += 3;
131: break;
132:
133: /* Skip over an option setting, changing the caseless flag */
134:
135: case OP_OPT:
136: caseless = (tcode[1] & PCRE_CASELESS) != 0;
137: tcode += 2;
138: try_next = TRUE;
139: break;
140:
141: /* BRAZERO does the bracket, but carries on. */
142:
143: case OP_BRAZERO:
144: case OP_BRAMINZERO:
145: if (!set_start_bits(++tcode, start_bits, caseless, cd))
146: return FALSE;
147: dummy = 1;
148: do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
149: tcode += 3;
150: try_next = TRUE;
151: break;
152:
153: /* Single-char * or ? sets the bit and tries the next item */
154:
155: case OP_STAR:
156: case OP_MINSTAR:
157: case OP_QUERY:
158: case OP_MINQUERY:
159: set_bit(start_bits, tcode[1], caseless, cd);
160: tcode += 2;
161: try_next = TRUE;
162: break;
163:
164: /* Single-char upto sets the bit and tries the next */
165:
166: case OP_UPTO:
167: case OP_MINUPTO:
168: set_bit(start_bits, tcode[3], caseless, cd);
169: tcode += 4;
170: try_next = TRUE;
171: break;
172:
173: /* At least one single char sets the bit and stops */
174:
175: case OP_EXACT: /* Fall through */
176: tcode++;
177:
178: case OP_CHARS: /* Fall through */
179: tcode++;
180:
181: case OP_PLUS:
182: case OP_MINPLUS:
183: set_bit(start_bits, tcode[1], caseless, cd);
184: break;
185:
186: /* Single character type sets the bits and stops */
187:
188: case OP_NOT_DIGIT:
189: for (c = 0; c < 32; c++)
190: start_bits[c] |= ~cd->cbits[c+cbit_digit];
191: break;
192:
193: case OP_DIGIT:
194: for (c = 0; c < 32; c++)
195: start_bits[c] |= cd->cbits[c+cbit_digit];
196: break;
197:
198: case OP_NOT_WHITESPACE:
199: for (c = 0; c < 32; c++)
200: start_bits[c] |= ~cd->cbits[c+cbit_space];
201: break;
202:
203: case OP_WHITESPACE:
204: for (c = 0; c < 32; c++)
205: start_bits[c] |= cd->cbits[c+cbit_space];
206: break;
207:
208: case OP_NOT_WORDCHAR:
209: for (c = 0; c < 32; c++)
210: start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
211: break;
212:
213: case OP_WORDCHAR:
214: for (c = 0; c < 32; c++)
215: start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
216: break;
217:
218: /* One or more character type fudges the pointer and restarts, knowing
219: it will hit a single character type and stop there. */
220:
221: case OP_TYPEPLUS:
222: case OP_TYPEMINPLUS:
223: tcode++;
224: try_next = TRUE;
225: break;
226:
227: case OP_TYPEEXACT:
228: tcode += 3;
229: try_next = TRUE;
230: break;
231:
232: /* Zero or more repeats of character types set the bits and then
233: try again. */
234:
235: case OP_TYPEUPTO:
236: case OP_TYPEMINUPTO:
237: tcode += 2; /* Fall through */
238:
239: case OP_TYPESTAR:
240: case OP_TYPEMINSTAR:
241: case OP_TYPEQUERY:
242: case OP_TYPEMINQUERY:
243: switch(tcode[1])
244: {
245: case OP_NOT_DIGIT:
246: for (c = 0; c < 32; c++)
247: start_bits[c] |= ~cd->cbits[c+cbit_digit];
248: break;
249:
250: case OP_DIGIT:
251: for (c = 0; c < 32; c++)
252: start_bits[c] |= cd->cbits[c+cbit_digit];
253: break;
254:
255: case OP_NOT_WHITESPACE:
256: for (c = 0; c < 32; c++)
257: start_bits[c] |= ~cd->cbits[c+cbit_space];
258: break;
259:
260: case OP_WHITESPACE:
261: for (c = 0; c < 32; c++)
262: start_bits[c] |= cd->cbits[c+cbit_space];
263: break;
264:
265: case OP_NOT_WORDCHAR:
266: for (c = 0; c < 32; c++)
267: start_bits[c] |= ~(cd->cbits[c] | cd->cbits[c+cbit_word]);
268: break;
269:
270: case OP_WORDCHAR:
271: for (c = 0; c < 32; c++)
272: start_bits[c] |= (cd->cbits[c] | cd->cbits[c+cbit_word]);
273: break;
274: }
275:
276: tcode += 2;
277: try_next = TRUE;
278: break;
279:
280: /* Character class: set the bits and either carry on or not,
281: according to the repeat count. */
282:
283: case OP_CLASS:
284: {
285: tcode++;
286: for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
287: tcode += 32;
288: switch (*tcode)
289: {
290: case OP_CRSTAR:
291: case OP_CRMINSTAR:
292: case OP_CRQUERY:
293: case OP_CRMINQUERY:
294: tcode++;
295: try_next = TRUE;
296: break;
297:
298: case OP_CRRANGE:
299: case OP_CRMINRANGE:
300: if (((tcode[1] << 8) + tcode[2]) == 0)
301: {
302: tcode += 5;
303: try_next = TRUE;
304: }
305: break;
306: }
307: }
308: break; /* End of class handling */
309:
310: } /* End of switch */
311: } /* End of try_next loop */
312:
313: code += (code[1] << 8) + code[2]; /* Advance to next branch */
314: }
315: while (*code == OP_ALT);
316: return TRUE;
317: }
318:
319:
320:
321: /*************************************************
322: * Study a compiled expression *
323: *************************************************/
324:
325: /* This function is handed a compiled expression that it must study to produce
326: information that will speed up the matching. It returns a pcre_extra block
327: which then gets handed back to pcre_exec().
328:
329: Arguments:
330: re points to the compiled expression
331: options contains option bits
332: errorptr points to where to place error messages;
333: set NULL unless error
334:
335: Returns: pointer to a pcre_extra block,
336: NULL on error or if no optimization possible
337: */
338:
339: pcre_extra *
1.1.14.1! paf 340: pcre_study(const pcre *external_re, int options, const char* *errorptr)
1.1 paf 341: {
342: uschar start_bits[32];
343: real_pcre_extra *extra;
344: const real_pcre *re = (const real_pcre *)external_re;
345: compile_data compile_block;
346:
347: *errorptr = NULL;
348:
349: if (re == NULL || re->magic_number != MAGIC_NUMBER)
350: {
351: *errorptr = "argument is not a compiled regular expression";
352: return NULL;
353: }
354:
355: if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
356: {
357: *errorptr = "unknown or incorrect option bit(s) set";
358: return NULL;
359: }
360:
361: /* For an anchored pattern, or an unchored pattern that has a first char, or a
362: multiline pattern that matches only at "line starts", no further processing at
363: present. */
364:
365: if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
366: return NULL;
367:
368: /* Set the character tables in the block which is passed around */
369:
370: compile_block.lcc = re->tables + lcc_offset;
371: compile_block.fcc = re->tables + fcc_offset;
372: compile_block.cbits = re->tables + cbits_offset;
373: compile_block.ctypes = re->tables + ctypes_offset;
374:
375: /* See if we can find a fixed set of initial characters for the pattern. */
376:
377: memset(start_bits, 0, 32 * sizeof(uschar));
378: if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
379: &compile_block)) return NULL;
380:
381: /* Get an "extra" block and put the information therein. */
382:
383: extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
384:
385: if (extra == NULL)
386: {
387: *errorptr = "failed to get memory";
388: return NULL;
389: }
390:
391: extra->options = PCRE_STUDY_MAPPED;
392: memcpy(extra->start_bits, start_bits, sizeof(start_bits));
393:
394: return (pcre_extra *)extra;
395: }
396:
397: /* End of study.c */
E-mail: