Annotation of parser3/src/lib/pcre/internal.h, revision 1.1.18.2

1.1       paf         1: /*************************************************
                      2: *      Perl-Compatible Regular Expressions       *
                      3: *************************************************/
                      4: 
                      5: 
                      6: /* This is a library of functions to support regular expressions whose syntax
                      7: and semantics are as close as possible to those of the Perl 5 language. See
                      8: the file Tech.Notes for some information on the internals.
                      9: 
                     10: Written by: Philip Hazel <ph10@cam.ac.uk>
                     11: 
                     12:            Copyright (c) 1997-1999 University of Cambridge
                     13: 
                     14: -----------------------------------------------------------------------------
                     15: Permission is granted to anyone to use this software for any purpose on any
                     16: computer system, and to redistribute it freely, subject to the following
                     17: restrictions:
                     18: 
                     19: 1. This software is distributed in the hope that it will be useful,
                     20:    but WITHOUT ANY WARRANTY; without even the implied warranty of
                     21:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
                     22: 
                     23: 2. The origin of this software must not be misrepresented, either by
                     24:    explicit claim or by omission.
                     25: 
                     26: 3. Altered versions must be plainly marked as such, and must not be
                     27:    misrepresented as being the original software.
                     28: 
                     29: 4. If PCRE is embedded in any software that is released under the GNU
                     30:    General Purpose Licence (GPL), then the terms of that licence shall
                     31:    supersede any condition above with which it is incompatible.
                     32: -----------------------------------------------------------------------------
                     33: */
                     34: 
                     35: /* This header contains definitions that are shared between the different
                     36: modules, but which are not relevant to the outside. */
                     37: 
                     38: /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
                     39: define a macro for memmove() if USE_BCOPY is defined. */
                     40: 
                     41: #ifdef USE_BCOPY
                     42: #undef  memmove        /* some systems may have a macro */
                     43: #define memmove(a, b, c) bcopy(b, a, c)
                     44: #endif
                     45: 
                     46: /* Standard C headers plus the external interface definition */
                     47: 
                     48: #include <ctype.h>
                     49: /* #include <limits.h> 
                     50: PAF@design.ru removed this, for it were used only for stupid MAX_INT for
                     51: mimimum searches, changed that to some out-of-reasonable-text-length value
                     52: of 10M
                     53: */
                     54: #define PCRE_MAX_POS (10*0x400*0x400)
                     55: 
                     56: #include <stddef.h>
                     57: #include <stdio.h>
                     58: #include <stdlib.h>
                     59: #include <string.h>
                     60: #include "pcre.h"
                     61: 
                     62: /* In case there is no definition of offsetof() provided - though any proper
                     63: Standard C system should have one. */
                     64: 
                     65: #ifndef offsetof
                     66: #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
                     67: #endif
                     68: 
                     69: /* These are the public options that can change during matching. */
                     70: 
                     71: #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
                     72: 
                     73: /* Private options flags start at the most significant end of the four bytes,
                     74: but skip the top bit so we can use ints for convenience without getting tangled
                     75: with negative values. The public options defined in pcre.h start at the least
                     76: significant end. Make sure they don't overlap, though now that we have expanded
                     77: to four bytes there is plenty of space. */
                     78: 
                     79: #define PCRE_FIRSTSET      0x40000000  /* first_char is set */
                     80: #define PCRE_REQCHSET      0x20000000  /* req_char is set */
1.1.18.2! paf        81: #define PCRE_STARTLINE     0x10000000  /* start after \n for multiline */
1.1       paf        82: #define PCRE_INGROUP       0x08000000  /* compiling inside a group */
                     83: #define PCRE_ICHANGED      0x04000000  /* i option changes within regex */
                     84: 
                     85: /* Options for the "extra" block produced by pcre_study(). */
                     86: 
                     87: #define PCRE_STUDY_MAPPED   0x01     /* a map of starting chars exists */
                     88: 
                     89: /* Masks for identifying the public options which are permitted at compile
                     90: time, run time or study time, respectively. */
                     91: 
                     92: #define PUBLIC_OPTIONS \
                     93:   (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
                     94:    PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY)
                     95: 
                     96: #define PUBLIC_EXEC_OPTIONS \
                     97:   (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY)
                     98: 
                     99: #define PUBLIC_STUDY_OPTIONS 0   /* None defined */
                    100: 
                    101: /* Magic number to provide a small check against being handed junk. */
                    102: 
                    103: #define MAGIC_NUMBER  0x50435245UL   /* 'PCRE' */
                    104: 
                    105: /* Miscellaneous definitions */
                    106: 
                    107: typedef int BOOL;
                    108: 
                    109: #ifndef FALSE
                    110: #define FALSE   0
                    111: #endif
                    112: #ifndef TRUE
                    113: #define TRUE    1
                    114: #endif
                    115: 
                    116: /* These are escaped items that aren't just an encoding of a particular data
                    117: value such as \n. They must have non-zero values, as check_escape() returns
                    118: their negation. Also, they must appear in the same order as in the opcode
                    119: definitions below, up to ESC_z. The final one must be ESC_REF as subsequent
                    120: values are used for \1, \2, \3, etc. There is a test in the code for an escape
                    121: greater than ESC_b and less than ESC_X to detect the types that may be
                    122: repeated. If any new escapes are put in-between that don't consume a character,
                    123: that code will have to change. */
                    124: 
                    125: enum { ESC_A = 1, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w,
                    126:        ESC_Z, ESC_z, ESC_REF };
                    127: 
                    128: /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
                    129: that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
                    130: OP_EOD must correspond in order to the list of escapes immediately above. */
                    131: 
                    132: enum {
                    133:   OP_END,            /* End of pattern */
                    134: 
                    135:   /* Values corresponding to backslashed metacharacters */
                    136: 
                    137:   OP_SOD,            /* Start of data: \A */
                    138:   OP_NOT_WORD_BOUNDARY,  /* \B */
                    139:   OP_WORD_BOUNDARY,      /* \b */
                    140:   OP_NOT_DIGIT,          /* \D */
                    141:   OP_DIGIT,              /* \d */
                    142:   OP_NOT_WHITESPACE,     /* \S */
                    143:   OP_WHITESPACE,         /* \s */
                    144:   OP_NOT_WORDCHAR,       /* \W */
                    145:   OP_WORDCHAR,           /* \w */
                    146:   OP_EODN,           /* End of data or \n at end of data: \Z. */
                    147:   OP_EOD,            /* End of data: \z */
                    148: 
                    149:   OP_OPT,            /* Set runtime options */
                    150:   OP_CIRC,           /* Start of line - varies with multiline switch */
                    151:   OP_DOLL,           /* End of line - varies with multiline switch */
                    152:   OP_ANY,            /* Match any character */
                    153:   OP_CHARS,          /* Match string of characters */
                    154:   OP_NOT,            /* Match anything but the following char */
                    155: 
                    156:   OP_STAR,           /* The maximizing and minimizing versions of */
                    157:   OP_MINSTAR,        /* all these opcodes must come in pairs, with */
                    158:   OP_PLUS,           /* the minimizing one second. */
                    159:   OP_MINPLUS,        /* This first set applies to single characters */
                    160:   OP_QUERY,
                    161:   OP_MINQUERY,
                    162:   OP_UPTO,           /* From 0 to n matches */
                    163:   OP_MINUPTO,
                    164:   OP_EXACT,          /* Exactly n matches */
                    165: 
                    166:   OP_NOTSTAR,        /* The maximizing and minimizing versions of */
                    167:   OP_NOTMINSTAR,     /* all these opcodes must come in pairs, with */
                    168:   OP_NOTPLUS,        /* the minimizing one second. */
                    169:   OP_NOTMINPLUS,     /* This first set applies to "not" single characters */
                    170:   OP_NOTQUERY,
                    171:   OP_NOTMINQUERY,
                    172:   OP_NOTUPTO,        /* From 0 to n matches */
                    173:   OP_NOTMINUPTO,
                    174:   OP_NOTEXACT,       /* Exactly n matches */
                    175: 
                    176:   OP_TYPESTAR,       /* The maximizing and minimizing versions of */
                    177:   OP_TYPEMINSTAR,    /* all these opcodes must come in pairs, with */
                    178:   OP_TYPEPLUS,       /* the minimizing one second. These codes must */
                    179:   OP_TYPEMINPLUS,    /* be in exactly the same order as those above. */
                    180:   OP_TYPEQUERY,      /* This set applies to character types such as \d */
                    181:   OP_TYPEMINQUERY,
                    182:   OP_TYPEUPTO,       /* From 0 to n matches */
                    183:   OP_TYPEMINUPTO,
                    184:   OP_TYPEEXACT,      /* Exactly n matches */
                    185: 
                    186:   OP_CRSTAR,         /* The maximizing and minimizing versions of */
                    187:   OP_CRMINSTAR,      /* all these opcodes must come in pairs, with */
                    188:   OP_CRPLUS,         /* the minimizing one second. These codes must */
                    189:   OP_CRMINPLUS,      /* be in exactly the same order as those above. */
                    190:   OP_CRQUERY,        /* These are for character classes and back refs */
                    191:   OP_CRMINQUERY,
                    192:   OP_CRRANGE,        /* These are different to the three seta above. */
                    193:   OP_CRMINRANGE,
                    194: 
                    195:   OP_CLASS,          /* Match a character class */
                    196:   OP_REF,            /* Match a back reference */
                    197: 
                    198:   OP_ALT,            /* Start of alternation */
                    199:   OP_KET,            /* End of group that doesn't have an unbounded repeat */
                    200:   OP_KETRMAX,        /* These two must remain together and in this */
                    201:   OP_KETRMIN,        /* order. They are for groups the repeat for ever. */
                    202: 
                    203:   /* The assertions must come before ONCE and COND */
                    204: 
                    205:   OP_ASSERT,         /* Positive lookahead */
                    206:   OP_ASSERT_NOT,     /* Negative lookahead */
                    207:   OP_ASSERTBACK,     /* Positive lookbehind */
                    208:   OP_ASSERTBACK_NOT, /* Negative lookbehind */
                    209:   OP_REVERSE,        /* Move pointer back - used in lookbehind assertions */
                    210: 
                    211:   /* ONCE and COND must come after the assertions, with ONCE first, as there's
                    212:   a test for >= ONCE for a subpattern that isn't an assertion. */
                    213: 
                    214:   OP_ONCE,           /* Once matched, don't back up into the subpattern */
                    215:   OP_COND,           /* Conditional group */
                    216:   OP_CREF,           /* Used to hold an extraction string number */
                    217: 
                    218:   OP_BRAZERO,        /* These two must remain together and in this */
                    219:   OP_BRAMINZERO,     /* order. */
                    220: 
                    221:   OP_BRA             /* This and greater values are used for brackets that
                    222:                         extract substrings. */
                    223: };
                    224: 
                    225: /* The highest extraction number. This is limited by the number of opcodes
                    226: left after OP_BRA, i.e. 255 - OP_BRA. We actually set it somewhat lower. */
                    227: 
                    228: #define EXTRACT_MAX  99
                    229: 
                    230: /* The texts of compile-time error messages are defined as macros here so that
                    231: they can be accessed by the POSIX wrapper and converted into error codes.  Yes,
                    232: I could have used error codes in the first place, but didn't feel like changing
                    233: just to accommodate the POSIX wrapper. */
                    234: 
                    235: #define ERR1  "\\ at end of pattern"
                    236: #define ERR2  "\\c at end of pattern"
                    237: #define ERR3  "unrecognized character follows \\"
                    238: #define ERR4  "numbers out of order in {} quantifier"
                    239: #define ERR5  "number too big in {} quantifier"
                    240: #define ERR6  "missing terminating ] for character class"
                    241: #define ERR7  "invalid escape sequence in character class"
                    242: #define ERR8  "range out of order in character class"
                    243: #define ERR9  "nothing to repeat"
                    244: #define ERR10 "operand of unlimited repeat could match the empty string"
                    245: #define ERR11 "internal error: unexpected repeat"
                    246: #define ERR12 "unrecognized character after (?"
                    247: #define ERR13 "too many capturing parenthesized sub-patterns"
                    248: #define ERR14 "missing )"
                    249: #define ERR15 "back reference to non-existent subpattern"
                    250: #define ERR16 "erroffset passed as NULL"
                    251: #define ERR17 "unknown option bit(s) set"
                    252: #define ERR18 "missing ) after comment"
                    253: #define ERR19 "too many sets of parentheses"
                    254: #define ERR20 "regular expression too large"
                    255: #define ERR21 "failed to get memory"
                    256: #define ERR22 "unmatched parentheses"
                    257: #define ERR23 "internal error: code overflow"
                    258: #define ERR24 "unrecognized character after (?<"
                    259: #define ERR25 "lookbehind assertion is not fixed length"
                    260: #define ERR26 "malformed number after (?("
                    261: #define ERR27 "conditional group contains more than two branches"
                    262: #define ERR28 "assertion expected after (?("
                    263: 
                    264: /* All character handling must be done as unsigned characters. Otherwise there
                    265: are problems with top-bit-set characters and functions such as isspace().
                    266: However, we leave the interface to the outside world as char *, because that
                    267: should make things easier for callers. We define a short type for unsigned char
                    268: to save lots of typing. I tried "uchar", but it causes problems on Digital
                    269: Unix, where it is defined in sys/types, so use "uschar" instead. */
                    270: 
                    271: typedef unsigned char uschar;
                    272: 
                    273: /* The real format of the start of the pcre block; the actual code vector
                    274: runs on as long as necessary after the end. */
                    275: 
                    276: typedef struct real_pcre {
                    277:   unsigned long int magic_number;
                    278:   const unsigned char *tables;
                    279:   unsigned long int options;
                    280:   uschar top_bracket;
                    281:   uschar top_backref;
                    282:   uschar first_char;
                    283:   uschar req_char;
                    284:   uschar code[1];
                    285: } real_pcre;
                    286: 
                    287: /* The real format of the extra block returned by pcre_study(). */
                    288: 
                    289: typedef struct real_pcre_extra {
                    290:   uschar options;
                    291:   uschar start_bits[32];
                    292: } real_pcre_extra;
                    293: 
                    294: 
                    295: /* Structure for passing "static" information around between the functions
                    296: doing the compiling, so that they are thread-safe. */
                    297: 
                    298: typedef struct compile_data {
                    299:   const uschar *lcc;            /* Points to lower casing table */
                    300:   const uschar *fcc;            /* Points to case-flipping table */
                    301:   const uschar *cbits;          /* Points to character type table */
                    302:   const uschar *ctypes;         /* Points to table of type maps */
                    303: } compile_data;
                    304: 
                    305: /* Structure for passing "static" information around between the functions
                    306: doing the matching, so that they are thread-safe. */
                    307: 
                    308: typedef struct match_data {
                    309:   int    errorcode;             /* As it says */
                    310:   int   *offset_vector;         /* Offset vector */
                    311:   int    offset_end;            /* One past the end */
                    312:   int    offset_max;            /* The maximum usable for return data */
                    313:   const uschar *lcc;            /* Points to lower casing table */
                    314:   const uschar *ctypes;         /* Points to table of type maps */
                    315:   BOOL   offset_overflow;       /* Set if too many extractions */
                    316:   BOOL   notbol;                /* NOTBOL flag */
                    317:   BOOL   noteol;                /* NOTEOL flag */
                    318:   BOOL   endonly;               /* Dollar not before final \n */
                    319:   BOOL   notempty;              /* Empty string match not wanted */
                    320:   const uschar *start_subject;  /* Start of the subject string */
                    321:   const uschar *end_subject;    /* End of the subject string */
                    322:   const uschar *start_match;    /* Start of this match attempt */
                    323:   const uschar *end_match_ptr;  /* Subject position at end match */
                    324:   int     end_offset_top;       /* Highwater mark at end of match */
                    325: } match_data;
                    326: 
                    327: /* Bit definitions for entries in the pcre_ctypes table. */
                    328: 
                    329: #define ctype_space   0x01
                    330: #define ctype_letter  0x02
                    331: #define ctype_digit   0x04
                    332: #define ctype_xdigit  0x08
                    333: #define ctype_word    0x10   /* alphameric or '_' */
                    334: #define ctype_meta    0x80   /* regexp meta char or zero (end pattern) */
                    335: 
                    336: /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
                    337: of bits for a class map. */
                    338: 
                    339: #define cbit_digit    0      /* for \d */
                    340: #define cbit_word    32      /* for \w */
                    341: #define cbit_space   64      /* for \s */
                    342: #define cbit_length  96      /* Length of the cbits table */
                    343: 
                    344: /* Offsets of the various tables from the base tables pointer, and
                    345: total length. */
                    346: 
                    347: #define lcc_offset      0
                    348: #define fcc_offset    256
                    349: #define cbits_offset  512
                    350: #define ctypes_offset (cbits_offset + cbit_length)
                    351: #define tables_length (ctypes_offset + 256)
                    352: 
                    353: /* End of internal.h */

E-mail: