Annotation of parser3/src/pcre/internal.h, revision 1.2

1.1       paf         1: /*************************************************
                      2: *      Perl-Compatible Regular Expressions       *
                      3: *************************************************/
                      4: 
                      5: 
                      6: /* This is a library of functions to support regular expressions whose syntax
                      7: and semantics are as close as possible to those of the Perl 5 language. See
                      8: the file Tech.Notes for some information on the internals.
                      9: 
                     10: Written by: Philip Hazel <ph10@cam.ac.uk>
                     11: 
                     12:            Copyright (c) 1997-1999 University of Cambridge
                     13: 
                     14: -----------------------------------------------------------------------------
                     15: Permission is granted to anyone to use this software for any purpose on any
                     16: computer system, and to redistribute it freely, subject to the following
                     17: restrictions:
                     18: 
                     19: 1. This software is distributed in the hope that it will be useful,
                     20:    but WITHOUT ANY WARRANTY; without even the implied warranty of
                     21:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
                     22: 
                     23: 2. The origin of this software must not be misrepresented, either by
                     24:    explicit claim or by omission.
                     25: 
                     26: 3. Altered versions must be plainly marked as such, and must not be
                     27:    misrepresented as being the original software.
                     28: 
                     29: 4. If PCRE is embedded in any software that is released under the GNU
                     30:    General Purpose Licence (GPL), then the terms of that licence shall
                     31:    supersede any condition above with which it is incompatible.
                     32: -----------------------------------------------------------------------------
                     33: */
                     34: 
                     35: /* This header contains definitions that are shared between the different
                     36: modules, but which are not relevant to the outside. */
                     37: 
                     38: /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
                     39: define a macro for memmove() if USE_BCOPY is defined. */
                     40: 
                     41: #ifdef USE_BCOPY
                     42: #undef  memmove        /* some systems may have a macro */
                     43: #define memmove(a, b, c) bcopy(b, a, c)
                     44: #endif
                     45: 
                     46: /* Standard C headers plus the external interface definition */
                     47: 
                     48: #include <ctype.h>
                     49: #include <limits.h>
                     50: #include <stddef.h>
                     51: #include <stdio.h>
                     52: #include <stdlib.h>
                     53: #include <string.h>
                     54: #include "pcre.h"
                     55: 
                     56: /* In case there is no definition of offsetof() provided - though any proper
                     57: Standard C system should have one. */
                     58: 
                     59: #ifndef offsetof
                     60: #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
                     61: #endif
                     62: 
                     63: /* These are the public options that can change during matching. */
                     64: 
                     65: #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
                     66: 
                     67: /* Private options flags start at the most significant end of the four bytes,
                     68: but skip the top bit so we can use ints for convenience without getting tangled
                     69: with negative values. The public options defined in pcre.h start at the least
                     70: significant end. Make sure they don't overlap, though now that we have expanded
                     71: to four bytes there is plenty of space. */
                     72: 
                     73: #define PCRE_FIRSTSET      0x40000000  /* first_char is set */
                     74: #define PCRE_REQCHSET      0x20000000  /* req_char is set */
                     75: #define PCRE_STARTLINE     0x10000000  /* start after \n for multiline */
                     76: #define PCRE_INGROUP       0x08000000  /* compiling inside a group */
                     77: #define PCRE_ICHANGED      0x04000000  /* i option changes within regex */
                     78: 
                     79: /* Options for the "extra" block produced by pcre_study(). */
                     80: 
                     81: #define PCRE_STUDY_MAPPED   0x01     /* a map of starting chars exists */
                     82: 
                     83: /* Masks for identifying the public options which are permitted at compile
                     84: time, run time or study time, respectively. */
                     85: 
                     86: #define PUBLIC_OPTIONS \
                     87:   (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
                     88:    PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY)
                     89: 
                     90: #define PUBLIC_EXEC_OPTIONS \
                     91:   (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY)
                     92: 
                     93: #define PUBLIC_STUDY_OPTIONS 0   /* None defined */
                     94: 
                     95: /* Magic number to provide a small check against being handed junk. */
                     96: 
                     97: #define MAGIC_NUMBER  0x50435245UL   /* 'PCRE' */
                     98: 
                     99: /* Miscellaneous definitions */
                    100: 
                    101: typedef int BOOL;
                    102: 
1.2     ! paf       103: #ifndef FALSE
1.1       paf       104: #define FALSE   0
1.2     ! paf       105: #endif
        !           106: #ifndef TRUE
1.1       paf       107: #define TRUE    1
1.2     ! paf       108: #endif
1.1       paf       109: 
                    110: /* These are escaped items that aren't just an encoding of a particular data
                    111: value such as \n. They must have non-zero values, as check_escape() returns
                    112: their negation. Also, they must appear in the same order as in the opcode
                    113: definitions below, up to ESC_z. The final one must be ESC_REF as subsequent
                    114: values are used for \1, \2, \3, etc. There is a test in the code for an escape
                    115: greater than ESC_b and less than ESC_X to detect the types that may be
                    116: repeated. If any new escapes are put in-between that don't consume a character,
                    117: that code will have to change. */
                    118: 
                    119: enum { ESC_A = 1, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w,
                    120:        ESC_Z, ESC_z, ESC_REF };
                    121: 
                    122: /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
                    123: that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
                    124: OP_EOD must correspond in order to the list of escapes immediately above. */
                    125: 
                    126: enum {
                    127:   OP_END,            /* End of pattern */
                    128: 
                    129:   /* Values corresponding to backslashed metacharacters */
                    130: 
                    131:   OP_SOD,            /* Start of data: \A */
                    132:   OP_NOT_WORD_BOUNDARY,  /* \B */
                    133:   OP_WORD_BOUNDARY,      /* \b */
                    134:   OP_NOT_DIGIT,          /* \D */
                    135:   OP_DIGIT,              /* \d */
                    136:   OP_NOT_WHITESPACE,     /* \S */
                    137:   OP_WHITESPACE,         /* \s */
                    138:   OP_NOT_WORDCHAR,       /* \W */
                    139:   OP_WORDCHAR,           /* \w */
                    140:   OP_EODN,           /* End of data or \n at end of data: \Z. */
                    141:   OP_EOD,            /* End of data: \z */
                    142: 
                    143:   OP_OPT,            /* Set runtime options */
                    144:   OP_CIRC,           /* Start of line - varies with multiline switch */
                    145:   OP_DOLL,           /* End of line - varies with multiline switch */
                    146:   OP_ANY,            /* Match any character */
                    147:   OP_CHARS,          /* Match string of characters */
                    148:   OP_NOT,            /* Match anything but the following char */
                    149: 
                    150:   OP_STAR,           /* The maximizing and minimizing versions of */
                    151:   OP_MINSTAR,        /* all these opcodes must come in pairs, with */
                    152:   OP_PLUS,           /* the minimizing one second. */
                    153:   OP_MINPLUS,        /* This first set applies to single characters */
                    154:   OP_QUERY,
                    155:   OP_MINQUERY,
                    156:   OP_UPTO,           /* From 0 to n matches */
                    157:   OP_MINUPTO,
                    158:   OP_EXACT,          /* Exactly n matches */
                    159: 
                    160:   OP_NOTSTAR,        /* The maximizing and minimizing versions of */
                    161:   OP_NOTMINSTAR,     /* all these opcodes must come in pairs, with */
                    162:   OP_NOTPLUS,        /* the minimizing one second. */
                    163:   OP_NOTMINPLUS,     /* This first set applies to "not" single characters */
                    164:   OP_NOTQUERY,
                    165:   OP_NOTMINQUERY,
                    166:   OP_NOTUPTO,        /* From 0 to n matches */
                    167:   OP_NOTMINUPTO,
                    168:   OP_NOTEXACT,       /* Exactly n matches */
                    169: 
                    170:   OP_TYPESTAR,       /* The maximizing and minimizing versions of */
                    171:   OP_TYPEMINSTAR,    /* all these opcodes must come in pairs, with */
                    172:   OP_TYPEPLUS,       /* the minimizing one second. These codes must */
                    173:   OP_TYPEMINPLUS,    /* be in exactly the same order as those above. */
                    174:   OP_TYPEQUERY,      /* This set applies to character types such as \d */
                    175:   OP_TYPEMINQUERY,
                    176:   OP_TYPEUPTO,       /* From 0 to n matches */
                    177:   OP_TYPEMINUPTO,
                    178:   OP_TYPEEXACT,      /* Exactly n matches */
                    179: 
                    180:   OP_CRSTAR,         /* The maximizing and minimizing versions of */
                    181:   OP_CRMINSTAR,      /* all these opcodes must come in pairs, with */
                    182:   OP_CRPLUS,         /* the minimizing one second. These codes must */
                    183:   OP_CRMINPLUS,      /* be in exactly the same order as those above. */
                    184:   OP_CRQUERY,        /* These are for character classes and back refs */
                    185:   OP_CRMINQUERY,
                    186:   OP_CRRANGE,        /* These are different to the three seta above. */
                    187:   OP_CRMINRANGE,
                    188: 
                    189:   OP_CLASS,          /* Match a character class */
                    190:   OP_REF,            /* Match a back reference */
                    191: 
                    192:   OP_ALT,            /* Start of alternation */
                    193:   OP_KET,            /* End of group that doesn't have an unbounded repeat */
                    194:   OP_KETRMAX,        /* These two must remain together and in this */
                    195:   OP_KETRMIN,        /* order. They are for groups the repeat for ever. */
                    196: 
                    197:   /* The assertions must come before ONCE and COND */
                    198: 
                    199:   OP_ASSERT,         /* Positive lookahead */
                    200:   OP_ASSERT_NOT,     /* Negative lookahead */
                    201:   OP_ASSERTBACK,     /* Positive lookbehind */
                    202:   OP_ASSERTBACK_NOT, /* Negative lookbehind */
                    203:   OP_REVERSE,        /* Move pointer back - used in lookbehind assertions */
                    204: 
                    205:   /* ONCE and COND must come after the assertions, with ONCE first, as there's
                    206:   a test for >= ONCE for a subpattern that isn't an assertion. */
                    207: 
                    208:   OP_ONCE,           /* Once matched, don't back up into the subpattern */
                    209:   OP_COND,           /* Conditional group */
                    210:   OP_CREF,           /* Used to hold an extraction string number */
                    211: 
                    212:   OP_BRAZERO,        /* These two must remain together and in this */
                    213:   OP_BRAMINZERO,     /* order. */
                    214: 
                    215:   OP_BRA             /* This and greater values are used for brackets that
                    216:                         extract substrings. */
                    217: };
                    218: 
                    219: /* The highest extraction number. This is limited by the number of opcodes
                    220: left after OP_BRA, i.e. 255 - OP_BRA. We actually set it somewhat lower. */
                    221: 
                    222: #define EXTRACT_MAX  99
                    223: 
                    224: /* The texts of compile-time error messages are defined as macros here so that
                    225: they can be accessed by the POSIX wrapper and converted into error codes.  Yes,
                    226: I could have used error codes in the first place, but didn't feel like changing
                    227: just to accommodate the POSIX wrapper. */
                    228: 
                    229: #define ERR1  "\\ at end of pattern"
                    230: #define ERR2  "\\c at end of pattern"
                    231: #define ERR3  "unrecognized character follows \\"
                    232: #define ERR4  "numbers out of order in {} quantifier"
                    233: #define ERR5  "number too big in {} quantifier"
                    234: #define ERR6  "missing terminating ] for character class"
                    235: #define ERR7  "invalid escape sequence in character class"
                    236: #define ERR8  "range out of order in character class"
                    237: #define ERR9  "nothing to repeat"
                    238: #define ERR10 "operand of unlimited repeat could match the empty string"
                    239: #define ERR11 "internal error: unexpected repeat"
                    240: #define ERR12 "unrecognized character after (?"
                    241: #define ERR13 "too many capturing parenthesized sub-patterns"
                    242: #define ERR14 "missing )"
                    243: #define ERR15 "back reference to non-existent subpattern"
                    244: #define ERR16 "erroffset passed as NULL"
                    245: #define ERR17 "unknown option bit(s) set"
                    246: #define ERR18 "missing ) after comment"
                    247: #define ERR19 "too many sets of parentheses"
                    248: #define ERR20 "regular expression too large"
                    249: #define ERR21 "failed to get memory"
                    250: #define ERR22 "unmatched parentheses"
                    251: #define ERR23 "internal error: code overflow"
                    252: #define ERR24 "unrecognized character after (?<"
                    253: #define ERR25 "lookbehind assertion is not fixed length"
                    254: #define ERR26 "malformed number after (?("
                    255: #define ERR27 "conditional group contains more than two branches"
                    256: #define ERR28 "assertion expected after (?("
                    257: 
                    258: /* All character handling must be done as unsigned characters. Otherwise there
                    259: are problems with top-bit-set characters and functions such as isspace().
                    260: However, we leave the interface to the outside world as char *, because that
                    261: should make things easier for callers. We define a short type for unsigned char
                    262: to save lots of typing. I tried "uchar", but it causes problems on Digital
                    263: Unix, where it is defined in sys/types, so use "uschar" instead. */
                    264: 
                    265: typedef unsigned char uschar;
                    266: 
                    267: /* The real format of the start of the pcre block; the actual code vector
                    268: runs on as long as necessary after the end. */
                    269: 
                    270: typedef struct real_pcre {
                    271:   unsigned long int magic_number;
                    272:   const unsigned char *tables;
                    273:   unsigned long int options;
                    274:   uschar top_bracket;
                    275:   uschar top_backref;
                    276:   uschar first_char;
                    277:   uschar req_char;
                    278:   uschar code[1];
                    279: } real_pcre;
                    280: 
                    281: /* The real format of the extra block returned by pcre_study(). */
                    282: 
                    283: typedef struct real_pcre_extra {
                    284:   uschar options;
                    285:   uschar start_bits[32];
                    286: } real_pcre_extra;
                    287: 
                    288: 
                    289: /* Structure for passing "static" information around between the functions
                    290: doing the compiling, so that they are thread-safe. */
                    291: 
                    292: typedef struct compile_data {
                    293:   const uschar *lcc;            /* Points to lower casing table */
                    294:   const uschar *fcc;            /* Points to case-flipping table */
                    295:   const uschar *cbits;          /* Points to character type table */
                    296:   const uschar *ctypes;         /* Points to table of type maps */
                    297: } compile_data;
                    298: 
                    299: /* Structure for passing "static" information around between the functions
                    300: doing the matching, so that they are thread-safe. */
                    301: 
                    302: typedef struct match_data {
                    303:   int    errorcode;             /* As it says */
                    304:   int   *offset_vector;         /* Offset vector */
                    305:   int    offset_end;            /* One past the end */
                    306:   int    offset_max;            /* The maximum usable for return data */
                    307:   const uschar *lcc;            /* Points to lower casing table */
                    308:   const uschar *ctypes;         /* Points to table of type maps */
                    309:   BOOL   offset_overflow;       /* Set if too many extractions */
                    310:   BOOL   notbol;                /* NOTBOL flag */
                    311:   BOOL   noteol;                /* NOTEOL flag */
                    312:   BOOL   endonly;               /* Dollar not before final \n */
                    313:   BOOL   notempty;              /* Empty string match not wanted */
                    314:   const uschar *start_subject;  /* Start of the subject string */
                    315:   const uschar *end_subject;    /* End of the subject string */
                    316:   const uschar *start_match;    /* Start of this match attempt */
                    317:   const uschar *end_match_ptr;  /* Subject position at end match */
                    318:   int     end_offset_top;       /* Highwater mark at end of match */
                    319: } match_data;
                    320: 
                    321: /* Bit definitions for entries in the pcre_ctypes table. */
                    322: 
                    323: #define ctype_space   0x01
                    324: #define ctype_letter  0x02
                    325: #define ctype_digit   0x04
                    326: #define ctype_xdigit  0x08
                    327: #define ctype_word    0x10   /* alphameric or '_' */
                    328: #define ctype_meta    0x80   /* regexp meta char or zero (end pattern) */
                    329: 
                    330: /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
                    331: of bits for a class map. */
                    332: 
                    333: #define cbit_digit    0      /* for \d */
                    334: #define cbit_word    32      /* for \w */
                    335: #define cbit_space   64      /* for \s */
                    336: #define cbit_length  96      /* Length of the cbits table */
                    337: 
                    338: /* Offsets of the various tables from the base tables pointer, and
                    339: total length. */
                    340: 
                    341: #define lcc_offset      0
                    342: #define fcc_offset    256
                    343: #define cbits_offset  512
                    344: #define ctypes_offset (cbits_offset + cbit_length)
                    345: #define tables_length (ctypes_offset + 256)
                    346: 
                    347: /* End of internal.h */

E-mail: