Annotation of parser3/src/lib/pcre/internal.h, revision 1.1

1.1     ! paf         1: /*************************************************
        !             2: *      Perl-Compatible Regular Expressions       *
        !             3: *************************************************/
        !             4: 
        !             5: 
        !             6: /* This is a library of functions to support regular expressions whose syntax
        !             7: and semantics are as close as possible to those of the Perl 5 language. See
        !             8: the file Tech.Notes for some information on the internals.
        !             9: 
        !            10: Written by: Philip Hazel <ph10@cam.ac.uk>
        !            11: 
        !            12:            Copyright (c) 1997-1999 University of Cambridge
        !            13: 
        !            14: -----------------------------------------------------------------------------
        !            15: Permission is granted to anyone to use this software for any purpose on any
        !            16: computer system, and to redistribute it freely, subject to the following
        !            17: restrictions:
        !            18: 
        !            19: 1. This software is distributed in the hope that it will be useful,
        !            20:    but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            21:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
        !            22: 
        !            23: 2. The origin of this software must not be misrepresented, either by
        !            24:    explicit claim or by omission.
        !            25: 
        !            26: 3. Altered versions must be plainly marked as such, and must not be
        !            27:    misrepresented as being the original software.
        !            28: 
        !            29: 4. If PCRE is embedded in any software that is released under the GNU
        !            30:    General Purpose Licence (GPL), then the terms of that licence shall
        !            31:    supersede any condition above with which it is incompatible.
        !            32: -----------------------------------------------------------------------------
        !            33: */
        !            34: 
        !            35: /* This header contains definitions that are shared between the different
        !            36: modules, but which are not relevant to the outside. */
        !            37: 
        !            38: /* To cope with SunOS4 and other systems that lack memmove() but have bcopy(),
        !            39: define a macro for memmove() if USE_BCOPY is defined. */
        !            40: 
        !            41: #ifdef USE_BCOPY
        !            42: #undef  memmove        /* some systems may have a macro */
        !            43: #define memmove(a, b, c) bcopy(b, a, c)
        !            44: #endif
        !            45: 
        !            46: /* Standard C headers plus the external interface definition */
        !            47: 
        !            48: #include <ctype.h>
        !            49: /* #include <limits.h> 
        !            50: PAF@design.ru removed this, for it were used only for stupid MAX_INT for
        !            51: mimimum searches, changed that to some out-of-reasonable-text-length value
        !            52: of 10M
        !            53: */
        !            54: #define PCRE_MAX_POS (10*0x400*0x400)
        !            55: 
        !            56: #include <stddef.h>
        !            57: #include <stdio.h>
        !            58: #include <stdlib.h>
        !            59: #include <string.h>
        !            60: #include "pcre.h"
        !            61: 
        !            62: /* In case there is no definition of offsetof() provided - though any proper
        !            63: Standard C system should have one. */
        !            64: 
        !            65: #ifndef offsetof
        !            66: #define offsetof(p_type,field) ((size_t)&(((p_type *)0)->field))
        !            67: #endif
        !            68: 
        !            69: /* These are the public options that can change during matching. */
        !            70: 
        !            71: #define PCRE_IMS (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL)
        !            72: 
        !            73: /* Private options flags start at the most significant end of the four bytes,
        !            74: but skip the top bit so we can use ints for convenience without getting tangled
        !            75: with negative values. The public options defined in pcre.h start at the least
        !            76: significant end. Make sure they don't overlap, though now that we have expanded
        !            77: to four bytes there is plenty of space. */
        !            78: 
        !            79: #define PCRE_FIRSTSET      0x40000000  /* first_char is set */
        !            80: #define PCRE_REQCHSET      0x20000000  /* req_char is set */
        !            81: #define PCRE_STARTLINE     0x10000000  /* start after \n for multiline */
        !            82: #define PCRE_INGROUP       0x08000000  /* compiling inside a group */
        !            83: #define PCRE_ICHANGED      0x04000000  /* i option changes within regex */
        !            84: 
        !            85: /* Options for the "extra" block produced by pcre_study(). */
        !            86: 
        !            87: #define PCRE_STUDY_MAPPED   0x01     /* a map of starting chars exists */
        !            88: 
        !            89: /* Masks for identifying the public options which are permitted at compile
        !            90: time, run time or study time, respectively. */
        !            91: 
        !            92: #define PUBLIC_OPTIONS \
        !            93:   (PCRE_CASELESS|PCRE_EXTENDED|PCRE_ANCHORED|PCRE_MULTILINE| \
        !            94:    PCRE_DOTALL|PCRE_DOLLAR_ENDONLY|PCRE_EXTRA|PCRE_UNGREEDY)
        !            95: 
        !            96: #define PUBLIC_EXEC_OPTIONS \
        !            97:   (PCRE_ANCHORED|PCRE_NOTBOL|PCRE_NOTEOL|PCRE_NOTEMPTY)
        !            98: 
        !            99: #define PUBLIC_STUDY_OPTIONS 0   /* None defined */
        !           100: 
        !           101: /* Magic number to provide a small check against being handed junk. */
        !           102: 
        !           103: #define MAGIC_NUMBER  0x50435245UL   /* 'PCRE' */
        !           104: 
        !           105: /* Miscellaneous definitions */
        !           106: 
        !           107: typedef int BOOL;
        !           108: 
        !           109: #ifndef FALSE
        !           110: #define FALSE   0
        !           111: #endif
        !           112: #ifndef TRUE
        !           113: #define TRUE    1
        !           114: #endif
        !           115: 
        !           116: /* These are escaped items that aren't just an encoding of a particular data
        !           117: value such as \n. They must have non-zero values, as check_escape() returns
        !           118: their negation. Also, they must appear in the same order as in the opcode
        !           119: definitions below, up to ESC_z. The final one must be ESC_REF as subsequent
        !           120: values are used for \1, \2, \3, etc. There is a test in the code for an escape
        !           121: greater than ESC_b and less than ESC_X to detect the types that may be
        !           122: repeated. If any new escapes are put in-between that don't consume a character,
        !           123: that code will have to change. */
        !           124: 
        !           125: enum { ESC_A = 1, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w,
        !           126:        ESC_Z, ESC_z, ESC_REF };
        !           127: 
        !           128: /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets
        !           129: that extract substrings. Starting from 1 (i.e. after OP_END), the values up to
        !           130: OP_EOD must correspond in order to the list of escapes immediately above. */
        !           131: 
        !           132: enum {
        !           133:   OP_END,            /* End of pattern */
        !           134: 
        !           135:   /* Values corresponding to backslashed metacharacters */
        !           136: 
        !           137:   OP_SOD,            /* Start of data: \A */
        !           138:   OP_NOT_WORD_BOUNDARY,  /* \B */
        !           139:   OP_WORD_BOUNDARY,      /* \b */
        !           140:   OP_NOT_DIGIT,          /* \D */
        !           141:   OP_DIGIT,              /* \d */
        !           142:   OP_NOT_WHITESPACE,     /* \S */
        !           143:   OP_WHITESPACE,         /* \s */
        !           144:   OP_NOT_WORDCHAR,       /* \W */
        !           145:   OP_WORDCHAR,           /* \w */
        !           146:   OP_EODN,           /* End of data or \n at end of data: \Z. */
        !           147:   OP_EOD,            /* End of data: \z */
        !           148: 
        !           149:   OP_OPT,            /* Set runtime options */
        !           150:   OP_CIRC,           /* Start of line - varies with multiline switch */
        !           151:   OP_DOLL,           /* End of line - varies with multiline switch */
        !           152:   OP_ANY,            /* Match any character */
        !           153:   OP_CHARS,          /* Match string of characters */
        !           154:   OP_NOT,            /* Match anything but the following char */
        !           155: 
        !           156:   OP_STAR,           /* The maximizing and minimizing versions of */
        !           157:   OP_MINSTAR,        /* all these opcodes must come in pairs, with */
        !           158:   OP_PLUS,           /* the minimizing one second. */
        !           159:   OP_MINPLUS,        /* This first set applies to single characters */
        !           160:   OP_QUERY,
        !           161:   OP_MINQUERY,
        !           162:   OP_UPTO,           /* From 0 to n matches */
        !           163:   OP_MINUPTO,
        !           164:   OP_EXACT,          /* Exactly n matches */
        !           165: 
        !           166:   OP_NOTSTAR,        /* The maximizing and minimizing versions of */
        !           167:   OP_NOTMINSTAR,     /* all these opcodes must come in pairs, with */
        !           168:   OP_NOTPLUS,        /* the minimizing one second. */
        !           169:   OP_NOTMINPLUS,     /* This first set applies to "not" single characters */
        !           170:   OP_NOTQUERY,
        !           171:   OP_NOTMINQUERY,
        !           172:   OP_NOTUPTO,        /* From 0 to n matches */
        !           173:   OP_NOTMINUPTO,
        !           174:   OP_NOTEXACT,       /* Exactly n matches */
        !           175: 
        !           176:   OP_TYPESTAR,       /* The maximizing and minimizing versions of */
        !           177:   OP_TYPEMINSTAR,    /* all these opcodes must come in pairs, with */
        !           178:   OP_TYPEPLUS,       /* the minimizing one second. These codes must */
        !           179:   OP_TYPEMINPLUS,    /* be in exactly the same order as those above. */
        !           180:   OP_TYPEQUERY,      /* This set applies to character types such as \d */
        !           181:   OP_TYPEMINQUERY,
        !           182:   OP_TYPEUPTO,       /* From 0 to n matches */
        !           183:   OP_TYPEMINUPTO,
        !           184:   OP_TYPEEXACT,      /* Exactly n matches */
        !           185: 
        !           186:   OP_CRSTAR,         /* The maximizing and minimizing versions of */
        !           187:   OP_CRMINSTAR,      /* all these opcodes must come in pairs, with */
        !           188:   OP_CRPLUS,         /* the minimizing one second. These codes must */
        !           189:   OP_CRMINPLUS,      /* be in exactly the same order as those above. */
        !           190:   OP_CRQUERY,        /* These are for character classes and back refs */
        !           191:   OP_CRMINQUERY,
        !           192:   OP_CRRANGE,        /* These are different to the three seta above. */
        !           193:   OP_CRMINRANGE,
        !           194: 
        !           195:   OP_CLASS,          /* Match a character class */
        !           196:   OP_REF,            /* Match a back reference */
        !           197: 
        !           198:   OP_ALT,            /* Start of alternation */
        !           199:   OP_KET,            /* End of group that doesn't have an unbounded repeat */
        !           200:   OP_KETRMAX,        /* These two must remain together and in this */
        !           201:   OP_KETRMIN,        /* order. They are for groups the repeat for ever. */
        !           202: 
        !           203:   /* The assertions must come before ONCE and COND */
        !           204: 
        !           205:   OP_ASSERT,         /* Positive lookahead */
        !           206:   OP_ASSERT_NOT,     /* Negative lookahead */
        !           207:   OP_ASSERTBACK,     /* Positive lookbehind */
        !           208:   OP_ASSERTBACK_NOT, /* Negative lookbehind */
        !           209:   OP_REVERSE,        /* Move pointer back - used in lookbehind assertions */
        !           210: 
        !           211:   /* ONCE and COND must come after the assertions, with ONCE first, as there's
        !           212:   a test for >= ONCE for a subpattern that isn't an assertion. */
        !           213: 
        !           214:   OP_ONCE,           /* Once matched, don't back up into the subpattern */
        !           215:   OP_COND,           /* Conditional group */
        !           216:   OP_CREF,           /* Used to hold an extraction string number */
        !           217: 
        !           218:   OP_BRAZERO,        /* These two must remain together and in this */
        !           219:   OP_BRAMINZERO,     /* order. */
        !           220: 
        !           221:   OP_BRA             /* This and greater values are used for brackets that
        !           222:                         extract substrings. */
        !           223: };
        !           224: 
        !           225: /* The highest extraction number. This is limited by the number of opcodes
        !           226: left after OP_BRA, i.e. 255 - OP_BRA. We actually set it somewhat lower. */
        !           227: 
        !           228: #define EXTRACT_MAX  99
        !           229: 
        !           230: /* The texts of compile-time error messages are defined as macros here so that
        !           231: they can be accessed by the POSIX wrapper and converted into error codes.  Yes,
        !           232: I could have used error codes in the first place, but didn't feel like changing
        !           233: just to accommodate the POSIX wrapper. */
        !           234: 
        !           235: #define ERR1  "\\ at end of pattern"
        !           236: #define ERR2  "\\c at end of pattern"
        !           237: #define ERR3  "unrecognized character follows \\"
        !           238: #define ERR4  "numbers out of order in {} quantifier"
        !           239: #define ERR5  "number too big in {} quantifier"
        !           240: #define ERR6  "missing terminating ] for character class"
        !           241: #define ERR7  "invalid escape sequence in character class"
        !           242: #define ERR8  "range out of order in character class"
        !           243: #define ERR9  "nothing to repeat"
        !           244: #define ERR10 "operand of unlimited repeat could match the empty string"
        !           245: #define ERR11 "internal error: unexpected repeat"
        !           246: #define ERR12 "unrecognized character after (?"
        !           247: #define ERR13 "too many capturing parenthesized sub-patterns"
        !           248: #define ERR14 "missing )"
        !           249: #define ERR15 "back reference to non-existent subpattern"
        !           250: #define ERR16 "erroffset passed as NULL"
        !           251: #define ERR17 "unknown option bit(s) set"
        !           252: #define ERR18 "missing ) after comment"
        !           253: #define ERR19 "too many sets of parentheses"
        !           254: #define ERR20 "regular expression too large"
        !           255: #define ERR21 "failed to get memory"
        !           256: #define ERR22 "unmatched parentheses"
        !           257: #define ERR23 "internal error: code overflow"
        !           258: #define ERR24 "unrecognized character after (?<"
        !           259: #define ERR25 "lookbehind assertion is not fixed length"
        !           260: #define ERR26 "malformed number after (?("
        !           261: #define ERR27 "conditional group contains more than two branches"
        !           262: #define ERR28 "assertion expected after (?("
        !           263: 
        !           264: /* All character handling must be done as unsigned characters. Otherwise there
        !           265: are problems with top-bit-set characters and functions such as isspace().
        !           266: However, we leave the interface to the outside world as char *, because that
        !           267: should make things easier for callers. We define a short type for unsigned char
        !           268: to save lots of typing. I tried "uchar", but it causes problems on Digital
        !           269: Unix, where it is defined in sys/types, so use "uschar" instead. */
        !           270: 
        !           271: typedef unsigned char uschar;
        !           272: 
        !           273: /* The real format of the start of the pcre block; the actual code vector
        !           274: runs on as long as necessary after the end. */
        !           275: 
        !           276: typedef struct real_pcre {
        !           277:   unsigned long int magic_number;
        !           278:   const unsigned char *tables;
        !           279:   unsigned long int options;
        !           280:   uschar top_bracket;
        !           281:   uschar top_backref;
        !           282:   uschar first_char;
        !           283:   uschar req_char;
        !           284:   uschar code[1];
        !           285: } real_pcre;
        !           286: 
        !           287: /* The real format of the extra block returned by pcre_study(). */
        !           288: 
        !           289: typedef struct real_pcre_extra {
        !           290:   uschar options;
        !           291:   uschar start_bits[32];
        !           292: } real_pcre_extra;
        !           293: 
        !           294: 
        !           295: /* Structure for passing "static" information around between the functions
        !           296: doing the compiling, so that they are thread-safe. */
        !           297: 
        !           298: typedef struct compile_data {
        !           299:   const uschar *lcc;            /* Points to lower casing table */
        !           300:   const uschar *fcc;            /* Points to case-flipping table */
        !           301:   const uschar *cbits;          /* Points to character type table */
        !           302:   const uschar *ctypes;         /* Points to table of type maps */
        !           303: } compile_data;
        !           304: 
        !           305: /* Structure for passing "static" information around between the functions
        !           306: doing the matching, so that they are thread-safe. */
        !           307: 
        !           308: typedef struct match_data {
        !           309:   int    errorcode;             /* As it says */
        !           310:   int   *offset_vector;         /* Offset vector */
        !           311:   int    offset_end;            /* One past the end */
        !           312:   int    offset_max;            /* The maximum usable for return data */
        !           313:   const uschar *lcc;            /* Points to lower casing table */
        !           314:   const uschar *ctypes;         /* Points to table of type maps */
        !           315:   BOOL   offset_overflow;       /* Set if too many extractions */
        !           316:   BOOL   notbol;                /* NOTBOL flag */
        !           317:   BOOL   noteol;                /* NOTEOL flag */
        !           318:   BOOL   endonly;               /* Dollar not before final \n */
        !           319:   BOOL   notempty;              /* Empty string match not wanted */
        !           320:   const uschar *start_subject;  /* Start of the subject string */
        !           321:   const uschar *end_subject;    /* End of the subject string */
        !           322:   const uschar *start_match;    /* Start of this match attempt */
        !           323:   const uschar *end_match_ptr;  /* Subject position at end match */
        !           324:   int     end_offset_top;       /* Highwater mark at end of match */
        !           325: } match_data;
        !           326: 
        !           327: /* Bit definitions for entries in the pcre_ctypes table. */
        !           328: 
        !           329: #define ctype_space   0x01
        !           330: #define ctype_letter  0x02
        !           331: #define ctype_digit   0x04
        !           332: #define ctype_xdigit  0x08
        !           333: #define ctype_word    0x10   /* alphameric or '_' */
        !           334: #define ctype_meta    0x80   /* regexp meta char or zero (end pattern) */
        !           335: 
        !           336: /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set
        !           337: of bits for a class map. */
        !           338: 
        !           339: #define cbit_digit    0      /* for \d */
        !           340: #define cbit_word    32      /* for \w */
        !           341: #define cbit_space   64      /* for \s */
        !           342: #define cbit_length  96      /* Length of the cbits table */
        !           343: 
        !           344: /* Offsets of the various tables from the base tables pointer, and
        !           345: total length. */
        !           346: 
        !           347: #define lcc_offset      0
        !           348: #define fcc_offset    256
        !           349: #define cbits_offset  512
        !           350: #define ctypes_offset (cbits_offset + cbit_length)
        !           351: #define tables_length (ctypes_offset + 256)
        !           352: 
        !           353: /* End of internal.h */

E-mail: