Annotation of parser3/src/pcre/pcre.c, revision 1.2

1.1       paf         1: /*************************************************
                      2: *      Perl-Compatible Regular Expressions       *
                      3: *************************************************/
                      4: 
                      5: /*
                      6: This is a library of functions to support regular expressions whose syntax
                      7: and semantics are as close as possible to those of the Perl 5 language. See
                      8: the file Tech.Notes for some information on the internals.
                      9: 
                     10: Written by: Philip Hazel <ph10@cam.ac.uk>
                     11: 
                     12:            Copyright (c) 1997-1999 University of Cambridge
                     13: 
                     14: -----------------------------------------------------------------------------
                     15: Permission is granted to anyone to use this software for any purpose on any
                     16: computer system, and to redistribute it freely, subject to the following
                     17: restrictions:
                     18: 
                     19: 1. This software is distributed in the hope that it will be useful,
                     20:    but WITHOUT ANY WARRANTY; without even the implied warranty of
                     21:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
                     22: 
                     23: 2. The origin of this software must not be misrepresented, either by
                     24:    explicit claim or by omission.
                     25: 
                     26: 3. Altered versions must be plainly marked as such, and must not be
                     27:    misrepresented as being the original software.
                     28: 
                     29: 4. If PCRE is embedded in any software that is released under the GNU
                     30:    General Purpose Licence (GPL), then the terms of that licence shall
                     31:    supersede any condition above with which it is incompatible.
                     32: -----------------------------------------------------------------------------
                     33: */
                     34: 
                     35: 
                     36: /* Define DEBUG to get debugging output on stdout. */
                     37: 
                     38: /* #define DEBUG */
                     39: 
                     40: /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
                     41: inline, and there are *still* stupid compilers about that don't like indented
                     42: pre-processor statements. I suppose it's only been 10 years... */
                     43: 
                     44: #ifdef DEBUG
                     45: #define DPRINTF(p) printf p
                     46: #else
                     47: #define DPRINTF(p) /*nothing*/
                     48: #endif
                     49: 
                     50: /* Include the internals header, which itself includes Standard C headers plus
                     51: the external pcre header. */
                     52: 
                     53: #include "internal.h"
                     54: 
                     55: 
                     56: /* Allow compilation as C++ source code, should anybody want to do that. */
                     57: 
                     58: #ifdef __cplusplus
                     59: #define class pcre_class
                     60: #endif
                     61: 
                     62: 
                     63: /* Number of items on the nested bracket stacks at compile time. This should
                     64: not be set greater than 200. */
                     65: 
                     66: #define BRASTACK_SIZE 200
                     67: 
                     68: 
                     69: /* Min and max values for the common repeats; for the maxima, 0 => infinity */
                     70: 
                     71: static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
                     72: static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
                     73: 
                     74: /* Text forms of OP_ values and things, for debugging (not all used) */
                     75: 
                     76: #ifdef DEBUG
                     77: static const char *OP_names[] = {
                     78:   "End", "\\A", "\\B", "\\b", "\\D", "\\d",
                     79:   "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
                     80:   "Opt", "^", "$", "Any", "chars", "not",
                     81:   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
                     82:   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
                     83:   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
                     84:   "*", "*?", "+", "+?", "?", "??", "{", "{",
                     85:   "class", "Ref",
                     86:   "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
                     87:   "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
                     88:   "Brazero", "Braminzero", "Bra"
                     89: };
                     90: #endif
                     91: 
                     92: /* Table for handling escaped characters in the range '0'-'z'. Positive returns
                     93: are simple data values; negative values are for special things like \d and so
                     94: on. Zero means further processing is needed (for things like \x), or the escape
                     95: is invalid. */
                     96: 
                     97: static const short int escapes[] = {
                     98:     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
                     99:     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
                    100:   '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
                    101:     0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
                    102:     0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
                    103:     0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
                    104:   '`',      7, -ESC_b,      0, -ESC_d,     27,   '\f',      0,   /* ` - g */
                    105:     0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */
                    106:     0,      0,   '\r', -ESC_s,   '\t',      0,      0, -ESC_w,   /* p - w */
                    107:     0,      0, -ESC_z                                            /* x - z */
                    108: };
                    109: 
                    110: /* Definition to allow mutual recursion */
                    111: 
                    112: static BOOL
                    113:   compile_regex(int, int, int *, uschar **, const uschar **, const char **,
                    114:     BOOL, int, int *, int *, compile_data *);
                    115: 
                    116: 
                    117: 
                    118: /*************************************************
                    119: *               Global variables                 *
                    120: *************************************************/
                    121: 
                    122: /* PCRE is thread-clean and doesn't use any global variables in the normal
                    123: sense. However, it calls memory allocation and free functions via the two
                    124: indirections below, which are can be changed by the caller, but are shared
                    125: between all threads. */
                    126: 
                    127: void *(*pcre_malloc)(size_t) = malloc;
                    128: void  (*pcre_free)(void *) = free;
                    129: 
                    130: 
                    131: 
                    132: 
                    133: /*************************************************
                    134: *             Default character tables           *
                    135: *************************************************/
                    136: 
                    137: /* A default set of character tables is included in the PCRE binary. Its source
                    138: is built by the maketables auxiliary program, which uses the default C ctypes
                    139: functions, and put in the file chartables.c. These tables are used by PCRE
                    140: whenever the caller of pcre_compile() does not provide an alternate set of
                    141: tables. */
                    142: 
                    143: #include "chartables.c"
                    144: 
                    145: 
                    146: 
                    147: /*************************************************
                    148: *          Return version string                 *
                    149: *************************************************/
                    150: 
                    151: #define STRING(a)  # a
                    152: #define XSTRING(s) STRING(s)
                    153: 
                    154: const char *
                    155: pcre_version(void)
                    156: {
                    157: return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
                    158: }
                    159: 
                    160: 
                    161: 
                    162: 
                    163: /*************************************************
                    164: *       Return info about a compiled pattern     *
                    165: *************************************************/
                    166: 
                    167: /* This function picks potentially useful data out of the private
                    168: structure. The public options are passed back in an int - though the
                    169: re->options field has been expanded to a long int, all the public options
                    170: at the low end of it, and so even on 16-bit systems this will still be OK.
                    171: Therefore, I haven't changed the API for pcre_info().
                    172: 
                    173: Arguments:
                    174:   external_re   points to compiled code
                    175:   optptr        where to pass back the options
                    176:   first_char    where to pass back the first character,
                    177:                 or -1 if multiline and all branches start ^,
                    178:                 or -2 otherwise
                    179: 
                    180: Returns:        number of identifying extraction brackets
                    181:                 or negative values on error
                    182: */
                    183: 
                    184: int
                    185: pcre_info(const pcre *external_re, int *optptr, int *first_char)
                    186: {
                    187: const real_pcre *re = (const real_pcre *)external_re;
                    188: if (re == NULL) return PCRE_ERROR_NULL;
                    189: if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
                    190: if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
                    191: if (first_char != NULL)
                    192:   *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
                    193:      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
                    194: return re->top_bracket;
                    195: }
                    196: 
                    197: 
                    198: 
                    199: 
                    200: #ifdef DEBUG
                    201: /*************************************************
                    202: *        Debugging function to print chars       *
                    203: *************************************************/
                    204: 
                    205: /* Print a sequence of chars in printable format, stopping at the end of the
                    206: subject if the requested.
                    207: 
                    208: Arguments:
                    209:   p           points to characters
                    210:   length      number to print
                    211:   is_subject  TRUE if printing from within md->start_subject
                    212:   md          pointer to matching data block, if is_subject is TRUE
                    213: 
                    214: Returns:     nothing
                    215: */
                    216: 
                    217: static void
                    218: pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
                    219: {
                    220: int c;
                    221: if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
                    222: while (length-- > 0)
                    223:   if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
                    224: }
                    225: #endif
                    226: 
                    227: 
                    228: 
                    229: 
                    230: /*************************************************
                    231: *            Handle escapes                      *
                    232: *************************************************/
                    233: 
                    234: /* This function is called when a \ has been encountered. It either returns a
                    235: positive value for a simple escape such as \n, or a negative value which
                    236: encodes one of the more complicated things such as \d. On entry, ptr is
                    237: pointing at the \. On exit, it is on the final character of the escape
                    238: sequence.
                    239: 
                    240: Arguments:
                    241:   ptrptr     points to the pattern position pointer
                    242:   errorptr   points to the pointer to the error message
                    243:   bracount   number of previous extracting brackets
                    244:   options    the options bits
                    245:   isclass    TRUE if inside a character class
                    246:   cd         pointer to char tables block
                    247: 
                    248: Returns:     zero or positive => a data character
                    249:              negative => a special escape sequence
                    250:              on error, errorptr is set
                    251: */
                    252: 
                    253: static int
                    254: check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
                    255:   int options, BOOL isclass, compile_data *cd)
                    256: {
                    257: const uschar *ptr = *ptrptr;
                    258: int c = *(++ptr) & 255;   /* Ensure > 0 on signed-char systems */
                    259: int i;
                    260: 
                    261: if (c == 0) *errorptr = ERR1;
                    262: 
                    263: /* Digits or letters may have special meaning; all others are literals. */
                    264: 
                    265: else if (c < '0' || c > 'z') {}
                    266: 
                    267: /* Do an initial lookup in a table. A non-zero result is something that can be
                    268: returned immediately. Otherwise further processing may be required. */
                    269: 
                    270: else if ((i = escapes[c - '0']) != 0) c = i;
                    271: 
                    272: /* Escapes that need further processing, or are illegal. */
                    273: 
                    274: else
                    275:   {
                    276:   const uschar *oldptr;
                    277:   switch (c)
                    278:     {
                    279:     /* The handling of escape sequences consisting of a string of digits
                    280:     starting with one that is not zero is not straightforward. By experiment,
                    281:     the way Perl works seems to be as follows:
                    282: 
                    283:     Outside a character class, the digits are read as a decimal number. If the
                    284:     number is less than 10, or if there are that many previous extracting
                    285:     left brackets, then it is a back reference. Otherwise, up to three octal
                    286:     digits are read to form an escaped byte. Thus \123 is likely to be octal
                    287:     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
                    288:     value is greater than 377, the least significant 8 bits are taken. Inside a
                    289:     character class, \ followed by a digit is always an octal number. */
                    290: 
                    291:     case '1': case '2': case '3': case '4': case '5':
                    292:     case '6': case '7': case '8': case '9':
                    293: 
                    294:     if (!isclass)
                    295:       {
                    296:       oldptr = ptr;
                    297:       c -= '0';
                    298:       while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
                    299:         c = c * 10 + *(++ptr) - '0';
                    300:       if (c < 10 || c <= bracount)
                    301:         {
                    302:         c = -(ESC_REF + c);
                    303:         break;
                    304:         }
                    305:       ptr = oldptr;      /* Put the pointer back and fall through */
                    306:       }
                    307: 
                    308:     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
                    309:     generates a binary zero byte and treats the digit as a following literal.
                    310:     Thus we have to pull back the pointer by one. */
                    311: 
                    312:     if ((c = *ptr) >= '8')
                    313:       {
                    314:       ptr--;
                    315:       c = 0;
                    316:       break;
                    317:       }
                    318: 
                    319:     /* \0 always starts an octal number, but we may drop through to here with a
                    320:     larger first octal digit */
                    321: 
                    322:     case '0':
                    323:     c -= '0';
                    324:     while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
                    325:       ptr[1] != '8' && ptr[1] != '9')
                    326:         c = c * 8 + *(++ptr) - '0';
                    327:     break;
                    328: 
                    329:     /* Special escapes not starting with a digit are straightforward */
                    330: 
                    331:     case 'x':
                    332:     c = 0;
                    333:     while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
                    334:       {
                    335:       ptr++;
                    336:       c = c * 16 + cd->lcc[*ptr] -
                    337:         (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
                    338:       }
                    339:     break;
                    340: 
                    341:     case 'c':
                    342:     c = *(++ptr);
                    343:     if (c == 0)
                    344:       {
                    345:       *errorptr = ERR2;
                    346:       return 0;
                    347:       }
                    348: 
                    349:     /* A letter is upper-cased; then the 0x40 bit is flipped */
                    350: 
                    351:     if (c >= 'a' && c <= 'z') c = cd->fcc[c];
                    352:     c ^= 0x40;
                    353:     break;
                    354: 
                    355:     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
                    356:     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
                    357:     for Perl compatibility, it is a literal. This code looks a bit odd, but
                    358:     there used to be some cases other than the default, and there may be again
                    359:     in future, so I haven't "optimized" it. */
                    360: 
                    361:     default:
                    362:     if ((options & PCRE_EXTRA) != 0) switch(c)
                    363:       {
                    364:       default:
                    365:       *errorptr = ERR3;
                    366:       break;
                    367:       }
                    368:     break;
                    369:     }
                    370:   }
                    371: 
                    372: *ptrptr = ptr;
                    373: return c;
                    374: }
                    375: 
                    376: 
                    377: 
                    378: /*************************************************
                    379: *            Check for counted repeat            *
                    380: *************************************************/
                    381: 
                    382: /* This function is called when a '{' is encountered in a place where it might
                    383: start a quantifier. It looks ahead to see if it really is a quantifier or not.
                    384: It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
                    385: where the ddds are digits.
                    386: 
                    387: Arguments:
                    388:   p         pointer to the first char after '{'
                    389:   cd        pointer to char tables block
                    390: 
                    391: Returns:    TRUE or FALSE
                    392: */
                    393: 
                    394: static BOOL
                    395: is_counted_repeat(const uschar *p, compile_data *cd)
                    396: {
                    397: if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
                    398: while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
                    399: if (*p == '}') return TRUE;
                    400: 
                    401: if (*p++ != ',') return FALSE;
                    402: if (*p == '}') return TRUE;
                    403: 
                    404: if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
                    405: while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
                    406: return (*p == '}');
                    407: }
                    408: 
                    409: 
                    410: 
                    411: /*************************************************
                    412: *         Read repeat counts                     *
                    413: *************************************************/
                    414: 
                    415: /* Read an item of the form {n,m} and return the values. This is called only
                    416: after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
                    417: so the syntax is guaranteed to be correct, but we need to check the values.
                    418: 
                    419: Arguments:
                    420:   p          pointer to first char after '{'
                    421:   minp       pointer to int for min
                    422:   maxp       pointer to int for max
                    423:              returned as -1 if no max
                    424:   errorptr   points to pointer to error message
                    425:   cd         pointer to character tables clock
                    426: 
                    427: Returns:     pointer to '}' on success;
                    428:              current ptr on error, with errorptr set
                    429: */
                    430: 
                    431: static const uschar *
                    432: read_repeat_counts(const uschar *p, int *minp, int *maxp,
                    433:   const char **errorptr, compile_data *cd)
                    434: {
                    435: int min = 0;
                    436: int max = -1;
                    437: 
                    438: while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
                    439: 
                    440: if (*p == '}') max = min; else
                    441:   {
                    442:   if (*(++p) != '}')
                    443:     {
                    444:     max = 0;
                    445:     while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
                    446:     if (max < min)
                    447:       {
                    448:       *errorptr = ERR4;
                    449:       return p;
                    450:       }
                    451:     }
                    452:   }
                    453: 
                    454: /* Do paranoid checks, then fill in the required variables, and pass back the
                    455: pointer to the terminating '}'. */
                    456: 
                    457: if (min > 65535 || max > 65535)
                    458:   *errorptr = ERR5;
                    459: else
                    460:   {
                    461:   *minp = min;
                    462:   *maxp = max;
                    463:   }
                    464: return p;
                    465: }
                    466: 
                    467: 
                    468: 
                    469: /*************************************************
                    470: *        Find the fixed length of a pattern      *
                    471: *************************************************/
                    472: 
                    473: /* Scan a pattern and compute the fixed length of subject that will match it,
                    474: if the length is fixed. This is needed for dealing with backward assertions.
                    475: 
                    476: Arguments:
                    477:   code     points to the start of the pattern (the bracket)
                    478: 
                    479: Returns:   the fixed length, or -1 if there is no fixed length
                    480: */
                    481: 
                    482: static int
                    483: find_fixedlength(uschar *code)
                    484: {
                    485: int length = -1;
                    486: 
                    487: register int branchlength = 0;
                    488: register uschar *cc = code + 3;
                    489: 
                    490: /* Scan along the opcodes for this branch. If we get to the end of the
                    491: branch, check the length against that of the other branches. */
                    492: 
                    493: for (;;)
                    494:   {
                    495:   int d;
                    496:   register int op = *cc;
                    497:   if (op >= OP_BRA) op = OP_BRA;
                    498: 
                    499:   switch (op)
                    500:     {
                    501:     case OP_BRA:
                    502:     case OP_ONCE:
                    503:     case OP_COND:
                    504:     d = find_fixedlength(cc);
                    505:     if (d < 0) return -1;
                    506:     branchlength += d;
                    507:     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
                    508:     cc += 3;
                    509:     break;
                    510: 
                    511:     /* Reached end of a branch; if it's a ket it is the end of a nested
                    512:     call. If it's ALT it is an alternation in a nested call. If it is
                    513:     END it's the end of the outer call. All can be handled by the same code. */
                    514: 
                    515:     case OP_ALT:
                    516:     case OP_KET:
                    517:     case OP_KETRMAX:
                    518:     case OP_KETRMIN:
                    519:     case OP_END:
                    520:     if (length < 0) length = branchlength;
                    521:       else if (length != branchlength) return -1;
                    522:     if (*cc != OP_ALT) return length;
                    523:     cc += 3;
                    524:     branchlength = 0;
                    525:     break;
                    526: 
                    527:     /* Skip over assertive subpatterns */
                    528: 
                    529:     case OP_ASSERT:
                    530:     case OP_ASSERT_NOT:
                    531:     case OP_ASSERTBACK:
                    532:     case OP_ASSERTBACK_NOT:
                    533:     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
                    534:     cc += 3;
                    535:     break;
                    536: 
                    537:     /* Skip over things that don't match chars */
                    538: 
                    539:     case OP_REVERSE:
                    540:     cc++;
                    541:     /* Fall through */
                    542: 
                    543:     case OP_CREF:
                    544:     case OP_OPT:
                    545:     cc++;
                    546:     /* Fall through */
                    547: 
                    548:     case OP_SOD:
                    549:     case OP_EOD:
                    550:     case OP_EODN:
                    551:     case OP_CIRC:
                    552:     case OP_DOLL:
                    553:     case OP_NOT_WORD_BOUNDARY:
                    554:     case OP_WORD_BOUNDARY:
                    555:     cc++;
                    556:     break;
                    557: 
                    558:     /* Handle char strings */
                    559: 
                    560:     case OP_CHARS:
                    561:     branchlength += *(++cc);
                    562:     cc += *cc + 1;
                    563:     break;
                    564: 
                    565:     /* Handle exact repetitions */
                    566: 
                    567:     case OP_EXACT:
                    568:     case OP_TYPEEXACT:
                    569:     branchlength += (cc[1] << 8) + cc[2];
                    570:     cc += 4;
                    571:     break;
                    572: 
                    573:     /* Handle single-char matchers */
                    574: 
                    575:     case OP_NOT_DIGIT:
                    576:     case OP_DIGIT:
                    577:     case OP_NOT_WHITESPACE:
                    578:     case OP_WHITESPACE:
                    579:     case OP_NOT_WORDCHAR:
                    580:     case OP_WORDCHAR:
                    581:     case OP_ANY:
                    582:     branchlength++;
                    583:     cc++;
                    584:     break;
                    585: 
                    586: 
                    587:     /* Check a class for variable quantification */
                    588: 
                    589:     case OP_CLASS:
                    590:     cc += (*cc == OP_REF)? 2 : 33;
                    591: 
                    592:     switch (*cc)
                    593:       {
                    594:       case OP_CRSTAR:
                    595:       case OP_CRMINSTAR:
                    596:       case OP_CRQUERY:
                    597:       case OP_CRMINQUERY:
                    598:       return -1;
                    599: 
                    600:       case OP_CRRANGE:
                    601:       case OP_CRMINRANGE:
                    602:       if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
                    603:       branchlength += (cc[1] << 8) + cc[2];
                    604:       cc += 5;
                    605:       break;
                    606: 
                    607:       default:
                    608:       branchlength++;
                    609:       }
                    610:     break;
                    611: 
                    612:     /* Anything else is variable length */
                    613: 
                    614:     default:
                    615:     return -1;
                    616:     }
                    617:   }
                    618: /* Control never gets here */
                    619: }
                    620: 
                    621: 
                    622: 
                    623: 
                    624: /*************************************************
                    625: *           Compile one branch                   *
                    626: *************************************************/
                    627: 
                    628: /* Scan the pattern, compiling it into the code vector.
                    629: 
                    630: Arguments:
                    631:   options      the option bits
                    632:   brackets     points to number of brackets used
                    633:   code         points to the pointer to the current code point
                    634:   ptrptr       points to the current pattern pointer
                    635:   errorptr     points to pointer to error message
                    636:   optchanged   set to the value of the last OP_OPT item compiled
                    637:   reqchar      set to the last literal character required, else -1
                    638:   countlits    set to count of mandatory literal characters
                    639:   cd           contains pointers to tables
                    640: 
                    641: Returns:       TRUE on success
                    642:                FALSE, with *errorptr set on error
                    643: */
                    644: 
                    645: static BOOL
                    646: compile_branch(int options, int *brackets, uschar **codeptr,
                    647:   const uschar **ptrptr, const char **errorptr, int *optchanged,
                    648:   int *reqchar, int *countlits, compile_data *cd)
                    649: {
                    650: int repeat_type, op_type;
                    651: int repeat_min, repeat_max;
                    652: int bravalue, length;
                    653: int greedy_default, greedy_non_default;
                    654: int prevreqchar;
                    655: int condcount = 0;
                    656: int subcountlits = 0;
                    657: register int c;
                    658: register uschar *code = *codeptr;
                    659: uschar *tempcode;
                    660: const uschar *ptr = *ptrptr;
                    661: const uschar *tempptr;
                    662: uschar *previous = NULL;
                    663: uschar class[32];
                    664: 
                    665: /* Set up the default and non-default settings for greediness */
                    666: 
                    667: greedy_default = ((options & PCRE_UNGREEDY) != 0);
                    668: greedy_non_default = greedy_default ^ 1;
                    669: 
                    670: /* Initialize no required char, and count of literals */
                    671: 
                    672: *reqchar = prevreqchar = -1;
                    673: *countlits = 0;
                    674: 
                    675: /* Switch on next character until the end of the branch */
                    676: 
                    677: for (;; ptr++)
                    678:   {
                    679:   BOOL negate_class;
                    680:   int class_charcount;
                    681:   int class_lastchar;
                    682:   int newoptions;
                    683:   int condref;
                    684:   int subreqchar;
                    685: 
                    686:   c = *ptr;
                    687:   if ((options & PCRE_EXTENDED) != 0)
                    688:     {
                    689:     if ((cd->ctypes[c] & ctype_space) != 0) continue;
                    690:     if (c == '#')
                    691:       {
                    692:       while ((c = *(++ptr)) != 0 && c != '\n');
                    693:       continue;
                    694:       }
                    695:     }
                    696: 
                    697:   switch(c)
                    698:     {
                    699:     /* The branch terminates at end of string, |, or ). */
                    700: 
                    701:     case 0:
                    702:     case '|':
                    703:     case ')':
                    704:     *codeptr = code;
                    705:     *ptrptr = ptr;
                    706:     return TRUE;
                    707: 
                    708:     /* Handle single-character metacharacters */
                    709: 
                    710:     case '^':
                    711:     previous = NULL;
                    712:     *code++ = OP_CIRC;
                    713:     break;
                    714: 
                    715:     case '$':
                    716:     previous = NULL;
                    717:     *code++ = OP_DOLL;
                    718:     break;
                    719: 
                    720:     case '.':
                    721:     previous = code;
                    722:     *code++ = OP_ANY;
                    723:     break;
                    724: 
                    725:     /* Character classes. These always build a 32-byte bitmap of the permitted
                    726:     characters, except in the special case where there is only one character.
                    727:     For negated classes, we build the map as usual, then invert it at the end.
                    728:     */
                    729: 
                    730:     case '[':
                    731:     previous = code;
                    732:     *code++ = OP_CLASS;
                    733: 
                    734:     /* If the first character is '^', set the negation flag and skip it. */
                    735: 
                    736:     if ((c = *(++ptr)) == '^')
                    737:       {
                    738:       negate_class = TRUE;
                    739:       c = *(++ptr);
                    740:       }
                    741:     else negate_class = FALSE;
                    742: 
                    743:     /* Keep a count of chars so that we can optimize the case of just a single
                    744:     character. */
                    745: 
                    746:     class_charcount = 0;
                    747:     class_lastchar = -1;
                    748: 
                    749:     /* Initialize the 32-char bit map to all zeros. We have to build the
                    750:     map in a temporary bit of store, in case the class contains only 1
                    751:     character, because in that case the compiled code doesn't use the
                    752:     bit map. */
                    753: 
                    754:     memset(class, 0, 32 * sizeof(uschar));
                    755: 
                    756:     /* Process characters until ] is reached. By writing this as a "do" it
                    757:     means that an initial ] is taken as a data character. */
                    758: 
                    759:     do
                    760:       {
                    761:       if (c == 0)
                    762:         {
                    763:         *errorptr = ERR6;
                    764:         goto FAILED;
                    765:         }
                    766: 
                    767:       /* Backslash may introduce a single character, or it may introduce one
                    768:       of the specials, which just set a flag. Escaped items are checked for
                    769:       validity in the pre-compiling pass. The sequence \b is a special case.
                    770:       Inside a class (and only there) it is treated as backspace. Elsewhere
                    771:       it marks a word boundary. Other escapes have preset maps ready to
                    772:       or into the one we are building. We assume they have more than one
                    773:       character in them, so set class_count bigger than one. */
                    774: 
                    775:       if (c == '\\')
                    776:         {
                    777:         c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
                    778:         if (-c == ESC_b) c = '\b';
                    779:         else if (c < 0)
                    780:           {
                    781:           register const uschar *cbits = cd->cbits;
                    782:           class_charcount = 10;
                    783:           switch (-c)
                    784:             {
                    785:             case ESC_d:
                    786:             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
                    787:             continue;
                    788: 
                    789:             case ESC_D:
                    790:             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
                    791:             continue;
                    792: 
                    793:             case ESC_w:
                    794:             for (c = 0; c < 32; c++)
                    795:               class[c] |= (cbits[c+cbit_digit] | cbits[c+cbit_word]);
                    796:             continue;
                    797: 
                    798:             case ESC_W:
                    799:             for (c = 0; c < 32; c++)
                    800:               class[c] |= ~(cbits[c+cbit_digit] | cbits[c+cbit_word]);
                    801:             continue;
                    802: 
                    803:             case ESC_s:
                    804:             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
                    805:             continue;
                    806: 
                    807:             case ESC_S:
                    808:             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
                    809:             continue;
                    810: 
                    811:             default:
                    812:             *errorptr = ERR7;
                    813:             goto FAILED;
                    814:             }
                    815:           }
                    816:         /* Fall through if single character */
                    817:         }
                    818: 
                    819:       /* A single character may be followed by '-' to form a range. However,
                    820:       Perl does not permit ']' to be the end of the range. A '-' character
                    821:       here is treated as a literal. */
                    822: 
                    823:       if (ptr[1] == '-' && ptr[2] != ']')
                    824:         {
                    825:         int d;
                    826:         ptr += 2;
                    827:         d = *ptr;
                    828: 
                    829:         if (d == 0)
                    830:           {
                    831:           *errorptr = ERR6;
                    832:           goto FAILED;
                    833:           }
                    834: 
                    835:         /* The second part of a range can be a single-character escape, but
                    836:         not any of the other escapes. */
                    837: 
                    838:         if (d == '\\')
                    839:           {
                    840:           d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
                    841:           if (d < 0)
                    842:             {
                    843:             if (d == -ESC_b) d = '\b'; else
                    844:               {
                    845:               *errorptr = ERR7;
                    846:               goto FAILED;
                    847:               }
                    848:             }
                    849:           }
                    850: 
                    851:         if (d < c)
                    852:           {
                    853:           *errorptr = ERR8;
                    854:           goto FAILED;
                    855:           }
                    856: 
                    857:         for (; c <= d; c++)
                    858:           {
                    859:           class[c/8] |= (1 << (c&7));
                    860:           if ((options & PCRE_CASELESS) != 0)
                    861:             {
                    862:             int uc = cd->fcc[c];           /* flip case */
                    863:             class[uc/8] |= (1 << (uc&7));
                    864:             }
                    865:           class_charcount++;                /* in case a one-char range */
                    866:           class_lastchar = c;
                    867:           }
                    868:         continue;   /* Go get the next char in the class */
                    869:         }
                    870: 
                    871:       /* Handle a lone single character - we can get here for a normal
                    872:       non-escape char, or after \ that introduces a single character. */
                    873: 
                    874:       class [c/8] |= (1 << (c&7));
                    875:       if ((options & PCRE_CASELESS) != 0)
                    876:         {
                    877:         c = cd->fcc[c];   /* flip case */
                    878:         class[c/8] |= (1 << (c&7));
                    879:         }
                    880:       class_charcount++;
                    881:       class_lastchar = c;
                    882:       }
                    883: 
                    884:     /* Loop until ']' reached; the check for end of string happens inside the
                    885:     loop. This "while" is the end of the "do" above. */
                    886: 
                    887:     while ((c = *(++ptr)) != ']');
                    888: 
                    889:     /* If class_charcount is 1 and class_lastchar is not negative, we saw
                    890:     precisely one character. This doesn't need the whole 32-byte bit map.
                    891:     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
                    892:     it's negative. */
                    893: 
                    894:     if (class_charcount == 1 && class_lastchar >= 0)
                    895:       {
                    896:       if (negate_class)
                    897:         {
                    898:         code[-1] = OP_NOT;
                    899:         }
                    900:       else
                    901:         {
                    902:         code[-1] = OP_CHARS;
                    903:         *code++ = 1;
                    904:         }
                    905:       *code++ = class_lastchar;
                    906:       }
                    907: 
                    908:     /* Otherwise, negate the 32-byte map if necessary, and copy it into
                    909:     the code vector. */
                    910: 
                    911:     else
                    912:       {
                    913:       if (negate_class)
                    914:         for (c = 0; c < 32; c++) code[c] = ~class[c];
                    915:       else
                    916:         memcpy(code, class, 32);
                    917:       code += 32;
                    918:       }
                    919:     break;
                    920: 
                    921:     /* Various kinds of repeat */
                    922: 
                    923:     case '{':
                    924:     if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
                    925:     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
                    926:     if (*errorptr != NULL) goto FAILED;
                    927:     goto REPEAT;
                    928: 
                    929:     case '*':
                    930:     repeat_min = 0;
                    931:     repeat_max = -1;
                    932:     goto REPEAT;
                    933: 
                    934:     case '+':
                    935:     repeat_min = 1;
                    936:     repeat_max = -1;
                    937:     goto REPEAT;
                    938: 
                    939:     case '?':
                    940:     repeat_min = 0;
                    941:     repeat_max = 1;
                    942: 
                    943:     REPEAT:
                    944:     if (previous == NULL)
                    945:       {
                    946:       *errorptr = ERR9;
                    947:       goto FAILED;
                    948:       }
                    949: 
                    950:     /* If the next character is '?' this is a minimizing repeat, by default,
                    951:     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
                    952:     next character. */
                    953: 
                    954:     if (ptr[1] == '?')
                    955:       { repeat_type = greedy_non_default; ptr++; }
                    956:     else repeat_type = greedy_default;
                    957: 
                    958:     /* If previous was a string of characters, chop off the last one and use it
                    959:     as the subject of the repeat. If there was only one character, we can
                    960:     abolish the previous item altogether. A repeat with a zero minimum wipes
                    961:     out any reqchar setting, backing up to the previous value. We must also
                    962:     adjust the countlits value. */
                    963: 
                    964:     if (*previous == OP_CHARS)
                    965:       {
                    966:       int len = previous[1];
                    967: 
                    968:       if (repeat_min == 0) *reqchar = prevreqchar;
                    969:       *countlits += repeat_min - 1;
                    970: 
                    971:       if (len == 1)
                    972:         {
                    973:         c = previous[2];
                    974:         code = previous;
                    975:         }
                    976:       else
                    977:         {
                    978:         c = previous[len+1];
                    979:         previous[1]--;
                    980:         code--;
                    981:         }
                    982:       op_type = 0;                 /* Use single-char op codes */
                    983:       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
                    984:       }
                    985: 
                    986:     /* If previous was a single negated character ([^a] or similar), we use
                    987:     one of the special opcodes, replacing it. The code is shared with single-
                    988:     character repeats by adding a suitable offset into repeat_type. */
                    989: 
                    990:     else if ((int)*previous == OP_NOT)
                    991:       {
                    992:       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
                    993:       c = previous[1];
                    994:       code = previous;
                    995:       goto OUTPUT_SINGLE_REPEAT;
                    996:       }
                    997: 
                    998:     /* If previous was a character type match (\d or similar), abolish it and
                    999:     create a suitable repeat item. The code is shared with single-character
                   1000:     repeats by adding a suitable offset into repeat_type. */
                   1001: 
                   1002:     else if ((int)*previous < OP_EODN || *previous == OP_ANY)
                   1003:       {
                   1004:       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
                   1005:       c = *previous;
                   1006:       code = previous;
                   1007: 
                   1008:       OUTPUT_SINGLE_REPEAT:
                   1009: 
                   1010:       /* If the maximum is zero then the minimum must also be zero; Perl allows
                   1011:       this case, so we do too - by simply omitting the item altogether. */
                   1012: 
                   1013:       if (repeat_max == 0) goto END_REPEAT;
                   1014: 
                   1015:       /* Combine the op_type with the repeat_type */
                   1016: 
                   1017:       repeat_type += op_type;
                   1018: 
                   1019:       /* A minimum of zero is handled either as the special case * or ?, or as
                   1020:       an UPTO, with the maximum given. */
                   1021: 
                   1022:       if (repeat_min == 0)
                   1023:         {
                   1024:         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
                   1025:           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
                   1026:         else
                   1027:           {
                   1028:           *code++ = OP_UPTO + repeat_type;
                   1029:           *code++ = repeat_max >> 8;
                   1030:           *code++ = (repeat_max & 255);
                   1031:           }
                   1032:         }
                   1033: 
                   1034:       /* The case {1,} is handled as the special case + */
                   1035: 
                   1036:       else if (repeat_min == 1 && repeat_max == -1)
                   1037:         *code++ = OP_PLUS + repeat_type;
                   1038: 
                   1039:       /* The case {n,n} is just an EXACT, while the general case {n,m} is
                   1040:       handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
                   1041: 
                   1042:       else
                   1043:         {
                   1044:         if (repeat_min != 1)
                   1045:           {
                   1046:           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
                   1047:           *code++ = repeat_min >> 8;
                   1048:           *code++ = (repeat_min & 255);
                   1049:           }
                   1050: 
                   1051:         /* If the mininum is 1 and the previous item was a character string,
                   1052:         we either have to put back the item that got cancelled if the string
                   1053:         length was 1, or add the character back onto the end of a longer
                   1054:         string. For a character type nothing need be done; it will just get
                   1055:         put back naturally. Note that the final character is always going to
                   1056:         get added below. */
                   1057: 
                   1058:         else if (*previous == OP_CHARS)
                   1059:           {
                   1060:           if (code == previous) code += 2; else previous[1]++;
                   1061:           }
                   1062: 
                   1063:         /*  For a single negated character we also have to put back the
                   1064:         item that got cancelled. */
                   1065: 
                   1066:         else if (*previous == OP_NOT) code++;
                   1067: 
                   1068:         /* If the maximum is unlimited, insert an OP_STAR. */
                   1069: 
                   1070:         if (repeat_max < 0)
                   1071:           {
                   1072:           *code++ = c;
                   1073:           *code++ = OP_STAR + repeat_type;
                   1074:           }
                   1075: 
                   1076:         /* Else insert an UPTO if the max is greater than the min. */
                   1077: 
                   1078:         else if (repeat_max != repeat_min)
                   1079:           {
                   1080:           *code++ = c;
                   1081:           repeat_max -= repeat_min;
                   1082:           *code++ = OP_UPTO + repeat_type;
                   1083:           *code++ = repeat_max >> 8;
                   1084:           *code++ = (repeat_max & 255);
                   1085:           }
                   1086:         }
                   1087: 
                   1088:       /* The character or character type itself comes last in all cases. */
                   1089: 
                   1090:       *code++ = c;
                   1091:       }
                   1092: 
                   1093:     /* If previous was a character class or a back reference, we put the repeat
                   1094:     stuff after it, but just skip the item if the repeat was {0,0}. */
                   1095: 
                   1096:     else if (*previous == OP_CLASS || *previous == OP_REF)
                   1097:       {
                   1098:       if (repeat_max == 0)
                   1099:         {
                   1100:         code = previous;
                   1101:         goto END_REPEAT;
                   1102:         }
                   1103:       if (repeat_min == 0 && repeat_max == -1)
                   1104:         *code++ = OP_CRSTAR + repeat_type;
                   1105:       else if (repeat_min == 1 && repeat_max == -1)
                   1106:         *code++ = OP_CRPLUS + repeat_type;
                   1107:       else if (repeat_min == 0 && repeat_max == 1)
                   1108:         *code++ = OP_CRQUERY + repeat_type;
                   1109:       else
                   1110:         {
                   1111:         *code++ = OP_CRRANGE + repeat_type;
                   1112:         *code++ = repeat_min >> 8;
                   1113:         *code++ = repeat_min & 255;
                   1114:         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
                   1115:         *code++ = repeat_max >> 8;
                   1116:         *code++ = repeat_max & 255;
                   1117:         }
                   1118:       }
                   1119: 
                   1120:     /* If previous was a bracket group, we may have to replicate it in certain
                   1121:     cases. */
                   1122: 
                   1123:     else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
                   1124:              (int)*previous == OP_COND)
                   1125:       {
                   1126:       register int i;
                   1127:       int ketoffset = 0;
                   1128:       int len = code - previous;
                   1129:       uschar *bralink = NULL;
                   1130: 
                   1131:       /* If the maximum repeat count is unlimited, find the end of the bracket
                   1132:       by scanning through from the start, and compute the offset back to it
                   1133:       from the current code pointer. There may be an OP_OPT setting following
                   1134:       the final KET, so we can't find the end just by going back from the code
                   1135:       pointer. */
                   1136: 
                   1137:       if (repeat_max == -1)
                   1138:         {
                   1139:         register uschar *ket = previous;
                   1140:         do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
                   1141:         ketoffset = code - ket;
                   1142:         }
                   1143: 
                   1144:       /* The case of a zero minimum is special because of the need to stick
                   1145:       OP_BRAZERO in front of it, and because the group appears once in the
                   1146:       data, whereas in other cases it appears the minimum number of times. For
                   1147:       this reason, it is simplest to treat this case separately, as otherwise
                   1148:       the code gets far too mess. There are several special subcases when the
                   1149:       minimum is zero. */
                   1150: 
                   1151:       if (repeat_min == 0)
                   1152:         {
                   1153:         /* If we set up a required char from the bracket, we must back off
                   1154:         to the previous value and reset the countlits value too. */
                   1155: 
                   1156:         if (subcountlits > 0)
                   1157:           {
                   1158:           *reqchar = prevreqchar;
                   1159:           *countlits -= subcountlits;
                   1160:           }
                   1161: 
                   1162:         /* If the maximum is also zero, we just omit the group from the output
                   1163:         altogether. */
                   1164: 
                   1165:         if (repeat_max == 0)
                   1166:           {
                   1167:           code = previous;
                   1168:           goto END_REPEAT;
                   1169:           }
                   1170: 
                   1171:         /* If the maximum is 1 or unlimited, we just have to stick in the
                   1172:         BRAZERO and do no more at this point. */
                   1173: 
                   1174:         if (repeat_max <= 1)
                   1175:           {
                   1176:           memmove(previous+1, previous, len);
                   1177:           code++;
                   1178:           *previous++ = OP_BRAZERO + repeat_type;
                   1179:           }
                   1180: 
                   1181:         /* If the maximum is greater than 1 and limited, we have to replicate
                   1182:         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
                   1183:         The first one has to be handled carefully because it's the original
                   1184:         copy, which has to be moved up. The remainder can be handled by code
                   1185:         that is common with the non-zero minimum case below. We just have to
                   1186:         adjust the value or repeat_max, since one less copy is required. */
                   1187: 
                   1188:         else
                   1189:           {
                   1190:           int offset;
                   1191:           memmove(previous+4, previous, len);
                   1192:           code += 4;
                   1193:           *previous++ = OP_BRAZERO + repeat_type;
                   1194:           *previous++ = OP_BRA;
                   1195: 
                   1196:           /* We chain together the bracket offset fields that have to be
                   1197:           filled in later when the ends of the brackets are reached. */
                   1198: 
                   1199:           offset = (bralink == NULL)? 0 : previous - bralink;
                   1200:           bralink = previous;
                   1201:           *previous++ = offset >> 8;
                   1202:           *previous++ = offset & 255;
                   1203:           }
                   1204: 
                   1205:         repeat_max--;
                   1206:         }
                   1207: 
                   1208:       /* If the minimum is greater than zero, replicate the group as many
                   1209:       times as necessary, and adjust the maximum to the number of subsequent
                   1210:       copies that we need. */
                   1211: 
                   1212:       else
                   1213:         {
                   1214:         for (i = 1; i < repeat_min; i++)
                   1215:           {
                   1216:           memcpy(code, previous, len);
                   1217:           code += len;
                   1218:           }
                   1219:         if (repeat_max > 0) repeat_max -= repeat_min;
                   1220:         }
                   1221: 
                   1222:       /* This code is common to both the zero and non-zero minimum cases. If
                   1223:       the maximum is limited, it replicates the group in a nested fashion,
                   1224:       remembering the bracket starts on a stack. In the case of a zero minimum,
                   1225:       the first one was set up above. In all cases the repeat_max now specifies
                   1226:       the number of additional copies needed. */
                   1227: 
                   1228:       if (repeat_max >= 0)
                   1229:         {
                   1230:         for (i = repeat_max - 1; i >= 0; i--)
                   1231:           {
                   1232:           *code++ = OP_BRAZERO + repeat_type;
                   1233: 
                   1234:           /* All but the final copy start a new nesting, maintaining the
                   1235:           chain of brackets outstanding. */
                   1236: 
                   1237:           if (i != 0)
                   1238:             {
                   1239:             int offset;
                   1240:             *code++ = OP_BRA;
                   1241:             offset = (bralink == NULL)? 0 : code - bralink;
                   1242:             bralink = code;
                   1243:             *code++ = offset >> 8;
                   1244:             *code++ = offset & 255;
                   1245:             }
                   1246: 
                   1247:           memcpy(code, previous, len);
                   1248:           code += len;
                   1249:           }
                   1250: 
                   1251:         /* Now chain through the pending brackets, and fill in their length
                   1252:         fields (which are holding the chain links pro tem). */
                   1253: 
                   1254:         while (bralink != NULL)
                   1255:           {
                   1256:           int oldlinkoffset;
                   1257:           int offset = code - bralink + 1;
                   1258:           uschar *bra = code - offset;
                   1259:           oldlinkoffset = (bra[1] << 8) + bra[2];
                   1260:           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
                   1261:           *code++ = OP_KET;
                   1262:           *code++ = bra[1] = offset >> 8;
                   1263:           *code++ = bra[2] = (offset & 255);
                   1264:           }
                   1265:         }
                   1266: 
                   1267:       /* If the maximum is unlimited, set a repeater in the final copy. We
                   1268:       can't just offset backwards from the current code point, because we
                   1269:       don't know if there's been an options resetting after the ket. The
                   1270:       correct offset was computed above. */
                   1271: 
                   1272:       else code[-ketoffset] = OP_KETRMAX + repeat_type;
                   1273:       }
                   1274: 
                   1275:     /* Else there's some kind of shambles */
                   1276: 
                   1277:     else
                   1278:       {
                   1279:       *errorptr = ERR11;
                   1280:       goto FAILED;
                   1281:       }
                   1282: 
                   1283:     /* In all case we no longer have a previous item. */
                   1284: 
                   1285:     END_REPEAT:
                   1286:     previous = NULL;
                   1287:     break;
                   1288: 
                   1289: 
                   1290:     /* Start of nested bracket sub-expression, or comment or lookahead or
                   1291:     lookbehind or option setting or condition. First deal with special things
                   1292:     that can come after a bracket; all are introduced by ?, and the appearance
                   1293:     of any of them means that this is not a referencing group. They were
                   1294:     checked for validity in the first pass over the string, so we don't have to
                   1295:     check for syntax errors here.  */
                   1296: 
                   1297:     case '(':
                   1298:     newoptions = options;
                   1299:     condref = -1;
                   1300: 
                   1301:     if (*(++ptr) == '?')
                   1302:       {
                   1303:       int set, unset;
                   1304:       int *optset;
                   1305: 
                   1306:       switch (*(++ptr))
                   1307:         {
                   1308:         case '#':                 /* Comment; skip to ket */
                   1309:         ptr++;
                   1310:         while (*ptr != ')') ptr++;
                   1311:         continue;
                   1312: 
                   1313:         case ':':                 /* Non-extracting bracket */
                   1314:         bravalue = OP_BRA;
                   1315:         ptr++;
                   1316:         break;
                   1317: 
                   1318:         case '(':
                   1319:         bravalue = OP_COND;       /* Conditional group */
                   1320:         if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
                   1321:           {
                   1322:           condref = *ptr - '0';
                   1323:           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
                   1324:           ptr++;
                   1325:           }
                   1326:         else ptr--;
                   1327:         break;
                   1328: 
                   1329:         case '=':                 /* Positive lookahead */
                   1330:         bravalue = OP_ASSERT;
                   1331:         ptr++;
                   1332:         break;
                   1333: 
                   1334:         case '!':                 /* Negative lookahead */
                   1335:         bravalue = OP_ASSERT_NOT;
                   1336:         ptr++;
                   1337:         break;
                   1338: 
                   1339:         case '<':                 /* Lookbehinds */
                   1340:         switch (*(++ptr))
                   1341:           {
                   1342:           case '=':               /* Positive lookbehind */
                   1343:           bravalue = OP_ASSERTBACK;
                   1344:           ptr++;
                   1345:           break;
                   1346: 
                   1347:           case '!':               /* Negative lookbehind */
                   1348:           bravalue = OP_ASSERTBACK_NOT;
                   1349:           ptr++;
                   1350:           break;
                   1351: 
                   1352:           default:                /* Syntax error */
                   1353:           *errorptr = ERR24;
                   1354:           goto FAILED;
                   1355:           }
                   1356:         break;
                   1357: 
                   1358:         case '>':                 /* One-time brackets */
                   1359:         bravalue = OP_ONCE;
                   1360:         ptr++;
                   1361:         break;
                   1362: 
                   1363:         default:                  /* Option setting */
                   1364:         set = unset = 0;
                   1365:         optset = &set;
                   1366: 
                   1367:         while (*ptr != ')' && *ptr != ':')
                   1368:           {
                   1369:           switch (*ptr++)
                   1370:             {
                   1371:             case '-': optset = &unset; break;
                   1372: 
                   1373:             case 'i': *optset |= PCRE_CASELESS; break;
                   1374:             case 'm': *optset |= PCRE_MULTILINE; break;
                   1375:             case 's': *optset |= PCRE_DOTALL; break;
                   1376:             case 'x': *optset |= PCRE_EXTENDED; break;
                   1377:             case 'U': *optset |= PCRE_UNGREEDY; break;
                   1378:             case 'X': *optset |= PCRE_EXTRA; break;
                   1379: 
                   1380:             default:
                   1381:             *errorptr = ERR12;
                   1382:             goto FAILED;
                   1383:             }
                   1384:           }
                   1385: 
                   1386:         /* Set up the changed option bits, but don't change anything yet. */
                   1387: 
                   1388:         newoptions = (options | set) & (~unset);
                   1389: 
                   1390:         /* If the options ended with ')' this is not the start of a nested
                   1391:         group with option changes, so the options change at this level. At top
                   1392:         level there is nothing else to be done (the options will in fact have
                   1393:         been set from the start of compiling as a result of the first pass) but
                   1394:         at an inner level we must compile code to change the ims options if
                   1395:         necessary, and pass the new setting back so that it can be put at the
                   1396:         start of any following branches, and when this group ends, a resetting
                   1397:         item can be compiled. */
                   1398: 
                   1399:         if (*ptr == ')')
                   1400:           {
                   1401:           if ((options & PCRE_INGROUP) != 0 &&
                   1402:               (options & PCRE_IMS) != (newoptions & PCRE_IMS))
                   1403:             {
                   1404:             *code++ = OP_OPT;
                   1405:             *code++ = *optchanged = newoptions & PCRE_IMS;
                   1406:             }
                   1407:           options = newoptions;  /* Change options at this level */
                   1408:           previous = NULL;       /* This item can't be repeated */
                   1409:           continue;              /* It is complete */
                   1410:           }
                   1411: 
                   1412:         /* If the options ended with ':' we are heading into a nested group
                   1413:         with possible change of options. Such groups are non-capturing and are
                   1414:         not assertions of any kind. All we need to do is skip over the ':';
                   1415:         the newoptions value is handled below. */
                   1416: 
                   1417:         bravalue = OP_BRA;
                   1418:         ptr++;
                   1419:         }
                   1420:       }
                   1421: 
                   1422:     /* Else we have a referencing group; adjust the opcode. */
                   1423: 
                   1424:     else
                   1425:       {
                   1426:       if (++(*brackets) > EXTRACT_MAX)
                   1427:         {
                   1428:         *errorptr = ERR13;
                   1429:         goto FAILED;
                   1430:         }
                   1431:       bravalue = OP_BRA + *brackets;
                   1432:       }
                   1433: 
                   1434:     /* Process nested bracketed re. Assertions may not be repeated, but other
                   1435:     kinds can be. We copy code into a non-register variable in order to be able
                   1436:     to pass its address because some compilers complain otherwise. Pass in a
                   1437:     new setting for the ims options if they have changed. */
                   1438: 
                   1439:     previous = (bravalue >= OP_ONCE)? code : NULL;
                   1440:     *code = bravalue;
                   1441:     tempcode = code;
                   1442: 
                   1443:     if (!compile_regex(
                   1444:          options | PCRE_INGROUP,       /* Set for all nested groups */
                   1445:          ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
                   1446:            newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
                   1447:          brackets,                     /* Bracket level */
                   1448:          &tempcode,                    /* Where to put code (updated) */
                   1449:          &ptr,                         /* Input pointer (updated) */
                   1450:          errorptr,                     /* Where to put an error message */
                   1451:          (bravalue == OP_ASSERTBACK ||
                   1452:           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
                   1453:          condref,                      /* Condition reference number */
                   1454:          &subreqchar,                  /* For possible last char */
                   1455:          &subcountlits,                /* For literal count */
                   1456:          cd))                          /* Tables block */
                   1457:       goto FAILED;
                   1458: 
                   1459:     /* At the end of compiling, code is still pointing to the start of the
                   1460:     group, while tempcode has been updated to point past the end of the group
                   1461:     and any option resetting that may follow it. The pattern pointer (ptr)
                   1462:     is on the bracket. */
                   1463: 
                   1464:     /* If this is a conditional bracket, check that there are no more than
                   1465:     two branches in the group. */
                   1466: 
                   1467:     if (bravalue == OP_COND)
                   1468:       {
                   1469:       uschar *tc = code;
                   1470:       condcount = 0;
                   1471: 
                   1472:       do {
                   1473:          condcount++;
                   1474:          tc += (tc[1] << 8) | tc[2];
                   1475:          }
                   1476:       while (*tc != OP_KET);
                   1477: 
                   1478:       if (condcount > 2)
                   1479:         {
                   1480:         *errorptr = ERR27;
                   1481:         goto FAILED;
                   1482:         }
                   1483:       }
                   1484: 
                   1485:     /* Handle updating of the required character. If the subpattern didn't
                   1486:     set one, leave it as it was. Otherwise, update it for normal brackets of
                   1487:     all kinds, forward assertions, and conditions with two branches. Don't
                   1488:     update the literal count for forward assertions, however. If the bracket
                   1489:     is followed by a quantifier with zero repeat, we have to back off. Hence
                   1490:     the definition of prevreqchar and subcountlits outside the main loop so
                   1491:     that they can be accessed for the back off. */
                   1492: 
                   1493:     if (subreqchar > 0 &&
                   1494:          (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
                   1495:          (bravalue == OP_COND && condcount == 2)))
                   1496:       {
                   1497:       prevreqchar = *reqchar;
                   1498:       *reqchar = subreqchar;
                   1499:       if (bravalue != OP_ASSERT) *countlits += subcountlits;
                   1500:       }
                   1501: 
                   1502:     /* Now update the main code pointer to the end of the group. */
                   1503: 
                   1504:     code = tempcode;
                   1505: 
                   1506:     /* Error if hit end of pattern */
                   1507: 
                   1508:     if (*ptr != ')')
                   1509:       {
                   1510:       *errorptr = ERR14;
                   1511:       goto FAILED;
                   1512:       }
                   1513:     break;
                   1514: 
                   1515:     /* Check \ for being a real metacharacter; if not, fall through and handle
                   1516:     it as a data character at the start of a string. Escape items are checked
                   1517:     for validity in the pre-compiling pass. */
                   1518: 
                   1519:     case '\\':
                   1520:     tempptr = ptr;
                   1521:     c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
                   1522: 
                   1523:     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
                   1524:     are arranged to be the negation of the corresponding OP_values. For the
                   1525:     back references, the values are ESC_REF plus the reference number. Only
                   1526:     back references and those types that consume a character may be repeated.
                   1527:     We can test for values between ESC_b and ESC_Z for the latter; this may
                   1528:     have to change if any new ones are ever created. */
                   1529: 
                   1530:     if (c < 0)
                   1531:       {
                   1532:       if (-c >= ESC_REF)
                   1533:         {
                   1534:         previous = code;
                   1535:         *code++ = OP_REF;
                   1536:         *code++ = -c - ESC_REF;
                   1537:         }
                   1538:       else
                   1539:         {
                   1540:         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
                   1541:         *code++ = -c;
                   1542:         }
                   1543:       continue;
                   1544:       }
                   1545: 
                   1546:     /* Data character: reset and fall through */
                   1547: 
                   1548:     ptr = tempptr;
                   1549:     c = '\\';
                   1550: 
                   1551:     /* Handle a run of data characters until a metacharacter is encountered.
                   1552:     The first character is guaranteed not to be whitespace or # when the
                   1553:     extended flag is set. */
                   1554: 
                   1555:     NORMAL_CHAR:
                   1556:     default:
                   1557:     previous = code;
                   1558:     *code = OP_CHARS;
                   1559:     code += 2;
                   1560:     length = 0;
                   1561: 
                   1562:     do
                   1563:       {
                   1564:       if ((options & PCRE_EXTENDED) != 0)
                   1565:         {
                   1566:         if ((cd->ctypes[c] & ctype_space) != 0) continue;
                   1567:         if (c == '#')
                   1568:           {
                   1569:           while ((c = *(++ptr)) != 0 && c != '\n');
                   1570:           if (c == 0) break;
                   1571:           continue;
                   1572:           }
                   1573:         }
                   1574: 
                   1575:       /* Backslash may introduce a data char or a metacharacter. Escaped items
                   1576:       are checked for validity in the pre-compiling pass. Stop the string
                   1577:       before a metaitem. */
                   1578: 
                   1579:       if (c == '\\')
                   1580:         {
                   1581:         tempptr = ptr;
                   1582:         c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
                   1583:         if (c < 0) { ptr = tempptr; break; }
                   1584:         }
                   1585: 
                   1586:       /* Ordinary character or single-char escape */
                   1587: 
                   1588:       *code++ = c;
                   1589:       length++;
                   1590:       }
                   1591: 
                   1592:     /* This "while" is the end of the "do" above. */
                   1593: 
                   1594:     while (length < 255 && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
                   1595: 
                   1596:     /* Update the last character and the count of literals */
                   1597: 
                   1598:     prevreqchar = (length > 1)? code[-2] : *reqchar;
                   1599:     *reqchar = code[-1];
                   1600:     *countlits += length;
                   1601: 
                   1602:     /* Compute the length and set it in the data vector, and advance to
                   1603:     the next state. */
                   1604: 
                   1605:     previous[1] = length;
                   1606:     if (length < 255) ptr--;
                   1607:     break;
                   1608:     }
                   1609:   }                   /* end of big loop */
                   1610: 
                   1611: /* Control never reaches here by falling through, only by a goto for all the
                   1612: error states. Pass back the position in the pattern so that it can be displayed
                   1613: to the user for diagnosing the error. */
                   1614: 
                   1615: FAILED:
                   1616: *ptrptr = ptr;
                   1617: return FALSE;
                   1618: }
                   1619: 
                   1620: 
                   1621: 
                   1622: 
                   1623: /*************************************************
                   1624: *     Compile sequence of alternatives           *
                   1625: *************************************************/
                   1626: 
                   1627: /* On entry, ptr is pointing past the bracket character, but on return
                   1628: it points to the closing bracket, or vertical bar, or end of string.
                   1629: The code variable is pointing at the byte into which the BRA operator has been
                   1630: stored. If the ims options are changed at the start (for a (?ims: group) or
                   1631: during any branch, we need to insert an OP_OPT item at the start of every
                   1632: following branch to ensure they get set correctly at run time, and also pass
                   1633: the new options into every subsequent branch compile.
                   1634: 
                   1635: Argument:
                   1636:   options     the option bits
                   1637:   optchanged  new ims options to set as if (?ims) were at the start, or -1
                   1638:                for no change
                   1639:   brackets    -> int containing the number of extracting brackets used
                   1640:   codeptr     -> the address of the current code pointer
                   1641:   ptrptr      -> the address of the current pattern pointer
                   1642:   errorptr    -> pointer to error message
                   1643:   lookbehind  TRUE if this is a lookbehind assertion
                   1644:   condref     > 0 for OPT_CREF setting at start of conditional group
                   1645:   reqchar     -> place to put the last required character, or a negative number
                   1646:   countlits   -> place to put the shortest literal count of any branch
                   1647:   cd          points to the data block with tables pointers
                   1648: 
                   1649: Returns:      TRUE on success
                   1650: */
                   1651: 
                   1652: static BOOL
                   1653: compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
                   1654:   const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
                   1655:   int *reqchar, int *countlits, compile_data *cd)
                   1656: {
                   1657: const uschar *ptr = *ptrptr;
                   1658: uschar *code = *codeptr;
                   1659: uschar *last_branch = code;
                   1660: uschar *start_bracket = code;
                   1661: uschar *reverse_count = NULL;
                   1662: int oldoptions = options & PCRE_IMS;
                   1663: int branchreqchar, branchcountlits;
                   1664: 
                   1665: *reqchar = -1;
1.2     ! paf      1666: *countlits = PCRE_MAX_POS;
1.1       paf      1667: code += 3;
                   1668: 
                   1669: /* At the start of a reference-based conditional group, insert the reference
                   1670: number as an OP_CREF item. */
                   1671: 
                   1672: if (condref > 0)
                   1673:   {
                   1674:   *code++ = OP_CREF;
                   1675:   *code++ = condref;
                   1676:   }
                   1677: 
                   1678: /* Loop for each alternative branch */
                   1679: 
                   1680: for (;;)
                   1681:   {
                   1682:   int length;
                   1683: 
                   1684:   /* Handle change of options */
                   1685: 
                   1686:   if (optchanged >= 0)
                   1687:     {
                   1688:     *code++ = OP_OPT;
                   1689:     *code++ = optchanged;
                   1690:     options = (options & ~PCRE_IMS) | optchanged;
                   1691:     }
                   1692: 
                   1693:   /* Set up dummy OP_REVERSE if lookbehind assertion */
                   1694: 
                   1695:   if (lookbehind)
                   1696:     {
                   1697:     *code++ = OP_REVERSE;
                   1698:     reverse_count = code;
                   1699:     *code++ = 0;
                   1700:     *code++ = 0;
                   1701:     }
                   1702: 
                   1703:   /* Now compile the branch */
                   1704: 
                   1705:   if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
                   1706:       &branchreqchar, &branchcountlits, cd))
                   1707:     {
                   1708:     *ptrptr = ptr;
                   1709:     return FALSE;
                   1710:     }
                   1711: 
                   1712:   /* Fill in the length of the last branch */
                   1713: 
                   1714:   length = code - last_branch;
                   1715:   last_branch[1] = length >> 8;
                   1716:   last_branch[2] = length & 255;
                   1717: 
                   1718:   /* Save the last required character if all branches have the same; a current
                   1719:   value of -1 means unset, while -2 means "previous branch had no last required
                   1720:   char".  */
                   1721: 
                   1722:   if (*reqchar != -2)
                   1723:     {
                   1724:     if (branchreqchar >= 0)
                   1725:       {
                   1726:       if (*reqchar == -1) *reqchar = branchreqchar;
                   1727:       else if (*reqchar != branchreqchar) *reqchar = -2;
                   1728:       }
                   1729:     else *reqchar = -2;
                   1730:     }
                   1731: 
                   1732:   /* Keep the shortest literal count */
                   1733: 
                   1734:   if (branchcountlits < *countlits) *countlits = branchcountlits;
                   1735:   DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
                   1736: 
                   1737:   /* If lookbehind, check that this branch matches a fixed-length string,
                   1738:   and put the length into the OP_REVERSE item. Temporarily mark the end of
                   1739:   the branch with OP_END. */
                   1740: 
                   1741:   if (lookbehind)
                   1742:     {
                   1743:     *code = OP_END;
                   1744:     length = find_fixedlength(last_branch);
                   1745:     DPRINTF(("fixed length = %d\n", length));
                   1746:     if (length < 0)
                   1747:       {
                   1748:       *errorptr = ERR25;
                   1749:       *ptrptr = ptr;
                   1750:       return FALSE;
                   1751:       }
                   1752:     reverse_count[0] = (length >> 8);
                   1753:     reverse_count[1] = length & 255;
                   1754:     }
                   1755: 
                   1756:   /* Reached end of expression, either ')' or end of pattern. Insert a
                   1757:   terminating ket and the length of the whole bracketed item, and return,
                   1758:   leaving the pointer at the terminating char. If any of the ims options
                   1759:   were changed inside the group, compile a resetting op-code following. */
                   1760: 
                   1761:   if (*ptr != '|')
                   1762:     {
                   1763:     length = code - start_bracket;
                   1764:     *code++ = OP_KET;
                   1765:     *code++ = length >> 8;
                   1766:     *code++ = length & 255;
                   1767:     if (optchanged >= 0)
                   1768:       {
                   1769:       *code++ = OP_OPT;
                   1770:       *code++ = oldoptions;
                   1771:       }
                   1772:     *codeptr = code;
                   1773:     *ptrptr = ptr;
                   1774:     return TRUE;
                   1775:     }
                   1776: 
                   1777:   /* Another branch follows; insert an "or" node and advance the pointer. */
                   1778: 
                   1779:   *code = OP_ALT;
                   1780:   last_branch = code;
                   1781:   code += 3;
                   1782:   ptr++;
                   1783:   }
                   1784: /* Control never reaches here */
                   1785: }
                   1786: 
                   1787: 
                   1788: 
                   1789: 
                   1790: /*************************************************
                   1791: *      Find first significant op code            *
                   1792: *************************************************/
                   1793: 
                   1794: /* This is called by several functions that scan a compiled expression looking
                   1795: for a fixed first character, or an anchoring op code etc. It skips over things
                   1796: that do not influence this. For one application, a change of caseless option is
                   1797: important.
                   1798: 
                   1799: Arguments:
                   1800:   code       pointer to the start of the group
                   1801:   options    pointer to external options
                   1802:   optbit     the option bit whose changing is significant, or
                   1803:              zero if none are
                   1804:   optstop    TRUE to return on option change, otherwise change the options
                   1805:                value and continue
                   1806: 
                   1807: Returns:     pointer to the first significant opcode
                   1808: */
                   1809: 
                   1810: static const uschar*
                   1811: first_significant_code(const uschar *code, int *options, int optbit,
                   1812:   BOOL optstop)
                   1813: {
                   1814: for (;;)
                   1815:   {
                   1816:   switch ((int)*code)
                   1817:     {
                   1818:     case OP_OPT:
                   1819:     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
                   1820:       {
                   1821:       if (optstop) return code;
                   1822:       *options = (int)code[1];
                   1823:       }
                   1824:     code += 2;
                   1825:     break;
                   1826: 
                   1827:     case OP_CREF:
                   1828:     code += 2;
                   1829:     break;
                   1830: 
                   1831:     case OP_WORD_BOUNDARY:
                   1832:     case OP_NOT_WORD_BOUNDARY:
                   1833:     code++;
                   1834:     break;
                   1835: 
                   1836:     case OP_ASSERT_NOT:
                   1837:     case OP_ASSERTBACK:
                   1838:     case OP_ASSERTBACK_NOT:
                   1839:     do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
                   1840:     code += 3;
                   1841:     break;
                   1842: 
                   1843:     default:
                   1844:     return code;
                   1845:     }
                   1846:   }
                   1847: /* Control never reaches here */
                   1848: }
                   1849: 
                   1850: 
                   1851: 
                   1852: 
                   1853: /*************************************************
                   1854: *          Check for anchored expression         *
                   1855: *************************************************/
                   1856: 
                   1857: /* Try to find out if this is an anchored regular expression. Consider each
                   1858: alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
                   1859: all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
                   1860: it's anchored. However, if this is a multiline pattern, then only OP_SOD
                   1861: counts, since OP_CIRC can match in the middle.
                   1862: 
                   1863: A branch is also implicitly anchored if it starts with .* and DOTALL is set,
                   1864: because that will try the rest of the pattern at all possible matching points,
                   1865: so there is no point trying them again.
                   1866: 
                   1867: Arguments:
                   1868:   code       points to start of expression (the bracket)
                   1869:   options    points to the options setting
                   1870: 
                   1871: Returns:     TRUE or FALSE
                   1872: */
                   1873: 
                   1874: static BOOL
                   1875: is_anchored(register const uschar *code, int *options)
                   1876: {
                   1877: do {
                   1878:    const uschar *scode = first_significant_code(code + 3, options,
                   1879:      PCRE_MULTILINE, FALSE);
                   1880:    register int op = *scode;
                   1881:    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
                   1882:      { if (!is_anchored(scode, options)) return FALSE; }
                   1883:    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
                   1884:             (*options & PCRE_DOTALL) != 0)
                   1885:      { if (scode[1] != OP_ANY) return FALSE; }
                   1886:    else if (op != OP_SOD &&
                   1887:            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
                   1888:      return FALSE;
                   1889:    code += (code[1] << 8) + code[2];
                   1890:    }
                   1891: while (*code == OP_ALT);
                   1892: return TRUE;
                   1893: }
                   1894: 
                   1895: 
                   1896: 
                   1897: /*************************************************
                   1898: *         Check for starting with ^ or .*        *
                   1899: *************************************************/
                   1900: 
                   1901: /* This is called to find out if every branch starts with ^ or .* so that
                   1902: "first char" processing can be done to speed things up in multiline
                   1903: matching and for non-DOTALL patterns that start with .* (which must start at
                   1904: the beginning or after \n).
                   1905: 
                   1906: Argument:  points to start of expression (the bracket)
                   1907: Returns:   TRUE or FALSE
                   1908: */
                   1909: 
                   1910: static BOOL
                   1911: is_startline(const uschar *code)
                   1912: {
                   1913: do {
                   1914:    const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
                   1915:    register int op = *scode;
                   1916:    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
                   1917:      { if (!is_startline(scode)) return FALSE; }
                   1918:    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
                   1919:      { if (scode[1] != OP_ANY) return FALSE; }
                   1920:    else if (op != OP_CIRC) return FALSE;
                   1921:    code += (code[1] << 8) + code[2];
                   1922:    }
                   1923: while (*code == OP_ALT);
                   1924: return TRUE;
                   1925: }
                   1926: 
                   1927: 
                   1928: 
                   1929: /*************************************************
                   1930: *          Check for fixed first char            *
                   1931: *************************************************/
                   1932: 
                   1933: /* Try to find out if there is a fixed first character. This is called for
                   1934: unanchored expressions, as it speeds up their processing quite considerably.
                   1935: Consider each alternative branch. If they all start with the same char, or with
                   1936: a bracket all of whose alternatives start with the same char (recurse ad lib),
                   1937: then we return that char, otherwise -1.
                   1938: 
                   1939: Arguments:
                   1940:   code       points to start of expression (the bracket)
                   1941:   options    pointer to the options (used to check casing changes)
                   1942: 
                   1943: Returns:     -1 or the fixed first char
                   1944: */
                   1945: 
                   1946: static int
                   1947: find_firstchar(const uschar *code, int *options)
                   1948: {
                   1949: register int c = -1;
                   1950: do {
                   1951:    int d;
                   1952:    const uschar *scode = first_significant_code(code + 3, options,
                   1953:      PCRE_CASELESS, TRUE);
                   1954:    register int op = *scode;
                   1955: 
                   1956:    if (op >= OP_BRA) op = OP_BRA;
                   1957: 
                   1958:    switch(op)
                   1959:      {
                   1960:      default:
                   1961:      return -1;
                   1962: 
                   1963:      case OP_BRA:
                   1964:      case OP_ASSERT:
                   1965:      case OP_ONCE:
                   1966:      case OP_COND:
                   1967:      if ((d = find_firstchar(scode, options)) < 0) return -1;
                   1968:      if (c < 0) c = d; else if (c != d) return -1;
                   1969:      break;
                   1970: 
                   1971:      case OP_EXACT:       /* Fall through */
                   1972:      scode++;
                   1973: 
                   1974:      case OP_CHARS:       /* Fall through */
                   1975:      scode++;
                   1976: 
                   1977:      case OP_PLUS:
                   1978:      case OP_MINPLUS:
                   1979:      if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
                   1980:      break;
                   1981:      }
                   1982: 
                   1983:    code += (code[1] << 8) + code[2];
                   1984:    }
                   1985: while (*code == OP_ALT);
                   1986: return c;
                   1987: }
                   1988: 
                   1989: 
                   1990: 
                   1991: 
                   1992: 
                   1993: /*************************************************
                   1994: *        Compile a Regular Expression            *
                   1995: *************************************************/
                   1996: 
                   1997: /* This function takes a string and returns a pointer to a block of store
                   1998: holding a compiled version of the expression.
                   1999: 
                   2000: Arguments:
                   2001:   pattern      the regular expression
                   2002:   options      various option bits
                   2003:   errorptr     pointer to pointer to error text
                   2004:   erroroffset  ptr offset in pattern where error was detected
                   2005:   tables       pointer to character tables or NULL
                   2006: 
                   2007: Returns:       pointer to compiled data block, or NULL on error,
                   2008:                with errorptr and erroroffset set
                   2009: */
                   2010: 
                   2011: pcre *
                   2012: pcre_compile(const char *pattern, int options, const char **errorptr,
                   2013:   int *erroroffset, const unsigned char *tables)
                   2014: {
                   2015: real_pcre *re;
                   2016: int length = 3;      /* For initial BRA plus length */
                   2017: int runlength;
                   2018: int c, size, reqchar, countlits;
                   2019: int bracount = 0;
                   2020: int top_backref = 0;
                   2021: int branch_extra = 0;
                   2022: int branch_newextra;
                   2023: unsigned int brastackptr = 0;
                   2024: uschar *code;
                   2025: const uschar *ptr;
                   2026: compile_data compile_block;
                   2027: int brastack[BRASTACK_SIZE];
                   2028: uschar bralenstack[BRASTACK_SIZE];
                   2029: 
                   2030: #ifdef DEBUG
                   2031: uschar *code_base, *code_end;
                   2032: #endif
                   2033: 
                   2034: /* We can't pass back an error message if errorptr is NULL; I guess the best we
                   2035: can do is just return NULL. */
                   2036: 
                   2037: if (errorptr == NULL) return NULL;
                   2038: *errorptr = NULL;
                   2039: 
                   2040: /* However, we can give a message for this error */
                   2041: 
                   2042: if (erroroffset == NULL)
                   2043:   {
                   2044:   *errorptr = ERR16;
                   2045:   return NULL;
                   2046:   }
                   2047: *erroroffset = 0;
                   2048: 
                   2049: if ((options & ~PUBLIC_OPTIONS) != 0)
                   2050:   {
                   2051:   *errorptr = ERR17;
                   2052:   return NULL;
                   2053:   }
                   2054: 
                   2055: /* Set up pointers to the individual character tables */
                   2056: 
                   2057: if (tables == NULL) tables = pcre_default_tables;
                   2058: compile_block.lcc = tables + lcc_offset;
                   2059: compile_block.fcc = tables + fcc_offset;
                   2060: compile_block.cbits = tables + cbits_offset;
                   2061: compile_block.ctypes = tables + ctypes_offset;
                   2062: 
                   2063: /* Reflect pattern for debugging output */
                   2064: 
                   2065: DPRINTF(("------------------------------------------------------------------\n"));
                   2066: DPRINTF(("%s\n", pattern));
                   2067: 
                   2068: /* The first thing to do is to make a pass over the pattern to compute the
                   2069: amount of store required to hold the compiled code. This does not have to be
                   2070: perfect as long as errors are overestimates. At the same time we can detect any
                   2071: internal flag settings. Make an attempt to correct for any counted white space
                   2072: if an "extended" flag setting appears late in the pattern. We can't be so
                   2073: clever for #-comments. */
                   2074: 
                   2075: ptr = (const uschar *)(pattern - 1);
                   2076: while ((c = *(++ptr)) != 0)
                   2077:   {
                   2078:   int min, max;
                   2079:   int class_charcount;
                   2080: 
                   2081:   if ((options & PCRE_EXTENDED) != 0)
                   2082:     {
                   2083:     if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
                   2084:     if (c == '#')
                   2085:       {
                   2086:       while ((c = *(++ptr)) != 0 && c != '\n');
                   2087:       continue;
                   2088:       }
                   2089:     }
                   2090: 
                   2091:   switch(c)
                   2092:     {
                   2093:     /* A backslashed item may be an escaped "normal" character or a
                   2094:     character type. For a "normal" character, put the pointers and
                   2095:     character back so that tests for whitespace etc. in the input
                   2096:     are done correctly. */
                   2097: 
                   2098:     case '\\':
                   2099:       {
                   2100:       const uschar *save_ptr = ptr;
                   2101:       c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
                   2102:       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
                   2103:       if (c >= 0)
                   2104:         {
                   2105:         ptr = save_ptr;
                   2106:         c = '\\';
                   2107:         goto NORMAL_CHAR;
                   2108:         }
                   2109:       }
                   2110:     length++;
                   2111: 
                   2112:     /* A back reference needs an additional char, plus either one or 5
                   2113:     bytes for a repeat. We also need to keep the value of the highest
                   2114:     back reference. */
                   2115: 
                   2116:     if (c <= -ESC_REF)
                   2117:       {
                   2118:       int refnum = -c - ESC_REF;
                   2119:       if (refnum > top_backref) top_backref = refnum;
                   2120:       length++;   /* For single back reference */
                   2121:       if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
                   2122:         {
                   2123:         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
                   2124:         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
                   2125:         if ((min == 0 && (max == 1 || max == -1)) ||
                   2126:           (min == 1 && max == -1))
                   2127:             length++;
                   2128:         else length += 5;
                   2129:         if (ptr[1] == '?') ptr++;
                   2130:         }
                   2131:       }
                   2132:     continue;
                   2133: 
                   2134:     case '^':
                   2135:     case '.':
                   2136:     case '$':
                   2137:     case '*':     /* These repeats won't be after brackets; */
                   2138:     case '+':     /* those are handled separately */
                   2139:     case '?':
                   2140:     length++;
                   2141:     continue;
                   2142: 
                   2143:     /* This covers the cases of repeats after a single char, metachar, class,
                   2144:     or back reference. */
                   2145: 
                   2146:     case '{':
                   2147:     if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
                   2148:     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
                   2149:     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
                   2150:     if ((min == 0 && (max == 1 || max == -1)) ||
                   2151:       (min == 1 && max == -1))
                   2152:         length++;
                   2153:     else
                   2154:       {
                   2155:       length--;   /* Uncount the original char or metachar */
                   2156:       if (min == 1) length++; else if (min > 0) length += 4;
                   2157:       if (max > 0) length += 4; else length += 2;
                   2158:       }
                   2159:     if (ptr[1] == '?') ptr++;
                   2160:     continue;
                   2161: 
                   2162:     /* An alternation contains an offset to the next branch or ket. If any ims
                   2163:     options changed in the previous branch(es), and/or if we are in a
                   2164:     lookbehind assertion, extra space will be needed at the start of the
                   2165:     branch. This is handled by branch_extra. */
                   2166: 
                   2167:     case '|':
                   2168:     length += 3 + branch_extra;
                   2169:     continue;
                   2170: 
                   2171:     /* A character class uses 33 characters. Don't worry about character types
                   2172:     that aren't allowed in classes - they'll get picked up during the compile.
                   2173:     A character class that contains only one character uses 2 or 3 bytes,
                   2174:     depending on whether it is negated or not. Notice this where we can. */
                   2175: 
                   2176:     case '[':
                   2177:     class_charcount = 0;
                   2178:     if (*(++ptr) == '^') ptr++;
                   2179:     do
                   2180:       {
                   2181:       if (*ptr == '\\')
                   2182:         {
                   2183:         int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
                   2184:           &compile_block);
                   2185:         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
                   2186:         if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
                   2187:         }
                   2188:       else class_charcount++;
                   2189:       ptr++;
                   2190:       }
                   2191:     while (*ptr != 0 && *ptr != ']');
                   2192: 
                   2193:     /* Repeats for negated single chars are handled by the general code */
                   2194: 
                   2195:     if (class_charcount == 1) length += 3; else
                   2196:       {
                   2197:       length += 33;
                   2198: 
                   2199:       /* A repeat needs either 1 or 5 bytes. */
                   2200: 
                   2201:       if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
                   2202:         {
                   2203:         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
                   2204:         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
                   2205:         if ((min == 0 && (max == 1 || max == -1)) ||
                   2206:           (min == 1 && max == -1))
                   2207:             length++;
                   2208:         else length += 5;
                   2209:         if (ptr[1] == '?') ptr++;
                   2210:         }
                   2211:       }
                   2212:     continue;
                   2213: 
                   2214:     /* Brackets may be genuine groups or special things */
                   2215: 
                   2216:     case '(':
                   2217:     branch_newextra = 0;
                   2218: 
                   2219:     /* Handle special forms of bracket, which all start (? */
                   2220: 
                   2221:     if (ptr[1] == '?')
                   2222:       {
                   2223:       int set, unset;
                   2224:       int *optset;
                   2225: 
                   2226:       switch (c = ptr[2])
                   2227:         {
                   2228:         /* Skip over comments entirely */
                   2229:         case '#':
                   2230:         ptr += 3;
                   2231:         while (*ptr != 0 && *ptr != ')') ptr++;
                   2232:         if (*ptr == 0)
                   2233:           {
                   2234:           *errorptr = ERR18;
                   2235:           goto PCRE_ERROR_RETURN;
                   2236:           }
                   2237:         continue;
                   2238: 
                   2239:         /* Non-referencing groups and lookaheads just move the pointer on, and
                   2240:         then behave like a non-special bracket, except that they don't increment
                   2241:         the count of extracting brackets. Ditto for the "once only" bracket,
                   2242:         which is in Perl from version 5.005. */
                   2243: 
                   2244:         case ':':
                   2245:         case '=':
                   2246:         case '!':
                   2247:         case '>':
                   2248:         ptr += 2;
                   2249:         break;
                   2250: 
                   2251:         /* Lookbehinds are in Perl from version 5.005 */
                   2252: 
                   2253:         case '<':
                   2254:         if (ptr[3] == '=' || ptr[3] == '!')
                   2255:           {
                   2256:           ptr += 3;
                   2257:           branch_newextra = 3;
                   2258:           length += 3;         /* For the first branch */
                   2259:           break;
                   2260:           }
                   2261:         *errorptr = ERR24;
                   2262:         goto PCRE_ERROR_RETURN;
                   2263: 
                   2264:         /* Conditionals are in Perl from version 5.005. The bracket must either
                   2265:         be followed by a number (for bracket reference) or by an assertion
                   2266:         group. */
                   2267: 
                   2268:         case '(':
                   2269:         if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
                   2270:           {
                   2271:           ptr += 4;
                   2272:           length += 2;
                   2273:           while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
                   2274:           if (*ptr != ')')
                   2275:             {
                   2276:             *errorptr = ERR26;
                   2277:             goto PCRE_ERROR_RETURN;
                   2278:             }
                   2279:           }
                   2280:         else   /* An assertion must follow */
                   2281:           {
                   2282:           ptr++;   /* Can treat like ':' as far as spacing is concerned */
                   2283: 
                   2284:           if (ptr[2] != '?' || strchr("=!<", ptr[3]) == NULL)
                   2285:             {
                   2286:             ptr += 2;    /* To get right offset in message */
                   2287:             *errorptr = ERR28;
                   2288:             goto PCRE_ERROR_RETURN;
                   2289:             }
                   2290:           }
                   2291:         break;
                   2292: 
                   2293:         /* Else loop checking valid options until ) is met. Anything else is an
                   2294:         error. If we are without any brackets, i.e. at top level, the settings
                   2295:         act as if specified in the options, so massage the options immediately.
                   2296:         This is for backward compatibility with Perl 5.004. */
                   2297: 
                   2298:         default:
                   2299:         set = unset = 0;
                   2300:         optset = &set;
                   2301:         ptr += 2;
                   2302: 
                   2303:         for (;; ptr++)
                   2304:           {
                   2305:           c = *ptr;
                   2306:           switch (c)
                   2307:             {
                   2308:             case 'i':
                   2309:             *optset |= PCRE_CASELESS;
                   2310:             continue;
                   2311: 
                   2312:             case 'm':
                   2313:             *optset |= PCRE_MULTILINE;
                   2314:             continue;
                   2315: 
                   2316:             case 's':
                   2317:             *optset |= PCRE_DOTALL;
                   2318:             continue;
                   2319: 
                   2320:             case 'x':
                   2321:             *optset |= PCRE_EXTENDED;
                   2322:             continue;
                   2323: 
                   2324:             case 'X':
                   2325:             *optset |= PCRE_EXTRA;
                   2326:             continue;
                   2327: 
                   2328:             case 'U':
                   2329:             *optset |= PCRE_UNGREEDY;
                   2330:             continue;
                   2331: 
                   2332:             case '-':
                   2333:             optset = &unset;
                   2334:             continue;
                   2335: 
                   2336:             /* A termination by ')' indicates an options-setting-only item;
                   2337:             this is global at top level; otherwise nothing is done here and
                   2338:             it is handled during the compiling process on a per-bracket-group
                   2339:             basis. */
                   2340: 
                   2341:             case ')':
                   2342:             if (brastackptr == 0)
                   2343:               {
                   2344:               options = (options | set) & (~unset);
                   2345:               set = unset = 0;     /* To save length */
                   2346:               }
                   2347:             /* Fall through */
                   2348: 
                   2349:             /* A termination by ':' indicates the start of a nested group with
                   2350:             the given options set. This is again handled at compile time, but
                   2351:             we must allow for compiled space if any of the ims options are
                   2352:             set. We also have to allow for resetting space at the end of
                   2353:             the group, which is why 4 is added to the length and not just 2.
                   2354:             If there are several changes of options within the same group, this
                   2355:             will lead to an over-estimate on the length, but this shouldn't
                   2356:             matter very much. We also have to allow for resetting options at
                   2357:             the start of any alternations, which we do by setting
                   2358:             branch_newextra to 2. Finally, we record whether the case-dependent
                   2359:             flag ever changes within the regex. This is used by the "required
                   2360:             character" code. */
                   2361: 
                   2362:             case ':':
                   2363:             if (((set|unset) & PCRE_IMS) != 0)
                   2364:               {
                   2365:               length += 4;
                   2366:               branch_newextra = 2;
                   2367:               if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
                   2368:               }
                   2369:             goto END_OPTIONS;
                   2370: 
                   2371:             /* Unrecognized option character */
                   2372: 
                   2373:             default:
                   2374:             *errorptr = ERR12;
                   2375:             goto PCRE_ERROR_RETURN;
                   2376:             }
                   2377:           }
                   2378: 
                   2379:         /* If we hit a closing bracket, that's it - this is a freestanding
                   2380:         option-setting. We need to ensure that branch_extra is updated if
                   2381:         necessary. The only values branch_newextra can have here are 0 or 2.
                   2382:         If the value is 2, then branch_extra must either be 2 or 5, depending
                   2383:         on whether this is a lookbehind group or not. */
                   2384: 
                   2385:         END_OPTIONS:
                   2386:         if (c == ')')
                   2387:           {
                   2388:           if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
                   2389:             branch_extra += branch_newextra;
                   2390:           continue;
                   2391:           }
                   2392: 
                   2393:         /* If options were terminated by ':' control comes here. Fall through
                   2394:         to handle the group below. */
                   2395:         }
                   2396:       }
                   2397: 
                   2398:     /* Extracting brackets must be counted so we can process escapes in a
                   2399:     Perlish way. */
                   2400: 
                   2401:     else bracount++;
                   2402: 
                   2403:     /* Non-special forms of bracket. Save length for computing whole length
                   2404:     at end if there's a repeat that requires duplication of the group. Also
                   2405:     save the current value of branch_extra, and start the new group with
                   2406:     the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
                   2407:     for a lookbehind assertion. */
                   2408: 
                   2409:     if (brastackptr >= sizeof(brastack)/sizeof(int))
                   2410:       {
                   2411:       *errorptr = ERR19;
                   2412:       goto PCRE_ERROR_RETURN;
                   2413:       }
                   2414: 
                   2415:     bralenstack[brastackptr] = branch_extra;
                   2416:     branch_extra = branch_newextra;
                   2417: 
                   2418:     brastack[brastackptr++] = length;
                   2419:     length += 3;
                   2420:     continue;
                   2421: 
                   2422:     /* Handle ket. Look for subsequent max/min; for certain sets of values we
                   2423:     have to replicate this bracket up to that many times. If brastackptr is
                   2424:     0 this is an unmatched bracket which will generate an error, but take care
                   2425:     not to try to access brastack[-1] when computing the length and restoring
                   2426:     the branch_extra value. */
                   2427: 
                   2428:     case ')':
                   2429:     length += 3;
                   2430:       {
                   2431:       int minval = 1;
                   2432:       int maxval = 1;
                   2433:       int duplength;
                   2434: 
                   2435:       if (brastackptr > 0)
                   2436:         {
                   2437:         duplength = length - brastack[--brastackptr];
                   2438:         branch_extra = bralenstack[brastackptr];
                   2439:         }
                   2440:       else duplength = 0;
                   2441: 
                   2442:       /* Leave ptr at the final char; for read_repeat_counts this happens
                   2443:       automatically; for the others we need an increment. */
                   2444: 
                   2445:       if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
                   2446:         {
                   2447:         ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
                   2448:           &compile_block);
                   2449:         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
                   2450:         }
                   2451:       else if (c == '*') { minval = 0; maxval = -1; ptr++; }
                   2452:       else if (c == '+') { maxval = -1; ptr++; }
                   2453:       else if (c == '?') { minval = 0; ptr++; }
                   2454: 
                   2455:       /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
                   2456:       group, and if the maximum is greater than zero, we have to replicate
                   2457:       maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
                   2458:       bracket set - hence the 7. */
                   2459: 
                   2460:       if (minval == 0)
                   2461:         {
                   2462:         length++;
                   2463:         if (maxval > 0) length += (maxval - 1) * (duplength + 7);
                   2464:         }
                   2465: 
                   2466:       /* When the minimum is greater than zero, 1 we have to replicate up to
                   2467:       minval-1 times, with no additions required in the copies. Then, if
                   2468:       there is a limited maximum we have to replicate up to maxval-1 times
                   2469:       allowing for a BRAZERO item before each optional copy and nesting
                   2470:       brackets for all but one of the optional copies. */
                   2471: 
                   2472:       else
                   2473:         {
                   2474:         length += (minval - 1) * duplength;
                   2475:         if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
                   2476:           length += (maxval - minval) * (duplength + 7) - 6;
                   2477:         }
                   2478:       }
                   2479:     continue;
                   2480: 
                   2481:     /* Non-special character. For a run of such characters the length required
                   2482:     is the number of characters + 2, except that the maximum run length is 255.
                   2483:     We won't get a skipped space or a non-data escape or the start of a #
                   2484:     comment as the first character, so the length can't be zero. */
                   2485: 
                   2486:     NORMAL_CHAR:
                   2487:     default:
                   2488:     length += 2;
                   2489:     runlength = 0;
                   2490:     do
                   2491:       {
                   2492:       if ((options & PCRE_EXTENDED) != 0)
                   2493:         {
                   2494:         if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
                   2495:         if (c == '#')
                   2496:           {
                   2497:           while ((c = *(++ptr)) != 0 && c != '\n');
                   2498:           continue;
                   2499:           }
                   2500:         }
                   2501: 
                   2502:       /* Backslash may introduce a data char or a metacharacter; stop the
                   2503:       string before the latter. */
                   2504: 
                   2505:       if (c == '\\')
                   2506:         {
                   2507:         const uschar *saveptr = ptr;
                   2508:         c = check_escape(&ptr, errorptr, bracount, options, FALSE,
                   2509:           &compile_block);
                   2510:         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
                   2511:         if (c < 0) { ptr = saveptr; break; }
                   2512:         }
                   2513: 
                   2514:       /* Ordinary character or single-char escape */
                   2515: 
                   2516:       runlength++;
                   2517:       }
                   2518: 
                   2519:     /* This "while" is the end of the "do" above. */
                   2520: 
                   2521:     while (runlength < 255 &&
                   2522:       (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
                   2523: 
                   2524:     ptr--;
                   2525:     length += runlength;
                   2526:     continue;
                   2527:     }
                   2528:   }
                   2529: 
                   2530: length += 4;    /* For final KET and END */
                   2531: 
                   2532: if (length > 65539)
                   2533:   {
                   2534:   *errorptr = ERR20;
                   2535:   return NULL;
                   2536:   }
                   2537: 
                   2538: /* Compute the size of data block needed and get it, either from malloc or
                   2539: externally provided function. We specify "code[0]" in the offsetof() expression
                   2540: rather than just "code", because it has been reported that one broken compiler
                   2541: fails on "code" because it is also an independent variable. It should make no
                   2542: difference to the value of the offsetof(). */
                   2543: 
                   2544: size = length + offsetof(real_pcre, code[0]);
                   2545: re = (real_pcre *)(pcre_malloc)(size);
                   2546: 
                   2547: if (re == NULL)
                   2548:   {
                   2549:   *errorptr = ERR21;
                   2550:   return NULL;
                   2551:   }
                   2552: 
                   2553: /* Put in the magic number and the options. */
                   2554: 
                   2555: re->magic_number = MAGIC_NUMBER;
                   2556: re->options = options;
                   2557: re->tables = tables;
                   2558: 
                   2559: /* Set up a starting, non-extracting bracket, then compile the expression. On
                   2560: error, *errorptr will be set non-NULL, so we don't need to look at the result
                   2561: of the function here. */
                   2562: 
                   2563: ptr = (const uschar *)pattern;
                   2564: code = re->code;
                   2565: *code = OP_BRA;
                   2566: bracount = 0;
                   2567: (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
                   2568:   &reqchar, &countlits, &compile_block);
                   2569: re->top_bracket = bracount;
                   2570: re->top_backref = top_backref;
                   2571: 
                   2572: /* If not reached end of pattern on success, there's an excess bracket. */
                   2573: 
                   2574: if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
                   2575: 
                   2576: /* Fill in the terminating state and check for disastrous overflow, but
                   2577: if debugging, leave the test till after things are printed out. */
                   2578: 
                   2579: *code++ = OP_END;
                   2580: 
                   2581: #ifndef DEBUG
                   2582: if (code - re->code > length) *errorptr = ERR23;
                   2583: #endif
                   2584: 
                   2585: /* Give an error if there's back reference to a non-existent capturing
                   2586: subpattern. */
                   2587: 
                   2588: if (top_backref > re->top_bracket) *errorptr = ERR15;
                   2589: 
                   2590: /* Failed to compile */
                   2591: 
                   2592: if (*errorptr != NULL)
                   2593:   {
                   2594:   (pcre_free)(re);
                   2595:   PCRE_ERROR_RETURN:
                   2596:   *erroroffset = ptr - (const uschar *)pattern;
                   2597:   return NULL;
                   2598:   }
                   2599: 
                   2600: /* If the anchored option was not passed, set flag if we can determine that the
                   2601: pattern is anchored by virtue of ^ characters or \A or anything else (such as
                   2602: starting with .* when DOTALL is set).
                   2603: 
                   2604: Otherwise, see if we can determine what the first character has to be, because
                   2605: that speeds up unanchored matches no end. If not, see if we can set the
                   2606: PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
                   2607: start with ^. and also when all branches start with .* for non-DOTALL matches.
                   2608: */
                   2609: 
                   2610: if ((options & PCRE_ANCHORED) == 0)
                   2611:   {
                   2612:   int temp_options = options;
                   2613:   if (is_anchored(re->code, &temp_options))
                   2614:     re->options |= PCRE_ANCHORED;
                   2615:   else
                   2616:     {
                   2617:     int ch = find_firstchar(re->code, &temp_options);
                   2618:     if (ch >= 0)
                   2619:       {
                   2620:       re->first_char = ch;
                   2621:       re->options |= PCRE_FIRSTSET;
                   2622:       }
                   2623:     else if (is_startline(re->code))
                   2624:       re->options |= PCRE_STARTLINE;
                   2625:     }
                   2626:   }
                   2627: 
                   2628: /* Save the last required character if there are at least two literal
                   2629: characters on all paths, or if there is no first character setting. */
                   2630: 
                   2631: if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
                   2632:   {
                   2633:   re->req_char = reqchar;
                   2634:   re->options |= PCRE_REQCHSET;
                   2635:   }
                   2636: 
                   2637: /* Print out the compiled data for debugging */
                   2638: 
                   2639: #ifdef DEBUG
                   2640: 
                   2641: printf("Length = %d top_bracket = %d top_backref = %d\n",
                   2642:   length, re->top_bracket, re->top_backref);
                   2643: 
                   2644: if (re->options != 0)
                   2645:   {
                   2646:   printf("%s%s%s%s%s%s%s%s%s\n",
                   2647:     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
                   2648:     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
                   2649:     ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
                   2650:     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
                   2651:     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
                   2652:     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
                   2653:     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
                   2654:     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
                   2655:     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
                   2656:   }
                   2657: 
                   2658: if ((re->options & PCRE_FIRSTSET) != 0)
                   2659:   {
                   2660:   if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
                   2661:     else printf("First char = \\x%02x\n", re->first_char);
                   2662:   }
                   2663: 
                   2664: if ((re->options & PCRE_REQCHSET) != 0)
                   2665:   {
                   2666:   if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
                   2667:     else printf("Req char = \\x%02x\n", re->req_char);
                   2668:   }
                   2669: 
                   2670: code_end = code;
                   2671: code_base = code = re->code;
                   2672: 
                   2673: while (code < code_end)
                   2674:   {
                   2675:   int charlength;
                   2676: 
                   2677:   printf("%3d ", code - code_base);
                   2678: 
                   2679:   if (*code >= OP_BRA)
                   2680:     {
                   2681:     printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
                   2682:     code += 2;
                   2683:     }
                   2684: 
                   2685:   else switch(*code)
                   2686:     {
                   2687:     case OP_OPT:
                   2688:     printf(" %.2x %s", code[1], OP_names[*code]);
                   2689:     code++;
                   2690:     break;
                   2691: 
                   2692:     case OP_COND:
                   2693:     printf("%3d Cond", (code[1] << 8) + code[2]);
                   2694:     code += 2;
                   2695:     break;
                   2696: 
                   2697:     case OP_CREF:
                   2698:     printf(" %.2d %s", code[1], OP_names[*code]);
                   2699:     code++;
                   2700:     break;
                   2701: 
                   2702:     case OP_CHARS:
                   2703:     charlength = *(++code);
                   2704:     printf("%3d ", charlength);
                   2705:     while (charlength-- > 0)
                   2706:       if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
                   2707:     break;
                   2708: 
                   2709:     case OP_KETRMAX:
                   2710:     case OP_KETRMIN:
                   2711:     case OP_ALT:
                   2712:     case OP_KET:
                   2713:     case OP_ASSERT:
                   2714:     case OP_ASSERT_NOT:
                   2715:     case OP_ASSERTBACK:
                   2716:     case OP_ASSERTBACK_NOT:
                   2717:     case OP_ONCE:
                   2718:     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
                   2719:     code += 2;
                   2720:     break;
                   2721: 
                   2722:     case OP_REVERSE:
                   2723:     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
                   2724:     code += 2;
                   2725:     break;
                   2726: 
                   2727:     case OP_STAR:
                   2728:     case OP_MINSTAR:
                   2729:     case OP_PLUS:
                   2730:     case OP_MINPLUS:
                   2731:     case OP_QUERY:
                   2732:     case OP_MINQUERY:
                   2733:     case OP_TYPESTAR:
                   2734:     case OP_TYPEMINSTAR:
                   2735:     case OP_TYPEPLUS:
                   2736:     case OP_TYPEMINPLUS:
                   2737:     case OP_TYPEQUERY:
                   2738:     case OP_TYPEMINQUERY:
                   2739:     if (*code >= OP_TYPESTAR)
                   2740:       printf("    %s", OP_names[code[1]]);
                   2741:     else if (isprint(c = code[1])) printf("    %c", c);
                   2742:       else printf("    \\x%02x", c);
                   2743:     printf("%s", OP_names[*code++]);
                   2744:     break;
                   2745: 
                   2746:     case OP_EXACT:
                   2747:     case OP_UPTO:
                   2748:     case OP_MINUPTO:
                   2749:     if (isprint(c = code[3])) printf("    %c{", c);
                   2750:       else printf("    \\x%02x{", c);
                   2751:     if (*code != OP_EXACT) printf("0,");
                   2752:     printf("%d}", (code[1] << 8) + code[2]);
                   2753:     if (*code == OP_MINUPTO) printf("?");
                   2754:     code += 3;
                   2755:     break;
                   2756: 
                   2757:     case OP_TYPEEXACT:
                   2758:     case OP_TYPEUPTO:
                   2759:     case OP_TYPEMINUPTO:
                   2760:     printf("    %s{", OP_names[code[3]]);
                   2761:     if (*code != OP_TYPEEXACT) printf(",");
                   2762:     printf("%d}", (code[1] << 8) + code[2]);
                   2763:     if (*code == OP_TYPEMINUPTO) printf("?");
                   2764:     code += 3;
                   2765:     break;
                   2766: 
                   2767:     case OP_NOT:
                   2768:     if (isprint(c = *(++code))) printf("    [^%c]", c);
                   2769:       else printf("    [^\\x%02x]", c);
                   2770:     break;
                   2771: 
                   2772:     case OP_NOTSTAR:
                   2773:     case OP_NOTMINSTAR:
                   2774:     case OP_NOTPLUS:
                   2775:     case OP_NOTMINPLUS:
                   2776:     case OP_NOTQUERY:
                   2777:     case OP_NOTMINQUERY:
                   2778:     if (isprint(c = code[1])) printf("    [^%c]", c);
                   2779:       else printf("    [^\\x%02x]", c);
                   2780:     printf("%s", OP_names[*code++]);
                   2781:     break;
                   2782: 
                   2783:     case OP_NOTEXACT:
                   2784:     case OP_NOTUPTO:
                   2785:     case OP_NOTMINUPTO:
                   2786:     if (isprint(c = code[3])) printf("    [^%c]{", c);
                   2787:       else printf("    [^\\x%02x]{", c);
                   2788:     if (*code != OP_NOTEXACT) printf(",");
                   2789:     printf("%d}", (code[1] << 8) + code[2]);
                   2790:     if (*code == OP_NOTMINUPTO) printf("?");
                   2791:     code += 3;
                   2792:     break;
                   2793: 
                   2794:     case OP_REF:
                   2795:     printf("    \\%d", *(++code));
                   2796:     code ++;
                   2797:     goto CLASS_REF_REPEAT;
                   2798: 
                   2799:     case OP_CLASS:
                   2800:       {
                   2801:       int i, min, max;
                   2802:       code++;
                   2803:       printf("    [");
                   2804: 
                   2805:       for (i = 0; i < 256; i++)
                   2806:         {
                   2807:         if ((code[i/8] & (1 << (i&7))) != 0)
                   2808:           {
                   2809:           int j;
                   2810:           for (j = i+1; j < 256; j++)
                   2811:             if ((code[j/8] & (1 << (j&7))) == 0) break;
                   2812:           if (i == '-' || i == ']') printf("\\");
                   2813:           if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
                   2814:           if (--j > i)
                   2815:             {
                   2816:             printf("-");
                   2817:             if (j == '-' || j == ']') printf("\\");
                   2818:             if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
                   2819:             }
                   2820:           i = j;
                   2821:           }
                   2822:         }
                   2823:       printf("]");
                   2824:       code += 32;
                   2825: 
                   2826:       CLASS_REF_REPEAT:
                   2827: 
                   2828:       switch(*code)
                   2829:         {
                   2830:         case OP_CRSTAR:
                   2831:         case OP_CRMINSTAR:
                   2832:         case OP_CRPLUS:
                   2833:         case OP_CRMINPLUS:
                   2834:         case OP_CRQUERY:
                   2835:         case OP_CRMINQUERY:
                   2836:         printf("%s", OP_names[*code]);
                   2837:         break;
                   2838: 
                   2839:         case OP_CRRANGE:
                   2840:         case OP_CRMINRANGE:
                   2841:         min = (code[1] << 8) + code[2];
                   2842:         max = (code[3] << 8) + code[4];
                   2843:         if (max == 0) printf("{%d,}", min);
                   2844:         else printf("{%d,%d}", min, max);
                   2845:         if (*code == OP_CRMINRANGE) printf("?");
                   2846:         code += 4;
                   2847:         break;
                   2848: 
                   2849:         default:
                   2850:         code--;
                   2851:         }
                   2852:       }
                   2853:     break;
                   2854: 
                   2855:     /* Anything else is just a one-node item */
                   2856: 
                   2857:     default:
                   2858:     printf("    %s", OP_names[*code]);
                   2859:     break;
                   2860:     }
                   2861: 
                   2862:   code++;
                   2863:   printf("\n");
                   2864:   }
                   2865: printf("------------------------------------------------------------------\n");
                   2866: 
                   2867: /* This check is done here in the debugging case so that the code that
                   2868: was compiled can be seen. */
                   2869: 
                   2870: if (code - re->code > length)
                   2871:   {
                   2872:   *errorptr = ERR23;
                   2873:   (pcre_free)(re);
                   2874:   *erroroffset = ptr - (uschar *)pattern;
                   2875:   return NULL;
                   2876:   }
                   2877: #endif
                   2878: 
                   2879: return (pcre *)re;
                   2880: }
                   2881: 
                   2882: 
                   2883: 
                   2884: /*************************************************
                   2885: *          Match a back-reference                *
                   2886: *************************************************/
                   2887: 
                   2888: /* If a back reference hasn't been set, the length that is passed is greater
                   2889: than the number of characters left in the string, so the match fails.
                   2890: 
                   2891: Arguments:
                   2892:   offset      index into the offset vector
                   2893:   eptr        points into the subject
                   2894:   length      length to be matched
                   2895:   md          points to match data block
                   2896:   ims         the ims flags
                   2897: 
                   2898: Returns:      TRUE if matched
                   2899: */
                   2900: 
                   2901: static BOOL
                   2902: match_ref(int offset, register const uschar *eptr, int length, match_data *md,
                   2903:   unsigned long int ims)
                   2904: {
                   2905: const uschar *p = md->start_subject + md->offset_vector[offset];
                   2906: 
                   2907: #ifdef DEBUG
                   2908: if (eptr >= md->end_subject)
                   2909:   printf("matching subject <null>");
                   2910: else
                   2911:   {
                   2912:   printf("matching subject ");
                   2913:   pchars(eptr, length, TRUE, md);
                   2914:   }
                   2915: printf(" against backref ");
                   2916: pchars(p, length, FALSE, md);
                   2917: printf("\n");
                   2918: #endif
                   2919: 
                   2920: /* Always fail if not enough characters left */
                   2921: 
                   2922: if (length > md->end_subject - eptr) return FALSE;
                   2923: 
                   2924: /* Separate the caselesss case for speed */
                   2925: 
                   2926: if ((ims & PCRE_CASELESS) != 0)
                   2927:   {
                   2928:   while (length-- > 0)
                   2929:     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
                   2930:   }
                   2931: else
                   2932:   { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
                   2933: 
                   2934: return TRUE;
                   2935: }
                   2936: 
                   2937: 
                   2938: 
                   2939: /*************************************************
                   2940: *         Match from current position            *
                   2941: *************************************************/
                   2942: 
                   2943: /* On entry ecode points to the first opcode, and eptr to the first character
                   2944: in the subject string, while eptrb holds the value of eptr at the start of the
                   2945: last bracketed group - used for breaking infinite loops matching zero-length
                   2946: strings.
                   2947: 
                   2948: Arguments:
                   2949:    eptr        pointer in subject
                   2950:    ecode       position in code
                   2951:    offset_top  current top pointer
                   2952:    md          pointer to "static" info for the match
                   2953:    ims         current /i, /m, and /s options
                   2954:    condassert  TRUE if called to check a condition assertion
                   2955:    eptrb       eptr at start of last bracket
                   2956: 
                   2957: Returns:       TRUE if matched
                   2958: */
                   2959: 
                   2960: static BOOL
                   2961: match(register const uschar *eptr, register const uschar *ecode,
                   2962:   int offset_top, match_data *md, unsigned long int ims, BOOL condassert,
                   2963:   const uschar *eptrb)
                   2964: {
                   2965: unsigned long int original_ims = ims;   /* Save for resetting on ')' */
                   2966: 
                   2967: for (;;)
                   2968:   {
                   2969:   int op = (int)*ecode;
                   2970:   int min, max, ctype;
                   2971:   register int i;
                   2972:   register int c;
                   2973:   BOOL minimize = FALSE;
                   2974: 
                   2975:   /* Opening capturing bracket. If there is space in the offset vector, save
                   2976:   the current subject position in the working slot at the top of the vector. We
                   2977:   mustn't change the current values of the data slot, because they may be set
                   2978:   from a previous iteration of this group, and be referred to by a reference
                   2979:   inside the group.
                   2980: 
                   2981:   If the bracket fails to match, we need to restore this value and also the
                   2982:   values of the final offsets, in case they were set by a previous iteration of
                   2983:   the same bracket.
                   2984: 
                   2985:   If there isn't enough space in the offset vector, treat this as if it were a
                   2986:   non-capturing bracket. Don't worry about setting the flag for the error case
                   2987:   here; that is handled in the code for KET. */
                   2988: 
                   2989:   if (op > OP_BRA)
                   2990:     {
                   2991:     int number = op - OP_BRA;
                   2992:     int offset = number << 1;
                   2993: 
                   2994: #ifdef DEBUG
                   2995:     printf("start bracket %d subject=", number);
                   2996:     pchars(eptr, 16, TRUE, md);
                   2997:     printf("\n");
                   2998: #endif
                   2999: 
                   3000:     if (offset < md->offset_max)
                   3001:       {
                   3002:       int save_offset1 = md->offset_vector[offset];
                   3003:       int save_offset2 = md->offset_vector[offset+1];
                   3004:       int save_offset3 = md->offset_vector[md->offset_end - number];
                   3005: 
                   3006:       DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
                   3007:       md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
                   3008: 
                   3009:       do
                   3010:         {
                   3011:         if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
                   3012:         ecode += (ecode[1] << 8) + ecode[2];
                   3013:         }
                   3014:       while (*ecode == OP_ALT);
                   3015: 
                   3016:       DPRINTF(("bracket %d failed\n", number));
                   3017: 
                   3018:       md->offset_vector[offset] = save_offset1;
                   3019:       md->offset_vector[offset+1] = save_offset2;
                   3020:       md->offset_vector[md->offset_end - number] = save_offset3;
                   3021:       return FALSE;
                   3022:       }
                   3023: 
                   3024:     /* Insufficient room for saving captured contents */
                   3025: 
                   3026:     else op = OP_BRA;
                   3027:     }
                   3028: 
                   3029:   /* Other types of node can be handled by a switch */
                   3030: 
                   3031:   switch(op)
                   3032:     {
                   3033:     case OP_BRA:     /* Non-capturing bracket: optimized */
                   3034:     DPRINTF(("start bracket 0\n"));
                   3035:     do
                   3036:       {
                   3037:       if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
                   3038:       ecode += (ecode[1] << 8) + ecode[2];
                   3039:       }
                   3040:     while (*ecode == OP_ALT);
                   3041:     DPRINTF(("bracket 0 failed\n"));
                   3042:     return FALSE;
                   3043: 
                   3044:     /* Conditional group: compilation checked that there are no more than
                   3045:     two branches. If the condition is false, skipping the first branch takes us
                   3046:     past the end if there is only one branch, but that's OK because that is
                   3047:     exactly what going to the ket would do. */
                   3048: 
                   3049:     case OP_COND:
                   3050:     if (ecode[3] == OP_CREF)         /* Condition is extraction test */
                   3051:       {
                   3052:       int offset = ecode[4] << 1;    /* Doubled reference number */
                   3053:       return match(eptr,
                   3054:         ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
                   3055:           5 : 3 + (ecode[1] << 8) + ecode[2]),
                   3056:         offset_top, md, ims, FALSE, eptr);
                   3057:       }
                   3058: 
                   3059:     /* The condition is an assertion. Call match() to evaluate it - setting
                   3060:     the final argument TRUE causes it to stop at the end of an assertion. */
                   3061: 
                   3062:     else
                   3063:       {
                   3064:       if (match(eptr, ecode+3, offset_top, md, ims, TRUE, NULL))
                   3065:         {
                   3066:         ecode += 3 + (ecode[4] << 8) + ecode[5];
                   3067:         while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
                   3068:         }
                   3069:       else ecode += (ecode[1] << 8) + ecode[2];
                   3070:       return match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr);
                   3071:       }
                   3072:     /* Control never reaches here */
                   3073: 
                   3074:     /* Skip over conditional reference data if encountered (should not be) */
                   3075: 
                   3076:     case OP_CREF:
                   3077:     ecode += 2;
                   3078:     break;
                   3079: 
                   3080:     /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
                   3081:     an empty string - recursion will then try other alternatives, if any. */
                   3082: 
                   3083:     case OP_END:
                   3084:     if (md->notempty && eptr == md->start_match) return FALSE;
                   3085:     md->end_match_ptr = eptr;          /* Record where we ended */
                   3086:     md->end_offset_top = offset_top;   /* and how many extracts were taken */
                   3087:     return TRUE;
                   3088: 
                   3089:     /* Change option settings */
                   3090: 
                   3091:     case OP_OPT:
                   3092:     ims = ecode[1];
                   3093:     ecode += 2;
                   3094:     DPRINTF(("ims set to %02lx\n", ims));
                   3095:     break;
                   3096: 
                   3097:     /* Assertion brackets. Check the alternative branches in turn - the
                   3098:     matching won't pass the KET for an assertion. If any one branch matches,
                   3099:     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
                   3100:     start of each branch to move the current point backwards, so the code at
                   3101:     this level is identical to the lookahead case. */
                   3102: 
                   3103:     case OP_ASSERT:
                   3104:     case OP_ASSERTBACK:
                   3105:     do
                   3106:       {
                   3107:       if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) break;
                   3108:       ecode += (ecode[1] << 8) + ecode[2];
                   3109:       }
                   3110:     while (*ecode == OP_ALT);
                   3111:     if (*ecode == OP_KET) return FALSE;
                   3112: 
                   3113:     /* If checking an assertion for a condition, return TRUE. */
                   3114: 
                   3115:     if (condassert) return TRUE;
                   3116: 
                   3117:     /* Continue from after the assertion, updating the offsets high water
                   3118:     mark, since extracts may have been taken during the assertion. */
                   3119: 
                   3120:     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
                   3121:     ecode += 3;
                   3122:     offset_top = md->end_offset_top;
                   3123:     continue;
                   3124: 
                   3125:     /* Negative assertion: all branches must fail to match */
                   3126: 
                   3127:     case OP_ASSERT_NOT:
                   3128:     case OP_ASSERTBACK_NOT:
                   3129:     do
                   3130:       {
                   3131:       if (match(eptr, ecode+3, offset_top, md, ims, FALSE, NULL)) return FALSE;
                   3132:       ecode += (ecode[1] << 8) + ecode[2];
                   3133:       }
                   3134:     while (*ecode == OP_ALT);
                   3135: 
                   3136:     if (condassert) return TRUE;
                   3137:     ecode += 3;
                   3138:     continue;
                   3139: 
                   3140:     /* Move the subject pointer back. This occurs only at the start of
                   3141:     each branch of a lookbehind assertion. If we are too close to the start to
                   3142:     move back, this match function fails. */
                   3143: 
                   3144:     case OP_REVERSE:
                   3145:     eptr -= (ecode[1] << 8) + ecode[2];
                   3146:     if (eptr < md->start_subject) return FALSE;
                   3147:     ecode += 3;
                   3148:     break;
                   3149: 
                   3150: 
                   3151:     /* "Once" brackets are like assertion brackets except that after a match,
                   3152:     the point in the subject string is not moved back. Thus there can never be
                   3153:     a move back into the brackets. Check the alternative branches in turn - the
                   3154:     matching won't pass the KET for this kind of subpattern. If any one branch
                   3155:     matches, we carry on as at the end of a normal bracket, leaving the subject
                   3156:     pointer. */
                   3157: 
                   3158:     case OP_ONCE:
                   3159:       {
                   3160:       const uschar *prev = ecode;
                   3161: 
                   3162:       do
                   3163:         {
                   3164:         if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) break;
                   3165:         ecode += (ecode[1] << 8) + ecode[2];
                   3166:         }
                   3167:       while (*ecode == OP_ALT);
                   3168: 
                   3169:       /* If hit the end of the group (which could be repeated), fail */
                   3170: 
                   3171:       if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
                   3172: 
                   3173:       /* Continue as from after the assertion, updating the offsets high water
                   3174:       mark, since extracts may have been taken. */
                   3175: 
                   3176:       do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
                   3177: 
                   3178:       offset_top = md->end_offset_top;
                   3179:       eptr = md->end_match_ptr;
                   3180: 
                   3181:       /* For a non-repeating ket, just continue at this level. This also
                   3182:       happens for a repeating ket if no characters were matched in the group.
                   3183:       This is the forcible breaking of infinite loops as implemented in Perl
                   3184:       5.005. If there is an options reset, it will get obeyed in the normal
                   3185:       course of events. */
                   3186: 
                   3187:       if (*ecode == OP_KET || eptr == eptrb)
                   3188:         {
                   3189:         ecode += 3;
                   3190:         break;
                   3191:         }
                   3192: 
                   3193:       /* The repeating kets try the rest of the pattern or restart from the
                   3194:       preceding bracket, in the appropriate order. We need to reset any options
                   3195:       that changed within the bracket before re-running it, so check the next
                   3196:       opcode. */
                   3197: 
                   3198:       if (ecode[3] == OP_OPT)
                   3199:         {
                   3200:         ims = (ims & ~PCRE_IMS) | ecode[4];
                   3201:         DPRINTF(("ims set to %02lx at group repeat\n", ims));
                   3202:         }
                   3203: 
                   3204:       if (*ecode == OP_KETRMIN)
                   3205:         {
                   3206:         if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
                   3207:             match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
                   3208:         }
                   3209:       else  /* OP_KETRMAX */
                   3210:         {
                   3211:         if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
                   3212:             match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
                   3213:         }
                   3214:       }
                   3215:     return FALSE;
                   3216: 
                   3217:     /* An alternation is the end of a branch; scan along to find the end of the
                   3218:     bracketed group and go to there. */
                   3219: 
                   3220:     case OP_ALT:
                   3221:     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
                   3222:     break;
                   3223: 
                   3224:     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
                   3225:     that it may occur zero times. It may repeat infinitely, or not at all -
                   3226:     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
                   3227:     repeat limits are compiled as a number of copies, with the optional ones
                   3228:     preceded by BRAZERO or BRAMINZERO. */
                   3229: 
                   3230:     case OP_BRAZERO:
                   3231:       {
                   3232:       const uschar *next = ecode+1;
                   3233:       if (match(eptr, next, offset_top, md, ims, FALSE, eptr)) return TRUE;
                   3234:       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
                   3235:       ecode = next + 3;
                   3236:       }
                   3237:     break;
                   3238: 
                   3239:     case OP_BRAMINZERO:
                   3240:       {
                   3241:       const uschar *next = ecode+1;
                   3242:       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
                   3243:       if (match(eptr, next+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
                   3244:       ecode++;
                   3245:       }
                   3246:     break;
                   3247: 
                   3248:     /* End of a group, repeated or non-repeating. If we are at the end of
                   3249:     an assertion "group", stop matching and return TRUE, but record the
                   3250:     current high water mark for use by positive assertions. Do this also
                   3251:     for the "once" (not-backup up) groups. */
                   3252: 
                   3253:     case OP_KET:
                   3254:     case OP_KETRMIN:
                   3255:     case OP_KETRMAX:
                   3256:       {
                   3257:       const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
                   3258: 
                   3259:       if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
                   3260:           *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
                   3261:           *prev == OP_ONCE)
                   3262:         {
                   3263:         md->end_match_ptr = eptr;      /* For ONCE */
                   3264:         md->end_offset_top = offset_top;
                   3265:         return TRUE;
                   3266:         }
                   3267: 
                   3268:       /* In all other cases except a conditional group we have to check the
                   3269:       group number back at the start and if necessary complete handling an
                   3270:       extraction by setting the offsets and bumping the high water mark. */
                   3271: 
                   3272:       if (*prev != OP_COND)
                   3273:         {
                   3274:         int number = *prev - OP_BRA;
                   3275:         int offset = number << 1;
                   3276: 
                   3277:         DPRINTF(("end bracket %d\n", number));
                   3278: 
                   3279:         if (number > 0)
                   3280:           {
                   3281:           if (offset >= md->offset_max) md->offset_overflow = TRUE; else
                   3282:             {
                   3283:             md->offset_vector[offset] =
                   3284:               md->offset_vector[md->offset_end - number];
                   3285:             md->offset_vector[offset+1] = eptr - md->start_subject;
                   3286:             if (offset_top <= offset) offset_top = offset + 2;
                   3287:             }
                   3288:           }
                   3289:         }
                   3290: 
                   3291:       /* Reset the value of the ims flags, in case they got changed during
                   3292:       the group. */
                   3293: 
                   3294:       ims = original_ims;
                   3295:       DPRINTF(("ims reset to %02lx\n", ims));
                   3296: 
                   3297:       /* For a non-repeating ket, just continue at this level. This also
                   3298:       happens for a repeating ket if no characters were matched in the group.
                   3299:       This is the forcible breaking of infinite loops as implemented in Perl
                   3300:       5.005. If there is an options reset, it will get obeyed in the normal
                   3301:       course of events. */
                   3302: 
                   3303:       if (*ecode == OP_KET || eptr == eptrb)
                   3304:         {
                   3305:         ecode += 3;
                   3306:         break;
                   3307:         }
                   3308: 
                   3309:       /* The repeating kets try the rest of the pattern or restart from the
                   3310:       preceding bracket, in the appropriate order. */
                   3311: 
                   3312:       if (*ecode == OP_KETRMIN)
                   3313:         {
                   3314:         if (match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr) ||
                   3315:             match(eptr, prev, offset_top, md, ims, FALSE, eptr)) return TRUE;
                   3316:         }
                   3317:       else  /* OP_KETRMAX */
                   3318:         {
                   3319:         if (match(eptr, prev, offset_top, md, ims, FALSE, eptr) ||
                   3320:             match(eptr, ecode+3, offset_top, md, ims, FALSE, eptr)) return TRUE;
                   3321:         }
                   3322:       }
                   3323:     return FALSE;
                   3324: 
                   3325:     /* Start of subject unless notbol, or after internal newline if multiline */
                   3326: 
                   3327:     case OP_CIRC:
                   3328:     if (md->notbol && eptr == md->start_subject) return FALSE;
                   3329:     if ((ims & PCRE_MULTILINE) != 0)
                   3330:       {
                   3331:       if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
                   3332:       ecode++;
                   3333:       break;
                   3334:       }
                   3335:     /* ... else fall through */
                   3336: 
                   3337:     /* Start of subject assertion */
                   3338: 
                   3339:     case OP_SOD:
                   3340:     if (eptr != md->start_subject) return FALSE;
                   3341:     ecode++;
                   3342:     break;
                   3343: 
                   3344:     /* Assert before internal newline if multiline, or before a terminating
                   3345:     newline unless endonly is set, else end of subject unless noteol is set. */
                   3346: 
                   3347:     case OP_DOLL:
                   3348:     if ((ims & PCRE_MULTILINE) != 0)
                   3349:       {
                   3350:       if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
                   3351:         else { if (md->noteol) return FALSE; }
                   3352:       ecode++;
                   3353:       break;
                   3354:       }
                   3355:     else
                   3356:       {
                   3357:       if (md->noteol) return FALSE;
                   3358:       if (!md->endonly)
                   3359:         {
                   3360:         if (eptr < md->end_subject - 1 ||
                   3361:            (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
                   3362: 
                   3363:         ecode++;
                   3364:         break;
                   3365:         }
                   3366:       }
                   3367:     /* ... else fall through */
                   3368: 
                   3369:     /* End of subject assertion (\z) */
                   3370: 
                   3371:     case OP_EOD:
                   3372:     if (eptr < md->end_subject) return FALSE;
                   3373:     ecode++;
                   3374:     break;
                   3375: 
                   3376:     /* End of subject or ending \n assertion (\Z) */
                   3377: 
                   3378:     case OP_EODN:
                   3379:     if (eptr < md->end_subject - 1 ||
                   3380:        (eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
                   3381:     ecode++;
                   3382:     break;
                   3383: 
                   3384:     /* Word boundary assertions */
                   3385: 
                   3386:     case OP_NOT_WORD_BOUNDARY:
                   3387:     case OP_WORD_BOUNDARY:
                   3388:       {
                   3389:       BOOL prev_is_word = (eptr != md->start_subject) &&
                   3390:         ((md->ctypes[eptr[-1]] & ctype_word) != 0);
                   3391:       BOOL cur_is_word = (eptr < md->end_subject) &&
                   3392:         ((md->ctypes[*eptr] & ctype_word) != 0);
                   3393:       if ((*ecode++ == OP_WORD_BOUNDARY)?
                   3394:            cur_is_word == prev_is_word : cur_is_word != prev_is_word)
                   3395:         return FALSE;
                   3396:       }
                   3397:     break;
                   3398: 
                   3399:     /* Match a single character type; inline for speed */
                   3400: 
                   3401:     case OP_ANY:
                   3402:     if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
                   3403:       return FALSE;
                   3404:     if (eptr++ >= md->end_subject) return FALSE;
                   3405:     ecode++;
                   3406:     break;
                   3407: 
                   3408:     case OP_NOT_DIGIT:
                   3409:     if (eptr >= md->end_subject ||
                   3410:        (md->ctypes[*eptr++] & ctype_digit) != 0)
                   3411:       return FALSE;
                   3412:     ecode++;
                   3413:     break;
                   3414: 
                   3415:     case OP_DIGIT:
                   3416:     if (eptr >= md->end_subject ||
                   3417:        (md->ctypes[*eptr++] & ctype_digit) == 0)
                   3418:       return FALSE;
                   3419:     ecode++;
                   3420:     break;
                   3421: 
                   3422:     case OP_NOT_WHITESPACE:
                   3423:     if (eptr >= md->end_subject ||
                   3424:        (md->ctypes[*eptr++] & ctype_space) != 0)
                   3425:       return FALSE;
                   3426:     ecode++;
                   3427:     break;
                   3428: 
                   3429:     case OP_WHITESPACE:
                   3430:     if (eptr >= md->end_subject ||
                   3431:        (md->ctypes[*eptr++] & ctype_space) == 0)
                   3432:       return FALSE;
                   3433:     ecode++;
                   3434:     break;
                   3435: 
                   3436:     case OP_NOT_WORDCHAR:
                   3437:     if (eptr >= md->end_subject ||
                   3438:        (md->ctypes[*eptr++] & ctype_word) != 0)
                   3439:       return FALSE;
                   3440:     ecode++;
                   3441:     break;
                   3442: 
                   3443:     case OP_WORDCHAR:
                   3444:     if (eptr >= md->end_subject ||
                   3445:        (md->ctypes[*eptr++] & ctype_word) == 0)
                   3446:       return FALSE;
                   3447:     ecode++;
                   3448:     break;
                   3449: 
                   3450:     /* Match a back reference, possibly repeatedly. Look past the end of the
                   3451:     item to see if there is repeat information following. The code is similar
                   3452:     to that for character classes, but repeated for efficiency. Then obey
                   3453:     similar code to character type repeats - written out again for speed.
                   3454:     However, if the referenced string is the empty string, always treat
                   3455:     it as matched, any number of times (otherwise there could be infinite
                   3456:     loops). */
                   3457: 
                   3458:     case OP_REF:
                   3459:       {
                   3460:       int length;
                   3461:       int offset = ecode[1] << 1;                /* Doubled reference number */
                   3462:       ecode += 2;                                /* Advance past the item */
                   3463: 
                   3464:       /* If the reference is unset, set the length to be longer than the amount
                   3465:       of subject left; this ensures that every attempt at a match fails. We
                   3466:       can't just fail here, because of the possibility of quantifiers with zero
                   3467:       minima. */
                   3468: 
                   3469:       length = (offset >= offset_top || md->offset_vector[offset] < 0)?
                   3470:         md->end_subject - eptr + 1 :
                   3471:         md->offset_vector[offset+1] - md->offset_vector[offset];
                   3472: 
                   3473:       /* Set up for repetition, or handle the non-repeated case */
                   3474: 
                   3475:       switch (*ecode)
                   3476:         {
                   3477:         case OP_CRSTAR:
                   3478:         case OP_CRMINSTAR:
                   3479:         case OP_CRPLUS:
                   3480:         case OP_CRMINPLUS:
                   3481:         case OP_CRQUERY:
                   3482:         case OP_CRMINQUERY:
                   3483:         c = *ecode++ - OP_CRSTAR;
                   3484:         minimize = (c & 1) != 0;
                   3485:         min = rep_min[c];                 /* Pick up values from tables; */
                   3486:         max = rep_max[c];                 /* zero for max => infinity */
1.2     ! paf      3487:         if (max == 0) max = PCRE_MAX_POS;
1.1       paf      3488:         break;
                   3489: 
                   3490:         case OP_CRRANGE:
                   3491:         case OP_CRMINRANGE:
                   3492:         minimize = (*ecode == OP_CRMINRANGE);
                   3493:         min = (ecode[1] << 8) + ecode[2];
                   3494:         max = (ecode[3] << 8) + ecode[4];
1.2     ! paf      3495:         if (max == 0) max = PCRE_MAX_POS;
1.1       paf      3496:         ecode += 5;
                   3497:         break;
                   3498: 
                   3499:         default:               /* No repeat follows */
                   3500:         if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
                   3501:         eptr += length;
                   3502:         continue;              /* With the main loop */
                   3503:         }
                   3504: 
                   3505:       /* If the length of the reference is zero, just continue with the
                   3506:       main loop. */
                   3507: 
                   3508:       if (length == 0) continue;
                   3509: 
                   3510:       /* First, ensure the minimum number of matches are present. We get back
                   3511:       the length of the reference string explicitly rather than passing the
                   3512:       address of eptr, so that eptr can be a register variable. */
                   3513: 
                   3514:       for (i = 1; i <= min; i++)
                   3515:         {
                   3516:         if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
                   3517:         eptr += length;
                   3518:         }
                   3519: 
                   3520:       /* If min = max, continue at the same level without recursion.
                   3521:       They are not both allowed to be zero. */
                   3522: 
                   3523:       if (min == max) continue;
                   3524: 
                   3525:       /* If minimizing, keep trying and advancing the pointer */
                   3526: 
                   3527:       if (minimize)
                   3528:         {
                   3529:         for (i = min;; i++)
                   3530:           {
                   3531:           if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
                   3532:             return TRUE;
                   3533:           if (i >= max || !match_ref(offset, eptr, length, md, ims))
                   3534:             return FALSE;
                   3535:           eptr += length;
                   3536:           }
                   3537:         /* Control never gets here */
                   3538:         }
                   3539: 
                   3540:       /* If maximizing, find the longest string and work backwards */
                   3541: 
                   3542:       else
                   3543:         {
                   3544:         const uschar *pp = eptr;
                   3545:         for (i = min; i < max; i++)
                   3546:           {
                   3547:           if (!match_ref(offset, eptr, length, md, ims)) break;
                   3548:           eptr += length;
                   3549:           }
                   3550:         while (eptr >= pp)
                   3551:           {
                   3552:           if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
                   3553:             return TRUE;
                   3554:           eptr -= length;
                   3555:           }
                   3556:         return FALSE;
                   3557:         }
                   3558:       }
                   3559:     /* Control never gets here */
                   3560: 
                   3561: 
                   3562: 
                   3563:     /* Match a character class, possibly repeatedly. Look past the end of the
                   3564:     item to see if there is repeat information following. Then obey similar
                   3565:     code to character type repeats - written out again for speed. */
                   3566: 
                   3567:     case OP_CLASS:
                   3568:       {
                   3569:       const uschar *data = ecode + 1;  /* Save for matching */
                   3570:       ecode += 33;                     /* Advance past the item */
                   3571: 
                   3572:       switch (*ecode)
                   3573:         {
                   3574:         case OP_CRSTAR:
                   3575:         case OP_CRMINSTAR:
                   3576:         case OP_CRPLUS:
                   3577:         case OP_CRMINPLUS:
                   3578:         case OP_CRQUERY:
                   3579:         case OP_CRMINQUERY:
                   3580:         c = *ecode++ - OP_CRSTAR;
                   3581:         minimize = (c & 1) != 0;
                   3582:         min = rep_min[c];                 /* Pick up values from tables; */
                   3583:         max = rep_max[c];                 /* zero for max => infinity */
1.2     ! paf      3584:         if (max == 0) max = PCRE_MAX_POS;
1.1       paf      3585:         break;
                   3586: 
                   3587:         case OP_CRRANGE:
                   3588:         case OP_CRMINRANGE:
                   3589:         minimize = (*ecode == OP_CRMINRANGE);
                   3590:         min = (ecode[1] << 8) + ecode[2];
                   3591:         max = (ecode[3] << 8) + ecode[4];
1.2     ! paf      3592:         if (max == 0) max = PCRE_MAX_POS;
1.1       paf      3593:         ecode += 5;
                   3594:         break;
                   3595: 
                   3596:         default:               /* No repeat follows */
                   3597:         min = max = 1;
                   3598:         break;
                   3599:         }
                   3600: 
                   3601:       /* First, ensure the minimum number of matches are present. */
                   3602: 
                   3603:       for (i = 1; i <= min; i++)
                   3604:         {
                   3605:         if (eptr >= md->end_subject) return FALSE;
                   3606:         c = *eptr++;
                   3607:         if ((data[c/8] & (1 << (c&7))) != 0) continue;
                   3608:         return FALSE;
                   3609:         }
                   3610: 
                   3611:       /* If max == min we can continue with the main loop without the
                   3612:       need to recurse. */
                   3613: 
                   3614:       if (min == max) continue;
                   3615: 
                   3616:       /* If minimizing, keep testing the rest of the expression and advancing
                   3617:       the pointer while it matches the class. */
                   3618: 
                   3619:       if (minimize)
                   3620:         {
                   3621:         for (i = min;; i++)
                   3622:           {
                   3623:           if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
                   3624:             return TRUE;
                   3625:           if (i >= max || eptr >= md->end_subject) return FALSE;
                   3626:           c = *eptr++;
                   3627:           if ((data[c/8] & (1 << (c&7))) != 0) continue;
                   3628:           return FALSE;
                   3629:           }
                   3630:         /* Control never gets here */
                   3631:         }
                   3632: 
                   3633:       /* If maximizing, find the longest possible run, then work backwards. */
                   3634: 
                   3635:       else
                   3636:         {
                   3637:         const uschar *pp = eptr;
                   3638:         for (i = min; i < max; eptr++, i++)
                   3639:           {
                   3640:           if (eptr >= md->end_subject) break;
                   3641:           c = *eptr;
                   3642:           if ((data[c/8] & (1 << (c&7))) != 0) continue;
                   3643:           break;
                   3644:           }
                   3645: 
                   3646:         while (eptr >= pp)
                   3647:           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
                   3648:             return TRUE;
                   3649:         return FALSE;
                   3650:         }
                   3651:       }
                   3652:     /* Control never gets here */
                   3653: 
                   3654:     /* Match a run of characters */
                   3655: 
                   3656:     case OP_CHARS:
                   3657:       {
                   3658:       register int length = ecode[1];
                   3659:       ecode += 2;
                   3660: 
                   3661: #ifdef DEBUG    /* Sigh. Some compilers never learn. */
                   3662:       if (eptr >= md->end_subject)
                   3663:         printf("matching subject <null> against pattern ");
                   3664:       else
                   3665:         {
                   3666:         printf("matching subject ");
                   3667:         pchars(eptr, length, TRUE, md);
                   3668:         printf(" against pattern ");
                   3669:         }
                   3670:       pchars(ecode, length, FALSE, md);
                   3671:       printf("\n");
                   3672: #endif
                   3673: 
                   3674:       if (length > md->end_subject - eptr) return FALSE;
                   3675:       if ((ims & PCRE_CASELESS) != 0)
                   3676:         {
                   3677:         while (length-- > 0)
                   3678:           if (md->lcc[*ecode++] != md->lcc[*eptr++])
                   3679:             return FALSE;
                   3680:         }
                   3681:       else
                   3682:         {
                   3683:         while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
                   3684:         }
                   3685:       }
                   3686:     break;
                   3687: 
                   3688:     /* Match a single character repeatedly; different opcodes share code. */
                   3689: 
                   3690:     case OP_EXACT:
                   3691:     min = max = (ecode[1] << 8) + ecode[2];
                   3692:     ecode += 3;
                   3693:     goto REPEATCHAR;
                   3694: 
                   3695:     case OP_UPTO:
                   3696:     case OP_MINUPTO:
                   3697:     min = 0;
                   3698:     max = (ecode[1] << 8) + ecode[2];
                   3699:     minimize = *ecode == OP_MINUPTO;
                   3700:     ecode += 3;
                   3701:     goto REPEATCHAR;
                   3702: 
                   3703:     case OP_STAR:
                   3704:     case OP_MINSTAR:
                   3705:     case OP_PLUS:
                   3706:     case OP_MINPLUS:
                   3707:     case OP_QUERY:
                   3708:     case OP_MINQUERY:
                   3709:     c = *ecode++ - OP_STAR;
                   3710:     minimize = (c & 1) != 0;
                   3711:     min = rep_min[c];                 /* Pick up values from tables; */
                   3712:     max = rep_max[c];                 /* zero for max => infinity */
1.2     ! paf      3713:     if (max == 0) max = PCRE_MAX_POS;
1.1       paf      3714: 
                   3715:     /* Common code for all repeated single-character matches. We can give
                   3716:     up quickly if there are fewer than the minimum number of characters left in
                   3717:     the subject. */
                   3718: 
                   3719:     REPEATCHAR:
                   3720:     if (min > md->end_subject - eptr) return FALSE;
                   3721:     c = *ecode++;
                   3722: 
                   3723:     /* The code is duplicated for the caseless and caseful cases, for speed,
                   3724:     since matching characters is likely to be quite common. First, ensure the
                   3725:     minimum number of matches are present. If min = max, continue at the same
                   3726:     level without recursing. Otherwise, if minimizing, keep trying the rest of
                   3727:     the expression and advancing one matching character if failing, up to the
                   3728:     maximum. Alternatively, if maximizing, find the maximum number of
                   3729:     characters and work backwards. */
                   3730: 
                   3731:     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
                   3732:       max, eptr));
                   3733: 
                   3734:     if ((ims & PCRE_CASELESS) != 0)
                   3735:       {
                   3736:       c = md->lcc[c];
                   3737:       for (i = 1; i <= min; i++)
                   3738:         if (c != md->lcc[*eptr++]) return FALSE;
                   3739:       if (min == max) continue;
                   3740:       if (minimize)
                   3741:         {
                   3742:         for (i = min;; i++)
                   3743:           {
                   3744:           if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
                   3745:             return TRUE;
                   3746:           if (i >= max || eptr >= md->end_subject ||
                   3747:               c != md->lcc[*eptr++])
                   3748:             return FALSE;
                   3749:           }
                   3750:         /* Control never gets here */
                   3751:         }
                   3752:       else
                   3753:         {
                   3754:         const uschar *pp = eptr;
                   3755:         for (i = min; i < max; i++)
                   3756:           {
                   3757:           if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
                   3758:           eptr++;
                   3759:           }
                   3760:         while (eptr >= pp)
                   3761:           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
                   3762:             return TRUE;
                   3763:         return FALSE;
                   3764:         }
                   3765:       /* Control never gets here */
                   3766:       }
                   3767: 
                   3768:     /* Caseful comparisons */
                   3769: 
                   3770:     else
                   3771:       {
                   3772:       for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
                   3773:       if (min == max) continue;
                   3774:       if (minimize)
                   3775:         {
                   3776:         for (i = min;; i++)
                   3777:           {
                   3778:           if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
                   3779:             return TRUE;
                   3780:           if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
                   3781:           }
                   3782:         /* Control never gets here */
                   3783:         }
                   3784:       else
                   3785:         {
                   3786:         const uschar *pp = eptr;
                   3787:         for (i = min; i < max; i++)
                   3788:           {
                   3789:           if (eptr >= md->end_subject || c != *eptr) break;
                   3790:           eptr++;
                   3791:           }
                   3792:         while (eptr >= pp)
                   3793:          if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
                   3794:            return TRUE;
                   3795:         return FALSE;
                   3796:         }
                   3797:       }
                   3798:     /* Control never gets here */
                   3799: 
                   3800:     /* Match a negated single character */
                   3801: 
                   3802:     case OP_NOT:
                   3803:     if (eptr >= md->end_subject) return FALSE;
                   3804:     ecode++;
                   3805:     if ((ims & PCRE_CASELESS) != 0)
                   3806:       {
                   3807:       if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
                   3808:       }
                   3809:     else
                   3810:       {
                   3811:       if (*ecode++ == *eptr++) return FALSE;
                   3812:       }
                   3813:     break;
                   3814: 
                   3815:     /* Match a negated single character repeatedly. This is almost a repeat of
                   3816:     the code for a repeated single character, but I haven't found a nice way of
                   3817:     commoning these up that doesn't require a test of the positive/negative
                   3818:     option for each character match. Maybe that wouldn't add very much to the
                   3819:     time taken, but character matching *is* what this is all about... */
                   3820: 
                   3821:     case OP_NOTEXACT:
                   3822:     min = max = (ecode[1] << 8) + ecode[2];
                   3823:     ecode += 3;
                   3824:     goto REPEATNOTCHAR;
                   3825: 
                   3826:     case OP_NOTUPTO:
                   3827:     case OP_NOTMINUPTO:
                   3828:     min = 0;
                   3829:     max = (ecode[1] << 8) + ecode[2];
                   3830:     minimize = *ecode == OP_NOTMINUPTO;
                   3831:     ecode += 3;
                   3832:     goto REPEATNOTCHAR;
                   3833: 
                   3834:     case OP_NOTSTAR:
                   3835:     case OP_NOTMINSTAR:
                   3836:     case OP_NOTPLUS:
                   3837:     case OP_NOTMINPLUS:
                   3838:     case OP_NOTQUERY:
                   3839:     case OP_NOTMINQUERY:
                   3840:     c = *ecode++ - OP_NOTSTAR;
                   3841:     minimize = (c & 1) != 0;
                   3842:     min = rep_min[c];                 /* Pick up values from tables; */
                   3843:     max = rep_max[c];                 /* zero for max => infinity */
1.2     ! paf      3844:     if (max == 0) max = PCRE_MAX_POS;
1.1       paf      3845: 
                   3846:     /* Common code for all repeated single-character matches. We can give
                   3847:     up quickly if there are fewer than the minimum number of characters left in
                   3848:     the subject. */
                   3849: 
                   3850:     REPEATNOTCHAR:
                   3851:     if (min > md->end_subject - eptr) return FALSE;
                   3852:     c = *ecode++;
                   3853: 
                   3854:     /* The code is duplicated for the caseless and caseful cases, for speed,
                   3855:     since matching characters is likely to be quite common. First, ensure the
                   3856:     minimum number of matches are present. If min = max, continue at the same
                   3857:     level without recursing. Otherwise, if minimizing, keep trying the rest of
                   3858:     the expression and advancing one matching character if failing, up to the
                   3859:     maximum. Alternatively, if maximizing, find the maximum number of
                   3860:     characters and work backwards. */
                   3861: 
                   3862:     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
                   3863:       max, eptr));
                   3864: 
                   3865:     if ((ims & PCRE_CASELESS) != 0)
                   3866:       {
                   3867:       c = md->lcc[c];
                   3868:       for (i = 1; i <= min; i++)
                   3869:         if (c == md->lcc[*eptr++]) return FALSE;
                   3870:       if (min == max) continue;
                   3871:       if (minimize)
                   3872:         {
                   3873:         for (i = min;; i++)
                   3874:           {
                   3875:           if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
                   3876:             return TRUE;
                   3877:           if (i >= max || eptr >= md->end_subject ||
                   3878:               c == md->lcc[*eptr++])
                   3879:             return FALSE;
                   3880:           }
                   3881:         /* Control never gets here */
                   3882:         }
                   3883:       else
                   3884:         {
                   3885:         const uschar *pp = eptr;
                   3886:         for (i = min; i < max; i++)
                   3887:           {
                   3888:           if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
                   3889:           eptr++;
                   3890:           }
                   3891:         while (eptr >= pp)
                   3892:           if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
                   3893:             return TRUE;
                   3894:         return FALSE;
                   3895:         }
                   3896:       /* Control never gets here */
                   3897:       }
                   3898: 
                   3899:     /* Caseful comparisons */
                   3900: 
                   3901:     else
                   3902:       {
                   3903:       for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
                   3904:       if (min == max) continue;
                   3905:       if (minimize)
                   3906:         {
                   3907:         for (i = min;; i++)
                   3908:           {
                   3909:           if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb))
                   3910:             return TRUE;
                   3911:           if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
                   3912:           }
                   3913:         /* Control never gets here */
                   3914:         }
                   3915:       else
                   3916:         {
                   3917:         const uschar *pp = eptr;
                   3918:         for (i = min; i < max; i++)
                   3919:           {
                   3920:           if (eptr >= md->end_subject || c == *eptr) break;
                   3921:           eptr++;
                   3922:           }
                   3923:         while (eptr >= pp)
                   3924:          if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
                   3925:            return TRUE;
                   3926:         return FALSE;
                   3927:         }
                   3928:       }
                   3929:     /* Control never gets here */
                   3930: 
                   3931:     /* Match a single character type repeatedly; several different opcodes
                   3932:     share code. This is very similar to the code for single characters, but we
                   3933:     repeat it in the interests of efficiency. */
                   3934: 
                   3935:     case OP_TYPEEXACT:
                   3936:     min = max = (ecode[1] << 8) + ecode[2];
                   3937:     minimize = TRUE;
                   3938:     ecode += 3;
                   3939:     goto REPEATTYPE;
                   3940: 
                   3941:     case OP_TYPEUPTO:
                   3942:     case OP_TYPEMINUPTO:
                   3943:     min = 0;
                   3944:     max = (ecode[1] << 8) + ecode[2];
                   3945:     minimize = *ecode == OP_TYPEMINUPTO;
                   3946:     ecode += 3;
                   3947:     goto REPEATTYPE;
                   3948: 
                   3949:     case OP_TYPESTAR:
                   3950:     case OP_TYPEMINSTAR:
                   3951:     case OP_TYPEPLUS:
                   3952:     case OP_TYPEMINPLUS:
                   3953:     case OP_TYPEQUERY:
                   3954:     case OP_TYPEMINQUERY:
                   3955:     c = *ecode++ - OP_TYPESTAR;
                   3956:     minimize = (c & 1) != 0;
                   3957:     min = rep_min[c];                 /* Pick up values from tables; */
                   3958:     max = rep_max[c];                 /* zero for max => infinity */
1.2     ! paf      3959:     if (max == 0) max = PCRE_MAX_POS;
1.1       paf      3960: 
                   3961:     /* Common code for all repeated single character type matches */
                   3962: 
                   3963:     REPEATTYPE:
                   3964:     ctype = *ecode++;      /* Code for the character type */
                   3965: 
                   3966:     /* First, ensure the minimum number of matches are present. Use inline
                   3967:     code for maximizing the speed, and do the type test once at the start
                   3968:     (i.e. keep it out of the loop). Also test that there are at least the
                   3969:     minimum number of characters before we start. */
                   3970: 
                   3971:     if (min > md->end_subject - eptr) return FALSE;
                   3972:     if (min > 0) switch(ctype)
                   3973:       {
                   3974:       case OP_ANY:
                   3975:       if ((ims & PCRE_DOTALL) == 0)
                   3976:         { for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
                   3977:       else eptr += min;
                   3978:       break;
                   3979: 
                   3980:       case OP_NOT_DIGIT:
                   3981:       for (i = 1; i <= min; i++)
                   3982:         if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
                   3983:       break;
                   3984: 
                   3985:       case OP_DIGIT:
                   3986:       for (i = 1; i <= min; i++)
                   3987:         if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
                   3988:       break;
                   3989: 
                   3990:       case OP_NOT_WHITESPACE:
                   3991:       for (i = 1; i <= min; i++)
                   3992:         if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
                   3993:       break;
                   3994: 
                   3995:       case OP_WHITESPACE:
                   3996:       for (i = 1; i <= min; i++)
                   3997:         if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
                   3998:       break;
                   3999: 
                   4000:       case OP_NOT_WORDCHAR:
                   4001:       for (i = 1; i <= min; i++)
                   4002:         if ((md->ctypes[*eptr++] & ctype_word) != 0)
                   4003:           return FALSE;
                   4004:       break;
                   4005: 
                   4006:       case OP_WORDCHAR:
                   4007:       for (i = 1; i <= min; i++)
                   4008:         if ((md->ctypes[*eptr++] & ctype_word) == 0)
                   4009:           return FALSE;
                   4010:       break;
                   4011:       }
                   4012: 
                   4013:     /* If min = max, continue at the same level without recursing */
                   4014: 
                   4015:     if (min == max) continue;
                   4016: 
                   4017:     /* If minimizing, we have to test the rest of the pattern before each
                   4018:     subsequent match. */
                   4019: 
                   4020:     if (minimize)
                   4021:       {
                   4022:       for (i = min;; i++)
                   4023:         {
                   4024:         if (match(eptr, ecode, offset_top, md, ims, FALSE, eptrb)) return TRUE;
                   4025:         if (i >= max || eptr >= md->end_subject) return FALSE;
                   4026: 
                   4027:         c = *eptr++;
                   4028:         switch(ctype)
                   4029:           {
                   4030:           case OP_ANY:
                   4031:           if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
                   4032:           break;
                   4033: 
                   4034:           case OP_NOT_DIGIT:
                   4035:           if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
                   4036:           break;
                   4037: 
                   4038:           case OP_DIGIT:
                   4039:           if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
                   4040:           break;
                   4041: 
                   4042:           case OP_NOT_WHITESPACE:
                   4043:           if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
                   4044:           break;
                   4045: 
                   4046:           case OP_WHITESPACE:
                   4047:           if  ((md->ctypes[c] & ctype_space) == 0) return FALSE;
                   4048:           break;
                   4049: 
                   4050:           case OP_NOT_WORDCHAR:
                   4051:           if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
                   4052:           break;
                   4053: 
                   4054:           case OP_WORDCHAR:
                   4055:           if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
                   4056:           break;
                   4057:           }
                   4058:         }
                   4059:       /* Control never gets here */
                   4060:       }
                   4061: 
                   4062:     /* If maximizing it is worth using inline code for speed, doing the type
                   4063:     test once at the start (i.e. keep it out of the loop). */
                   4064: 
                   4065:     else
                   4066:       {
                   4067:       const uschar *pp = eptr;
                   4068:       switch(ctype)
                   4069:         {
                   4070:         case OP_ANY:
                   4071:         if ((ims & PCRE_DOTALL) == 0)
                   4072:           {
                   4073:           for (i = min; i < max; i++)
                   4074:             {
                   4075:             if (eptr >= md->end_subject || *eptr == '\n') break;
                   4076:             eptr++;
                   4077:             }
                   4078:           }
                   4079:         else
                   4080:           {
                   4081:           c = max - min;
                   4082:           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
                   4083:           eptr += c;
                   4084:           }
                   4085:         break;
                   4086: 
                   4087:         case OP_NOT_DIGIT:
                   4088:         for (i = min; i < max; i++)
                   4089:           {
                   4090:           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
                   4091:             break;
                   4092:           eptr++;
                   4093:           }
                   4094:         break;
                   4095: 
                   4096:         case OP_DIGIT:
                   4097:         for (i = min; i < max; i++)
                   4098:           {
                   4099:           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
                   4100:             break;
                   4101:           eptr++;
                   4102:           }
                   4103:         break;
                   4104: 
                   4105:         case OP_NOT_WHITESPACE:
                   4106:         for (i = min; i < max; i++)
                   4107:           {
                   4108:           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
                   4109:             break;
                   4110:           eptr++;
                   4111:           }
                   4112:         break;
                   4113: 
                   4114:         case OP_WHITESPACE:
                   4115:         for (i = min; i < max; i++)
                   4116:           {
                   4117:           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
                   4118:             break;
                   4119:           eptr++;
                   4120:           }
                   4121:         break;
                   4122: 
                   4123:         case OP_NOT_WORDCHAR:
                   4124:         for (i = min; i < max; i++)
                   4125:           {
                   4126:           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
                   4127:             break;
                   4128:           eptr++;
                   4129:           }
                   4130:         break;
                   4131: 
                   4132:         case OP_WORDCHAR:
                   4133:         for (i = min; i < max; i++)
                   4134:           {
                   4135:           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
                   4136:             break;
                   4137:           eptr++;
                   4138:           }
                   4139:         break;
                   4140:         }
                   4141: 
                   4142:       while (eptr >= pp)
                   4143:         if (match(eptr--, ecode, offset_top, md, ims, FALSE, eptrb))
                   4144:           return TRUE;
                   4145:       return FALSE;
                   4146:       }
                   4147:     /* Control never gets here */
                   4148: 
                   4149:     /* There's been some horrible disaster. */
                   4150: 
                   4151:     default:
                   4152:     DPRINTF(("Unknown opcode %d\n", *ecode));
                   4153:     md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
                   4154:     return FALSE;
                   4155:     }
                   4156: 
                   4157:   /* Do not stick any code in here without much thought; it is assumed
                   4158:   that "continue" in the code above comes out to here to repeat the main
                   4159:   loop. */
                   4160: 
                   4161:   }             /* End of main loop */
                   4162: /* Control never reaches here */
                   4163: }
                   4164: 
                   4165: 
                   4166: 
                   4167: 
                   4168: /*************************************************
                   4169: *         Execute a Regular Expression           *
                   4170: *************************************************/
                   4171: 
                   4172: /* This function applies a compiled re to a subject string and picks out
                   4173: portions of the string if it matches. Two elements in the vector are set for
                   4174: each substring: the offsets to the start and end of the substring.
                   4175: 
                   4176: Arguments:
                   4177:   external_re     points to the compiled expression
                   4178:   external_extra  points to "hints" from pcre_study() or is NULL
                   4179:   subject         points to the subject string
                   4180:   length          length of subject string (may contain binary zeros)
                   4181:   start_offset    where to start in the subject string
                   4182:   options         option bits
                   4183:   offsets         points to a vector of ints to be filled in with offsets
                   4184:   offsetcount     the number of elements in the vector
                   4185: 
                   4186: Returns:          > 0 => success; value is the number of elements filled in
                   4187:                   = 0 => success, but offsets is not big enough
                   4188:                    -1 => failed to match
                   4189:                  < -1 => some kind of unexpected problem
                   4190: */
                   4191: 
                   4192: int
                   4193: pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
                   4194:   const char *subject, int length, int start_offset, int options, int *offsets,
                   4195:   int offsetcount)
                   4196: {
                   4197: int resetcount, ocount;
                   4198: int first_char = -1;
                   4199: int req_char = -1;
                   4200: int req_char2 = -1;
                   4201: unsigned long int ims = 0;
                   4202: match_data match_block;
                   4203: const uschar *start_bits = NULL;
                   4204: const uschar *start_match = (const uschar *)subject + start_offset;
                   4205: const uschar *end_subject;
                   4206: const uschar *req_char_ptr = start_match - 1;
                   4207: const real_pcre *re = (const real_pcre *)external_re;
                   4208: const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
                   4209: BOOL using_temporary_offsets = FALSE;
                   4210: BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
                   4211: BOOL startline = (re->options & PCRE_STARTLINE) != 0;
                   4212: 
                   4213: if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
                   4214: 
                   4215: if (re == NULL || subject == NULL ||
                   4216:    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
                   4217: if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
                   4218: 
                   4219: match_block.start_subject = (const uschar *)subject;
                   4220: match_block.end_subject = match_block.start_subject + length;
                   4221: end_subject = match_block.end_subject;
                   4222: 
                   4223: match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
                   4224: 
                   4225: match_block.notbol = (options & PCRE_NOTBOL) != 0;
                   4226: match_block.noteol = (options & PCRE_NOTEOL) != 0;
                   4227: match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
                   4228: 
                   4229: match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
                   4230: 
                   4231: match_block.lcc = re->tables + lcc_offset;
                   4232: match_block.ctypes = re->tables + ctypes_offset;
                   4233: 
                   4234: /* The ims options can vary during the matching as a result of the presence
                   4235: of (?ims) items in the pattern. They are kept in a local variable so that
                   4236: restoring at the exit of a group is easy. */
                   4237: 
                   4238: ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
                   4239: 
                   4240: /* If the expression has got more back references than the offsets supplied can
                   4241: hold, we get a temporary bit of working store to use during the matching.
                   4242: Otherwise, we can use the vector supplied, rounding down its size to a multiple
                   4243: of 3. */
                   4244: 
                   4245: ocount = offsetcount - (offsetcount % 3);
                   4246: 
                   4247: if (re->top_backref > 0 && re->top_backref >= ocount/3)
                   4248:   {
                   4249:   ocount = re->top_backref * 3 + 3;
                   4250:   match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
                   4251:   if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
                   4252:   using_temporary_offsets = TRUE;
                   4253:   DPRINTF(("Got memory to hold back references\n"));
                   4254:   }
                   4255: else match_block.offset_vector = offsets;
                   4256: 
                   4257: match_block.offset_end = ocount;
                   4258: match_block.offset_max = (2*ocount)/3;
                   4259: match_block.offset_overflow = FALSE;
                   4260: 
                   4261: /* Compute the minimum number of offsets that we need to reset each time. Doing
                   4262: this makes a huge difference to execution time when there aren't many brackets
                   4263: in the pattern. */
                   4264: 
                   4265: resetcount = 2 + re->top_bracket * 2;
                   4266: if (resetcount > offsetcount) resetcount = ocount;
                   4267: 
                   4268: /* Reset the working variable associated with each extraction. These should
                   4269: never be used unless previously set, but they get saved and restored, and so we
                   4270: initialize them to avoid reading uninitialized locations. */
                   4271: 
                   4272: if (match_block.offset_vector != NULL)
                   4273:   {
                   4274:   register int *iptr = match_block.offset_vector + ocount;
                   4275:   register int *iend = iptr - resetcount/2 + 1;
                   4276:   while (--iptr >= iend) *iptr = -1;
                   4277:   }
                   4278: 
                   4279: /* Set up the first character to match, if available. The first_char value is
                   4280: never set for an anchored regular expression, but the anchoring may be forced
                   4281: at run time, so we have to test for anchoring. The first char may be unset for
                   4282: an unanchored pattern, of course. If there's no first char and the pattern was
                   4283: studied, there may be a bitmap of possible first characters. */
                   4284: 
                   4285: if (!anchored)
                   4286:   {
                   4287:   if ((re->options & PCRE_FIRSTSET) != 0)
                   4288:     {
                   4289:     first_char = re->first_char;
                   4290:     if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
                   4291:     }
                   4292:   else
                   4293:     if (!startline && extra != NULL &&
                   4294:       (extra->options & PCRE_STUDY_MAPPED) != 0)
                   4295:         start_bits = extra->start_bits;
                   4296:   }
                   4297: 
                   4298: /* For anchored or unanchored matches, there may be a "last known required
                   4299: character" set. If the PCRE_CASELESS is set, implying that the match starts
                   4300: caselessly, or if there are any changes of this flag within the regex, set up
                   4301: both cases of the character. Otherwise set the two values the same, which will
                   4302: avoid duplicate testing (which takes significant time). This covers the vast
                   4303: majority of cases. It will be suboptimal when the case flag changes in a regex
                   4304: and the required character in fact is caseful. */
                   4305: 
                   4306: if ((re->options & PCRE_REQCHSET) != 0)
                   4307:   {
                   4308:   req_char = re->req_char;
                   4309:   req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
                   4310:     (re->tables + fcc_offset)[req_char] : req_char;
                   4311:   }
                   4312: 
                   4313: /* Loop for handling unanchored repeated matching attempts; for anchored regexs
                   4314: the loop runs just once. */
                   4315: 
                   4316: do
                   4317:   {
                   4318:   int rc;
                   4319:   register int *iptr = match_block.offset_vector;
                   4320:   register int *iend = iptr + resetcount;
                   4321: 
                   4322:   /* Reset the maximum number of extractions we might see. */
                   4323: 
                   4324:   while (iptr < iend) *iptr++ = -1;
                   4325: 
                   4326:   /* Advance to a unique first char if possible */
                   4327: 
                   4328:   if (first_char >= 0)
                   4329:     {
                   4330:     if ((ims & PCRE_CASELESS) != 0)
                   4331:       while (start_match < end_subject &&
                   4332:              match_block.lcc[*start_match] != first_char)
                   4333:         start_match++;
                   4334:     else
                   4335:       while (start_match < end_subject && *start_match != first_char)
                   4336:         start_match++;
                   4337:     }
                   4338: 
                   4339:   /* Or to just after \n for a multiline match if possible */
                   4340: 
                   4341:   else if (startline)
                   4342:     {
                   4343:     if (start_match > match_block.start_subject + start_offset)
                   4344:       {
                   4345:       while (start_match < end_subject && start_match[-1] != '\n')
                   4346:         start_match++;
                   4347:       }
                   4348:     }
                   4349: 
                   4350:   /* Or to a non-unique first char after study */
                   4351: 
                   4352:   else if (start_bits != NULL)
                   4353:     {
                   4354:     while (start_match < end_subject)
                   4355:       {
                   4356:       register int c = *start_match;
                   4357:       if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
                   4358:       }
                   4359:     }
                   4360: 
                   4361: #ifdef DEBUG  /* Sigh. Some compilers never learn. */
                   4362:   printf(">>>> Match against: ");
                   4363:   pchars(start_match, end_subject - start_match, TRUE, &match_block);
                   4364:   printf("\n");
                   4365: #endif
                   4366: 
                   4367:   /* If req_char is set, we know that that character must appear in the subject
                   4368:   for the match to succeed. If the first character is set, req_char must be
                   4369:   later in the subject; otherwise the test starts at the match point. This
                   4370:   optimization can save a huge amount of backtracking in patterns with nested
                   4371:   unlimited repeats that aren't going to match. We don't know what the state of
                   4372:   case matching may be when this character is hit, so test for it in both its
                   4373:   cases if necessary. However, the different cased versions will not be set up
                   4374:   unless PCRE_CASELESS was given or the casing state changes within the regex.
                   4375:   Writing separate code makes it go faster, as does using an autoincrement and
                   4376:   backing off on a match. */
                   4377: 
                   4378:   if (req_char >= 0)
                   4379:     {
                   4380:     register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
                   4381: 
                   4382:     /* We don't need to repeat the search if we haven't yet reached the
                   4383:     place we found it at last time. */
                   4384: 
                   4385:     if (p > req_char_ptr)
                   4386:       {
                   4387:       /* Do a single test if no case difference is set up */
                   4388: 
                   4389:       if (req_char == req_char2)
                   4390:         {
                   4391:         while (p < end_subject)
                   4392:           {
                   4393:           if (*p++ == req_char) { p--; break; }
                   4394:           }
                   4395:         }
                   4396: 
                   4397:       /* Otherwise test for either case */
                   4398: 
                   4399:       else
                   4400:         {
                   4401:         while (p < end_subject)
                   4402:           {
                   4403:           register int pp = *p++;
                   4404:           if (pp == req_char || pp == req_char2) { p--; break; }
                   4405:           }
                   4406:         }
                   4407: 
                   4408:       /* If we can't find the required character, break the matching loop */
                   4409: 
                   4410:       if (p >= end_subject) break;
                   4411: 
                   4412:       /* If we have found the required character, save the point where we
                   4413:       found it, so that we don't search again next time round the loop if
                   4414:       the start hasn't passed this character yet. */
                   4415: 
                   4416:       req_char_ptr = p;
                   4417:       }
                   4418:     }
                   4419: 
                   4420:   /* When a match occurs, substrings will be set for all internal extractions;
                   4421:   we just need to set up the whole thing as substring 0 before returning. If
                   4422:   there were too many extractions, set the return code to zero. In the case
                   4423:   where we had to get some local store to hold offsets for backreferences, copy
                   4424:   those back references that we can. In this case there need not be overflow
                   4425:   if certain parts of the pattern were not used. */
                   4426: 
                   4427:   match_block.start_match = start_match;
                   4428:   if (!match(start_match, re->code, 2, &match_block, ims, FALSE, start_match))
                   4429:     continue;
                   4430: 
                   4431:   /* Copy the offset information from temporary store if necessary */
                   4432: 
                   4433:   if (using_temporary_offsets)
                   4434:     {
                   4435:     if (offsetcount >= 4)
                   4436:       {
                   4437:       memcpy(offsets + 2, match_block.offset_vector + 2,
                   4438:         (offsetcount - 2) * sizeof(int));
                   4439:       DPRINTF(("Copied offsets from temporary memory\n"));
                   4440:       }
                   4441:     if (match_block.end_offset_top > offsetcount)
                   4442:       match_block.offset_overflow = TRUE;
                   4443: 
                   4444:     DPRINTF(("Freeing temporary memory\n"));
                   4445:     (pcre_free)(match_block.offset_vector);
                   4446:     }
                   4447: 
                   4448:   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
                   4449: 
                   4450:   if (match_block.offset_end < 2) rc = 0; else
                   4451:     {
                   4452:     offsets[0] = start_match - match_block.start_subject;
                   4453:     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
                   4454:     }
                   4455: 
                   4456:   DPRINTF((">>>> returning %d\n", rc));
                   4457:   return rc;
                   4458:   }
                   4459: 
                   4460: /* This "while" is the end of the "do" above */
                   4461: 
                   4462: while (!anchored &&
                   4463:        match_block.errorcode == PCRE_ERROR_NOMATCH &&
                   4464:        start_match++ < end_subject);
                   4465: 
                   4466: if (using_temporary_offsets)
                   4467:   {
                   4468:   DPRINTF(("Freeing temporary memory\n"));
                   4469:   (pcre_free)(match_block.offset_vector);
                   4470:   }
                   4471: 
                   4472: DPRINTF((">>>> returning %d\n", match_block.errorcode));
                   4473: 
                   4474: return match_block.errorcode;
                   4475: }
                   4476: 
                   4477: /* End of pcre.c */

E-mail: