Annotation of win32/pcre/pcre_dfa_exec.c, revision 1.2

1.1       misha       1: /*************************************************
                      2: *      Perl-Compatible Regular Expressions       *
                      3: *************************************************/
                      4: 
                      5: /* PCRE is a library of functions to support regular expressions whose syntax
                      6: and semantics are as close as possible to those of the Perl 5 language.
                      7: 
                      8:                        Written by Philip Hazel
                      9:            Copyright (c) 1997-2008 University of Cambridge
                     10: 
                     11: -----------------------------------------------------------------------------
                     12: Redistribution and use in source and binary forms, with or without
                     13: modification, are permitted provided that the following conditions are met:
                     14: 
                     15:     * Redistributions of source code must retain the above copyright notice,
                     16:       this list of conditions and the following disclaimer.
                     17: 
                     18:     * Redistributions in binary form must reproduce the above copyright
                     19:       notice, this list of conditions and the following disclaimer in the
                     20:       documentation and/or other materials provided with the distribution.
                     21: 
                     22:     * Neither the name of the University of Cambridge nor the names of its
                     23:       contributors may be used to endorse or promote products derived from
                     24:       this software without specific prior written permission.
                     25: 
                     26: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
                     27: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     28: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     29: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
                     30: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
                     31: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
                     32: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
                     33: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
                     34: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
                     35: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
                     36: POSSIBILITY OF SUCH DAMAGE.
                     37: -----------------------------------------------------------------------------
                     38: */
                     39: 
                     40: 
                     41: /* This module contains the external function pcre_dfa_exec(), which is an
                     42: alternative matching function that uses a sort of DFA algorithm (not a true
                     43: FSM). This is NOT Perl- compatible, but it has advantages in certain
                     44: applications. */
                     45: 
                     46: 
                     47: #ifdef HAVE_CONFIG_H
                     48: #include "config.h"
                     49: #endif
                     50: 
                     51: #define NLBLOCK md             /* Block containing newline information */
                     52: #define PSSTART start_subject  /* Field containing processed string start */
                     53: #define PSEND   end_subject    /* Field containing processed string end */
                     54: 
                     55: #include "pcre_internal.h"
                     56: 
                     57: 
                     58: /* For use to indent debugging output */
                     59: 
                     60: #define SP "                   "
                     61: 
                     62: 
                     63: 
                     64: /*************************************************
                     65: *      Code parameters and static tables         *
                     66: *************************************************/
                     67: 
                     68: /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
                     69: into others, under special conditions. A gap of 20 between the blocks should be
                     70: enough. The resulting opcodes don't have to be less than 256 because they are
                     71: never stored, so we push them well clear of the normal opcodes. */
                     72: 
                     73: #define OP_PROP_EXTRA       300
                     74: #define OP_EXTUNI_EXTRA     320
                     75: #define OP_ANYNL_EXTRA      340
                     76: #define OP_HSPACE_EXTRA     360
                     77: #define OP_VSPACE_EXTRA     380
                     78: 
                     79: 
                     80: /* This table identifies those opcodes that are followed immediately by a
                     81: character that is to be tested in some way. This makes is possible to
                     82: centralize the loading of these characters. In the case of Type * etc, the
                     83: "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
                     84: small value. ***NOTE*** If the start of this table is modified, the two tables
                     85: that follow must also be modified. */
                     86: 
                     87: static const uschar coptable[] = {
                     88:   0,                             /* End                                    */
                     89:   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
                     90:   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
                     91:   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
                     92:   0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
                     93:   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
                     94:   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
                     95:   1,                             /* Char                                   */
                     96:   1,                             /* Charnc                                 */
                     97:   1,                             /* not                                    */
                     98:   /* Positive single-char repeats                                          */
                     99:   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
                    100:   3, 3, 3,                       /* upto, minupto, exact                   */
                    101:   1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
                    102:   /* Negative single-char repeats - only for chars < 256                   */
                    103:   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
                    104:   3, 3, 3,                       /* NOT upto, minupto, exact               */
                    105:   1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */
                    106:   /* Positive type repeats                                                 */
                    107:   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
                    108:   3, 3, 3,                       /* Type upto, minupto, exact              */
                    109:   1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
                    110:   /* Character class & ref repeats                                         */
                    111:   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
                    112:   0, 0,                          /* CRRANGE, CRMINRANGE                    */
                    113:   0,                             /* CLASS                                  */
                    114:   0,                             /* NCLASS                                 */
                    115:   0,                             /* XCLASS - variable length               */
                    116:   0,                             /* REF                                    */
                    117:   0,                             /* RECURSE                                */
                    118:   0,                             /* CALLOUT                                */
                    119:   0,                             /* Alt                                    */
                    120:   0,                             /* Ket                                    */
                    121:   0,                             /* KetRmax                                */
                    122:   0,                             /* KetRmin                                */
                    123:   0,                             /* Assert                                 */
                    124:   0,                             /* Assert not                             */
                    125:   0,                             /* Assert behind                          */
                    126:   0,                             /* Assert behind not                      */
                    127:   0,                             /* Reverse                                */
                    128:   0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
                    129:   0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
                    130:   0,                             /* CREF                                   */
                    131:   0,                             /* RREF                                   */
                    132:   0,                             /* DEF                                    */
                    133:   0, 0,                          /* BRAZERO, BRAMINZERO                    */
                    134:   0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
                    135:   0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */
                    136: };
                    137: 
                    138: /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
                    139: and \w */
                    140: 
                    141: static const uschar toptable1[] = {
                    142:   0, 0, 0, 0, 0, 0,
                    143:   ctype_digit, ctype_digit,
                    144:   ctype_space, ctype_space,
                    145:   ctype_word,  ctype_word,
                    146:   0, 0                            /* OP_ANY, OP_ALLANY */
                    147: };
                    148: 
                    149: static const uschar toptable2[] = {
                    150:   0, 0, 0, 0, 0, 0,
                    151:   ctype_digit, 0,
                    152:   ctype_space, 0,
                    153:   ctype_word,  0,
                    154:   1, 1                            /* OP_ANY, OP_ALLANY */
                    155: };
                    156: 
                    157: 
                    158: /* Structure for holding data about a particular state, which is in effect the
                    159: current data for an active path through the match tree. It must consist
                    160: entirely of ints because the working vector we are passed, and which we put
                    161: these structures in, is a vector of ints. */
                    162: 
                    163: typedef struct stateblock {
                    164:   int offset;                     /* Offset to opcode */
                    165:   int count;                      /* Count for repeats */
                    166:   int ims;                        /* ims flag bits */
                    167:   int data;                       /* Some use extra data */
                    168: } stateblock;
                    169: 
                    170: #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
                    171: 
                    172: 
                    173: #ifdef DEBUG
                    174: /*************************************************
                    175: *             Print character string             *
                    176: *************************************************/
                    177: 
                    178: /* Character string printing function for debugging.
                    179: 
                    180: Arguments:
                    181:   p            points to string
                    182:   length       number of bytes
                    183:   f            where to print
                    184: 
                    185: Returns:       nothing
                    186: */
                    187: 
                    188: static void
                    189: pchars(unsigned char *p, int length, FILE *f)
                    190: {
                    191: int c;
                    192: while (length-- > 0)
                    193:   {
                    194:   if (isprint(c = *(p++)))
                    195:     fprintf(f, "%c", c);
                    196:   else
                    197:     fprintf(f, "\\x%02x", c);
                    198:   }
                    199: }
                    200: #endif
                    201: 
                    202: 
                    203: 
                    204: /*************************************************
                    205: *    Execute a Regular Expression - DFA engine   *
                    206: *************************************************/
                    207: 
                    208: /* This internal function applies a compiled pattern to a subject string,
                    209: starting at a given point, using a DFA engine. This function is called from the
                    210: external one, possibly multiple times if the pattern is not anchored. The
                    211: function calls itself recursively for some kinds of subpattern.
                    212: 
                    213: Arguments:
                    214:   md                the match_data block with fixed information
                    215:   this_start_code   the opening bracket of this subexpression's code
                    216:   current_subject   where we currently are in the subject string
                    217:   start_offset      start offset in the subject string
                    218:   offsets           vector to contain the matching string offsets
                    219:   offsetcount       size of same
                    220:   workspace         vector of workspace
                    221:   wscount           size of same
                    222:   ims               the current ims flags
                    223:   rlevel            function call recursion level
                    224:   recursing         regex recursive call level
                    225: 
                    226: Returns:            > 0 => number of match offset pairs placed in offsets
                    227:                     = 0 => offsets overflowed; longest matches are present
                    228:                      -1 => failed to match
                    229:                    < -1 => some kind of unexpected problem
                    230: 
                    231: The following macros are used for adding states to the two state vectors (one
                    232: for the current character, one for the following character). */
                    233: 
                    234: #define ADD_ACTIVE(x,y) \
                    235:   if (active_count++ < wscount) \
                    236:     { \
                    237:     next_active_state->offset = (x); \
                    238:     next_active_state->count  = (y); \
                    239:     next_active_state->ims    = ims; \
                    240:     next_active_state++; \
                    241:     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
                    242:     } \
                    243:   else return PCRE_ERROR_DFA_WSSIZE
                    244: 
                    245: #define ADD_ACTIVE_DATA(x,y,z) \
                    246:   if (active_count++ < wscount) \
                    247:     { \
                    248:     next_active_state->offset = (x); \
                    249:     next_active_state->count  = (y); \
                    250:     next_active_state->ims    = ims; \
                    251:     next_active_state->data   = (z); \
                    252:     next_active_state++; \
                    253:     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
                    254:     } \
                    255:   else return PCRE_ERROR_DFA_WSSIZE
                    256: 
                    257: #define ADD_NEW(x,y) \
                    258:   if (new_count++ < wscount) \
                    259:     { \
                    260:     next_new_state->offset = (x); \
                    261:     next_new_state->count  = (y); \
                    262:     next_new_state->ims    = ims; \
                    263:     next_new_state++; \
                    264:     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
                    265:     } \
                    266:   else return PCRE_ERROR_DFA_WSSIZE
                    267: 
                    268: #define ADD_NEW_DATA(x,y,z) \
                    269:   if (new_count++ < wscount) \
                    270:     { \
                    271:     next_new_state->offset = (x); \
                    272:     next_new_state->count  = (y); \
                    273:     next_new_state->ims    = ims; \
                    274:     next_new_state->data   = (z); \
                    275:     next_new_state++; \
                    276:     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
                    277:     } \
                    278:   else return PCRE_ERROR_DFA_WSSIZE
                    279: 
                    280: /* And now, here is the code */
                    281: 
                    282: static int
                    283: internal_dfa_exec(
                    284:   dfa_match_data *md,
                    285:   const uschar *this_start_code,
                    286:   const uschar *current_subject,
                    287:   int start_offset,
                    288:   int *offsets,
                    289:   int offsetcount,
                    290:   int *workspace,
                    291:   int wscount,
                    292:   int ims,
                    293:   int  rlevel,
                    294:   int  recursing)
                    295: {
                    296: stateblock *active_states, *new_states, *temp_states;
                    297: stateblock *next_active_state, *next_new_state;
                    298: 
                    299: const uschar *ctypes, *lcc, *fcc;
                    300: const uschar *ptr;
                    301: const uschar *end_code, *first_op;
                    302: 
                    303: int active_count, new_count, match_count;
                    304: 
                    305: /* Some fields in the md block are frequently referenced, so we load them into
                    306: independent variables in the hope that this will perform better. */
                    307: 
                    308: const uschar *start_subject = md->start_subject;
                    309: const uschar *end_subject = md->end_subject;
                    310: const uschar *start_code = md->start_code;
                    311: 
                    312: #ifdef SUPPORT_UTF8
                    313: BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
                    314: #else
                    315: BOOL utf8 = FALSE;
                    316: #endif
                    317: 
                    318: rlevel++;
                    319: offsetcount &= (-2);
                    320: 
                    321: wscount -= 2;
                    322: wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
                    323:           (2 * INTS_PER_STATEBLOCK);
                    324: 
                    325: DPRINTF(("\n%.*s---------------------\n"
                    326:   "%.*sCall to internal_dfa_exec f=%d r=%d\n",
                    327:   rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
                    328: 
                    329: ctypes = md->tables + ctypes_offset;
                    330: lcc = md->tables + lcc_offset;
                    331: fcc = md->tables + fcc_offset;
                    332: 
                    333: match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
                    334: 
                    335: active_states = (stateblock *)(workspace + 2);
                    336: next_new_state = new_states = active_states + wscount;
                    337: new_count = 0;
                    338: 
                    339: first_op = this_start_code + 1 + LINK_SIZE +
                    340:   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
                    341: 
                    342: /* The first thing in any (sub) pattern is a bracket of some sort. Push all
                    343: the alternative states onto the list, and find out where the end is. This
                    344: makes is possible to use this function recursively, when we want to stop at a
                    345: matching internal ket rather than at the end.
                    346: 
                    347: If the first opcode in the first alternative is OP_REVERSE, we are dealing with
                    348: a backward assertion. In that case, we have to find out the maximum amount to
                    349: move back, and set up each alternative appropriately. */
                    350: 
                    351: if (*first_op == OP_REVERSE)
                    352:   {
                    353:   int max_back = 0;
                    354:   int gone_back;
                    355: 
                    356:   end_code = this_start_code;
                    357:   do
                    358:     {
                    359:     int back = GET(end_code, 2+LINK_SIZE);
                    360:     if (back > max_back) max_back = back;
                    361:     end_code += GET(end_code, 1);
                    362:     }
                    363:   while (*end_code == OP_ALT);
                    364: 
                    365:   /* If we can't go back the amount required for the longest lookbehind
                    366:   pattern, go back as far as we can; some alternatives may still be viable. */
                    367: 
                    368: #ifdef SUPPORT_UTF8
                    369:   /* In character mode we have to step back character by character */
                    370: 
                    371:   if (utf8)
                    372:     {
                    373:     for (gone_back = 0; gone_back < max_back; gone_back++)
                    374:       {
                    375:       if (current_subject <= start_subject) break;
                    376:       current_subject--;
                    377:       while (current_subject > start_subject &&
                    378:              (*current_subject & 0xc0) == 0x80)
                    379:         current_subject--;
                    380:       }
                    381:     }
                    382:   else
                    383: #endif
                    384: 
                    385:   /* In byte-mode we can do this quickly. */
                    386: 
                    387:     {
                    388:     gone_back = (current_subject - max_back < start_subject)?
                    389:       current_subject - start_subject : max_back;
                    390:     current_subject -= gone_back;
                    391:     }
                    392: 
                    393:   /* Now we can process the individual branches. */
                    394: 
                    395:   end_code = this_start_code;
                    396:   do
                    397:     {
                    398:     int back = GET(end_code, 2+LINK_SIZE);
                    399:     if (back <= gone_back)
                    400:       {
                    401:       int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
                    402:       ADD_NEW_DATA(-bstate, 0, gone_back - back);
                    403:       }
                    404:     end_code += GET(end_code, 1);
                    405:     }
                    406:   while (*end_code == OP_ALT);
                    407:  }
                    408: 
                    409: /* This is the code for a "normal" subpattern (not a backward assertion). The
                    410: start of a whole pattern is always one of these. If we are at the top level,
                    411: we may be asked to restart matching from the same point that we reached for a
                    412: previous partial match. We still have to scan through the top-level branches to
                    413: find the end state. */
                    414: 
                    415: else
                    416:   {
                    417:   end_code = this_start_code;
                    418: 
                    419:   /* Restarting */
                    420: 
                    421:   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
                    422:     {
                    423:     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
                    424:     new_count = workspace[1];
                    425:     if (!workspace[0])
                    426:       memcpy(new_states, active_states, new_count * sizeof(stateblock));
                    427:     }
                    428: 
                    429:   /* Not restarting */
                    430: 
                    431:   else
                    432:     {
                    433:     int length = 1 + LINK_SIZE +
                    434:       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
                    435:     do
                    436:       {
                    437:       ADD_NEW(end_code - start_code + length, 0);
                    438:       end_code += GET(end_code, 1);
                    439:       length = 1 + LINK_SIZE;
                    440:       }
                    441:     while (*end_code == OP_ALT);
                    442:     }
                    443:   }
                    444: 
                    445: workspace[0] = 0;    /* Bit indicating which vector is current */
                    446: 
                    447: DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
                    448: 
                    449: /* Loop for scanning the subject */
                    450: 
                    451: ptr = current_subject;
                    452: for (;;)
                    453:   {
                    454:   int i, j;
                    455:   int clen, dlen;
                    456:   unsigned int c, d;
                    457: 
                    458:   /* Make the new state list into the active state list and empty the
                    459:   new state list. */
                    460: 
                    461:   temp_states = active_states;
                    462:   active_states = new_states;
                    463:   new_states = temp_states;
                    464:   active_count = new_count;
                    465:   new_count = 0;
                    466: 
                    467:   workspace[0] ^= 1;              /* Remember for the restarting feature */
                    468:   workspace[1] = active_count;
                    469: 
                    470: #ifdef DEBUG
                    471:   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
                    472:   pchars((uschar *)ptr, strlen((char *)ptr), stdout);
                    473:   printf("\"\n");
                    474: 
                    475:   printf("%.*sActive states: ", rlevel*2-2, SP);
                    476:   for (i = 0; i < active_count; i++)
                    477:     printf("%d/%d ", active_states[i].offset, active_states[i].count);
                    478:   printf("\n");
                    479: #endif
                    480: 
                    481:   /* Set the pointers for adding new states */
                    482: 
                    483:   next_active_state = active_states + active_count;
                    484:   next_new_state = new_states;
                    485: 
                    486:   /* Load the current character from the subject outside the loop, as many
                    487:   different states may want to look at it, and we assume that at least one
                    488:   will. */
                    489: 
                    490:   if (ptr < end_subject)
                    491:     {
                    492:     clen = 1;        /* Number of bytes in the character */
                    493: #ifdef SUPPORT_UTF8
                    494:     if (utf8) { GETCHARLEN(c, ptr, clen); } else
                    495: #endif  /* SUPPORT_UTF8 */
                    496:     c = *ptr;
                    497:     }
                    498:   else
                    499:     {
                    500:     clen = 0;        /* This indicates the end of the subject */
                    501:     c = NOTACHAR;    /* This value should never actually be used */
                    502:     }
                    503: 
                    504:   /* Scan up the active states and act on each one. The result of an action
                    505:   may be to add more states to the currently active list (e.g. on hitting a
                    506:   parenthesis) or it may be to put states on the new list, for considering
                    507:   when we move the character pointer on. */
                    508: 
                    509:   for (i = 0; i < active_count; i++)
                    510:     {
                    511:     stateblock *current_state = active_states + i;
                    512:     const uschar *code;
                    513:     int state_offset = current_state->offset;
                    514:     int count, codevalue;
                    515: 
                    516: #ifdef DEBUG
                    517:     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
                    518:     if (clen == 0) printf("EOL\n");
                    519:       else if (c > 32 && c < 127) printf("'%c'\n", c);
                    520:         else printf("0x%02x\n", c);
                    521: #endif
                    522: 
                    523:     /* This variable is referred to implicity in the ADD_xxx macros. */
                    524: 
                    525:     ims = current_state->ims;
                    526: 
                    527:     /* A negative offset is a special case meaning "hold off going to this
                    528:     (negated) state until the number of characters in the data field have
                    529:     been skipped". */
                    530: 
                    531:     if (state_offset < 0)
                    532:       {
                    533:       if (current_state->data > 0)
                    534:         {
                    535:         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
                    536:         ADD_NEW_DATA(state_offset, current_state->count,
                    537:           current_state->data - 1);
                    538:         continue;
                    539:         }
                    540:       else
                    541:         {
                    542:         current_state->offset = state_offset = -state_offset;
                    543:         }
                    544:       }
                    545: 
                    546:     /* Check for a duplicate state with the same count, and skip if found. */
                    547: 
                    548:     for (j = 0; j < i; j++)
                    549:       {
                    550:       if (active_states[j].offset == state_offset &&
                    551:           active_states[j].count == current_state->count)
                    552:         {
                    553:         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
                    554:         goto NEXT_ACTIVE_STATE;
                    555:         }
                    556:       }
                    557: 
                    558:     /* The state offset is the offset to the opcode */
                    559: 
                    560:     code = start_code + state_offset;
                    561:     codevalue = *code;
                    562: 
                    563:     /* If this opcode is followed by an inline character, load it. It is
                    564:     tempting to test for the presence of a subject character here, but that
                    565:     is wrong, because sometimes zero repetitions of the subject are
                    566:     permitted.
                    567: 
                    568:     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
                    569:     argument that is not a data character - but is always one byte long. We
                    570:     have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
                    571:     this case. To keep the other cases fast, convert these ones to new opcodes.
                    572:     */
                    573: 
                    574:     if (coptable[codevalue] > 0)
                    575:       {
                    576:       dlen = 1;
                    577: #ifdef SUPPORT_UTF8
                    578:       if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
                    579: #endif  /* SUPPORT_UTF8 */
                    580:       d = code[coptable[codevalue]];
                    581:       if (codevalue >= OP_TYPESTAR)
                    582:         {
                    583:         switch(d)
                    584:           {
                    585:           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
                    586:           case OP_NOTPROP:
                    587:           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
                    588:           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
                    589:           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
                    590:           case OP_NOT_HSPACE:
                    591:           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
                    592:           case OP_NOT_VSPACE:
                    593:           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
                    594:           default: break;
                    595:           }
                    596:         }
                    597:       }
                    598:     else
                    599:       {
                    600:       dlen = 0;         /* Not strictly necessary, but compilers moan */
                    601:       d = NOTACHAR;     /* if these variables are not set. */
                    602:       }
                    603: 
                    604: 
                    605:     /* Now process the individual opcodes */
                    606: 
                    607:     switch (codevalue)
                    608:       {
                    609: 
                    610: /* ========================================================================== */
                    611:       /* Reached a closing bracket. If not at the end of the pattern, carry
                    612:       on with the next opcode. Otherwise, unless we have an empty string and
                    613:       PCRE_NOTEMPTY is set, save the match data, shifting up all previous
                    614:       matches so we always have the longest first. */
                    615: 
                    616:       case OP_KET:
                    617:       case OP_KETRMIN:
                    618:       case OP_KETRMAX:
                    619:       if (code != end_code)
                    620:         {
                    621:         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
                    622:         if (codevalue != OP_KET)
                    623:           {
                    624:           ADD_ACTIVE(state_offset - GET(code, 1), 0);
                    625:           }
                    626:         }
                    627:       else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
                    628:         {
                    629:         if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
                    630:           else if (match_count > 0 && ++match_count * 2 >= offsetcount)
                    631:             match_count = 0;
                    632:         count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
                    633:         if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
                    634:         if (offsetcount >= 2)
                    635:           {
                    636:           offsets[0] = current_subject - start_subject;
                    637:           offsets[1] = ptr - start_subject;
                    638:           DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
                    639:             offsets[1] - offsets[0], current_subject));
                    640:           }
                    641:         if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
                    642:           {
                    643:           DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
                    644:             "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
                    645:             match_count, rlevel*2-2, SP));
                    646:           return match_count;
                    647:           }
                    648:         }
                    649:       break;
                    650: 
                    651: /* ========================================================================== */
                    652:       /* These opcodes add to the current list of states without looking
                    653:       at the current character. */
                    654: 
                    655:       /*-----------------------------------------------------------------*/
                    656:       case OP_ALT:
                    657:       do { code += GET(code, 1); } while (*code == OP_ALT);
                    658:       ADD_ACTIVE(code - start_code, 0);
                    659:       break;
                    660: 
                    661:       /*-----------------------------------------------------------------*/
                    662:       case OP_BRA:
                    663:       case OP_SBRA:
                    664:       do
                    665:         {
                    666:         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
                    667:         code += GET(code, 1);
                    668:         }
                    669:       while (*code == OP_ALT);
                    670:       break;
                    671: 
                    672:       /*-----------------------------------------------------------------*/
                    673:       case OP_CBRA:
                    674:       case OP_SCBRA:
                    675:       ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);
                    676:       code += GET(code, 1);
                    677:       while (*code == OP_ALT)
                    678:         {
                    679:         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);
                    680:         code += GET(code, 1);
                    681:         }
                    682:       break;
                    683: 
                    684:       /*-----------------------------------------------------------------*/
                    685:       case OP_BRAZERO:
                    686:       case OP_BRAMINZERO:
                    687:       ADD_ACTIVE(state_offset + 1, 0);
                    688:       code += 1 + GET(code, 2);
                    689:       while (*code == OP_ALT) code += GET(code, 1);
                    690:       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
                    691:       break;
                    692: 
                    693:       /*-----------------------------------------------------------------*/
                    694:       case OP_SKIPZERO:
                    695:       code += 1 + GET(code, 2);
                    696:       while (*code == OP_ALT) code += GET(code, 1);
                    697:       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
                    698:       break;
                    699: 
                    700:       /*-----------------------------------------------------------------*/
                    701:       case OP_CIRC:
                    702:       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
                    703:           ((ims & PCRE_MULTILINE) != 0 &&
                    704:             ptr != end_subject &&
                    705:             WAS_NEWLINE(ptr)))
                    706:         { ADD_ACTIVE(state_offset + 1, 0); }
                    707:       break;
                    708: 
                    709:       /*-----------------------------------------------------------------*/
                    710:       case OP_EOD:
                    711:       if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
                    712:       break;
                    713: 
                    714:       /*-----------------------------------------------------------------*/
                    715:       case OP_OPT:
                    716:       ims = code[1];
                    717:       ADD_ACTIVE(state_offset + 2, 0);
                    718:       break;
                    719: 
                    720:       /*-----------------------------------------------------------------*/
                    721:       case OP_SOD:
                    722:       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
                    723:       break;
                    724: 
                    725:       /*-----------------------------------------------------------------*/
                    726:       case OP_SOM:
                    727:       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
                    728:       break;
                    729: 
                    730: 
                    731: /* ========================================================================== */
                    732:       /* These opcodes inspect the next subject character, and sometimes
                    733:       the previous one as well, but do not have an argument. The variable
                    734:       clen contains the length of the current character and is zero if we are
                    735:       at the end of the subject. */
                    736: 
                    737:       /*-----------------------------------------------------------------*/
                    738:       case OP_ANY:
                    739:       if (clen > 0 && !IS_NEWLINE(ptr))
                    740:         { ADD_NEW(state_offset + 1, 0); }
                    741:       break;
                    742: 
                    743:       /*-----------------------------------------------------------------*/
                    744:       case OP_ALLANY:
                    745:       if (clen > 0)
                    746:         { ADD_NEW(state_offset + 1, 0); }
                    747:       break;
                    748: 
                    749:       /*-----------------------------------------------------------------*/
                    750:       case OP_EODN:
                    751:       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
                    752:         { ADD_ACTIVE(state_offset + 1, 0); }
                    753:       break;
                    754: 
                    755:       /*-----------------------------------------------------------------*/
                    756:       case OP_DOLL:
                    757:       if ((md->moptions & PCRE_NOTEOL) == 0)
                    758:         {
                    759:         if (clen == 0 ||
                    760:             (IS_NEWLINE(ptr) &&
                    761:                ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
                    762:             ))
                    763:           { ADD_ACTIVE(state_offset + 1, 0); }
                    764:         }
                    765:       else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
                    766:         { ADD_ACTIVE(state_offset + 1, 0); }
                    767:       break;
                    768: 
                    769:       /*-----------------------------------------------------------------*/
                    770: 
                    771:       case OP_DIGIT:
                    772:       case OP_WHITESPACE:
                    773:       case OP_WORDCHAR:
                    774:       if (clen > 0 && c < 256 &&
                    775:             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
                    776:         { ADD_NEW(state_offset + 1, 0); }
                    777:       break;
                    778: 
                    779:       /*-----------------------------------------------------------------*/
                    780:       case OP_NOT_DIGIT:
                    781:       case OP_NOT_WHITESPACE:
                    782:       case OP_NOT_WORDCHAR:
                    783:       if (clen > 0 && (c >= 256 ||
                    784:             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
                    785:         { ADD_NEW(state_offset + 1, 0); }
                    786:       break;
                    787: 
                    788:       /*-----------------------------------------------------------------*/
                    789:       case OP_WORD_BOUNDARY:
                    790:       case OP_NOT_WORD_BOUNDARY:
                    791:         {
                    792:         int left_word, right_word;
                    793: 
                    794:         if (ptr > start_subject)
                    795:           {
                    796:           const uschar *temp = ptr - 1;
                    797: #ifdef SUPPORT_UTF8
                    798:           if (utf8) BACKCHAR(temp);
                    799: #endif
                    800:           GETCHARTEST(d, temp);
                    801:           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
                    802:           }
                    803:         else left_word = 0;
                    804: 
                    805:         if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
                    806:           else right_word = 0;
                    807: 
                    808:         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
                    809:           { ADD_ACTIVE(state_offset + 1, 0); }
                    810:         }
                    811:       break;
                    812: 
                    813: 
                    814:       /*-----------------------------------------------------------------*/
                    815:       /* Check the next character by Unicode property. We will get here only
                    816:       if the support is in the binary; otherwise a compile-time error occurs.
                    817:       */
                    818: 
                    819: #ifdef SUPPORT_UCP
                    820:       case OP_PROP:
                    821:       case OP_NOTPROP:
                    822:       if (clen > 0)
                    823:         {
                    824:         BOOL OK;
1.2     ! misha     825:         const ucd_record * prop = GET_UCD(c);
1.1       misha     826:         switch(code[1])
                    827:           {
                    828:           case PT_ANY:
                    829:           OK = TRUE;
                    830:           break;
                    831: 
                    832:           case PT_LAMP:
1.2     ! misha     833:           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1.1       misha     834:           break;
                    835: 
                    836:           case PT_GC:
1.2     ! misha     837:           OK = _pcre_ucp_gentype[prop->chartype] == code[2];
1.1       misha     838:           break;
                    839: 
                    840:           case PT_PC:
1.2     ! misha     841:           OK = prop->chartype == code[2];
1.1       misha     842:           break;
                    843: 
                    844:           case PT_SC:
1.2     ! misha     845:           OK = prop->script == code[2];
1.1       misha     846:           break;
                    847: 
                    848:           /* Should never occur, but keep compilers from grumbling. */
                    849: 
                    850:           default:
                    851:           OK = codevalue != OP_PROP;
                    852:           break;
                    853:           }
                    854: 
                    855:         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
                    856:         }
                    857:       break;
                    858: #endif
                    859: 
                    860: 
                    861: 
                    862: /* ========================================================================== */
                    863:       /* These opcodes likewise inspect the subject character, but have an
                    864:       argument that is not a data character. It is one of these opcodes:
                    865:       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
                    866:       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
                    867: 
                    868:       case OP_TYPEPLUS:
                    869:       case OP_TYPEMINPLUS:
                    870:       case OP_TYPEPOSPLUS:
                    871:       count = current_state->count;  /* Already matched */
                    872:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
                    873:       if (clen > 0)
                    874:         {
                    875:         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
                    876:             (c < 256 &&
                    877:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                    878:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
                    879:           {
                    880:           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
                    881:             {
                    882:             active_count--;            /* Remove non-match possibility */
                    883:             next_active_state--;
                    884:             }
                    885:           count++;
                    886:           ADD_NEW(state_offset, count);
                    887:           }
                    888:         }
                    889:       break;
                    890: 
                    891:       /*-----------------------------------------------------------------*/
                    892:       case OP_TYPEQUERY:
                    893:       case OP_TYPEMINQUERY:
                    894:       case OP_TYPEPOSQUERY:
                    895:       ADD_ACTIVE(state_offset + 2, 0);
                    896:       if (clen > 0)
                    897:         {
                    898:         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
                    899:             (c < 256 &&
                    900:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                    901:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
                    902:           {
                    903:           if (codevalue == OP_TYPEPOSQUERY)
                    904:             {
                    905:             active_count--;            /* Remove non-match possibility */
                    906:             next_active_state--;
                    907:             }
                    908:           ADD_NEW(state_offset + 2, 0);
                    909:           }
                    910:         }
                    911:       break;
                    912: 
                    913:       /*-----------------------------------------------------------------*/
                    914:       case OP_TYPESTAR:
                    915:       case OP_TYPEMINSTAR:
                    916:       case OP_TYPEPOSSTAR:
                    917:       ADD_ACTIVE(state_offset + 2, 0);
                    918:       if (clen > 0)
                    919:         {
                    920:         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
                    921:             (c < 256 &&
                    922:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                    923:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
                    924:           {
                    925:           if (codevalue == OP_TYPEPOSSTAR)
                    926:             {
                    927:             active_count--;            /* Remove non-match possibility */
                    928:             next_active_state--;
                    929:             }
                    930:           ADD_NEW(state_offset, 0);
                    931:           }
                    932:         }
                    933:       break;
                    934: 
                    935:       /*-----------------------------------------------------------------*/
                    936:       case OP_TYPEEXACT:
                    937:       count = current_state->count;  /* Number already matched */
                    938:       if (clen > 0)
                    939:         {
                    940:         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
                    941:             (c < 256 &&
                    942:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                    943:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
                    944:           {
                    945:           if (++count >= GET2(code, 1))
                    946:             { ADD_NEW(state_offset + 4, 0); }
                    947:           else
                    948:             { ADD_NEW(state_offset, count); }
                    949:           }
                    950:         }
                    951:       break;
                    952: 
                    953:       /*-----------------------------------------------------------------*/
                    954:       case OP_TYPEUPTO:
                    955:       case OP_TYPEMINUPTO:
                    956:       case OP_TYPEPOSUPTO:
                    957:       ADD_ACTIVE(state_offset + 4, 0);
                    958:       count = current_state->count;  /* Number already matched */
                    959:       if (clen > 0)
                    960:         {
                    961:         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
                    962:             (c < 256 &&
                    963:               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
                    964:               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
                    965:           {
                    966:           if (codevalue == OP_TYPEPOSUPTO)
                    967:             {
                    968:             active_count--;           /* Remove non-match possibility */
                    969:             next_active_state--;
                    970:             }
                    971:           if (++count >= GET2(code, 1))
                    972:             { ADD_NEW(state_offset + 4, 0); }
                    973:           else
                    974:             { ADD_NEW(state_offset, count); }
                    975:           }
                    976:         }
                    977:       break;
                    978: 
                    979: /* ========================================================================== */
                    980:       /* These are virtual opcodes that are used when something like
                    981:       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
                    982:       argument. It keeps the code above fast for the other cases. The argument
                    983:       is in the d variable. */
                    984: 
                    985: #ifdef SUPPORT_UCP
                    986:       case OP_PROP_EXTRA + OP_TYPEPLUS:
                    987:       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
                    988:       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
                    989:       count = current_state->count;           /* Already matched */
                    990:       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
                    991:       if (clen > 0)
                    992:         {
                    993:         BOOL OK;
1.2     ! misha     994:         const ucd_record * prop = GET_UCD(c);
1.1       misha     995:         switch(code[2])
                    996:           {
                    997:           case PT_ANY:
                    998:           OK = TRUE;
                    999:           break;
                   1000: 
                   1001:           case PT_LAMP:
1.2     ! misha    1002:           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1.1       misha    1003:           break;
                   1004: 
                   1005:           case PT_GC:
1.2     ! misha    1006:           OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1.1       misha    1007:           break;
                   1008: 
                   1009:           case PT_PC:
1.2     ! misha    1010:           OK = prop->chartype == code[3];
1.1       misha    1011:           break;
                   1012: 
                   1013:           case PT_SC:
1.2     ! misha    1014:           OK = prop->script == code[3];
1.1       misha    1015:           break;
                   1016: 
                   1017:           /* Should never occur, but keep compilers from grumbling. */
                   1018: 
                   1019:           default:
                   1020:           OK = codevalue != OP_PROP;
                   1021:           break;
                   1022:           }
                   1023: 
                   1024:         if (OK == (d == OP_PROP))
                   1025:           {
                   1026:           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
                   1027:             {
                   1028:             active_count--;           /* Remove non-match possibility */
                   1029:             next_active_state--;
                   1030:             }
                   1031:           count++;
                   1032:           ADD_NEW(state_offset, count);
                   1033:           }
                   1034:         }
                   1035:       break;
                   1036: 
                   1037:       /*-----------------------------------------------------------------*/
                   1038:       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
                   1039:       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
                   1040:       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
                   1041:       count = current_state->count;  /* Already matched */
                   1042:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1.2     ! misha    1043:       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1.1       misha    1044:         {
                   1045:         const uschar *nptr = ptr + clen;
                   1046:         int ncount = 0;
                   1047:         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
                   1048:           {
                   1049:           active_count--;           /* Remove non-match possibility */
                   1050:           next_active_state--;
                   1051:           }
                   1052:         while (nptr < end_subject)
                   1053:           {
                   1054:           int nd;
                   1055:           int ndlen = 1;
                   1056:           GETCHARLEN(nd, nptr, ndlen);
1.2     ! misha    1057:           if (UCD_CATEGORY(nd) != ucp_M) break;
1.1       misha    1058:           ncount++;
                   1059:           nptr += ndlen;
                   1060:           }
                   1061:         count++;
                   1062:         ADD_NEW_DATA(-state_offset, count, ncount);
                   1063:         }
                   1064:       break;
                   1065: #endif
                   1066: 
                   1067:       /*-----------------------------------------------------------------*/
                   1068:       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
                   1069:       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
                   1070:       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
                   1071:       count = current_state->count;  /* Already matched */
                   1072:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
                   1073:       if (clen > 0)
                   1074:         {
                   1075:         int ncount = 0;
                   1076:         switch (c)
                   1077:           {
                   1078:           case 0x000b:
                   1079:           case 0x000c:
                   1080:           case 0x0085:
                   1081:           case 0x2028:
                   1082:           case 0x2029:
                   1083:           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
                   1084:           goto ANYNL01;
                   1085: 
                   1086:           case 0x000d:
                   1087:           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
                   1088:           /* Fall through */
                   1089: 
                   1090:           ANYNL01:
                   1091:           case 0x000a:
                   1092:           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
                   1093:             {
                   1094:             active_count--;           /* Remove non-match possibility */
                   1095:             next_active_state--;
                   1096:             }
                   1097:           count++;
                   1098:           ADD_NEW_DATA(-state_offset, count, ncount);
                   1099:           break;
                   1100: 
                   1101:           default:
                   1102:           break;
                   1103:           }
                   1104:         }
                   1105:       break;
                   1106: 
                   1107:       /*-----------------------------------------------------------------*/
                   1108:       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
                   1109:       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
                   1110:       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
                   1111:       count = current_state->count;  /* Already matched */
                   1112:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
                   1113:       if (clen > 0)
                   1114:         {
                   1115:         BOOL OK;
                   1116:         switch (c)
                   1117:           {
                   1118:           case 0x000a:
                   1119:           case 0x000b:
                   1120:           case 0x000c:
                   1121:           case 0x000d:
                   1122:           case 0x0085:
                   1123:           case 0x2028:
                   1124:           case 0x2029:
                   1125:           OK = TRUE;
                   1126:           break;
                   1127: 
                   1128:           default:
                   1129:           OK = FALSE;
                   1130:           break;
                   1131:           }
                   1132: 
                   1133:         if (OK == (d == OP_VSPACE))
                   1134:           {
                   1135:           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
                   1136:             {
                   1137:             active_count--;           /* Remove non-match possibility */
                   1138:             next_active_state--;
                   1139:             }
                   1140:           count++;
                   1141:           ADD_NEW_DATA(-state_offset, count, 0);
                   1142:           }
                   1143:         }
                   1144:       break;
                   1145: 
                   1146:       /*-----------------------------------------------------------------*/
                   1147:       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
                   1148:       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
                   1149:       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
                   1150:       count = current_state->count;  /* Already matched */
                   1151:       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
                   1152:       if (clen > 0)
                   1153:         {
                   1154:         BOOL OK;
                   1155:         switch (c)
                   1156:           {
                   1157:           case 0x09:      /* HT */
                   1158:           case 0x20:      /* SPACE */
                   1159:           case 0xa0:      /* NBSP */
                   1160:           case 0x1680:    /* OGHAM SPACE MARK */
                   1161:           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
                   1162:           case 0x2000:    /* EN QUAD */
                   1163:           case 0x2001:    /* EM QUAD */
                   1164:           case 0x2002:    /* EN SPACE */
                   1165:           case 0x2003:    /* EM SPACE */
                   1166:           case 0x2004:    /* THREE-PER-EM SPACE */
                   1167:           case 0x2005:    /* FOUR-PER-EM SPACE */
                   1168:           case 0x2006:    /* SIX-PER-EM SPACE */
                   1169:           case 0x2007:    /* FIGURE SPACE */
                   1170:           case 0x2008:    /* PUNCTUATION SPACE */
                   1171:           case 0x2009:    /* THIN SPACE */
                   1172:           case 0x200A:    /* HAIR SPACE */
                   1173:           case 0x202f:    /* NARROW NO-BREAK SPACE */
                   1174:           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
                   1175:           case 0x3000:    /* IDEOGRAPHIC SPACE */
                   1176:           OK = TRUE;
                   1177:           break;
                   1178: 
                   1179:           default:
                   1180:           OK = FALSE;
                   1181:           break;
                   1182:           }
                   1183: 
                   1184:         if (OK == (d == OP_HSPACE))
                   1185:           {
                   1186:           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
                   1187:             {
                   1188:             active_count--;           /* Remove non-match possibility */
                   1189:             next_active_state--;
                   1190:             }
                   1191:           count++;
                   1192:           ADD_NEW_DATA(-state_offset, count, 0);
                   1193:           }
                   1194:         }
                   1195:       break;
                   1196: 
                   1197:       /*-----------------------------------------------------------------*/
                   1198: #ifdef SUPPORT_UCP
                   1199:       case OP_PROP_EXTRA + OP_TYPEQUERY:
                   1200:       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
                   1201:       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
                   1202:       count = 4;
                   1203:       goto QS1;
                   1204: 
                   1205:       case OP_PROP_EXTRA + OP_TYPESTAR:
                   1206:       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
                   1207:       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
                   1208:       count = 0;
                   1209: 
                   1210:       QS1:
                   1211: 
                   1212:       ADD_ACTIVE(state_offset + 4, 0);
                   1213:       if (clen > 0)
                   1214:         {
                   1215:         BOOL OK;
1.2     ! misha    1216:         const ucd_record * prop = GET_UCD(c);
1.1       misha    1217:         switch(code[2])
                   1218:           {
                   1219:           case PT_ANY:
                   1220:           OK = TRUE;
                   1221:           break;
                   1222: 
                   1223:           case PT_LAMP:
1.2     ! misha    1224:           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1.1       misha    1225:           break;
                   1226: 
                   1227:           case PT_GC:
1.2     ! misha    1228:           OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1.1       misha    1229:           break;
                   1230: 
                   1231:           case PT_PC:
1.2     ! misha    1232:           OK = prop->chartype == code[3];
1.1       misha    1233:           break;
                   1234: 
                   1235:           case PT_SC:
1.2     ! misha    1236:           OK = prop->script == code[3];
1.1       misha    1237:           break;
                   1238: 
                   1239:           /* Should never occur, but keep compilers from grumbling. */
                   1240: 
                   1241:           default:
                   1242:           OK = codevalue != OP_PROP;
                   1243:           break;
                   1244:           }
                   1245: 
                   1246:         if (OK == (d == OP_PROP))
                   1247:           {
                   1248:           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
                   1249:               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
                   1250:             {
                   1251:             active_count--;           /* Remove non-match possibility */
                   1252:             next_active_state--;
                   1253:             }
                   1254:           ADD_NEW(state_offset + count, 0);
                   1255:           }
                   1256:         }
                   1257:       break;
                   1258: 
                   1259:       /*-----------------------------------------------------------------*/
                   1260:       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
                   1261:       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
                   1262:       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
                   1263:       count = 2;
                   1264:       goto QS2;
                   1265: 
                   1266:       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
                   1267:       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
                   1268:       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
                   1269:       count = 0;
                   1270: 
                   1271:       QS2:
                   1272: 
                   1273:       ADD_ACTIVE(state_offset + 2, 0);
1.2     ! misha    1274:       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1.1       misha    1275:         {
                   1276:         const uschar *nptr = ptr + clen;
                   1277:         int ncount = 0;
                   1278:         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
                   1279:             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
                   1280:           {
                   1281:           active_count--;           /* Remove non-match possibility */
                   1282:           next_active_state--;
                   1283:           }
                   1284:         while (nptr < end_subject)
                   1285:           {
                   1286:           int nd;
                   1287:           int ndlen = 1;
                   1288:           GETCHARLEN(nd, nptr, ndlen);
1.2     ! misha    1289:           if (UCD_CATEGORY(nd) != ucp_M) break;
1.1       misha    1290:           ncount++;
                   1291:           nptr += ndlen;
                   1292:           }
                   1293:         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
                   1294:         }
                   1295:       break;
                   1296: #endif
                   1297: 
                   1298:       /*-----------------------------------------------------------------*/
                   1299:       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
                   1300:       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
                   1301:       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
                   1302:       count = 2;
                   1303:       goto QS3;
                   1304: 
                   1305:       case OP_ANYNL_EXTRA + OP_TYPESTAR:
                   1306:       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
                   1307:       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
                   1308:       count = 0;
                   1309: 
                   1310:       QS3:
                   1311:       ADD_ACTIVE(state_offset + 2, 0);
                   1312:       if (clen > 0)
                   1313:         {
                   1314:         int ncount = 0;
                   1315:         switch (c)
                   1316:           {
                   1317:           case 0x000b:
                   1318:           case 0x000c:
                   1319:           case 0x0085:
                   1320:           case 0x2028:
                   1321:           case 0x2029:
                   1322:           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
                   1323:           goto ANYNL02;
                   1324: 
                   1325:           case 0x000d:
                   1326:           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
                   1327:           /* Fall through */
                   1328: 
                   1329:           ANYNL02:
                   1330:           case 0x000a:
                   1331:           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
                   1332:               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
                   1333:             {
                   1334:             active_count--;           /* Remove non-match possibility */
                   1335:             next_active_state--;
                   1336:             }
                   1337:           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
                   1338:           break;
                   1339: 
                   1340:           default:
                   1341:           break;
                   1342:           }
                   1343:         }
                   1344:       break;
                   1345: 
                   1346:       /*-----------------------------------------------------------------*/
                   1347:       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
                   1348:       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
                   1349:       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
                   1350:       count = 2;
                   1351:       goto QS4;
                   1352: 
                   1353:       case OP_VSPACE_EXTRA + OP_TYPESTAR:
                   1354:       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
                   1355:       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
                   1356:       count = 0;
                   1357: 
                   1358:       QS4:
                   1359:       ADD_ACTIVE(state_offset + 2, 0);
                   1360:       if (clen > 0)
                   1361:         {
                   1362:         BOOL OK;
                   1363:         switch (c)
                   1364:           {
                   1365:           case 0x000a:
                   1366:           case 0x000b:
                   1367:           case 0x000c:
                   1368:           case 0x000d:
                   1369:           case 0x0085:
                   1370:           case 0x2028:
                   1371:           case 0x2029:
                   1372:           OK = TRUE;
                   1373:           break;
                   1374: 
                   1375:           default:
                   1376:           OK = FALSE;
                   1377:           break;
                   1378:           }
                   1379:         if (OK == (d == OP_VSPACE))
                   1380:           {
                   1381:           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
                   1382:               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
                   1383:             {
                   1384:             active_count--;           /* Remove non-match possibility */
                   1385:             next_active_state--;
                   1386:             }
                   1387:           ADD_NEW_DATA(-(state_offset + count), 0, 0);
                   1388:           }
                   1389:         }
                   1390:       break;
                   1391: 
                   1392:       /*-----------------------------------------------------------------*/
                   1393:       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
                   1394:       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
                   1395:       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
                   1396:       count = 2;
                   1397:       goto QS5;
                   1398: 
                   1399:       case OP_HSPACE_EXTRA + OP_TYPESTAR:
                   1400:       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
                   1401:       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
                   1402:       count = 0;
                   1403: 
                   1404:       QS5:
                   1405:       ADD_ACTIVE(state_offset + 2, 0);
                   1406:       if (clen > 0)
                   1407:         {
                   1408:         BOOL OK;
                   1409:         switch (c)
                   1410:           {
                   1411:           case 0x09:      /* HT */
                   1412:           case 0x20:      /* SPACE */
                   1413:           case 0xa0:      /* NBSP */
                   1414:           case 0x1680:    /* OGHAM SPACE MARK */
                   1415:           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
                   1416:           case 0x2000:    /* EN QUAD */
                   1417:           case 0x2001:    /* EM QUAD */
                   1418:           case 0x2002:    /* EN SPACE */
                   1419:           case 0x2003:    /* EM SPACE */
                   1420:           case 0x2004:    /* THREE-PER-EM SPACE */
                   1421:           case 0x2005:    /* FOUR-PER-EM SPACE */
                   1422:           case 0x2006:    /* SIX-PER-EM SPACE */
                   1423:           case 0x2007:    /* FIGURE SPACE */
                   1424:           case 0x2008:    /* PUNCTUATION SPACE */
                   1425:           case 0x2009:    /* THIN SPACE */
                   1426:           case 0x200A:    /* HAIR SPACE */
                   1427:           case 0x202f:    /* NARROW NO-BREAK SPACE */
                   1428:           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
                   1429:           case 0x3000:    /* IDEOGRAPHIC SPACE */
                   1430:           OK = TRUE;
                   1431:           break;
                   1432: 
                   1433:           default:
                   1434:           OK = FALSE;
                   1435:           break;
                   1436:           }
                   1437: 
                   1438:         if (OK == (d == OP_HSPACE))
                   1439:           {
                   1440:           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
                   1441:               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
                   1442:             {
                   1443:             active_count--;           /* Remove non-match possibility */
                   1444:             next_active_state--;
                   1445:             }
                   1446:           ADD_NEW_DATA(-(state_offset + count), 0, 0);
                   1447:           }
                   1448:         }
                   1449:       break;
                   1450: 
                   1451:       /*-----------------------------------------------------------------*/
                   1452: #ifdef SUPPORT_UCP
                   1453:       case OP_PROP_EXTRA + OP_TYPEEXACT:
                   1454:       case OP_PROP_EXTRA + OP_TYPEUPTO:
                   1455:       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
                   1456:       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
                   1457:       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
                   1458:         { ADD_ACTIVE(state_offset + 6, 0); }
                   1459:       count = current_state->count;  /* Number already matched */
                   1460:       if (clen > 0)
                   1461:         {
                   1462:         BOOL OK;
1.2     ! misha    1463:         const ucd_record * prop = GET_UCD(c);
1.1       misha    1464:         switch(code[4])
                   1465:           {
                   1466:           case PT_ANY:
                   1467:           OK = TRUE;
                   1468:           break;
                   1469: 
                   1470:           case PT_LAMP:
1.2     ! misha    1471:           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1.1       misha    1472:           break;
                   1473: 
                   1474:           case PT_GC:
1.2     ! misha    1475:           OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1.1       misha    1476:           break;
                   1477: 
                   1478:           case PT_PC:
1.2     ! misha    1479:           OK = prop->chartype == code[5];
1.1       misha    1480:           break;
                   1481: 
                   1482:           case PT_SC:
1.2     ! misha    1483:           OK = prop->script == code[5];
1.1       misha    1484:           break;
                   1485: 
                   1486:           /* Should never occur, but keep compilers from grumbling. */
                   1487: 
                   1488:           default:
                   1489:           OK = codevalue != OP_PROP;
                   1490:           break;
                   1491:           }
                   1492: 
                   1493:         if (OK == (d == OP_PROP))
                   1494:           {
                   1495:           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
                   1496:             {
                   1497:             active_count--;           /* Remove non-match possibility */
                   1498:             next_active_state--;
                   1499:             }
                   1500:           if (++count >= GET2(code, 1))
                   1501:             { ADD_NEW(state_offset + 6, 0); }
                   1502:           else
                   1503:             { ADD_NEW(state_offset, count); }
                   1504:           }
                   1505:         }
                   1506:       break;
                   1507: 
                   1508:       /*-----------------------------------------------------------------*/
                   1509:       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
                   1510:       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
                   1511:       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
                   1512:       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
                   1513:       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
                   1514:         { ADD_ACTIVE(state_offset + 4, 0); }
                   1515:       count = current_state->count;  /* Number already matched */
1.2     ! misha    1516:       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1.1       misha    1517:         {
                   1518:         const uschar *nptr = ptr + clen;
                   1519:         int ncount = 0;
                   1520:         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
                   1521:           {
                   1522:           active_count--;           /* Remove non-match possibility */
                   1523:           next_active_state--;
                   1524:           }
                   1525:         while (nptr < end_subject)
                   1526:           {
                   1527:           int nd;
                   1528:           int ndlen = 1;
                   1529:           GETCHARLEN(nd, nptr, ndlen);
1.2     ! misha    1530:           if (UCD_CATEGORY(nd) != ucp_M) break;
1.1       misha    1531:           ncount++;
                   1532:           nptr += ndlen;
                   1533:           }
                   1534:         if (++count >= GET2(code, 1))
                   1535:           { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
                   1536:         else
                   1537:           { ADD_NEW_DATA(-state_offset, count, ncount); }
                   1538:         }
                   1539:       break;
                   1540: #endif
                   1541: 
                   1542:       /*-----------------------------------------------------------------*/
                   1543:       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
                   1544:       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
                   1545:       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
                   1546:       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
                   1547:       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
                   1548:         { ADD_ACTIVE(state_offset + 4, 0); }
                   1549:       count = current_state->count;  /* Number already matched */
                   1550:       if (clen > 0)
                   1551:         {
                   1552:         int ncount = 0;
                   1553:         switch (c)
                   1554:           {
                   1555:           case 0x000b:
                   1556:           case 0x000c:
                   1557:           case 0x0085:
                   1558:           case 0x2028:
                   1559:           case 0x2029:
                   1560:           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
                   1561:           goto ANYNL03;
                   1562: 
                   1563:           case 0x000d:
                   1564:           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
                   1565:           /* Fall through */
                   1566: 
                   1567:           ANYNL03:
                   1568:           case 0x000a:
                   1569:           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
                   1570:             {
                   1571:             active_count--;           /* Remove non-match possibility */
                   1572:             next_active_state--;
                   1573:             }
                   1574:           if (++count >= GET2(code, 1))
                   1575:             { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
                   1576:           else
                   1577:             { ADD_NEW_DATA(-state_offset, count, ncount); }
                   1578:           break;
                   1579: 
                   1580:           default:
                   1581:           break;
                   1582:           }
                   1583:         }
                   1584:       break;
                   1585: 
                   1586:       /*-----------------------------------------------------------------*/
                   1587:       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
                   1588:       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
                   1589:       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
                   1590:       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
                   1591:       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
                   1592:         { ADD_ACTIVE(state_offset + 4, 0); }
                   1593:       count = current_state->count;  /* Number already matched */
                   1594:       if (clen > 0)
                   1595:         {
                   1596:         BOOL OK;
                   1597:         switch (c)
                   1598:           {
                   1599:           case 0x000a:
                   1600:           case 0x000b:
                   1601:           case 0x000c:
                   1602:           case 0x000d:
                   1603:           case 0x0085:
                   1604:           case 0x2028:
                   1605:           case 0x2029:
                   1606:           OK = TRUE;
                   1607:           break;
                   1608: 
                   1609:           default:
                   1610:           OK = FALSE;
                   1611:           }
                   1612: 
                   1613:         if (OK == (d == OP_VSPACE))
                   1614:           {
                   1615:           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
                   1616:             {
                   1617:             active_count--;           /* Remove non-match possibility */
                   1618:             next_active_state--;
                   1619:             }
                   1620:           if (++count >= GET2(code, 1))
                   1621:             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
                   1622:           else
                   1623:             { ADD_NEW_DATA(-state_offset, count, 0); }
                   1624:           }
                   1625:         }
                   1626:       break;
                   1627: 
                   1628:       /*-----------------------------------------------------------------*/
                   1629:       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
                   1630:       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
                   1631:       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
                   1632:       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
                   1633:       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
                   1634:         { ADD_ACTIVE(state_offset + 4, 0); }
                   1635:       count = current_state->count;  /* Number already matched */
                   1636:       if (clen > 0)
                   1637:         {
                   1638:         BOOL OK;
                   1639:         switch (c)
                   1640:           {
                   1641:           case 0x09:      /* HT */
                   1642:           case 0x20:      /* SPACE */
                   1643:           case 0xa0:      /* NBSP */
                   1644:           case 0x1680:    /* OGHAM SPACE MARK */
                   1645:           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
                   1646:           case 0x2000:    /* EN QUAD */
                   1647:           case 0x2001:    /* EM QUAD */
                   1648:           case 0x2002:    /* EN SPACE */
                   1649:           case 0x2003:    /* EM SPACE */
                   1650:           case 0x2004:    /* THREE-PER-EM SPACE */
                   1651:           case 0x2005:    /* FOUR-PER-EM SPACE */
                   1652:           case 0x2006:    /* SIX-PER-EM SPACE */
                   1653:           case 0x2007:    /* FIGURE SPACE */
                   1654:           case 0x2008:    /* PUNCTUATION SPACE */
                   1655:           case 0x2009:    /* THIN SPACE */
                   1656:           case 0x200A:    /* HAIR SPACE */
                   1657:           case 0x202f:    /* NARROW NO-BREAK SPACE */
                   1658:           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
                   1659:           case 0x3000:    /* IDEOGRAPHIC SPACE */
                   1660:           OK = TRUE;
                   1661:           break;
                   1662: 
                   1663:           default:
                   1664:           OK = FALSE;
                   1665:           break;
                   1666:           }
                   1667: 
                   1668:         if (OK == (d == OP_HSPACE))
                   1669:           {
                   1670:           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
                   1671:             {
                   1672:             active_count--;           /* Remove non-match possibility */
                   1673:             next_active_state--;
                   1674:             }
                   1675:           if (++count >= GET2(code, 1))
                   1676:             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
                   1677:           else
                   1678:             { ADD_NEW_DATA(-state_offset, count, 0); }
                   1679:           }
                   1680:         }
                   1681:       break;
                   1682: 
                   1683: /* ========================================================================== */
                   1684:       /* These opcodes are followed by a character that is usually compared
                   1685:       to the current subject character; it is loaded into d. We still get
                   1686:       here even if there is no subject character, because in some cases zero
                   1687:       repetitions are permitted. */
                   1688: 
                   1689:       /*-----------------------------------------------------------------*/
                   1690:       case OP_CHAR:
                   1691:       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
                   1692:       break;
                   1693: 
                   1694:       /*-----------------------------------------------------------------*/
                   1695:       case OP_CHARNC:
                   1696:       if (clen == 0) break;
                   1697: 
                   1698: #ifdef SUPPORT_UTF8
                   1699:       if (utf8)
                   1700:         {
                   1701:         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
                   1702:           {
                   1703:           unsigned int othercase;
                   1704:           if (c < 128) othercase = fcc[c]; else
                   1705: 
                   1706:           /* If we have Unicode property support, we can use it to test the
                   1707:           other case of the character. */
                   1708: 
                   1709: #ifdef SUPPORT_UCP
1.2     ! misha    1710:           othercase = UCD_OTHERCASE(c);
1.1       misha    1711: #else
                   1712:           othercase = NOTACHAR;
                   1713: #endif
                   1714: 
                   1715:           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
                   1716:           }
                   1717:         }
                   1718:       else
                   1719: #endif  /* SUPPORT_UTF8 */
                   1720: 
                   1721:       /* Non-UTF-8 mode */
                   1722:         {
                   1723:         if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
                   1724:         }
                   1725:       break;
                   1726: 
                   1727: 
                   1728: #ifdef SUPPORT_UCP
                   1729:       /*-----------------------------------------------------------------*/
                   1730:       /* This is a tricky one because it can match more than one character.
                   1731:       Find out how many characters to skip, and then set up a negative state
                   1732:       to wait for them to pass before continuing. */
                   1733: 
                   1734:       case OP_EXTUNI:
1.2     ! misha    1735:       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1.1       misha    1736:         {
                   1737:         const uschar *nptr = ptr + clen;
                   1738:         int ncount = 0;
                   1739:         while (nptr < end_subject)
                   1740:           {
                   1741:           int nclen = 1;
                   1742:           GETCHARLEN(c, nptr, nclen);
1.2     ! misha    1743:           if (UCD_CATEGORY(c) != ucp_M) break;
1.1       misha    1744:           ncount++;
                   1745:           nptr += nclen;
                   1746:           }
                   1747:         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
                   1748:         }
                   1749:       break;
                   1750: #endif
                   1751: 
                   1752:       /*-----------------------------------------------------------------*/
                   1753:       /* This is a tricky like EXTUNI because it too can match more than one
                   1754:       character (when CR is followed by LF). In this case, set up a negative
                   1755:       state to wait for one character to pass before continuing. */
                   1756: 
                   1757:       case OP_ANYNL:
                   1758:       if (clen > 0) switch(c)
                   1759:         {
                   1760:         case 0x000b:
                   1761:         case 0x000c:
                   1762:         case 0x0085:
                   1763:         case 0x2028:
                   1764:         case 0x2029:
                   1765:         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
                   1766: 
                   1767:         case 0x000a:
                   1768:         ADD_NEW(state_offset + 1, 0);
                   1769:         break;
                   1770: 
                   1771:         case 0x000d:
                   1772:         if (ptr + 1 < end_subject && ptr[1] == 0x0a)
                   1773:           {
                   1774:           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
                   1775:           }
                   1776:         else
                   1777:           {
                   1778:           ADD_NEW(state_offset + 1, 0);
                   1779:           }
                   1780:         break;
                   1781:         }
                   1782:       break;
                   1783: 
                   1784:       /*-----------------------------------------------------------------*/
                   1785:       case OP_NOT_VSPACE:
                   1786:       if (clen > 0) switch(c)
                   1787:         {
                   1788:         case 0x000a:
                   1789:         case 0x000b:
                   1790:         case 0x000c:
                   1791:         case 0x000d:
                   1792:         case 0x0085:
                   1793:         case 0x2028:
                   1794:         case 0x2029:
                   1795:         break;
                   1796: 
                   1797:         default:
                   1798:         ADD_NEW(state_offset + 1, 0);
                   1799:         break;
                   1800:         }
                   1801:       break;
                   1802: 
                   1803:       /*-----------------------------------------------------------------*/
                   1804:       case OP_VSPACE:
                   1805:       if (clen > 0) switch(c)
                   1806:         {
                   1807:         case 0x000a:
                   1808:         case 0x000b:
                   1809:         case 0x000c:
                   1810:         case 0x000d:
                   1811:         case 0x0085:
                   1812:         case 0x2028:
                   1813:         case 0x2029:
                   1814:         ADD_NEW(state_offset + 1, 0);
                   1815:         break;
                   1816: 
                   1817:         default: break;
                   1818:         }
                   1819:       break;
                   1820: 
                   1821:       /*-----------------------------------------------------------------*/
                   1822:       case OP_NOT_HSPACE:
                   1823:       if (clen > 0) switch(c)
                   1824:         {
                   1825:         case 0x09:      /* HT */
                   1826:         case 0x20:      /* SPACE */
                   1827:         case 0xa0:      /* NBSP */
                   1828:         case 0x1680:    /* OGHAM SPACE MARK */
                   1829:         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
                   1830:         case 0x2000:    /* EN QUAD */
                   1831:         case 0x2001:    /* EM QUAD */
                   1832:         case 0x2002:    /* EN SPACE */
                   1833:         case 0x2003:    /* EM SPACE */
                   1834:         case 0x2004:    /* THREE-PER-EM SPACE */
                   1835:         case 0x2005:    /* FOUR-PER-EM SPACE */
                   1836:         case 0x2006:    /* SIX-PER-EM SPACE */
                   1837:         case 0x2007:    /* FIGURE SPACE */
                   1838:         case 0x2008:    /* PUNCTUATION SPACE */
                   1839:         case 0x2009:    /* THIN SPACE */
                   1840:         case 0x200A:    /* HAIR SPACE */
                   1841:         case 0x202f:    /* NARROW NO-BREAK SPACE */
                   1842:         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
                   1843:         case 0x3000:    /* IDEOGRAPHIC SPACE */
                   1844:         break;
                   1845: 
                   1846:         default:
                   1847:         ADD_NEW(state_offset + 1, 0);
                   1848:         break;
                   1849:         }
                   1850:       break;
                   1851: 
                   1852:       /*-----------------------------------------------------------------*/
                   1853:       case OP_HSPACE:
                   1854:       if (clen > 0) switch(c)
                   1855:         {
                   1856:         case 0x09:      /* HT */
                   1857:         case 0x20:      /* SPACE */
                   1858:         case 0xa0:      /* NBSP */
                   1859:         case 0x1680:    /* OGHAM SPACE MARK */
                   1860:         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
                   1861:         case 0x2000:    /* EN QUAD */
                   1862:         case 0x2001:    /* EM QUAD */
                   1863:         case 0x2002:    /* EN SPACE */
                   1864:         case 0x2003:    /* EM SPACE */
                   1865:         case 0x2004:    /* THREE-PER-EM SPACE */
                   1866:         case 0x2005:    /* FOUR-PER-EM SPACE */
                   1867:         case 0x2006:    /* SIX-PER-EM SPACE */
                   1868:         case 0x2007:    /* FIGURE SPACE */
                   1869:         case 0x2008:    /* PUNCTUATION SPACE */
                   1870:         case 0x2009:    /* THIN SPACE */
                   1871:         case 0x200A:    /* HAIR SPACE */
                   1872:         case 0x202f:    /* NARROW NO-BREAK SPACE */
                   1873:         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
                   1874:         case 0x3000:    /* IDEOGRAPHIC SPACE */
                   1875:         ADD_NEW(state_offset + 1, 0);
                   1876:         break;
                   1877:         }
                   1878:       break;
                   1879: 
                   1880:       /*-----------------------------------------------------------------*/
                   1881:       /* Match a negated single character. This is only used for one-byte
                   1882:       characters, that is, we know that d < 256. The character we are
                   1883:       checking (c) can be multibyte. */
                   1884: 
                   1885:       case OP_NOT:
                   1886:       if (clen > 0)
                   1887:         {
                   1888:         unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
                   1889:         if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
                   1890:         }
                   1891:       break;
                   1892: 
                   1893:       /*-----------------------------------------------------------------*/
                   1894:       case OP_PLUS:
                   1895:       case OP_MINPLUS:
                   1896:       case OP_POSPLUS:
                   1897:       case OP_NOTPLUS:
                   1898:       case OP_NOTMINPLUS:
                   1899:       case OP_NOTPOSPLUS:
                   1900:       count = current_state->count;  /* Already matched */
                   1901:       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
                   1902:       if (clen > 0)
                   1903:         {
                   1904:         unsigned int otherd = NOTACHAR;
                   1905:         if ((ims & PCRE_CASELESS) != 0)
                   1906:           {
                   1907: #ifdef SUPPORT_UTF8
                   1908:           if (utf8 && d >= 128)
                   1909:             {
                   1910: #ifdef SUPPORT_UCP
1.2     ! misha    1911:             otherd = UCD_OTHERCASE(d);
1.1       misha    1912: #endif  /* SUPPORT_UCP */
                   1913:             }
                   1914:           else
                   1915: #endif  /* SUPPORT_UTF8 */
                   1916:           otherd = fcc[d];
                   1917:           }
                   1918:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
                   1919:           {
                   1920:           if (count > 0 &&
                   1921:               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
                   1922:             {
                   1923:             active_count--;             /* Remove non-match possibility */
                   1924:             next_active_state--;
                   1925:             }
                   1926:           count++;
                   1927:           ADD_NEW(state_offset, count);
                   1928:           }
                   1929:         }
                   1930:       break;
                   1931: 
                   1932:       /*-----------------------------------------------------------------*/
                   1933:       case OP_QUERY:
                   1934:       case OP_MINQUERY:
                   1935:       case OP_POSQUERY:
                   1936:       case OP_NOTQUERY:
                   1937:       case OP_NOTMINQUERY:
                   1938:       case OP_NOTPOSQUERY:
                   1939:       ADD_ACTIVE(state_offset + dlen + 1, 0);
                   1940:       if (clen > 0)
                   1941:         {
                   1942:         unsigned int otherd = NOTACHAR;
                   1943:         if ((ims & PCRE_CASELESS) != 0)
                   1944:           {
                   1945: #ifdef SUPPORT_UTF8
                   1946:           if (utf8 && d >= 128)
                   1947:             {
                   1948: #ifdef SUPPORT_UCP
1.2     ! misha    1949:             otherd = UCD_OTHERCASE(d);
1.1       misha    1950: #endif  /* SUPPORT_UCP */
                   1951:             }
                   1952:           else
                   1953: #endif  /* SUPPORT_UTF8 */
                   1954:           otherd = fcc[d];
                   1955:           }
                   1956:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
                   1957:           {
                   1958:           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
                   1959:             {
                   1960:             active_count--;            /* Remove non-match possibility */
                   1961:             next_active_state--;
                   1962:             }
                   1963:           ADD_NEW(state_offset + dlen + 1, 0);
                   1964:           }
                   1965:         }
                   1966:       break;
                   1967: 
                   1968:       /*-----------------------------------------------------------------*/
                   1969:       case OP_STAR:
                   1970:       case OP_MINSTAR:
                   1971:       case OP_POSSTAR:
                   1972:       case OP_NOTSTAR:
                   1973:       case OP_NOTMINSTAR:
                   1974:       case OP_NOTPOSSTAR:
                   1975:       ADD_ACTIVE(state_offset + dlen + 1, 0);
                   1976:       if (clen > 0)
                   1977:         {
                   1978:         unsigned int otherd = NOTACHAR;
                   1979:         if ((ims & PCRE_CASELESS) != 0)
                   1980:           {
                   1981: #ifdef SUPPORT_UTF8
                   1982:           if (utf8 && d >= 128)
                   1983:             {
                   1984: #ifdef SUPPORT_UCP
1.2     ! misha    1985:             otherd = UCD_OTHERCASE(d);
1.1       misha    1986: #endif  /* SUPPORT_UCP */
                   1987:             }
                   1988:           else
                   1989: #endif  /* SUPPORT_UTF8 */
                   1990:           otherd = fcc[d];
                   1991:           }
                   1992:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
                   1993:           {
                   1994:           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
                   1995:             {
                   1996:             active_count--;            /* Remove non-match possibility */
                   1997:             next_active_state--;
                   1998:             }
                   1999:           ADD_NEW(state_offset, 0);
                   2000:           }
                   2001:         }
                   2002:       break;
                   2003: 
                   2004:       /*-----------------------------------------------------------------*/
                   2005:       case OP_EXACT:
                   2006:       case OP_NOTEXACT:
                   2007:       count = current_state->count;  /* Number already matched */
                   2008:       if (clen > 0)
                   2009:         {
                   2010:         unsigned int otherd = NOTACHAR;
                   2011:         if ((ims & PCRE_CASELESS) != 0)
                   2012:           {
                   2013: #ifdef SUPPORT_UTF8
                   2014:           if (utf8 && d >= 128)
                   2015:             {
                   2016: #ifdef SUPPORT_UCP
1.2     ! misha    2017:             otherd = UCD_OTHERCASE(d);
1.1       misha    2018: #endif  /* SUPPORT_UCP */
                   2019:             }
                   2020:           else
                   2021: #endif  /* SUPPORT_UTF8 */
                   2022:           otherd = fcc[d];
                   2023:           }
                   2024:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
                   2025:           {
                   2026:           if (++count >= GET2(code, 1))
                   2027:             { ADD_NEW(state_offset + dlen + 3, 0); }
                   2028:           else
                   2029:             { ADD_NEW(state_offset, count); }
                   2030:           }
                   2031:         }
                   2032:       break;
                   2033: 
                   2034:       /*-----------------------------------------------------------------*/
                   2035:       case OP_UPTO:
                   2036:       case OP_MINUPTO:
                   2037:       case OP_POSUPTO:
                   2038:       case OP_NOTUPTO:
                   2039:       case OP_NOTMINUPTO:
                   2040:       case OP_NOTPOSUPTO:
                   2041:       ADD_ACTIVE(state_offset + dlen + 3, 0);
                   2042:       count = current_state->count;  /* Number already matched */
                   2043:       if (clen > 0)
                   2044:         {
                   2045:         unsigned int otherd = NOTACHAR;
                   2046:         if ((ims & PCRE_CASELESS) != 0)
                   2047:           {
                   2048: #ifdef SUPPORT_UTF8
                   2049:           if (utf8 && d >= 128)
                   2050:             {
                   2051: #ifdef SUPPORT_UCP
1.2     ! misha    2052:             otherd = UCD_OTHERCASE(d);
1.1       misha    2053: #endif  /* SUPPORT_UCP */
                   2054:             }
                   2055:           else
                   2056: #endif  /* SUPPORT_UTF8 */
                   2057:           otherd = fcc[d];
                   2058:           }
                   2059:         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
                   2060:           {
                   2061:           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
                   2062:             {
                   2063:             active_count--;             /* Remove non-match possibility */
                   2064:             next_active_state--;
                   2065:             }
                   2066:           if (++count >= GET2(code, 1))
                   2067:             { ADD_NEW(state_offset + dlen + 3, 0); }
                   2068:           else
                   2069:             { ADD_NEW(state_offset, count); }
                   2070:           }
                   2071:         }
                   2072:       break;
                   2073: 
                   2074: 
                   2075: /* ========================================================================== */
                   2076:       /* These are the class-handling opcodes */
                   2077: 
                   2078:       case OP_CLASS:
                   2079:       case OP_NCLASS:
                   2080:       case OP_XCLASS:
                   2081:         {
                   2082:         BOOL isinclass = FALSE;
                   2083:         int next_state_offset;
                   2084:         const uschar *ecode;
                   2085: 
                   2086:         /* For a simple class, there is always just a 32-byte table, and we
                   2087:         can set isinclass from it. */
                   2088: 
                   2089:         if (codevalue != OP_XCLASS)
                   2090:           {
                   2091:           ecode = code + 33;
                   2092:           if (clen > 0)
                   2093:             {
                   2094:             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
                   2095:               ((code[1 + c/8] & (1 << (c&7))) != 0);
                   2096:             }
                   2097:           }
                   2098: 
                   2099:         /* An extended class may have a table or a list of single characters,
                   2100:         ranges, or both, and it may be positive or negative. There's a
                   2101:         function that sorts all this out. */
                   2102: 
                   2103:         else
                   2104:          {
                   2105:          ecode = code + GET(code, 1);
                   2106:          if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
                   2107:          }
                   2108: 
                   2109:         /* At this point, isinclass is set for all kinds of class, and ecode
                   2110:         points to the byte after the end of the class. If there is a
                   2111:         quantifier, this is where it will be. */
                   2112: 
                   2113:         next_state_offset = ecode - start_code;
                   2114: 
                   2115:         switch (*ecode)
                   2116:           {
                   2117:           case OP_CRSTAR:
                   2118:           case OP_CRMINSTAR:
                   2119:           ADD_ACTIVE(next_state_offset + 1, 0);
                   2120:           if (isinclass) { ADD_NEW(state_offset, 0); }
                   2121:           break;
                   2122: 
                   2123:           case OP_CRPLUS:
                   2124:           case OP_CRMINPLUS:
                   2125:           count = current_state->count;  /* Already matched */
                   2126:           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
                   2127:           if (isinclass) { count++; ADD_NEW(state_offset, count); }
                   2128:           break;
                   2129: 
                   2130:           case OP_CRQUERY:
                   2131:           case OP_CRMINQUERY:
                   2132:           ADD_ACTIVE(next_state_offset + 1, 0);
                   2133:           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
                   2134:           break;
                   2135: 
                   2136:           case OP_CRRANGE:
                   2137:           case OP_CRMINRANGE:
                   2138:           count = current_state->count;  /* Already matched */
                   2139:           if (count >= GET2(ecode, 1))
                   2140:             { ADD_ACTIVE(next_state_offset + 5, 0); }
                   2141:           if (isinclass)
                   2142:             {
                   2143:             int max = GET2(ecode, 3);
                   2144:             if (++count >= max && max != 0)   /* Max 0 => no limit */
                   2145:               { ADD_NEW(next_state_offset + 5, 0); }
                   2146:             else
                   2147:               { ADD_NEW(state_offset, count); }
                   2148:             }
                   2149:           break;
                   2150: 
                   2151:           default:
                   2152:           if (isinclass) { ADD_NEW(next_state_offset, 0); }
                   2153:           break;
                   2154:           }
                   2155:         }
                   2156:       break;
                   2157: 
                   2158: /* ========================================================================== */
                   2159:       /* These are the opcodes for fancy brackets of various kinds. We have
                   2160:       to use recursion in order to handle them. The "always failing" assersion
                   2161:       (?!) is optimised when compiling to OP_FAIL, so we have to support that,
                   2162:       though the other "backtracking verbs" are not supported. */
                   2163: 
                   2164:       case OP_FAIL:
                   2165:       break;
                   2166: 
                   2167:       case OP_ASSERT:
                   2168:       case OP_ASSERT_NOT:
                   2169:       case OP_ASSERTBACK:
                   2170:       case OP_ASSERTBACK_NOT:
                   2171:         {
                   2172:         int rc;
                   2173:         int local_offsets[2];
                   2174:         int local_workspace[1000];
                   2175:         const uschar *endasscode = code + GET(code, 1);
                   2176: 
                   2177:         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
                   2178: 
                   2179:         rc = internal_dfa_exec(
                   2180:           md,                                   /* static match data */
                   2181:           code,                                 /* this subexpression's code */
                   2182:           ptr,                                  /* where we currently are */
                   2183:           ptr - start_subject,                  /* start offset */
                   2184:           local_offsets,                        /* offset vector */
                   2185:           sizeof(local_offsets)/sizeof(int),    /* size of same */
                   2186:           local_workspace,                      /* workspace vector */
                   2187:           sizeof(local_workspace)/sizeof(int),  /* size of same */
                   2188:           ims,                                  /* the current ims flags */
                   2189:           rlevel,                               /* function recursion level */
                   2190:           recursing);                           /* pass on regex recursion */
                   2191: 
                   2192:         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
                   2193:             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
                   2194:         }
                   2195:       break;
                   2196: 
                   2197:       /*-----------------------------------------------------------------*/
                   2198:       case OP_COND:
                   2199:       case OP_SCOND:
                   2200:         {
                   2201:         int local_offsets[1000];
                   2202:         int local_workspace[1000];
                   2203:         int condcode = code[LINK_SIZE+1];
                   2204: 
                   2205:         /* Back reference conditions are not supported */
                   2206: 
                   2207:         if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
                   2208: 
                   2209:         /* The DEFINE condition is always false */
                   2210: 
                   2211:         if (condcode == OP_DEF)
                   2212:           {
                   2213:           ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0);
                   2214:           }
                   2215: 
                   2216:         /* The only supported version of OP_RREF is for the value RREF_ANY,
                   2217:         which means "test if in any recursion". We can't test for specifically
                   2218:         recursed groups. */
                   2219: 
                   2220:         else if (condcode == OP_RREF)
                   2221:           {
                   2222:           int value = GET2(code, LINK_SIZE+2);
                   2223:           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
                   2224:           if (recursing > 0) { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
                   2225:             else { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
                   2226:           }
                   2227: 
                   2228:         /* Otherwise, the condition is an assertion */
                   2229: 
                   2230:         else
                   2231:           {
                   2232:           int rc;
                   2233:           const uschar *asscode = code + LINK_SIZE + 1;
                   2234:           const uschar *endasscode = asscode + GET(asscode, 1);
                   2235: 
                   2236:           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
                   2237: 
                   2238:           rc = internal_dfa_exec(
                   2239:             md,                                   /* fixed match data */
                   2240:             asscode,                              /* this subexpression's code */
                   2241:             ptr,                                  /* where we currently are */
                   2242:             ptr - start_subject,                  /* start offset */
                   2243:             local_offsets,                        /* offset vector */
                   2244:             sizeof(local_offsets)/sizeof(int),    /* size of same */
                   2245:             local_workspace,                      /* workspace vector */
                   2246:             sizeof(local_workspace)/sizeof(int),  /* size of same */
                   2247:             ims,                                  /* the current ims flags */
                   2248:             rlevel,                               /* function recursion level */
                   2249:             recursing);                           /* pass on regex recursion */
                   2250: 
                   2251:           if ((rc >= 0) ==
                   2252:                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
                   2253:             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
                   2254:           else
                   2255:             { ADD_ACTIVE(state_offset + GET(code, 1) + LINK_SIZE + 1, 0); }
                   2256:           }
                   2257:         }
                   2258:       break;
                   2259: 
                   2260:       /*-----------------------------------------------------------------*/
                   2261:       case OP_RECURSE:
                   2262:         {
                   2263:         int local_offsets[1000];
                   2264:         int local_workspace[1000];
                   2265:         int rc;
                   2266: 
                   2267:         DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
                   2268:           recursing + 1));
                   2269: 
                   2270:         rc = internal_dfa_exec(
                   2271:           md,                                   /* fixed match data */
                   2272:           start_code + GET(code, 1),            /* this subexpression's code */
                   2273:           ptr,                                  /* where we currently are */
                   2274:           ptr - start_subject,                  /* start offset */
                   2275:           local_offsets,                        /* offset vector */
                   2276:           sizeof(local_offsets)/sizeof(int),    /* size of same */
                   2277:           local_workspace,                      /* workspace vector */
                   2278:           sizeof(local_workspace)/sizeof(int),  /* size of same */
                   2279:           ims,                                  /* the current ims flags */
                   2280:           rlevel,                               /* function recursion level */
                   2281:           recursing + 1);                       /* regex recurse level */
                   2282: 
                   2283:         DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
                   2284:           recursing + 1, rc));
                   2285: 
                   2286:         /* Ran out of internal offsets */
                   2287: 
                   2288:         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
                   2289: 
                   2290:         /* For each successful matched substring, set up the next state with a
                   2291:         count of characters to skip before trying it. Note that the count is in
                   2292:         characters, not bytes. */
                   2293: 
                   2294:         if (rc > 0)
                   2295:           {
                   2296:           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
                   2297:             {
                   2298:             const uschar *p = start_subject + local_offsets[rc];
                   2299:             const uschar *pp = start_subject + local_offsets[rc+1];
                   2300:             int charcount = local_offsets[rc+1] - local_offsets[rc];
                   2301:             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
                   2302:             if (charcount > 0)
                   2303:               {
                   2304:               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
                   2305:               }
                   2306:             else
                   2307:               {
                   2308:               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
                   2309:               }
                   2310:             }
                   2311:           }
                   2312:         else if (rc != PCRE_ERROR_NOMATCH) return rc;
                   2313:         }
                   2314:       break;
                   2315: 
                   2316:       /*-----------------------------------------------------------------*/
                   2317:       case OP_ONCE:
                   2318:         {
                   2319:         int local_offsets[2];
                   2320:         int local_workspace[1000];
                   2321: 
                   2322:         int rc = internal_dfa_exec(
                   2323:           md,                                   /* fixed match data */
                   2324:           code,                                 /* this subexpression's code */
                   2325:           ptr,                                  /* where we currently are */
                   2326:           ptr - start_subject,                  /* start offset */
                   2327:           local_offsets,                        /* offset vector */
                   2328:           sizeof(local_offsets)/sizeof(int),    /* size of same */
                   2329:           local_workspace,                      /* workspace vector */
                   2330:           sizeof(local_workspace)/sizeof(int),  /* size of same */
                   2331:           ims,                                  /* the current ims flags */
                   2332:           rlevel,                               /* function recursion level */
                   2333:           recursing);                           /* pass on regex recursion */
                   2334: 
                   2335:         if (rc >= 0)
                   2336:           {
                   2337:           const uschar *end_subpattern = code;
                   2338:           int charcount = local_offsets[1] - local_offsets[0];
                   2339:           int next_state_offset, repeat_state_offset;
                   2340: 
                   2341:           do { end_subpattern += GET(end_subpattern, 1); }
                   2342:             while (*end_subpattern == OP_ALT);
                   2343:           next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
                   2344: 
                   2345:           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
                   2346:           arrange for the repeat state also to be added to the relevant list.
                   2347:           Calculate the offset, or set -1 for no repeat. */
                   2348: 
                   2349:           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
                   2350:                                  *end_subpattern == OP_KETRMIN)?
                   2351:             end_subpattern - start_code - GET(end_subpattern, 1) : -1;
                   2352: 
                   2353:           /* If we have matched an empty string, add the next state at the
                   2354:           current character pointer. This is important so that the duplicate
                   2355:           checking kicks in, which is what breaks infinite loops that match an
                   2356:           empty string. */
                   2357: 
                   2358:           if (charcount == 0)
                   2359:             {
                   2360:             ADD_ACTIVE(next_state_offset, 0);
                   2361:             }
                   2362: 
                   2363:           /* Optimization: if there are no more active states, and there
                   2364:           are no new states yet set up, then skip over the subject string
                   2365:           right here, to save looping. Otherwise, set up the new state to swing
                   2366:           into action when the end of the substring is reached. */
                   2367: 
                   2368:           else if (i + 1 >= active_count && new_count == 0)
                   2369:             {
                   2370:             ptr += charcount;
                   2371:             clen = 0;
                   2372:             ADD_NEW(next_state_offset, 0);
                   2373: 
                   2374:             /* If we are adding a repeat state at the new character position,
                   2375:             we must fudge things so that it is the only current state.
                   2376:             Otherwise, it might be a duplicate of one we processed before, and
                   2377:             that would cause it to be skipped. */
                   2378: 
                   2379:             if (repeat_state_offset >= 0)
                   2380:               {
                   2381:               next_active_state = active_states;
                   2382:               active_count = 0;
                   2383:               i = -1;
                   2384:               ADD_ACTIVE(repeat_state_offset, 0);
                   2385:               }
                   2386:             }
                   2387:           else
                   2388:             {
                   2389:             const uschar *p = start_subject + local_offsets[0];
                   2390:             const uschar *pp = start_subject + local_offsets[1];
                   2391:             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
                   2392:             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
                   2393:             if (repeat_state_offset >= 0)
                   2394:               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
                   2395:             }
                   2396: 
                   2397:           }
                   2398:         else if (rc != PCRE_ERROR_NOMATCH) return rc;
                   2399:         }
                   2400:       break;
                   2401: 
                   2402: 
                   2403: /* ========================================================================== */
                   2404:       /* Handle callouts */
                   2405: 
                   2406:       case OP_CALLOUT:
                   2407:       if (pcre_callout != NULL)
                   2408:         {
                   2409:         int rrc;
                   2410:         pcre_callout_block cb;
                   2411:         cb.version          = 1;   /* Version 1 of the callout block */
                   2412:         cb.callout_number   = code[1];
                   2413:         cb.offset_vector    = offsets;
                   2414:         cb.subject          = (PCRE_SPTR)start_subject;
                   2415:         cb.subject_length   = end_subject - start_subject;
                   2416:         cb.start_match      = current_subject - start_subject;
                   2417:         cb.current_position = ptr - start_subject;
                   2418:         cb.pattern_position = GET(code, 2);
                   2419:         cb.next_item_length = GET(code, 2 + LINK_SIZE);
                   2420:         cb.capture_top      = 1;
                   2421:         cb.capture_last     = -1;
                   2422:         cb.callout_data     = md->callout_data;
                   2423:         if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
                   2424:         if (rrc == 0) { ADD_ACTIVE(state_offset + 2 + 2*LINK_SIZE, 0); }
                   2425:         }
                   2426:       break;
                   2427: 
                   2428: 
                   2429: /* ========================================================================== */
                   2430:       default:        /* Unsupported opcode */
                   2431:       return PCRE_ERROR_DFA_UITEM;
                   2432:       }
                   2433: 
                   2434:     NEXT_ACTIVE_STATE: continue;
                   2435: 
                   2436:     }      /* End of loop scanning active states */
                   2437: 
                   2438:   /* We have finished the processing at the current subject character. If no
                   2439:   new states have been set for the next character, we have found all the
                   2440:   matches that we are going to find. If we are at the top level and partial
                   2441:   matching has been requested, check for appropriate conditions. */
                   2442: 
                   2443:   if (new_count <= 0)
                   2444:     {
                   2445:     if (match_count < 0 &&                     /* No matches found */
                   2446:         rlevel == 1 &&                         /* Top level match function */
                   2447:         (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */
                   2448:         ptr >= end_subject &&                  /* Reached end of subject */
                   2449:         ptr > current_subject)                 /* Matched non-empty string */
                   2450:       {
                   2451:       if (offsetcount >= 2)
                   2452:         {
                   2453:         offsets[0] = current_subject - start_subject;
                   2454:         offsets[1] = end_subject - start_subject;
                   2455:         }
                   2456:       match_count = PCRE_ERROR_PARTIAL;
                   2457:       }
                   2458: 
                   2459:     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
                   2460:       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
                   2461:       rlevel*2-2, SP));
                   2462:     break;        /* In effect, "return", but see the comment below */
                   2463:     }
                   2464: 
                   2465:   /* One or more states are active for the next character. */
                   2466: 
                   2467:   ptr += clen;    /* Advance to next subject character */
                   2468:   }               /* Loop to move along the subject string */
                   2469: 
                   2470: /* Control gets here from "break" a few lines above. We do it this way because
                   2471: if we use "return" above, we have compiler trouble. Some compilers warn if
                   2472: there's nothing here because they think the function doesn't return a value. On
                   2473: the other hand, if we put a dummy statement here, some more clever compilers
                   2474: complain that it can't be reached. Sigh. */
                   2475: 
                   2476: return match_count;
                   2477: }
                   2478: 
                   2479: 
                   2480: 
                   2481: 
                   2482: /*************************************************
                   2483: *    Execute a Regular Expression - DFA engine   *
                   2484: *************************************************/
                   2485: 
                   2486: /* This external function applies a compiled re to a subject string using a DFA
                   2487: engine. This function calls the internal function multiple times if the pattern
                   2488: is not anchored.
                   2489: 
                   2490: Arguments:
                   2491:   argument_re     points to the compiled expression
                   2492:   extra_data      points to extra data or is NULL
                   2493:   subject         points to the subject string
                   2494:   length          length of subject string (may contain binary zeros)
                   2495:   start_offset    where to start in the subject string
                   2496:   options         option bits
                   2497:   offsets         vector of match offsets
                   2498:   offsetcount     size of same
                   2499:   workspace       workspace vector
                   2500:   wscount         size of same
                   2501: 
                   2502: Returns:          > 0 => number of match offset pairs placed in offsets
                   2503:                   = 0 => offsets overflowed; longest matches are present
                   2504:                    -1 => failed to match
                   2505:                  < -1 => some kind of unexpected problem
                   2506: */
                   2507: 
1.2     ! misha    2508: PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
1.1       misha    2509: pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
                   2510:   const char *subject, int length, int start_offset, int options, int *offsets,
                   2511:   int offsetcount, int *workspace, int wscount)
                   2512: {
                   2513: real_pcre *re = (real_pcre *)argument_re;
                   2514: dfa_match_data match_block;
                   2515: dfa_match_data *md = &match_block;
                   2516: BOOL utf8, anchored, startline, firstline;
                   2517: const uschar *current_subject, *end_subject, *lcc;
                   2518: 
                   2519: pcre_study_data internal_study;
                   2520: const pcre_study_data *study = NULL;
                   2521: real_pcre internal_re;
                   2522: 
                   2523: const uschar *req_byte_ptr;
                   2524: const uschar *start_bits = NULL;
                   2525: BOOL first_byte_caseless = FALSE;
                   2526: BOOL req_byte_caseless = FALSE;
                   2527: int first_byte = -1;
                   2528: int req_byte = -1;
                   2529: int req_byte2 = -1;
                   2530: int newline;
                   2531: 
                   2532: /* Plausibility checks */
                   2533: 
                   2534: if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
                   2535: if (re == NULL || subject == NULL || workspace == NULL ||
                   2536:    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
                   2537: if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
                   2538: if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
                   2539: 
                   2540: /* We need to find the pointer to any study data before we test for byte
                   2541: flipping, so we scan the extra_data block first. This may set two fields in the
                   2542: match block, so we must initialize them beforehand. However, the other fields
                   2543: in the match block must not be set until after the byte flipping. */
                   2544: 
                   2545: md->tables = re->tables;
                   2546: md->callout_data = NULL;
                   2547: 
                   2548: if (extra_data != NULL)
                   2549:   {
                   2550:   unsigned int flags = extra_data->flags;
                   2551:   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
                   2552:     study = (const pcre_study_data *)extra_data->study_data;
                   2553:   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
                   2554:   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
                   2555:     return PCRE_ERROR_DFA_UMLIMIT;
                   2556:   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
                   2557:     md->callout_data = extra_data->callout_data;
                   2558:   if ((flags & PCRE_EXTRA_TABLES) != 0)
                   2559:     md->tables = extra_data->tables;
                   2560:   }
                   2561: 
                   2562: /* Check that the first field in the block is the magic number. If it is not,
                   2563: test for a regex that was compiled on a host of opposite endianness. If this is
                   2564: the case, flipped values are put in internal_re and internal_study if there was
                   2565: study data too. */
                   2566: 
                   2567: if (re->magic_number != MAGIC_NUMBER)
                   2568:   {
                   2569:   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
                   2570:   if (re == NULL) return PCRE_ERROR_BADMAGIC;
                   2571:   if (study != NULL) study = &internal_study;
                   2572:   }
                   2573: 
                   2574: /* Set some local values */
                   2575: 
                   2576: current_subject = (const unsigned char *)subject + start_offset;
                   2577: end_subject = (const unsigned char *)subject + length;
                   2578: req_byte_ptr = current_subject - 1;
                   2579: 
                   2580: #ifdef SUPPORT_UTF8
                   2581: utf8 = (re->options & PCRE_UTF8) != 0;
                   2582: #else
                   2583: utf8 = FALSE;
                   2584: #endif
                   2585: 
                   2586: anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
                   2587:   (re->options & PCRE_ANCHORED) != 0;
                   2588: 
                   2589: /* The remaining fixed data for passing around. */
                   2590: 
                   2591: md->start_code = (const uschar *)argument_re +
                   2592:     re->name_table_offset + re->name_count * re->name_entry_size;
                   2593: md->start_subject = (const unsigned char *)subject;
                   2594: md->end_subject = end_subject;
                   2595: md->moptions = options;
                   2596: md->poptions = re->options;
                   2597: 
                   2598: /* If the BSR option is not set at match time, copy what was set
                   2599: at compile time. */
                   2600: 
                   2601: if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
                   2602:   {
                   2603:   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
                   2604:     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
                   2605: #ifdef BSR_ANYCRLF
                   2606:   else md->moptions |= PCRE_BSR_ANYCRLF;
                   2607: #endif
                   2608:   }
                   2609: 
                   2610: /* Handle different types of newline. The three bits give eight cases. If
                   2611: nothing is set at run time, whatever was used at compile time applies. */
                   2612: 
                   2613: switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
                   2614:          PCRE_NEWLINE_BITS)
                   2615:   {
                   2616:   case 0: newline = NEWLINE; break;   /* Compile-time default */
                   2617:   case PCRE_NEWLINE_CR: newline = '\r'; break;
                   2618:   case PCRE_NEWLINE_LF: newline = '\n'; break;
                   2619:   case PCRE_NEWLINE_CR+
                   2620:        PCRE_NEWLINE_LF: newline = ('\r' << 8) | '\n'; break;
                   2621:   case PCRE_NEWLINE_ANY: newline = -1; break;
                   2622:   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
                   2623:   default: return PCRE_ERROR_BADNEWLINE;
                   2624:   }
                   2625: 
                   2626: if (newline == -2)
                   2627:   {
                   2628:   md->nltype = NLTYPE_ANYCRLF;
                   2629:   }
                   2630: else if (newline < 0)
                   2631:   {
                   2632:   md->nltype = NLTYPE_ANY;
                   2633:   }
                   2634: else
                   2635:   {
                   2636:   md->nltype = NLTYPE_FIXED;
                   2637:   if (newline > 255)
                   2638:     {
                   2639:     md->nllen = 2;
                   2640:     md->nl[0] = (newline >> 8) & 255;
                   2641:     md->nl[1] = newline & 255;
                   2642:     }
                   2643:   else
                   2644:     {
                   2645:     md->nllen = 1;
                   2646:     md->nl[0] = newline;
                   2647:     }
                   2648:   }
                   2649: 
                   2650: /* Check a UTF-8 string if required. Unfortunately there's no way of passing
                   2651: back the character offset. */
                   2652: 
                   2653: #ifdef SUPPORT_UTF8
                   2654: if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
                   2655:   {
                   2656:   if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
                   2657:     return PCRE_ERROR_BADUTF8;
                   2658:   if (start_offset > 0 && start_offset < length)
                   2659:     {
                   2660:     int tb = ((uschar *)subject)[start_offset];
                   2661:     if (tb > 127)
                   2662:       {
                   2663:       tb &= 0xc0;
                   2664:       if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
                   2665:       }
                   2666:     }
                   2667:   }
                   2668: #endif
                   2669: 
                   2670: /* If the exec call supplied NULL for tables, use the inbuilt ones. This
                   2671: is a feature that makes it possible to save compiled regex and re-use them
                   2672: in other programs later. */
                   2673: 
                   2674: if (md->tables == NULL) md->tables = _pcre_default_tables;
                   2675: 
                   2676: /* The lower casing table and the "must be at the start of a line" flag are
                   2677: used in a loop when finding where to start. */
                   2678: 
                   2679: lcc = md->tables + lcc_offset;
                   2680: startline = (re->flags & PCRE_STARTLINE) != 0;
                   2681: firstline = (re->options & PCRE_FIRSTLINE) != 0;
                   2682: 
                   2683: /* Set up the first character to match, if available. The first_byte value is
                   2684: never set for an anchored regular expression, but the anchoring may be forced
                   2685: at run time, so we have to test for anchoring. The first char may be unset for
                   2686: an unanchored pattern, of course. If there's no first char and the pattern was
                   2687: studied, there may be a bitmap of possible first characters. */
                   2688: 
                   2689: if (!anchored)
                   2690:   {
                   2691:   if ((re->flags & PCRE_FIRSTSET) != 0)
                   2692:     {
                   2693:     first_byte = re->first_byte & 255;
                   2694:     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
                   2695:       first_byte = lcc[first_byte];
                   2696:     }
                   2697:   else
                   2698:     {
                   2699:     if (startline && study != NULL &&
                   2700:          (study->options & PCRE_STUDY_MAPPED) != 0)
                   2701:       start_bits = study->start_bits;
                   2702:     }
                   2703:   }
                   2704: 
                   2705: /* For anchored or unanchored matches, there may be a "last known required
                   2706: character" set. */
                   2707: 
                   2708: if ((re->flags & PCRE_REQCHSET) != 0)
                   2709:   {
                   2710:   req_byte = re->req_byte & 255;
                   2711:   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
                   2712:   req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
                   2713:   }
                   2714: 
                   2715: /* Call the main matching function, looping for a non-anchored regex after a
                   2716: failed match. Unless restarting, optimize by moving to the first match
                   2717: character if possible, when not anchored. Then unless wanting a partial match,
                   2718: check for a required later character. */
                   2719: 
                   2720: for (;;)
                   2721:   {
                   2722:   int rc;
                   2723: 
                   2724:   if ((options & PCRE_DFA_RESTART) == 0)
                   2725:     {
                   2726:     const uschar *save_end_subject = end_subject;
                   2727: 
                   2728:     /* Advance to a unique first char if possible. If firstline is TRUE, the
                   2729:     start of the match is constrained to the first line of a multiline string.
                   2730:     Implement this by temporarily adjusting end_subject so that we stop
                   2731:     scanning at a newline. If the match fails at the newline, later code breaks
                   2732:     this loop. */
                   2733: 
                   2734:     if (firstline)
                   2735:       {
1.2     ! misha    2736:       USPTR t = current_subject;
        !          2737: #ifdef SUPPORT_UTF8
        !          2738:       if (utf8)
        !          2739:         {
        !          2740:         while (t < md->end_subject && !IS_NEWLINE(t))
        !          2741:           {
        !          2742:           t++;
        !          2743:           while (t < end_subject && (*t & 0xc0) == 0x80) t++;
        !          2744:           }
        !          2745:         }
        !          2746:       else
        !          2747: #endif
1.1       misha    2748:       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
                   2749:       end_subject = t;
                   2750:       }
                   2751: 
                   2752:     if (first_byte >= 0)
                   2753:       {
                   2754:       if (first_byte_caseless)
                   2755:         while (current_subject < end_subject &&
                   2756:                lcc[*current_subject] != first_byte)
                   2757:           current_subject++;
                   2758:       else
                   2759:         while (current_subject < end_subject && *current_subject != first_byte)
                   2760:           current_subject++;
                   2761:       }
                   2762: 
                   2763:     /* Or to just after a linebreak for a multiline match if possible */
                   2764: 
                   2765:     else if (startline)
                   2766:       {
                   2767:       if (current_subject > md->start_subject + start_offset)
                   2768:         {
1.2     ! misha    2769: #ifdef SUPPORT_UTF8
        !          2770:         if (utf8)
        !          2771:           {
        !          2772:           while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
        !          2773:             {
        !          2774:             current_subject++;
        !          2775:             while(current_subject < end_subject &&
        !          2776:                   (*current_subject & 0xc0) == 0x80)
        !          2777:               current_subject++;
        !          2778:             }
        !          2779:           }
        !          2780:         else
        !          2781: #endif
        !          2782:         while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
1.1       misha    2783:           current_subject++;
                   2784: 
                   2785:         /* If we have just passed a CR and the newline option is ANY or
                   2786:         ANYCRLF, and we are now at a LF, advance the match position by one more
                   2787:         character. */
                   2788: 
                   2789:         if (current_subject[-1] == '\r' &&
                   2790:              (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
                   2791:              current_subject < end_subject &&
                   2792:              *current_subject == '\n')
                   2793:           current_subject++;
                   2794:         }
                   2795:       }
                   2796: 
                   2797:     /* Or to a non-unique first char after study */
                   2798: 
                   2799:     else if (start_bits != NULL)
                   2800:       {
                   2801:       while (current_subject < end_subject)
                   2802:         {
                   2803:         register unsigned int c = *current_subject;
                   2804:         if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
                   2805:           else break;
                   2806:         }
                   2807:       }
                   2808: 
                   2809:     /* Restore fudged end_subject */
                   2810: 
                   2811:     end_subject = save_end_subject;
                   2812:     }
                   2813: 
                   2814:   /* If req_byte is set, we know that that character must appear in the subject
                   2815:   for the match to succeed. If the first character is set, req_byte must be
                   2816:   later in the subject; otherwise the test starts at the match point. This
                   2817:   optimization can save a huge amount of work in patterns with nested unlimited
                   2818:   repeats that aren't going to match. Writing separate code for cased/caseless
                   2819:   versions makes it go faster, as does using an autoincrement and backing off
                   2820:   on a match.
                   2821: 
                   2822:   HOWEVER: when the subject string is very, very long, searching to its end can
                   2823:   take a long time, and give bad performance on quite ordinary patterns. This
                   2824:   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
                   2825:   don't do this when the string is sufficiently long.
                   2826: 
                   2827:   ALSO: this processing is disabled when partial matching is requested.
                   2828:   */
                   2829: 
                   2830:   if (req_byte >= 0 &&
                   2831:       end_subject - current_subject < REQ_BYTE_MAX &&
                   2832:       (options & PCRE_PARTIAL) == 0)
                   2833:     {
                   2834:     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
                   2835: 
                   2836:     /* We don't need to repeat the search if we haven't yet reached the
                   2837:     place we found it at last time. */
                   2838: 
                   2839:     if (p > req_byte_ptr)
                   2840:       {
                   2841:       if (req_byte_caseless)
                   2842:         {
                   2843:         while (p < end_subject)
                   2844:           {
                   2845:           register int pp = *p++;
                   2846:           if (pp == req_byte || pp == req_byte2) { p--; break; }
                   2847:           }
                   2848:         }
                   2849:       else
                   2850:         {
                   2851:         while (p < end_subject)
                   2852:           {
                   2853:           if (*p++ == req_byte) { p--; break; }
                   2854:           }
                   2855:         }
                   2856: 
                   2857:       /* If we can't find the required character, break the matching loop,
                   2858:       which will cause a return or PCRE_ERROR_NOMATCH. */
                   2859: 
                   2860:       if (p >= end_subject) break;
                   2861: 
                   2862:       /* If we have found the required character, save the point where we
                   2863:       found it, so that we don't search again next time round the loop if
                   2864:       the start hasn't passed this character yet. */
                   2865: 
                   2866:       req_byte_ptr = p;
                   2867:       }
                   2868:     }
                   2869: 
                   2870:   /* OK, now we can do the business */
                   2871: 
                   2872:   rc = internal_dfa_exec(
                   2873:     md,                                /* fixed match data */
                   2874:     md->start_code,                    /* this subexpression's code */
                   2875:     current_subject,                   /* where we currently are */
                   2876:     start_offset,                      /* start offset in subject */
                   2877:     offsets,                           /* offset vector */
                   2878:     offsetcount,                       /* size of same */
                   2879:     workspace,                         /* workspace vector */
                   2880:     wscount,                           /* size of same */
                   2881:     re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
                   2882:     0,                                 /* function recurse level */
                   2883:     0);                                /* regex recurse level */
                   2884: 
                   2885:   /* Anything other than "no match" means we are done, always; otherwise, carry
                   2886:   on only if not anchored. */
                   2887: 
                   2888:   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
                   2889: 
                   2890:   /* Advance to the next subject character unless we are at the end of a line
                   2891:   and firstline is set. */
                   2892: 
                   2893:   if (firstline && IS_NEWLINE(current_subject)) break;
                   2894:   current_subject++;
                   2895:   if (utf8)
                   2896:     {
                   2897:     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
                   2898:       current_subject++;
                   2899:     }
                   2900:   if (current_subject > end_subject) break;
                   2901: 
                   2902:   /* If we have just passed a CR and we are now at a LF, and the pattern does
                   2903:   not contain any explicit matches for \r or \n, and the newline option is CRLF
                   2904:   or ANY or ANYCRLF, advance the match position by one more character. */
                   2905: 
                   2906:   if (current_subject[-1] == '\r' &&
                   2907:       current_subject < end_subject &&
                   2908:       *current_subject == '\n' &&
                   2909:       (re->flags & PCRE_HASCRORLF) == 0 &&
                   2910:         (md->nltype == NLTYPE_ANY ||
                   2911:          md->nltype == NLTYPE_ANYCRLF ||
                   2912:          md->nllen == 2))
                   2913:     current_subject++;
                   2914: 
                   2915:   }   /* "Bumpalong" loop */
                   2916: 
                   2917: return PCRE_ERROR_NOMATCH;
                   2918: }
                   2919: 
                   2920: /* End of pcre_dfa_exec.c */

E-mail: