Annotation of parser3/src/main/pa_string.C, revision 1.139
1.45 paf 1: /** @file
1.55 paf 2: Parser: string class. @see untasize_t.C.
1.46 paf 3:
1.137 paf 4: Copyright (c) 2001, 2002 ArtLebedev Group (http://www.artlebedev.com)
1.138 paf 5: Author: Alexandr Petrosian <paf@design.ru> (http://paf.design.ru)
1.46 paf 6:
1.139 ! paf 7: $Id: pa_string.C,v 1.138 2002/02/08 08:30:16 paf Exp $
1.4 paf 8: */
9:
1.70 paf 10: #include "pcre.h"
11:
1.13 paf 12: #include "pa_pool.h"
1.12 paf 13: #include "pa_string.h"
1.5 paf 14: #include "pa_hash.h"
1.22 paf 15: #include "pa_exception.h"
1.53 paf 16: #include "pa_common.h"
1.60 paf 17: #include "pa_array.h"
18: #include "pa_globals.h"
1.61 paf 19: #include "pa_table.h"
1.101 parser 20: #include "pa_dictionary.h"
1.132 paf 21: #include "pa_charset.h"
1.60 paf 22:
1.139 ! paf 23: #define DEBUG_STRING_APPENDS_VS_EXPANDS
! 24:
! 25:
! 26: #ifdef DEBUG_STRING_APPENDS_VS_EXPANDS
! 27: ulong string_piece_appends=0;
! 28: #endif
! 29:
1.75 paf 30: String::String(Pool& apool, const char *src, size_t src_size, bool tainted) :
1.120 paf 31: Pooled(apool) {
1.28 paf 32: last_chunk=&head;
33: head.count=CR_PREALLOCATED_COUNT;
1.5 paf 34: append_here=head.rows;
1.123 paf 35: head_link=0;
1.28 paf 36: link_row=&head.rows[head.count];
1.41 paf 37:
38: if(src)
1.75 paf 39: if(tainted)
40: APPEND_TAINTED(src, src_size, 0, 0);
1.41 paf 41: else
1.75 paf 42: APPEND_CLEAN(src, src_size, 0, 0);
1.1 paf 43: }
44:
1.94 parser 45: String::String(const String& src) :
1.120 paf 46: Pooled(src.pool()) {
1.8 paf 47: head.count=CR_PREALLOCATED_COUNT;
48:
1.116 paf 49: uint src_used_rows=src.used_rows();
1.8 paf 50: if(src_used_rows<=head.count) {
1.55 paf 51: // all new rows fit size_to preallocated area
1.98 parser 52: last_chunk=&head;
1.116 paf 53: uint curr_chunk_rows=head.count;
1.8 paf 54: memcpy(head.rows, src.head.rows, sizeof(Chunk::Row)*src_used_rows);
55: append_here=&head.rows[src_used_rows];
56: link_row=&head.rows[curr_chunk_rows];
57: } else {
58: // warning:
1.10 paf 59: // heavily relies on the fact
60: // "preallocated area is the same for all strings"
1.8 paf 61: //
62: // info:
63: // allocating only enough mem to fit src string rows
64: // next append would allocate a new chunk
65: //
1.55 paf 66: // new rows don't fit size_to preallocated area: splitting size_to two chunks
1.8 paf 67: // preallocated chunk src to constructing head
68: memcpy(head.rows, src.head.rows, sizeof(Chunk::Row)*head.count);
1.55 paf 69: // remaining rows size_to new_chunk
1.116 paf 70: uint curr_chunk_rows=src_used_rows-head.count;
1.98 parser 71: last_chunk=static_cast<Chunk *>(
1.139 ! paf 72: malloc(sizeof(Chunk::count_type)+sizeof(Chunk::Row)*curr_chunk_rows+sizeof(Chunk *), 9));
1.98 parser 73: last_chunk->count=curr_chunk_rows;
1.123 paf 74: head_link=last_chunk;
1.98 parser 75: append_here=link_row=&last_chunk->rows[last_chunk->count];
1.8 paf 76:
1.123 paf 77: Chunk *old_chunk=src.head_link;
1.98 parser 78: Chunk::Row *new_rows=last_chunk->rows;
1.116 paf 79: uint rows_left_to_copy=last_chunk->count;
1.8 paf 80: while(true) {
1.139 ! paf 81: Chunk::count_type old_count=old_chunk->count;
1.8 paf 82: Chunk *next_chunk=old_chunk->rows[old_count].link;
83: if(next_chunk) {
84: // not last source chunk
85: // taking it all
86: memcpy(new_rows, old_chunk->rows, sizeof(Chunk::Row)*old_count);
87: new_rows+=old_count;
88: rows_left_to_copy-=old_count;
89:
90: old_chunk=next_chunk;
91: } else {
92: // the last source chunk
93: // taking only those rows of chunk that _left_to_copy
94: memcpy(new_rows, old_chunk->rows, sizeof(Chunk::Row)*rows_left_to_copy);
95: break;
96: }
97: }
1.5 paf 98: }
1.8 paf 99: link_row->link=0;
1.115 paf 100: src_used_rows;
1.120 paf 101: }
102:
103: size_t String::size() const {
104: size_t result=0;
1.123 paf 105: STRING_FOREACH_ROW(
1.120 paf 106: result+=row->item.size;
1.123 paf 107: );
1.120 paf 108: break2:
109: return result;
1.94 parser 110: }
111:
1.115 paf 112: /// @todo not very optimal
113: uint String::used_rows() const {
114: uint result=0;
1.123 paf 115: STRING_FOREACH_ROW(
116: result++;
117: );
1.115 paf 118: break2:
119: return result;
120: }
1.94 parser 121: void String::expand() {
1.139 ! paf 122: Chunk::count_type new_chunk_count=last_chunk->count+CR_GROW_COUNT;
! 123: if(new_chunk_count>max_integral(Chunk::count_type))
! 124: new_chunk_count=max_integral(Chunk::count_type);
1.122 paf 125:
1.94 parser 126: last_chunk=static_cast<Chunk *>(
1.139 ! paf 127: malloc(sizeof(Chunk::count_type)+sizeof(Chunk::Row)*new_chunk_count+sizeof(Chunk *), 10));
1.94 parser 128: last_chunk->count=new_chunk_count;
129: link_row->link=last_chunk;
130: append_here=last_chunk->rows;
131: link_row=&last_chunk->rows[last_chunk->count];
132: link_row->link=0;
1.5 paf 133: }
1.28 paf 134:
1.13 paf 135: String& String::real_append(STRING_APPEND_PARAMS) {
1.139 ! paf 136: if(!last_chunk) // growth stopped [we're appended as string to somebody]
! 137: throw Exception(0, 0,
! 138: this,
! 139: "string growth stopped");
! 140:
1.9 paf 141: if(!src)
142: return *this;
1.26 paf 143: if(!size)
144: size=strlen(src);
145: if(!size)
1.9 paf 146: return *this;
1.122 paf 147:
1.139 ! paf 148: #ifdef DEBUG_STRING_APPENDS_VS_EXPANDS
! 149: string_piece_appends++;
! 150: #endif
! 151:
! 152: // manually unrolled to avoid extra check
! 153: while(size>max_integral(Chunk::Row::item_size_type)) {
1.122 paf 154: if(chunk_is_full())
155: expand();
156:
157: append_here->item.ptr=src;
1.139 ! paf 158: append_here->item.size=max_integral(Chunk::Row::item_size_type);
1.122 paf 159: append_here->item.lang=lang;
160: #ifndef NO_STRING_ORIGIN
161: append_here->item.origin.file=file;
162: append_here->item.origin.line=line;
163: #endif
164: append_here++;
165:
1.139 ! paf 166: src+=max_integral(Chunk::Row::item_size_type);
! 167: size-=max_integral(Chunk::Row::item_size_type);
1.122 paf 168: }
1.9 paf 169:
1.1 paf 170: if(chunk_is_full())
171: expand();
172:
173: append_here->item.ptr=src;
1.121 paf 174: append_here->item.size=size;
1.52 paf 175: append_here->item.lang=lang;
1.13 paf 176: #ifndef NO_STRING_ORIGIN
1.14 paf 177: append_here->item.origin.file=file;
178: append_here->item.origin.line=line;
1.13 paf 179: #endif
1.115 paf 180: append_here++;
1.1 paf 181:
182: return *this;
1.97 parser 183: }
184:
185: char String::first_char() const {
1.120 paf 186: if(!used_rows())
1.112 parser 187: throw Exception(0, 0,
1.97 parser 188: this,
189: "getting first char of empty string");
190:
191: return *head.rows[0].item.ptr;
1.1 paf 192: }
193:
1.16 paf 194: uint String::hash_code() const {
1.7 paf 195: uint result=0;
1.123 paf 196: STRING_FOREACH_ROW(
1.6 paf 197: result=Hash::generic_code(result, row->item.ptr, row->item.size);
1.123 paf 198: );
1.5 paf 199: break2:
200: return result;
201: }
202:
1.60 paf 203: /// @todo move 'lang' skipping to pos
204: int String::cmp(int& partial, const String& src,
205: size_t this_offset, Untaint_lang lang) const {
1.59 paf 206: partial=-1;
1.125 paf 207: size_t a_size=size();
208: this_offset=min(this_offset, a_size-1);
1.55 paf 209:
1.16 paf 210: const Chunk *a_chunk=&head;
211: const Chunk *b_chunk=&src.head;
212: const Chunk::Row *a_row=a_chunk->rows;
213: const Chunk::Row *b_row=b_chunk->rows;
1.55 paf 214: size_t a_offset=this_offset;
215: size_t b_offset=0;
1.9 paf 216: Chunk::Row *a_end=append_here;
217: Chunk::Row *b_end=src.append_here;
1.116 paf 218: uint a_countdown=a_chunk->count;
219: uint b_countdown=b_chunk->count;
220: int result;
1.60 paf 221: size_t pos=0;
1.33 paf 222:
1.125 paf 223: bool a_break=a_size==0;
1.91 parser 224: bool b_break=src.size()==0;
1.83 parser 225: if(!(a_break || b_break)) while(true) {
1.55 paf 226: if(pos+a_row->item.size > this_offset) {
1.136 paf 227: if(lang!=UL_UNSPECIFIED && a_row->item.lang>lang)
1.60 paf 228: return -1; // wrong lang -- bail out
229:
1.55 paf 230: int size_diff=
231: (a_row->item.size-a_offset)-
232: (b_row->item.size-b_offset);
233:
234: if(size_diff==0) { // a has same size as b
1.60 paf 235: result=memcmp(a_row->item.ptr+a_offset, b_row->item.ptr+b_offset,
236: a_row->item.size-a_offset);
1.55 paf 237: if(result)
238: return result;
1.60 paf 239: pos+=a_row->item.size;
1.55 paf 240: a_row++; a_countdown--; a_offset=0;
241: b_row++; b_countdown--; b_offset=0;
242: } else if (size_diff>0) { // a longer
1.60 paf 243: result=memcmp(a_row->item.ptr+a_offset, b_row->item.ptr+b_offset,
244: b_row->item.size-b_offset);
1.55 paf 245: if(result)
246: return result;
247: a_offset+=b_row->item.size-b_offset;
248: b_row++; b_countdown--; b_offset=0;
249: } else { // b longer
1.60 paf 250: result=memcmp(a_row->item.ptr+a_offset, b_row->item.ptr+b_offset,
251: a_row->item.size-a_offset);
1.55 paf 252: if(result)
253: return result;
254: b_offset+=a_row->item.size-a_offset;
1.60 paf 255: pos+=a_row->item.size;
1.55 paf 256: a_row++; a_countdown--; a_offset=0;
257: }
1.83 parser 258: if(b_break=b_row==b_end) {
259: a_break=a_row==a_end;
260: break;
261: }
1.55 paf 262: if(!b_countdown) {
263: b_chunk=b_row->link;
264: b_row=b_chunk->rows;
265: b_countdown=b_chunk->count;
266: }
267: } else {
1.60 paf 268: a_offset-=a_row->item.size;
269: pos+=a_row->item.size;
270: a_row++; a_countdown--;
1.9 paf 271: }
272:
1.83 parser 273: if(a_break=a_row==a_end) {
274: b_break=b_row==b_end;
275: break;
276: }
1.11 paf 277: if(!a_countdown) {
1.9 paf 278: a_chunk=a_row->link;
279: a_row=a_chunk->rows;
1.11 paf 280: a_countdown=a_chunk->count;
1.9 paf 281: }
1.27 paf 282: }
1.55 paf 283: if(a_break==b_break) { // ended simultaneously
284: partial=0; return 0;
285: } else if(a_break) { // first bytes equal, but a ended before b
286: partial=1; return -1;
287: } else {
288: partial=2; return +1;
289: }
1.27 paf 290: }
291:
1.60 paf 292: /// @todo move 'lang' skipping to pos
1.59 paf 293: int String::cmp(int& partial, const char* b_ptr, size_t src_size,
1.60 paf 294: size_t this_offset, Untaint_lang lang) const {
1.59 paf 295: partial=-1;
1.125 paf 296: size_t a_size=size();
1.50 paf 297: size_t b_size=src_size?src_size:b_ptr?strlen(b_ptr):0;
1.125 paf 298: this_offset=min(this_offset, a_size-1);
1.27 paf 299:
300: const Chunk *a_chunk=&head;
301: const Chunk::Row *a_row=a_chunk->rows;
1.59 paf 302: size_t a_offset=this_offset;
1.55 paf 303: size_t b_offset=0;
1.27 paf 304: Chunk::Row *a_end=append_here;
1.116 paf 305: uint a_countdown=a_chunk->count;
1.60 paf 306: size_t pos=0;
1.52 paf 307:
1.125 paf 308: bool a_break=a_size==0;
1.83 parser 309: bool b_break=b_size==0;
310: if(!(a_break || b_break)) while(true) {
1.59 paf 311: if(pos+a_row->item.size > this_offset) {
1.136 paf 312: if(lang!=UL_UNSPECIFIED && a_row->item.lang>lang)
1.60 paf 313: return -1; // wrong lang -- bail out
314:
1.59 paf 315: int size_diff=
316: (a_row->item.size-a_offset)-
317: (b_size-b_offset);
318:
319: if(size_diff==0) { // a has same size as b
1.116 paf 320: if(int result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset,
1.59 paf 321: a_row->item.size-a_offset)!=0)
322: return result;
1.60 paf 323: pos+=a_row->item.size;
1.59 paf 324: a_row++; a_countdown--; a_offset=0;
325: b_break=true;
326: } else if (size_diff>0) { // a longer
1.116 paf 327: if(int result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset,
1.59 paf 328: b_size-b_offset)!=0)
329: return result;
330: a_offset+=b_size-b_offset;
331: b_break=true;
332: } else { // b longer
1.116 paf 333: if(int result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset,
1.59 paf 334: a_row->item.size-a_offset)!=0)
335: return result;
336: b_offset+=a_row->item.size-a_offset;
1.60 paf 337: pos+=a_row->item.size;
1.59 paf 338: a_row++; a_countdown--; a_offset=0;
339: }
340: } else {
1.60 paf 341: a_offset-=a_row->item.size;
342: pos+=a_row->item.size;
343: a_row++; a_countdown--;
1.27 paf 344: }
345:
1.86 parser 346: a_break=a_row==a_end;
347: if(a_break || b_break)
1.83 parser 348: break;
1.27 paf 349: if(!a_countdown) {
350: a_chunk=a_row->link;
351: a_row=a_chunk->rows;
352: a_countdown=a_chunk->count;
1.9 paf 353: }
354: }
1.55 paf 355: if(a_break==b_break) { // ended simultaneously
356: partial=0; return 0;
357: } else if(a_break) { // first bytes equal, but a ended before b
358: partial=1; return -1;
359: } else {
360: partial=2; return +1;
361: }
1.5 paf 362: }
1.46 paf 363:
364: #ifndef NO_STRING_ORIGIN
365: const Origin& String::origin() const {
1.120 paf 366: if(!used_rows()) {
1.96 parser 367: static const Origin empty_origin={"empty string"};
368: return empty_origin;
369: }
1.46 paf 370:
1.49 paf 371: // determining origin by last appended piece
1.50 paf 372: // because first one frequently constant.
373: // ex: ^load[/file] "document_root" + "/file"
1.80 paf 374: // when last peice is constant,
375: // ex: parser_root_auto_path{dynamic} / auto.p{const}
376: // using first piece
377: Origin& last_origin=append_here[-1].item.origin;
378: return last_origin.file ? last_origin : head.rows[0].item.origin;
1.46 paf 379: }
380: #endif
1.53 paf 381:
1.69 paf 382: String& String::mid(size_t start, size_t finish) const {
1.107 parser 383: String& result=*NEW String(pool());
384:
1.139 ! paf 385: start=max(size_t(0), start);
1.111 parser 386: finish=min(size(), finish);
1.60 paf 387: if(start==finish)
1.107 parser 388: return result;
1.53 paf 389:
390: size_t pos=0;
1.123 paf 391: STRING_FOREACH_ROW(
392: size_t item_finish=pos+row->item.size;
393: if(item_finish > start) { // started now or already?
394: bool started=result.size()==0; // started now?
395: bool finished=finish <= item_finish; // finished now?
396: size_t offset=started?start-pos:0;
397: size_t size=finished?finish-pos:row->item.size;
398: result.APPEND(
399: row->item.ptr+offset, size-offset,
400: row->item.lang,
401: row->item.origin.file, row->item.origin.line);
402: if(finished)
1.53 paf 403: goto break2;
404: }
1.123 paf 405: pos+=row->item.size;
406: );
1.53 paf 407: break2:
1.60 paf 408: // SAPI::log(pool(), "piece of '%s' from %d to %d is '%s'",
409: //cstr(), start, finish, result.cstr());
1.53 paf 410: return result;
1.54 paf 411: }
412:
1.60 paf 413: int String::pos(const String& substr,
1.116 paf 414: int result, Untaint_lang lang) const {
1.125 paf 415: size_t self_size=size();
1.131 paf 416: for(; size_t(result)<self_size; result++) {
1.60 paf 417: int partial; cmp(partial, substr, result, lang);
1.58 paf 418: if(
419: partial==0 || // full match
420: partial==2) // 'substr' starts 'this'+'result'
421: return result;
422: }
423:
424: return -1;
425: }
426:
1.60 paf 427: int String::pos(const char *substr, size_t substr_size,
1.116 paf 428: int result, Untaint_lang lang) const {
1.125 paf 429: size_t self_size=size();
1.131 paf 430: for(; size_t(result)<self_size; result++) {
1.60 paf 431: int partial; cmp(partial, substr, substr_size, result, lang);
1.55 paf 432: if(
433: partial==0 || // full match
434: partial==2) // 'substr' starts 'this'+'result'
435: return result;
436: }
437:
438: return -1;
1.60 paf 439: }
440:
441: void String::split(Array& result,
442: size_t* pos_after_ref,
443: const char *delim, size_t delim_size,
444: Untaint_lang lang, int limit) const {
1.125 paf 445: size_t self_size=size();
1.60 paf 446: if(delim_size) {
447: size_t pos_after=pos_after_ref?*pos_after_ref:0;
448: int pos_before;
449: // while we have 'delim'...
450: for(; (pos_before=pos(delim, delim_size, pos_after, lang))>=0 && limit; limit--) {
1.69 paf 451: result+=&mid(pos_after, pos_before);
1.60 paf 452: pos_after=pos_before+delim_size;
453: }
454: // last piece
1.124 paf 455: if(pos_after<self_size && limit) {
456: result+=&mid(pos_after, self_size);
457: pos_after=self_size;
1.60 paf 458: }
459: if(pos_after_ref)
460: *pos_after_ref=pos_after;
461: } else { // empty delim
462: result+=this;
463: if(pos_after_ref)
1.124 paf 464: *pos_after_ref+=self_size;
1.60 paf 465: }
466: }
467:
468: void String::split(Array& result,
469: size_t* pos_after_ref,
470: const String& delim, Untaint_lang lang,
471: int limit) const {
472: if(delim.size()) {
473: size_t pos_after=pos_after_ref?*pos_after_ref:0;
474: int pos_before;
475: // while we have 'delim'...
476: for(; (pos_before=pos(delim, pos_after, lang))>=0 && limit; limit--) {
1.69 paf 477: result+=&mid(pos_after, pos_before);
1.60 paf 478: pos_after=pos_before+delim.size();
479: }
480: // last piece
481: if(pos_after<size() && limit) {
1.69 paf 482: result+=&mid(pos_after, size());
1.60 paf 483: pos_after=size();
484: }
485: if(pos_after_ref)
486: *pos_after_ref=pos_after;
487: } else { // empty delim
488: result+=this;
489: if(pos_after_ref)
490: *pos_after_ref+=size();
491: }
1.61 paf 492: }
493:
1.63 paf 494: static void regex_options(char *options, int *result){
495: struct Regex_option {
496: char key;
497: int clear, set;
498: int *result;
499: } regex_option[]={
500: {'i', 0, PCRE_CASELESS, result}, // a=A
1.79 paf 501: {'s', 0, PCRE_DOTALL, result}, // \n\n$ [default]
1.63 paf 502: {'x', 0, PCRE_EXTENDED, result}, // whitespace in regex ignored
503: {'m', PCRE_DOTALL, PCRE_MULTILINE, result}, // ^aaa\n$^bbb\n$
504: {'g', 0, true, result+1}, // many rows
505: {0},
506: };
507: result[0]=PCRE_EXTRA | PCRE_DOTALL;
508: result[1]=0;
509:
510: if(options)
511: for(Regex_option *o=regex_option; o->key; o++)
512: if(
513: strchr(options, o->key) ||
514: strchr(options, toupper(o->key))) {
515: *(o->result)&=~o->clear;
516: *(o->result)|=o->set;
517: }
518: }
519:
1.88 parser 520: /// @todo maybe need speedup: some option to remove pre/match/post string generation
1.132 paf 521: bool String::match(
1.77 paf 522: const String *aorigin,
1.62 paf 523: const String& regexp,
1.63 paf 524: const String *options,
1.64 paf 525: Table **table,
1.95 parser 526: Row_action row_action, void *info,
527: bool *was_global) const {
1.64 paf 528:
1.73 paf 529: if(!regexp.size())
1.112 parser 530: throw Exception(0, 0,
1.73 paf 531: aorigin,
532: "regexp is empty");
1.118 paf 533: const char *pattern=regexp.cstr();
1.62 paf 534: const char *errptr;
535: int erroffset;
1.63 paf 536: int option_bits[2]; regex_options(options?options->cstr():0, option_bits);
1.95 parser 537: if(was_global)
538: *was_global=option_bits[1]!=0;
1.63 paf 539: pcre *code=pcre_compile(pattern, option_bits[0],
1.62 paf 540: &errptr, &erroffset,
1.132 paf 541: pool().get_source_charset().pcre_tables);
1.62 paf 542:
1.67 paf 543: if(!code)
1.112 parser 544: throw Exception(0, 0,
1.69 paf 545: ®exp.mid(erroffset, regexp.size()),
1.74 paf 546: "regular expression syntax error - %s", errptr);
1.62 paf 547:
1.63 paf 548: int info_substrings=pcre_info(code, 0, 0);
549: if(info_substrings<0) {
1.100 parser 550: pcre_free(code);
1.112 parser 551: throw Exception(0, 0,
1.73 paf 552: aorigin,
1.76 paf 553: "pcre_info error (%d)",
1.73 paf 554: info_substrings);
1.63 paf 555: }
556:
557: int startoffset=0;
1.118 paf 558: const char *subject=cstr();
1.62 paf 559: int length=strlen(subject);
1.63 paf 560: int ovecsize;
561: int *ovector=(int *)malloc(sizeof(int)*
1.114 paf 562: (ovecsize=(1/*match*/+info_substrings)*3), 11);
1.62 paf 563:
1.64 paf 564: { // create table
565: Array& columns=*NEW Array(pool());
566: columns+=string_pre_match_name;
567: columns+=string_match_name;
568: columns+=string_post_match_name;
569: for(int i=1; i<=info_substrings; i++) {
570: char *column=(char *)malloc(MAX_NUMBER);
571: snprintf(column, MAX_NUMBER, "%d", i);
572: columns+=NEW String(pool(), column); // .i column name
573: }
574: *table=NEW Table(pool(), aorigin, &columns);
1.62 paf 575: }
1.63 paf 576:
1.64 paf 577: int exec_option_bits=0;
1.63 paf 578: while(true) {
579: int exec_substrings=pcre_exec(code, 0,
580: subject, length, startoffset,
1.64 paf 581: exec_option_bits, ovector, ovecsize);
1.63 paf 582:
583: if(exec_substrings==PCRE_ERROR_NOMATCH) {
1.100 parser 584: pcre_free(code);
585: row_action(**table, 0/*last time, no row*/, 0, 0, info);
1.63 paf 586: return option_bits[1]!=0; // global=true+table, not global=false
587: }
588:
589: if(exec_substrings<0) {
1.100 parser 590: pcre_free(code);
1.112 parser 591: throw Exception(0, 0,
1.63 paf 592: aorigin,
1.76 paf 593: "regular expression execute error (%d)",
1.63 paf 594: exec_substrings);
595: }
596:
597: Array& row=*NEW Array(pool());
1.81 paf 598: row+=&mid(0, ovector[0]); // .prematch column value
1.69 paf 599: row+=&mid(ovector[0], ovector[1]); // .match
1.81 paf 600: row+=&mid(ovector[1], size()); // .postmatch
1.63 paf 601:
602: for(int i=1; i<exec_substrings; i++) {
1.69 paf 603: // -1:-1 case handled peacefully by mid() itself
604: row+=&mid(ovector[i*2+0], ovector[i*2+1]); // .i column value
1.63 paf 605: }
606:
1.100 parser 607: row_action(**table, &row, startoffset, ovector[0], info);
1.63 paf 608:
1.100 parser 609: if(!option_bits[1] || startoffset==ovector[1]) { // not global | going to hang
610: pcre_free(code);
611: row_action(**table, 0/*last time, no row*/, 0, 0, info);
1.63 paf 612: return true;
613: }
1.100 parser 614: startoffset=ovector[1];
1.63 paf 615:
616: /*
617: if(option_bits[0] & PCRE_MULTILINE)
1.64 paf 618: exec_option_bits|=PCRE_NOTBOL; // start of subject+startoffset not BOL
1.63 paf 619: */
620: }
1.82 parser 621: }
622:
1.132 paf 623: String& String::change_case(Pool& pool,
1.82 parser 624: Change_case_kind kind) const {
1.132 paf 625: const unsigned char *tables=pool.get_source_charset().pcre_tables;
1.82 parser 626: String& result=*new(pool) String(pool);
627:
628: const unsigned char *a;
629: const unsigned char *b;
630: switch(kind) {
631: case CC_UPPER:
632: a=tables+lcc_offset;
633: b=tables+fcc_offset;
634: break;
635: case CC_LOWER:
636: a=tables+lcc_offset;
637: b=0;
638: break;
639: default:
1.112 parser 640: throw Exception(0, 0,
1.82 parser 641: this,
642: "unknown change case kind #%d",
643: static_cast<int>(kind)); // never
644: a=b=0; // calm, compiler
645: break; // never
646: }
647:
648: const Chunk *chunk=&head;
649: do {
650: const Chunk::Row *row=chunk->rows;
1.139 ! paf 651: for(Chunk::count_type i=0; i<chunk->count; i++, row++) {
1.82 parser 652: if(row==append_here)
653: goto break2;
654:
1.114 paf 655: char *new_cstr=(char *)pool.malloc(row->item.size, 12);
1.82 parser 656: char *dest=new_cstr;
657: const char *src=row->item.ptr;
658: for(int size=row->item.size; size--; src++) {
659: unsigned char c=a[(unsigned char)*src];
660: if(b)
661: c=b[c];
662:
663: *dest++=(char)c;
664: }
665:
666: result.APPEND(new_cstr, row->item.size,
1.123 paf 667: row->item.lang,
1.82 parser 668: row->item.origin.file, row->item.origin.line);
669: }
670: chunk=row->link;
671: } while(chunk);
672: break2:
1.89 parser 673:
1.101 parser 674: return result;
675: }
676:
1.108 parser 677: void String::join_chain(Pool& pool,
1.116 paf 678: uint& ai, const Chunk*& achunk, const Chunk::Row*& arow,
1.123 paf 679: uchar& joined_lang, const char *& joined_ptr, size_t& joined_size) const {
680: joined_lang=arow->item.lang;
1.108 parser 681:
682: // calc size
683: joined_size=0;
684: {
1.116 paf 685: uint start_i=ai;
1.108 parser 686: const Chunk::Row *start_row=arow;
687: const Chunk *chunk=achunk;
688: do {
689: const Chunk::Row *row=start_row;
1.116 paf 690: for(uint i=start_i; i<chunk->count; i++, row++) {
1.108 parser 691: if(row==append_here)
692: goto break21;
693:
694: if(row->item.lang==joined_lang)
695: joined_size+=row->item.size;
696: else
1.129 paf 697: goto break21;
1.108 parser 698: }
699: if(chunk=row->link) {
700: start_i=0;
701: start_row=chunk->rows;
702: } else
703: break;
704: } while(true);
705: break21:;
706: }
707:
708: // if one row, return simply itself
709: if(joined_size==arow->item.size) {
710: joined_ptr=arow->item.ptr;
711: ai++; arow++;
1.133 paf 712: if(ai==achunk->count) {
713: if(achunk=arow->link) {
714: ai=0;
1.134 paf 715: arow=achunk->rows;
1.133 paf 716: }
717: }
1.108 parser 718: } else {
719: // join adjacent rows
1.114 paf 720: char *ptr=(char *)pool.malloc(joined_size,13);
1.108 parser 721: joined_ptr=ptr;
1.116 paf 722: uint start_i=ai;
1.108 parser 723: const Chunk::Row *start_row=arow;
724: const Chunk *chunk=achunk;
1.116 paf 725: uint i;
1.108 parser 726: const Chunk::Row *row;
727: do {
728: row=start_row;
729: for(i=start_i; i<chunk->count; i++, row++) {
730: if(row==append_here)
731: goto break22;
732:
733: if(row->item.lang==joined_lang) {
734: memcpy(ptr, row->item.ptr, row->item.size);
735: ptr+=row->item.size;
736: } else
1.129 paf 737: goto break22;
1.108 parser 738: }
739: if(chunk=row->link) {
740: start_i=0;
741: start_row=chunk->rows;
742: } else
743: break;
744: } while(true);
745: break22:;
746:
747: // return joined rows
748: ai=i;
749: arow=row;
750: achunk=chunk;
751: }
752: }
753:
754: String& String::reconstruct(Pool& pool) const {
755: //_asm int 3;
756: String& result=*new(pool) String(pool);
757: const Chunk *chunk=&head;
1.133 paf 758: const Chunk::Row *row=chunk->rows;
759: for(uint i=0; i<chunk->count; ) {
760: if(row==append_here)
761: break;
1.108 parser 762:
1.133 paf 763: uchar joined_lang;
764: const char *joined_ptr;
765: size_t joined_size;
1.130 paf 766: #ifndef NO_STRING_ORIGIN
1.133 paf 767: const char *joined_origin_file=row->item.origin.file;
768: const size_t joined_origin_line=row->item.origin.line;
1.130 paf 769: #endif
1.133 paf 770: join_chain(pool, i, chunk, row,
771: joined_lang, joined_ptr, joined_size);
772:
773: result.APPEND(joined_ptr, joined_size, joined_lang,
774: joined_origin_file, joined_origin_line);
1.108 parser 775:
1.133 paf 776: if(!chunk)
777: break;
1.130 paf 778: }
1.108 parser 779:
780: return result;
781: };
782:
783: String& String::replace_in_reconstructed(Pool& pool, Dictionary& dict) const {
1.106 parser 784: //_asm int 3;
1.101 parser 785: String& result=*new(pool) String(pool);
1.123 paf 786: STRING_FOREACH_ROW(
787: const char *src=row->item.ptr;
788: size_t src_size=row->item.size;
789: char *new_cstr=(char *)pool.malloc((size_t)ceil(src_size*dict.max_ratio()), 14);
790: char *dest=new_cstr;
791: while(src_size) {
792: // there is a row where first column starts 'src'
793: if(Table::Item *item=dict.first_that_starts(src, src_size)) {
794: // get a=>b values
795: const String& a=*static_cast<Array *>(item)->get_string(0);
796: const String& b=*static_cast<Array *>(item)->get_string(1);
797: // skip 'a' in 'src' && reduce work size
798: src+=a.size(); src_size-=a.size();
799: // write 'b' to 'dest' && skip 'b' in 'dest'
800: b.store_to(dest); dest+=b.size();
801: } else {
802: // write a char to b && reduce work size
803: *dest++=*src++; src_size--;
1.101 parser 804: }
805: }
806:
1.123 paf 807: result.APPEND(new_cstr, dest-new_cstr,
808: row->item.lang,
809: row->item.origin.file, row->item.origin.line);
810: );
1.101 parser 811: break2:
1.89 parser 812: return result;
1.108 parser 813: }
814:
815: String& String::replace(Pool& pool, Dictionary& dict) const {
816: return reconstruct(pool).replace_in_reconstructed(pool, dict);
1.89 parser 817: }
818:
1.90 parser 819: double String::as_double() const {
1.89 parser 820: double result;
1.114 paf 821: const char *cstr;
822: char buf[MAX_NUMBER];
1.115 paf 823: if(head.rows+1==append_here) {
1.114 paf 824: int size=min(head.rows[0].item.size, MAX_NUMBER-1);
825: memcpy(buf, head.rows[0].item.ptr, size);
826: buf[size]=0;
827: cstr=buf;
828: } else
829: cstr=this->cstr();
1.102 parser 830: char *error_pos;
1.89 parser 831: // 0xABC
1.99 parser 832: if(cstr[0]=='0')
833: if(cstr[1]=='x' || cstr[1]=='X')
834: result=(double)(unsigned long)strtol(cstr, &error_pos, 0);
835: else
1.102 parser 836: result=(double)strtod(cstr+1/*skip leading 0*/, &error_pos);
1.89 parser 837: else
1.99 parser 838: result=(double)strtod(cstr, &error_pos);
1.89 parser 839:
1.103 parser 840: if(*error_pos/*not EOS*/)
1.112 parser 841: throw Exception(0, 0,
1.89 parser 842: this,
843: "invalid number (double)");
844:
845: return result;
846: }
1.90 parser 847: int String::as_int() const {
1.89 parser 848: int result;
1.114 paf 849: const char *cstr;
850: char buf[MAX_NUMBER];
1.115 paf 851: if(head.rows+1==append_here) {
1.114 paf 852: int size=min(head.rows[0].item.size, MAX_NUMBER-1);
853: memcpy(buf, head.rows[0].item.ptr, size);
854: buf[size]=0;
855: cstr=buf;
856: } else
857: cstr=this->cstr();
1.102 parser 858: char *error_pos;
1.89 parser 859: // 0xABC
1.99 parser 860: if(cstr[0]=='0')
861: if(cstr[1]=='x' || cstr[1]=='X')
862: result=(int)(unsigned long)strtol(cstr, &error_pos, 0);
863: else
1.102 parser 864: result=(int)strtol(cstr+1/*skip leading 0*/, &error_pos, 0);
1.89 parser 865: else
866: result=(int)strtol(cstr, &error_pos, 0);
867:
1.103 parser 868: if(*error_pos/*not EOS*/)
1.112 parser 869: throw Exception(0, 0,
1.89 parser 870: this,
871: "invalid number (int)");
1.82 parser 872:
873: return result;
1.61 paf 874: }
1.113 parser 875:
1.128 paf 876: inline void ushort2uchars(ushort word, uchar& byte1, uchar& byte2) {
877: byte1=word&0xFF;
878: byte2=word>>8;
879: }
880: inline ushort uchars2ushort(uchar byte1, uchar byte2) {
881: return (byte2<<8) | byte1;
882: }
1.113 parser 883: /* @todo maybe network order worth spending some effort?
884: don't bothering myself with network byte order,
885: am not planning to be able to move resulting file across platforms
886: for now
887: */
888: void String::serialize(size_t prolog_size, void *& buf, size_t& buf_size) const {
889: buf_size=
890: prolog_size
1.126 paf 891: +used_rows()*(sizeof(uchar)+sizeof(ushort))
1.113 parser 892: +size();
1.114 paf 893: buf=malloc(buf_size,15);
1.113 parser 894: char *cur=(char *)buf+prolog_size;
895:
1.123 paf 896: STRING_FOREACH_ROW(
897: // lang
1.126 paf 898: memcpy(cur, &row->item.lang, sizeof(uchar));
899: cur+=sizeof(uchar);
1.123 paf 900: // size
1.128 paf 901: uchar byte1; uchar byte2;
902: ushort2uchars(row->item.size, byte1, byte2);
903: memcpy(cur, &byte1, sizeof(uchar)); cur+=sizeof(uchar);
904: memcpy(cur, &byte2, sizeof(uchar)); cur+=sizeof(uchar);
1.123 paf 905: // bytes
906: memcpy(cur, row->item.ptr, row->item.size);
907: cur+=row->item.size;
908: );
1.113 parser 909: break2:
910: ;
911: }
912: void String::deserialize(size_t prolog_size, void *buf, size_t buf_size, const char *file) {
1.135 paf 913: if(buf_size<=prolog_size)
914: return;
915:
1.126 paf 916: char *cur=(char *)buf+prolog_size;
1.113 parser 917: buf_size-=prolog_size;
918:
919: while(buf_size) {
1.128 paf 920: uchar lang=*(uchar *)(cur);
921:
922: ushort size=uchars2ushort(
923: *(uchar*)(cur+sizeof(uchar)*1),
924: *(uchar*)(cur+sizeof(uchar)*2)
925: );
926:
927: const char *ptr=(const char*)(cur+sizeof(uchar)*3);
1.126 paf 928: APPEND(ptr, size, lang, file, 0);
1.113 parser 929:
1.126 paf 930: size_t piece_size=sizeof(uchar)+sizeof(ushort)+size;
1.113 parser 931: cur+=piece_size;
932: buf_size-=piece_size;
933: }
934: }
E-mail: