Annotation of parser3/src/main/pa_string.C, revision 1.164
1.45 paf 1: /** @file
1.55 paf 2: Parser: string class. @see untasize_t.C.
1.46 paf 3:
1.137 paf 4: Copyright (c) 2001, 2002 ArtLebedev Group (http://www.artlebedev.com)
1.138 paf 5: Author: Alexandr Petrosian <paf@design.ru> (http://paf.design.ru)
1.164 ! paf 6: */
1.46 paf 7:
1.164 ! paf 8: static const char* IDENT_STRING_C="$Id: zzz $";
1.4 paf 9:
1.70 paf 10: #include "pcre.h"
11:
1.13 paf 12: #include "pa_pool.h"
1.12 paf 13: #include "pa_string.h"
1.5 paf 14: #include "pa_hash.h"
1.22 paf 15: #include "pa_exception.h"
1.53 paf 16: #include "pa_common.h"
1.60 paf 17: #include "pa_array.h"
18: #include "pa_globals.h"
1.61 paf 19: #include "pa_table.h"
1.101 parser 20: #include "pa_dictionary.h"
1.132 paf 21: #include "pa_charset.h"
1.60 paf 22:
1.139 paf 23: #define DEBUG_STRING_APPENDS_VS_EXPANDS
24:
25:
26: #ifdef DEBUG_STRING_APPENDS_VS_EXPANDS
27: ulong string_piece_appends=0;
28: #endif
29:
1.160 paf 30: String& String::OnPool(Pool& apool, const char *local_src, size_t src_size, bool tainted) {
31: if(local_src && *local_src) {
32: if(src_size==0)
33: src_size=strlen(local_src);
34:
35: char *pooled_src=(char *)apool.malloc(src_size);
36: memcpy(pooled_src, local_src, src_size);
37: return *new(apool) String(apool, pooled_src, src_size, tainted);
38: } else
39: return *new(apool) String(apool);
40: }
1.75 paf 41: String::String(Pool& apool, const char *src, size_t src_size, bool tainted) :
1.120 paf 42: Pooled(apool) {
1.151 paf 43: last_chunk=&head.chunk;
44: head.chunk.count=CR_PREALLOCATED_COUNT;
45: append_here=head.chunk.rows;
1.41 paf 46:
47: if(src)
1.75 paf 48: if(tainted)
49: APPEND_TAINTED(src, src_size, 0, 0);
1.41 paf 50: else
1.75 paf 51: APPEND_CLEAN(src, src_size, 0, 0);
1.1 paf 52: }
1.140 paf 53:
54: String::String(const String& src) :
55: Pooled(src.pool()) {
1.151 paf 56: last_chunk=&head.chunk;
57: head.chunk.count=CR_PREALLOCATED_COUNT;
58: append_here=head.chunk.rows;
1.140 paf 59:
60: append(src, UL_UNSPECIFIED);
1.120 paf 61: }
62:
63: size_t String::size() const {
64: size_t result=0;
1.123 paf 65: STRING_FOREACH_ROW(
1.120 paf 66: result+=row->item.size;
1.123 paf 67: );
1.120 paf 68: return result;
1.94 parser 69: }
70:
1.115 paf 71: /// @todo not very optimal
72: uint String::used_rows() const {
73: uint result=0;
1.123 paf 74: STRING_FOREACH_ROW(
75: result++;
76: );
1.115 paf 77: return result;
78: }
1.94 parser 79: void String::expand() {
1.143 paf 80: uint new_chunk_count=last_chunk->count+CR_GROW_COUNT;
1.139 paf 81: if(new_chunk_count>max_integral(Chunk::count_type))
82: new_chunk_count=max_integral(Chunk::count_type);
1.122 paf 83:
1.152 paf 84: Chunk *new_chunk=static_cast<Chunk *>(malloc(
1.151 paf 85: sizeof(Chunk)// count+interpadding(?)+rows[CR_PREALLOCATED_COUNT]+tailpadding(??)
86: -sizeof(Chunk::rows_type) // PREALLOCATED rows
87: +sizeof(Chunk::Row)*new_chunk_count // neaded rows
88: +sizeof(Chunk *) // link size
89: , 10));
1.141 paf 90: new_chunk->rows[new_chunk->count=new_chunk_count].link=0;
91: last_chunk->rows[last_chunk->count].link=new_chunk;
92:
93: last_chunk=new_chunk;
1.94 parser 94: append_here=last_chunk->rows;
1.5 paf 95: }
1.28 paf 96:
1.13 paf 97: String& String::real_append(STRING_APPEND_PARAMS) {
1.139 paf 98: if(!last_chunk) // growth stopped [we're appended as string to somebody]
1.149 paf 99: throw Exception(0,
1.139 paf 100: this,
1.142 paf 101: "string growth stopped (append cstr)");
1.139 paf 102:
1.9 paf 103: if(!src)
104: return *this;
1.26 paf 105: if(!size)
106: size=strlen(src);
107: if(!size)
1.9 paf 108: return *this;
1.122 paf 109:
1.139 paf 110: #ifdef DEBUG_STRING_APPENDS_VS_EXPANDS
111: string_piece_appends++;
112: #endif
113:
114: // manually unrolled to avoid extra check
115: while(size>max_integral(Chunk::Row::item_size_type)) {
1.122 paf 116: if(chunk_is_full())
117: expand();
118:
119: append_here->item.ptr=src;
1.139 paf 120: append_here->item.size=max_integral(Chunk::Row::item_size_type);
1.122 paf 121: append_here->item.lang=lang;
122: #ifndef NO_STRING_ORIGIN
123: append_here->item.origin.file=file;
124: append_here->item.origin.line=line;
125: #endif
126: append_here++;
127:
1.139 paf 128: src+=max_integral(Chunk::Row::item_size_type);
129: size-=max_integral(Chunk::Row::item_size_type);
1.122 paf 130: }
1.9 paf 131:
1.1 paf 132: if(chunk_is_full())
133: expand();
134:
135: append_here->item.ptr=src;
1.121 paf 136: append_here->item.size=size;
1.52 paf 137: append_here->item.lang=lang;
1.13 paf 138: #ifndef NO_STRING_ORIGIN
1.14 paf 139: append_here->item.origin.file=file;
140: append_here->item.origin.line=line;
1.13 paf 141: #endif
1.115 paf 142: append_here++;
1.1 paf 143:
144: return *this;
1.97 parser 145: }
146:
147: char String::first_char() const {
1.140 paf 148: if(is_empty())
1.149 paf 149: throw Exception(0,
1.97 parser 150: this,
151: "getting first char of empty string");
152:
1.151 paf 153: return *head.chunk.rows[0].item.ptr;
1.1 paf 154: }
155:
1.16 paf 156: uint String::hash_code() const {
1.7 paf 157: uint result=0;
1.123 paf 158: STRING_FOREACH_ROW(
1.6 paf 159: result=Hash::generic_code(result, row->item.ptr, row->item.size);
1.123 paf 160: );
1.5 paf 161: return result;
162: }
163:
1.60 paf 164: /// @todo move 'lang' skipping to pos
165: int String::cmp(int& partial, const String& src,
166: size_t this_offset, Untaint_lang lang) const {
1.59 paf 167: partial=-1;
1.125 paf 168: size_t a_size=size();
169: this_offset=min(this_offset, a_size-1);
1.55 paf 170:
1.151 paf 171: const Chunk *a_chunk=&head.chunk;
172: const Chunk *b_chunk=&src.head.chunk;
1.16 paf 173: const Chunk::Row *a_row=a_chunk->rows;
174: const Chunk::Row *b_row=b_chunk->rows;
1.55 paf 175: size_t a_offset=this_offset;
176: size_t b_offset=0;
1.9 paf 177: Chunk::Row *a_end=append_here;
178: Chunk::Row *b_end=src.append_here;
1.116 paf 179: uint a_countdown=a_chunk->count;
180: uint b_countdown=b_chunk->count;
181: int result;
1.60 paf 182: size_t pos=0;
1.33 paf 183:
1.125 paf 184: bool a_break=a_size==0;
1.140 paf 185: bool b_break=src.is_empty();
1.83 parser 186: if(!(a_break || b_break)) while(true) {
1.55 paf 187: if(pos+a_row->item.size > this_offset) {
1.136 paf 188: if(lang!=UL_UNSPECIFIED && a_row->item.lang>lang)
1.60 paf 189: return -1; // wrong lang -- bail out
190:
1.55 paf 191: int size_diff=
192: (a_row->item.size-a_offset)-
193: (b_row->item.size-b_offset);
194:
195: if(size_diff==0) { // a has same size as b
1.60 paf 196: result=memcmp(a_row->item.ptr+a_offset, b_row->item.ptr+b_offset,
197: a_row->item.size-a_offset);
1.55 paf 198: if(result)
199: return result;
1.60 paf 200: pos+=a_row->item.size;
1.55 paf 201: a_row++; a_countdown--; a_offset=0;
202: b_row++; b_countdown--; b_offset=0;
203: } else if (size_diff>0) { // a longer
1.60 paf 204: result=memcmp(a_row->item.ptr+a_offset, b_row->item.ptr+b_offset,
205: b_row->item.size-b_offset);
1.55 paf 206: if(result)
207: return result;
208: a_offset+=b_row->item.size-b_offset;
209: b_row++; b_countdown--; b_offset=0;
210: } else { // b longer
1.60 paf 211: result=memcmp(a_row->item.ptr+a_offset, b_row->item.ptr+b_offset,
212: a_row->item.size-a_offset);
1.55 paf 213: if(result)
214: return result;
215: b_offset+=a_row->item.size-a_offset;
1.60 paf 216: pos+=a_row->item.size;
1.55 paf 217: a_row++; a_countdown--; a_offset=0;
218: }
1.83 parser 219: if(b_break=b_row==b_end) {
220: a_break=a_row==a_end;
221: break;
222: }
1.55 paf 223: if(!b_countdown) {
224: b_chunk=b_row->link;
225: b_row=b_chunk->rows;
226: b_countdown=b_chunk->count;
227: }
228: } else {
1.60 paf 229: a_offset-=a_row->item.size;
230: pos+=a_row->item.size;
231: a_row++; a_countdown--;
1.9 paf 232: }
233:
1.83 parser 234: if(a_break=a_row==a_end) {
235: b_break=b_row==b_end;
236: break;
237: }
1.11 paf 238: if(!a_countdown) {
1.9 paf 239: a_chunk=a_row->link;
240: a_row=a_chunk->rows;
1.11 paf 241: a_countdown=a_chunk->count;
1.9 paf 242: }
1.27 paf 243: }
1.55 paf 244: if(a_break==b_break) { // ended simultaneously
245: partial=0; return 0;
246: } else if(a_break) { // first bytes equal, but a ended before b
247: partial=1; return -1;
248: } else {
249: partial=2; return +1;
250: }
1.27 paf 251: }
252:
1.60 paf 253: /// @todo move 'lang' skipping to pos
1.59 paf 254: int String::cmp(int& partial, const char* b_ptr, size_t src_size,
1.60 paf 255: size_t this_offset, Untaint_lang lang) const {
1.59 paf 256: partial=-1;
1.125 paf 257: size_t a_size=size();
1.50 paf 258: size_t b_size=src_size?src_size:b_ptr?strlen(b_ptr):0;
1.125 paf 259: this_offset=min(this_offset, a_size-1);
1.27 paf 260:
1.151 paf 261: const Chunk *a_chunk=&head.chunk;
1.27 paf 262: const Chunk::Row *a_row=a_chunk->rows;
1.59 paf 263: size_t a_offset=this_offset;
1.55 paf 264: size_t b_offset=0;
1.27 paf 265: Chunk::Row *a_end=append_here;
1.116 paf 266: uint a_countdown=a_chunk->count;
1.60 paf 267: size_t pos=0;
1.52 paf 268:
1.125 paf 269: bool a_break=a_size==0;
1.83 parser 270: bool b_break=b_size==0;
271: if(!(a_break || b_break)) while(true) {
1.59 paf 272: if(pos+a_row->item.size > this_offset) {
1.136 paf 273: if(lang!=UL_UNSPECIFIED && a_row->item.lang>lang)
1.60 paf 274: return -1; // wrong lang -- bail out
275:
1.59 paf 276: int size_diff=
277: (a_row->item.size-a_offset)-
278: (b_size-b_offset);
279:
280: if(size_diff==0) { // a has same size as b
1.116 paf 281: if(int result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset,
1.59 paf 282: a_row->item.size-a_offset)!=0)
283: return result;
1.60 paf 284: pos+=a_row->item.size;
1.59 paf 285: a_row++; a_countdown--; a_offset=0;
286: b_break=true;
287: } else if (size_diff>0) { // a longer
1.116 paf 288: if(int result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset,
1.59 paf 289: b_size-b_offset)!=0)
290: return result;
291: a_offset+=b_size-b_offset;
292: b_break=true;
293: } else { // b longer
1.116 paf 294: if(int result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset,
1.59 paf 295: a_row->item.size-a_offset)!=0)
296: return result;
297: b_offset+=a_row->item.size-a_offset;
1.60 paf 298: pos+=a_row->item.size;
1.59 paf 299: a_row++; a_countdown--; a_offset=0;
300: }
301: } else {
1.60 paf 302: a_offset-=a_row->item.size;
303: pos+=a_row->item.size;
304: a_row++; a_countdown--;
1.27 paf 305: }
306:
1.86 parser 307: a_break=a_row==a_end;
308: if(a_break || b_break)
1.83 parser 309: break;
1.27 paf 310: if(!a_countdown) {
311: a_chunk=a_row->link;
312: a_row=a_chunk->rows;
313: a_countdown=a_chunk->count;
1.9 paf 314: }
315: }
1.55 paf 316: if(a_break==b_break) { // ended simultaneously
317: partial=0; return 0;
318: } else if(a_break) { // first bytes equal, but a ended before b
319: partial=1; return -1;
320: } else {
321: partial=2; return +1;
322: }
1.5 paf 323: }
1.46 paf 324:
325: #ifndef NO_STRING_ORIGIN
326: const Origin& String::origin() const {
1.140 paf 327: if(is_empty()) {
1.96 parser 328: static const Origin empty_origin={"empty string"};
329: return empty_origin;
330: }
1.46 paf 331:
1.147 paf 332: // determining origin by first piece or last appended piece
333: // because any of them can be constant=without origin:
1.50 paf 334: // ex: ^load[/file] "document_root" + "/file"
1.80 paf 335: // when last peice is constant,
336: // ex: parser_root_auto_path{dynamic} / auto.p{const}
337: // using first piece
1.151 paf 338: Origin& first_origin=head.chunk.rows[0].item.origin;
1.147 paf 339: return first_origin.file ? first_origin : append_here[-1].item.origin;
1.46 paf 340: }
341: #endif
1.53 paf 342:
1.69 paf 343: String& String::mid(size_t start, size_t finish) const {
1.107 parser 344: String& result=*NEW String(pool());
345:
1.139 paf 346: start=max(size_t(0), start);
1.111 parser 347: finish=min(size(), finish);
1.60 paf 348: if(start==finish)
1.107 parser 349: return result;
1.53 paf 350:
351: size_t pos=0;
1.123 paf 352: STRING_FOREACH_ROW(
353: size_t item_finish=pos+row->item.size;
354: if(item_finish > start) { // started now or already?
1.140 paf 355: bool started=result.is_empty(); // started now?
1.123 paf 356: bool finished=finish <= item_finish; // finished now?
357: size_t offset=started?start-pos:0;
358: size_t size=finished?finish-pos:row->item.size;
359: result.APPEND(
360: row->item.ptr+offset, size-offset,
361: row->item.lang,
362: row->item.origin.file, row->item.origin.line);
363: if(finished)
1.53 paf 364: goto break2;
365: }
1.123 paf 366: pos+=row->item.size;
367: );
1.53 paf 368: break2:
1.60 paf 369: // SAPI::log(pool(), "piece of '%s' from %d to %d is '%s'",
370: //cstr(), start, finish, result.cstr());
1.53 paf 371: return result;
1.54 paf 372: }
373:
1.60 paf 374: int String::pos(const String& substr,
1.116 paf 375: int result, Untaint_lang lang) const {
1.125 paf 376: size_t self_size=size();
1.131 paf 377: for(; size_t(result)<self_size; result++) {
1.60 paf 378: int partial; cmp(partial, substr, result, lang);
1.58 paf 379: if(
380: partial==0 || // full match
381: partial==2) // 'substr' starts 'this'+'result'
382: return result;
383: }
384:
385: return -1;
386: }
387:
1.60 paf 388: int String::pos(const char *substr, size_t substr_size,
1.116 paf 389: int result, Untaint_lang lang) const {
1.125 paf 390: size_t self_size=size();
1.131 paf 391: for(; size_t(result)<self_size; result++) {
1.60 paf 392: int partial; cmp(partial, substr, substr_size, result, lang);
1.55 paf 393: if(
394: partial==0 || // full match
395: partial==2) // 'substr' starts 'this'+'result'
396: return result;
397: }
398:
399: return -1;
1.60 paf 400: }
401:
402: void String::split(Array& result,
403: size_t* pos_after_ref,
404: const char *delim, size_t delim_size,
405: Untaint_lang lang, int limit) const {
1.125 paf 406: size_t self_size=size();
1.60 paf 407: if(delim_size) {
408: size_t pos_after=pos_after_ref?*pos_after_ref:0;
409: int pos_before;
410: // while we have 'delim'...
411: for(; (pos_before=pos(delim, delim_size, pos_after, lang))>=0 && limit; limit--) {
1.69 paf 412: result+=&mid(pos_after, pos_before);
1.60 paf 413: pos_after=pos_before+delim_size;
414: }
415: // last piece
1.124 paf 416: if(pos_after<self_size && limit) {
417: result+=&mid(pos_after, self_size);
418: pos_after=self_size;
1.60 paf 419: }
420: if(pos_after_ref)
421: *pos_after_ref=pos_after;
422: } else { // empty delim
423: result+=this;
424: if(pos_after_ref)
1.124 paf 425: *pos_after_ref+=self_size;
1.60 paf 426: }
427: }
428:
429: void String::split(Array& result,
430: size_t* pos_after_ref,
431: const String& delim, Untaint_lang lang,
432: int limit) const {
1.140 paf 433: if(!delim.is_empty()) {
1.60 paf 434: size_t pos_after=pos_after_ref?*pos_after_ref:0;
435: int pos_before;
436: // while we have 'delim'...
437: for(; (pos_before=pos(delim, pos_after, lang))>=0 && limit; limit--) {
1.69 paf 438: result+=&mid(pos_after, pos_before);
1.60 paf 439: pos_after=pos_before+delim.size();
440: }
441: // last piece
442: if(pos_after<size() && limit) {
1.69 paf 443: result+=&mid(pos_after, size());
1.60 paf 444: pos_after=size();
445: }
446: if(pos_after_ref)
447: *pos_after_ref=pos_after;
448: } else { // empty delim
449: result+=this;
450: if(pos_after_ref)
451: *pos_after_ref+=size();
452: }
1.61 paf 453: }
454:
1.154 paf 455: static void regex_options(const String *options, int *result, bool& need_pre_post_match){
1.63 paf 456: struct Regex_option {
1.153 paf 457: const char *keyL;
458: const char *keyU;
1.63 paf 459: int clear, set;
460: int *result;
1.154 paf 461: bool *flag;
1.63 paf 462: } regex_option[]={
1.153 paf 463: {"i", "I", 0, PCRE_CASELESS, result}, // a=A
464: {"s", "S", 0, PCRE_DOTALL, result}, // \n\n$ [default]
465: {"x", "U", 0, PCRE_EXTENDED, result}, // whitespace in regex ignored
466: {"m", "M", PCRE_DOTALL, PCRE_MULTILINE, result}, // ^aaa\n$^bbb\n$
467: {"g", "G", 0, true, result+1}, // many rows
1.154 paf 468: {"'", 0, 0, 0, 0, &need_pre_post_match},
469: {0}
1.63 paf 470: };
471: result[0]=PCRE_EXTRA | PCRE_DOTALL;
472: result[1]=0;
473:
474: if(options)
1.153 paf 475: for(Regex_option *o=regex_option; o->keyL; o++)
1.154 paf 476: if(options->pos(o->keyL)>=0
477: || (o->keyU && options->pos(o->keyU)>=0)) {
478: if(o->flag)
479: *o->flag=true;
480: else { // result
481: *o->result &= ~o->clear;
482: *o->result |= o->set;
483: }
1.63 paf 484: }
485: }
486:
1.155 paf 487: /// @todo make replacement Table stacked
1.158 paf 488: bool String::match(
1.77 paf 489: const String *aorigin,
1.62 paf 490: const String& regexp,
1.63 paf 491: const String *options,
1.64 paf 492: Table **table,
1.95 parser 493: Row_action row_action, void *info,
494: bool *was_global) const {
1.64 paf 495:
1.140 paf 496: if(regexp.is_empty())
1.149 paf 497: throw Exception(0,
1.73 paf 498: aorigin,
499: "regexp is empty");
1.154 paf 500:
1.118 paf 501: const char *pattern=regexp.cstr();
1.62 paf 502: const char *errptr;
503: int erroffset;
1.154 paf 504: bool need_pre_post_match=false;
505: int option_bits[2]; regex_options(options, option_bits, need_pre_post_match);
1.95 parser 506: if(was_global)
507: *was_global=option_bits[1]!=0;
1.63 paf 508: pcre *code=pcre_compile(pattern, option_bits[0],
1.62 paf 509: &errptr, &erroffset,
1.132 paf 510: pool().get_source_charset().pcre_tables);
1.62 paf 511:
1.67 paf 512: if(!code)
1.149 paf 513: throw Exception(0,
1.69 paf 514: ®exp.mid(erroffset, regexp.size()),
1.74 paf 515: "regular expression syntax error - %s", errptr);
1.62 paf 516:
1.63 paf 517: int info_substrings=pcre_info(code, 0, 0);
518: if(info_substrings<0) {
1.100 parser 519: pcre_free(code);
1.149 paf 520: throw Exception(0,
1.73 paf 521: aorigin,
1.76 paf 522: "pcre_info error (%d)",
1.73 paf 523: info_substrings);
1.63 paf 524: }
525:
1.158 paf 526: const char *subject=cstr();
1.62 paf 527: int length=strlen(subject);
1.155 paf 528: const int ovecsize=(1/*match*/+MAX_STRING_MATCH_TABLE_COLUMNS)*3;
529: int ovector[ovecsize];
530:
531: // create table
1.157 paf 532: *table=NEW Table(pool(), *string_match_table_template);
1.63 paf 533:
1.64 paf 534: int exec_option_bits=0;
1.154 paf 535: int prestart=0;
536: int poststart=0;
537: int postfinish=size();
1.63 paf 538: while(true) {
539: int exec_substrings=pcre_exec(code, 0,
1.154 paf 540: subject, length, prestart,
1.64 paf 541: exec_option_bits, ovector, ovecsize);
1.63 paf 542:
543: if(exec_substrings==PCRE_ERROR_NOMATCH) {
1.100 parser 544: pcre_free(code);
1.154 paf 545: row_action(**table, 0/*last time, no row*/, 0, 0, poststart, postfinish, info);
1.63 paf 546: return option_bits[1]!=0; // global=true+table, not global=false
547: }
548:
549: if(exec_substrings<0) {
1.100 parser 550: pcre_free(code);
1.149 paf 551: throw Exception(0,
1.63 paf 552: aorigin,
1.76 paf 553: "regular expression execute error (%d)",
1.63 paf 554: exec_substrings);
555: }
556:
1.154 paf 557: int prefinish=ovector[0];
558: poststart=ovector[1];
1.63 paf 559: Array& row=*NEW Array(pool());
1.154 paf 560: row+=need_pre_post_match?&mid(0, prefinish):0; // .prematch column value
561: row+=need_pre_post_match?&mid(prefinish, poststart):0; // .match
562: row+=need_pre_post_match?&mid(poststart, postfinish):0; // .postmatch
1.63 paf 563:
564: for(int i=1; i<exec_substrings; i++) {
1.69 paf 565: // -1:-1 case handled peacefully by mid() itself
566: row+=&mid(ovector[i*2+0], ovector[i*2+1]); // .i column value
1.63 paf 567: }
568:
1.154 paf 569: row_action(**table, &row, prestart, prefinish, poststart, postfinish, info);
1.63 paf 570:
1.154 paf 571: if(!option_bits[1] || prestart==poststart) { // not global | going to hang
1.100 parser 572: pcre_free(code);
1.154 paf 573: row_action(**table, 0/*last time, no row*/, 0, 0, poststart, postfinish, info);
1.63 paf 574: return true;
575: }
1.154 paf 576: prestart=poststart;
1.63 paf 577:
578: /*
579: if(option_bits[0] & PCRE_MULTILINE)
1.64 paf 580: exec_option_bits|=PCRE_NOTBOL; // start of subject+startoffset not BOL
1.63 paf 581: */
582: }
1.82 parser 583: }
584:
1.132 paf 585: String& String::change_case(Pool& pool,
1.82 parser 586: Change_case_kind kind) const {
1.132 paf 587: const unsigned char *tables=pool.get_source_charset().pcre_tables;
1.82 parser 588: String& result=*new(pool) String(pool);
589:
590: const unsigned char *a;
591: const unsigned char *b;
592: switch(kind) {
593: case CC_UPPER:
594: a=tables+lcc_offset;
595: b=tables+fcc_offset;
596: break;
597: case CC_LOWER:
598: a=tables+lcc_offset;
599: b=0;
600: break;
601: default:
1.149 paf 602: throw Exception(0,
1.82 parser 603: this,
604: "unknown change case kind #%d",
605: static_cast<int>(kind)); // never
606: a=b=0; // calm, compiler
607: break; // never
608: }
609:
1.143 paf 610: STRING_FOREACH_ROW(
611: char *new_cstr=(char *)pool.malloc(row->item.size, 12);
612: char *dest=new_cstr;
613: const char *src=row->item.ptr;
614: for(int size=row->item.size; size--; src++) {
615: unsigned char c=a[(unsigned char)*src];
616: if(b)
617: c=b[c];
1.82 parser 618:
1.143 paf 619: *dest++=(char)c;
1.82 parser 620: }
1.143 paf 621:
622: result.APPEND(new_cstr, row->item.size,
623: row->item.lang,
624: row->item.origin.file, row->item.origin.line);
625: );
1.89 parser 626:
1.101 parser 627: return result;
628: }
629:
1.150 paf 630: /// @test if in some piece were found no dict words, append it, not it's duplicate
631: String& String::replace(Pool& pool, Dictionary& dict) const {
632: // return reconstruct(pool).replace_in_reconstructed(pool, dict);
1.108 parser 633: String& result=*new(pool) String(pool);
1.150 paf 634:
1.143 paf 635: STRING_FOREACH_ROW(
1.156 paf 636: const char *src=row->item.ptr;
637: size_t src_size=row->item.size;
1.123 paf 638: char *new_cstr=(char *)pool.malloc((size_t)ceil(src_size*dict.max_ratio()), 14);
639: char *dest=new_cstr;
640: while(src_size) {
641: // there is a row where first column starts 'src'
642: if(Table::Item *item=dict.first_that_starts(src, src_size)) {
643: // get a=>b values
644: const String& a=*static_cast<Array *>(item)->get_string(0);
645: const String& b=*static_cast<Array *>(item)->get_string(1);
646: // skip 'a' in 'src' && reduce work size
647: src+=a.size(); src_size-=a.size();
648: // write 'b' to 'dest' && skip 'b' in 'dest'
649: b.store_to(dest); dest+=b.size();
650: } else {
651: // write a char to b && reduce work size
652: *dest++=*src++; src_size--;
1.101 parser 653: }
654: }
655:
1.156 paf 656: result.APPEND(new_cstr, dest-new_cstr, row->item.lang,
657: row->item.origin.file, row->item.origin.line);
658: );
659: return result;
660: }
661:
662: String& String::join_chains(Pool& pool, char** acstr) const {
663: char *lcstr=cstr();
664: const char *current=lcstr;
665:
666: String& result=*new(pool) String(pool);
667: STRING_FOREACH_ROW(
668: IFNDEF_NO_STRING_ORIGIN(
669: const char *joined_origin_file=row->item.origin.file;
670: const size_t joined_origin_line=row->item.origin.line;
671: );
672: uchar joined_lang=row->item.lang;
673: const char *joined_ptr=current;
674: // calc size
675: size_t joined_size=0;
676: STRING_PREPARED_FOREACH_ROW(*this,
677: if(row->item.lang==joined_lang)
678: joined_size+=row->item.size;
679: else
680: break; // before non-ours
681: );
682: current+=joined_size;
683:
684: // pointers are after joined piece
685: // & one step back, see STRING_FOREACH_ROW
686: --row; ++countdown;
687:
688: result.APPEND(joined_ptr, joined_size, joined_lang,
1.150 paf 689: joined_origin_file, joined_origin_line);
1.123 paf 690: );
1.156 paf 691:
692: if(acstr)
693: *acstr=lcstr;
1.89 parser 694: return result;
695: }
696:
1.90 parser 697: double String::as_double() const {
1.89 parser 698: double result;
1.114 paf 699: const char *cstr;
700: char buf[MAX_NUMBER];
1.151 paf 701: if(head.chunk.rows+1==append_here) {
702: int size=min(head.chunk.rows[0].item.size, MAX_NUMBER-1);
703: memcpy(buf, head.chunk.rows[0].item.ptr, size);
1.114 paf 704: buf[size]=0;
705: cstr=buf;
706: } else
707: cstr=this->cstr();
1.161 paf 708: while(*cstr && isspace(*cstr))
709: cstr++;
710: if(!*cstr)
1.162 paf 711: return 0;
1.161 paf 712:
1.102 parser 713: char *error_pos;
1.89 parser 714: // 0xABC
1.99 parser 715: if(cstr[0]=='0')
716: if(cstr[1]=='x' || cstr[1]=='X')
717: result=(double)(unsigned long)strtol(cstr, &error_pos, 0);
718: else
1.102 parser 719: result=(double)strtod(cstr+1/*skip leading 0*/, &error_pos);
1.89 parser 720: else
1.99 parser 721: result=(double)strtod(cstr, &error_pos);
1.89 parser 722:
1.159 paf 723: while(char c=*error_pos++)
724: if(!isspace(c))
725: throw Exception("number.format",
726: this,
727: "invalid number (double)");
1.89 parser 728:
729: return result;
730: }
1.90 parser 731: int String::as_int() const {
1.89 parser 732: int result;
1.114 paf 733: const char *cstr;
734: char buf[MAX_NUMBER];
1.151 paf 735: if(head.chunk.rows+1==append_here) {
1.163 paf 736: size_t size=min(head.chunk.rows[0].item.size, MAX_NUMBER-1);
1.151 paf 737: memcpy(buf, head.chunk.rows[0].item.ptr, size);
1.114 paf 738: buf[size]=0;
739: cstr=buf;
740: } else
741: cstr=this->cstr();
1.161 paf 742: while(*cstr && isspace(*cstr))
743: cstr++;
744: if(!*cstr)
1.162 paf 745: return 0;
1.161 paf 746:
1.102 parser 747: char *error_pos;
1.89 parser 748: // 0xABC
1.99 parser 749: if(cstr[0]=='0')
750: if(cstr[1]=='x' || cstr[1]=='X')
751: result=(int)(unsigned long)strtol(cstr, &error_pos, 0);
752: else
1.102 parser 753: result=(int)strtol(cstr+1/*skip leading 0*/, &error_pos, 0);
1.89 parser 754: else
755: result=(int)strtol(cstr, &error_pos, 0);
756:
1.159 paf 757: while(char c=*error_pos++)
758: if(!isspace(c))
759: throw Exception("number.format",
760: this,
761: "invalid number (int)");
1.82 parser 762:
763: return result;
1.61 paf 764: }
1.113 parser 765:
1.128 paf 766: inline void ushort2uchars(ushort word, uchar& byte1, uchar& byte2) {
767: byte1=word&0xFF;
768: byte2=word>>8;
769: }
770: inline ushort uchars2ushort(uchar byte1, uchar byte2) {
771: return (byte2<<8) | byte1;
772: }
1.113 parser 773: /* @todo maybe network order worth spending some effort?
774: don't bothering myself with network byte order,
775: am not planning to be able to move resulting file across platforms
776: for now
777: */
778: void String::serialize(size_t prolog_size, void *& buf, size_t& buf_size) const {
779: buf_size=
780: prolog_size
1.126 paf 781: +used_rows()*(sizeof(uchar)+sizeof(ushort))
1.113 parser 782: +size();
1.114 paf 783: buf=malloc(buf_size,15);
1.113 parser 784: char *cur=(char *)buf+prolog_size;
785:
1.123 paf 786: STRING_FOREACH_ROW(
787: // lang
1.126 paf 788: memcpy(cur, &row->item.lang, sizeof(uchar));
789: cur+=sizeof(uchar);
1.123 paf 790: // size
1.128 paf 791: uchar byte1; uchar byte2;
792: ushort2uchars(row->item.size, byte1, byte2);
793: memcpy(cur, &byte1, sizeof(uchar)); cur+=sizeof(uchar);
794: memcpy(cur, &byte2, sizeof(uchar)); cur+=sizeof(uchar);
1.123 paf 795: // bytes
796: memcpy(cur, row->item.ptr, row->item.size);
797: cur+=row->item.size;
798: );
1.113 parser 799: }
1.148 paf 800: bool String::deserialize(size_t prolog_size, void *buf, size_t buf_size, const char *file) {
1.135 paf 801: if(buf_size<=prolog_size)
1.148 paf 802: return false;
1.135 paf 803:
1.126 paf 804: char *cur=(char *)buf+prolog_size;
1.113 parser 805: buf_size-=prolog_size;
806:
807: while(buf_size) {
1.148 paf 808: if(sizeof(uchar)+sizeof(ushort)>buf_size) // lang+size
809: return false;
810:
811: uchar lang=*(uchar *)(cur);
1.128 paf 812: ushort size=uchars2ushort(
813: *(uchar*)(cur+sizeof(uchar)*1),
814: *(uchar*)(cur+sizeof(uchar)*2)
815: );
816:
1.148 paf 817: size_t piece_size=sizeof(uchar)+sizeof(ushort)+size;
818: if(piece_size>buf_size) // buffer overrun, can be on incomplete cache files
819: return false;
820:
1.128 paf 821: const char *ptr=(const char*)(cur+sizeof(uchar)*3);
1.126 paf 822: APPEND(ptr, size, lang, file, 0);
1.113 parser 823:
824: cur+=piece_size;
825: buf_size-=piece_size;
826: }
1.148 paf 827: return true;
1.113 parser 828: }
E-mail: