Annotation of parser3/src/main/pa_string.C, revision 1.170
1.45 paf 1: /** @file
1.55 paf 2: Parser: string class. @see untasize_t.C.
1.46 paf 3:
1.137 paf 4: Copyright (c) 2001, 2002 ArtLebedev Group (http://www.artlebedev.com)
1.138 paf 5: Author: Alexandr Petrosian <paf@design.ru> (http://paf.design.ru)
1.164 paf 6: */
1.46 paf 7:
1.170 ! paf 8: static const char* IDENT_STRING_C="$Date: 2002/08/21 10:52:49 $";
1.4 paf 9:
1.70 paf 10: #include "pcre.h"
11:
1.13 paf 12: #include "pa_pool.h"
1.12 paf 13: #include "pa_string.h"
1.5 paf 14: #include "pa_hash.h"
1.22 paf 15: #include "pa_exception.h"
1.53 paf 16: #include "pa_common.h"
1.60 paf 17: #include "pa_array.h"
18: #include "pa_globals.h"
1.61 paf 19: #include "pa_table.h"
1.101 parser 20: #include "pa_dictionary.h"
1.132 paf 21: #include "pa_charset.h"
1.60 paf 22:
1.139 paf 23: #define DEBUG_STRING_APPENDS_VS_EXPANDS
24:
25:
26: #ifdef DEBUG_STRING_APPENDS_VS_EXPANDS
27: ulong string_piece_appends=0;
28: #endif
29:
1.160 paf 30: String& String::OnPool(Pool& apool, const char *local_src, size_t src_size, bool tainted) {
31: if(local_src && *local_src) {
32: if(src_size==0)
33: src_size=strlen(local_src);
34:
35: char *pooled_src=(char *)apool.malloc(src_size);
36: memcpy(pooled_src, local_src, src_size);
37: return *new(apool) String(apool, pooled_src, src_size, tainted);
38: } else
39: return *new(apool) String(apool);
40: }
1.75 paf 41: String::String(Pool& apool, const char *src, size_t src_size, bool tainted) :
1.120 paf 42: Pooled(apool) {
1.151 paf 43: last_chunk=&head.chunk;
44: head.chunk.count=CR_PREALLOCATED_COUNT;
45: append_here=head.chunk.rows;
1.41 paf 46:
47: if(src)
1.75 paf 48: if(tainted)
49: APPEND_TAINTED(src, src_size, 0, 0);
1.41 paf 50: else
1.75 paf 51: APPEND_CLEAN(src, src_size, 0, 0);
1.1 paf 52: }
1.140 paf 53:
54: String::String(const String& src) :
55: Pooled(src.pool()) {
1.151 paf 56: last_chunk=&head.chunk;
57: head.chunk.count=CR_PREALLOCATED_COUNT;
58: append_here=head.chunk.rows;
1.140 paf 59:
1.169 paf 60: append(src, UL_PASS_APPENDED);
1.120 paf 61: }
62:
63: size_t String::size() const {
64: size_t result=0;
1.123 paf 65: STRING_FOREACH_ROW(
1.120 paf 66: result+=row->item.size;
1.123 paf 67: );
1.120 paf 68: return result;
1.94 parser 69: }
70:
1.115 paf 71: /// @todo not very optimal
72: uint String::used_rows() const {
73: uint result=0;
1.123 paf 74: STRING_FOREACH_ROW(
75: result++;
76: );
1.115 paf 77: return result;
78: }
1.94 parser 79: void String::expand() {
1.143 paf 80: uint new_chunk_count=last_chunk->count+CR_GROW_COUNT;
1.139 paf 81: if(new_chunk_count>max_integral(Chunk::count_type))
82: new_chunk_count=max_integral(Chunk::count_type);
1.122 paf 83:
1.152 paf 84: Chunk *new_chunk=static_cast<Chunk *>(malloc(
1.151 paf 85: sizeof(Chunk)// count+interpadding(?)+rows[CR_PREALLOCATED_COUNT]+tailpadding(??)
86: -sizeof(Chunk::rows_type) // PREALLOCATED rows
87: +sizeof(Chunk::Row)*new_chunk_count // neaded rows
88: +sizeof(Chunk *) // link size
89: , 10));
1.141 paf 90: new_chunk->rows[new_chunk->count=new_chunk_count].link=0;
91: last_chunk->rows[last_chunk->count].link=new_chunk;
92:
93: last_chunk=new_chunk;
1.94 parser 94: append_here=last_chunk->rows;
1.5 paf 95: }
1.28 paf 96:
1.13 paf 97: String& String::real_append(STRING_APPEND_PARAMS) {
1.139 paf 98: if(!last_chunk) // growth stopped [we're appended as string to somebody]
1.149 paf 99: throw Exception(0,
1.139 paf 100: this,
1.142 paf 101: "string growth stopped (append cstr)");
1.139 paf 102:
1.9 paf 103: if(!src)
104: return *this;
1.26 paf 105: if(!size)
106: size=strlen(src);
107: if(!size)
1.9 paf 108: return *this;
1.122 paf 109:
1.139 paf 110: #ifdef DEBUG_STRING_APPENDS_VS_EXPANDS
111: string_piece_appends++;
112: #endif
113:
114: // manually unrolled to avoid extra check
115: while(size>max_integral(Chunk::Row::item_size_type)) {
1.122 paf 116: if(chunk_is_full())
117: expand();
118:
119: append_here->item.ptr=src;
1.139 paf 120: append_here->item.size=max_integral(Chunk::Row::item_size_type);
1.122 paf 121: append_here->item.lang=lang;
122: #ifndef NO_STRING_ORIGIN
123: append_here->item.origin.file=file;
124: append_here->item.origin.line=line;
125: #endif
126: append_here++;
127:
1.139 paf 128: src+=max_integral(Chunk::Row::item_size_type);
129: size-=max_integral(Chunk::Row::item_size_type);
1.122 paf 130: }
1.9 paf 131:
1.1 paf 132: if(chunk_is_full())
133: expand();
134:
135: append_here->item.ptr=src;
1.121 paf 136: append_here->item.size=size;
1.52 paf 137: append_here->item.lang=lang;
1.13 paf 138: #ifndef NO_STRING_ORIGIN
1.14 paf 139: append_here->item.origin.file=file;
140: append_here->item.origin.line=line;
1.13 paf 141: #endif
1.115 paf 142: append_here++;
1.1 paf 143:
144: return *this;
145: }
146:
1.16 paf 147: uint String::hash_code() const {
1.7 paf 148: uint result=0;
1.123 paf 149: STRING_FOREACH_ROW(
1.6 paf 150: result=Hash::generic_code(result, row->item.ptr, row->item.size);
1.123 paf 151: );
1.5 paf 152: return result;
153: }
154:
1.60 paf 155: /// @todo move 'lang' skipping to pos
156: int String::cmp(int& partial, const String& src,
157: size_t this_offset, Untaint_lang lang) const {
1.59 paf 158: partial=-1;
1.125 paf 159: size_t a_size=size();
160: this_offset=min(this_offset, a_size-1);
1.55 paf 161:
1.151 paf 162: const Chunk *a_chunk=&head.chunk;
163: const Chunk *b_chunk=&src.head.chunk;
1.16 paf 164: const Chunk::Row *a_row=a_chunk->rows;
165: const Chunk::Row *b_row=b_chunk->rows;
1.55 paf 166: size_t a_offset=this_offset;
167: size_t b_offset=0;
1.9 paf 168: Chunk::Row *a_end=append_here;
169: Chunk::Row *b_end=src.append_here;
1.116 paf 170: uint a_countdown=a_chunk->count;
171: uint b_countdown=b_chunk->count;
172: int result;
1.60 paf 173: size_t pos=0;
1.33 paf 174:
1.125 paf 175: bool a_break=a_size==0;
1.140 paf 176: bool b_break=src.is_empty();
1.83 parser 177: if(!(a_break || b_break)) while(true) {
1.55 paf 178: if(pos+a_row->item.size > this_offset) {
1.136 paf 179: if(lang!=UL_UNSPECIFIED && a_row->item.lang>lang)
1.60 paf 180: return -1; // wrong lang -- bail out
181:
1.55 paf 182: int size_diff=
183: (a_row->item.size-a_offset)-
184: (b_row->item.size-b_offset);
185:
186: if(size_diff==0) { // a has same size as b
1.60 paf 187: result=memcmp(a_row->item.ptr+a_offset, b_row->item.ptr+b_offset,
188: a_row->item.size-a_offset);
1.55 paf 189: if(result)
190: return result;
1.60 paf 191: pos+=a_row->item.size;
1.55 paf 192: a_row++; a_countdown--; a_offset=0;
193: b_row++; b_countdown--; b_offset=0;
194: } else if (size_diff>0) { // a longer
1.60 paf 195: result=memcmp(a_row->item.ptr+a_offset, b_row->item.ptr+b_offset,
196: b_row->item.size-b_offset);
1.55 paf 197: if(result)
198: return result;
199: a_offset+=b_row->item.size-b_offset;
200: b_row++; b_countdown--; b_offset=0;
201: } else { // b longer
1.60 paf 202: result=memcmp(a_row->item.ptr+a_offset, b_row->item.ptr+b_offset,
203: a_row->item.size-a_offset);
1.55 paf 204: if(result)
205: return result;
206: b_offset+=a_row->item.size-a_offset;
1.60 paf 207: pos+=a_row->item.size;
1.55 paf 208: a_row++; a_countdown--; a_offset=0;
209: }
1.83 parser 210: if(b_break=b_row==b_end) {
211: a_break=a_row==a_end;
212: break;
213: }
1.55 paf 214: if(!b_countdown) {
215: b_chunk=b_row->link;
216: b_row=b_chunk->rows;
217: b_countdown=b_chunk->count;
218: }
219: } else {
1.60 paf 220: a_offset-=a_row->item.size;
221: pos+=a_row->item.size;
222: a_row++; a_countdown--;
1.9 paf 223: }
224:
1.83 parser 225: if(a_break=a_row==a_end) {
226: b_break=b_row==b_end;
227: break;
228: }
1.11 paf 229: if(!a_countdown) {
1.9 paf 230: a_chunk=a_row->link;
231: a_row=a_chunk->rows;
1.11 paf 232: a_countdown=a_chunk->count;
1.9 paf 233: }
1.27 paf 234: }
1.55 paf 235: if(a_break==b_break) { // ended simultaneously
236: partial=0; return 0;
237: } else if(a_break) { // first bytes equal, but a ended before b
238: partial=1; return -1;
239: } else {
240: partial=2; return +1;
241: }
1.27 paf 242: }
243:
1.60 paf 244: /// @todo move 'lang' skipping to pos
1.59 paf 245: int String::cmp(int& partial, const char* b_ptr, size_t src_size,
1.60 paf 246: size_t this_offset, Untaint_lang lang) const {
1.59 paf 247: partial=-1;
1.125 paf 248: size_t a_size=size();
1.50 paf 249: size_t b_size=src_size?src_size:b_ptr?strlen(b_ptr):0;
1.125 paf 250: this_offset=min(this_offset, a_size-1);
1.27 paf 251:
1.151 paf 252: const Chunk *a_chunk=&head.chunk;
1.27 paf 253: const Chunk::Row *a_row=a_chunk->rows;
1.59 paf 254: size_t a_offset=this_offset;
1.55 paf 255: size_t b_offset=0;
1.27 paf 256: Chunk::Row *a_end=append_here;
1.116 paf 257: uint a_countdown=a_chunk->count;
1.60 paf 258: size_t pos=0;
1.52 paf 259:
1.125 paf 260: bool a_break=a_size==0;
1.83 parser 261: bool b_break=b_size==0;
262: if(!(a_break || b_break)) while(true) {
1.59 paf 263: if(pos+a_row->item.size > this_offset) {
1.136 paf 264: if(lang!=UL_UNSPECIFIED && a_row->item.lang>lang)
1.60 paf 265: return -1; // wrong lang -- bail out
266:
1.59 paf 267: int size_diff=
268: (a_row->item.size-a_offset)-
269: (b_size-b_offset);
270:
271: if(size_diff==0) { // a has same size as b
1.116 paf 272: if(int result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset,
1.59 paf 273: a_row->item.size-a_offset)!=0)
274: return result;
1.60 paf 275: pos+=a_row->item.size;
1.59 paf 276: a_row++; a_countdown--; a_offset=0;
277: b_break=true;
278: } else if (size_diff>0) { // a longer
1.116 paf 279: if(int result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset,
1.59 paf 280: b_size-b_offset)!=0)
281: return result;
282: a_offset+=b_size-b_offset;
283: b_break=true;
284: } else { // b longer
1.116 paf 285: if(int result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset,
1.59 paf 286: a_row->item.size-a_offset)!=0)
287: return result;
288: b_offset+=a_row->item.size-a_offset;
1.60 paf 289: pos+=a_row->item.size;
1.59 paf 290: a_row++; a_countdown--; a_offset=0;
291: }
292: } else {
1.60 paf 293: a_offset-=a_row->item.size;
294: pos+=a_row->item.size;
295: a_row++; a_countdown--;
1.27 paf 296: }
297:
1.86 parser 298: a_break=a_row==a_end;
299: if(a_break || b_break)
1.83 parser 300: break;
1.27 paf 301: if(!a_countdown) {
302: a_chunk=a_row->link;
303: a_row=a_chunk->rows;
304: a_countdown=a_chunk->count;
1.9 paf 305: }
306: }
1.55 paf 307: if(a_break==b_break) { // ended simultaneously
308: partial=0; return 0;
309: } else if(a_break) { // first bytes equal, but a ended before b
310: partial=1; return -1;
311: } else {
312: partial=2; return +1;
313: }
1.5 paf 314: }
1.46 paf 315:
316: #ifndef NO_STRING_ORIGIN
317: const Origin& String::origin() const {
1.140 paf 318: if(is_empty()) {
1.96 parser 319: static const Origin empty_origin={"empty string"};
320: return empty_origin;
321: }
1.46 paf 322:
1.147 paf 323: // determining origin by first piece or last appended piece
324: // because any of them can be constant=without origin:
1.50 paf 325: // ex: ^load[/file] "document_root" + "/file"
1.80 paf 326: // when last peice is constant,
327: // ex: parser_root_auto_path{dynamic} / auto.p{const}
328: // using first piece
1.151 paf 329: Origin& first_origin=head.chunk.rows[0].item.origin;
1.147 paf 330: return first_origin.file ? first_origin : append_here[-1].item.origin;
1.46 paf 331: }
332: #endif
1.53 paf 333:
1.69 paf 334: String& String::mid(size_t start, size_t finish) const {
1.107 parser 335: String& result=*NEW String(pool());
336:
1.166 paf 337: start=min(start, size());
1.167 paf 338: finish=max(start, finish);
1.60 paf 339: if(start==finish)
1.107 parser 340: return result;
1.53 paf 341:
342: size_t pos=0;
1.123 paf 343: STRING_FOREACH_ROW(
344: size_t item_finish=pos+row->item.size;
345: if(item_finish > start) { // started now or already?
1.140 paf 346: bool started=result.is_empty(); // started now?
1.123 paf 347: bool finished=finish <= item_finish; // finished now?
348: size_t offset=started?start-pos:0;
349: size_t size=finished?finish-pos:row->item.size;
350: result.APPEND(
351: row->item.ptr+offset, size-offset,
352: row->item.lang,
353: row->item.origin.file, row->item.origin.line);
354: if(finished)
1.53 paf 355: goto break2;
356: }
1.123 paf 357: pos+=row->item.size;
358: );
1.53 paf 359: break2:
1.60 paf 360: // SAPI::log(pool(), "piece of '%s' from %d to %d is '%s'",
361: //cstr(), start, finish, result.cstr());
1.53 paf 362: return result;
1.54 paf 363: }
364:
1.60 paf 365: int String::pos(const String& substr,
1.116 paf 366: int result, Untaint_lang lang) const {
1.125 paf 367: size_t self_size=size();
1.131 paf 368: for(; size_t(result)<self_size; result++) {
1.60 paf 369: int partial; cmp(partial, substr, result, lang);
1.58 paf 370: if(
371: partial==0 || // full match
372: partial==2) // 'substr' starts 'this'+'result'
373: return result;
374: }
375:
376: return -1;
377: }
378:
1.60 paf 379: int String::pos(const char *substr, size_t substr_size,
1.116 paf 380: int result, Untaint_lang lang) const {
1.125 paf 381: size_t self_size=size();
1.131 paf 382: for(; size_t(result)<self_size; result++) {
1.60 paf 383: int partial; cmp(partial, substr, substr_size, result, lang);
1.55 paf 384: if(
385: partial==0 || // full match
386: partial==2) // 'substr' starts 'this'+'result'
387: return result;
388: }
389:
390: return -1;
1.60 paf 391: }
392:
393: void String::split(Array& result,
394: size_t* pos_after_ref,
395: const char *delim, size_t delim_size,
396: Untaint_lang lang, int limit) const {
1.125 paf 397: size_t self_size=size();
1.60 paf 398: if(delim_size) {
399: size_t pos_after=pos_after_ref?*pos_after_ref:0;
400: int pos_before;
401: // while we have 'delim'...
402: for(; (pos_before=pos(delim, delim_size, pos_after, lang))>=0 && limit; limit--) {
1.69 paf 403: result+=&mid(pos_after, pos_before);
1.60 paf 404: pos_after=pos_before+delim_size;
405: }
406: // last piece
1.124 paf 407: if(pos_after<self_size && limit) {
408: result+=&mid(pos_after, self_size);
409: pos_after=self_size;
1.60 paf 410: }
411: if(pos_after_ref)
412: *pos_after_ref=pos_after;
413: } else { // empty delim
414: result+=this;
415: if(pos_after_ref)
1.124 paf 416: *pos_after_ref+=self_size;
1.60 paf 417: }
418: }
419:
420: void String::split(Array& result,
421: size_t* pos_after_ref,
422: const String& delim, Untaint_lang lang,
423: int limit) const {
1.140 paf 424: if(!delim.is_empty()) {
1.60 paf 425: size_t pos_after=pos_after_ref?*pos_after_ref:0;
426: int pos_before;
427: // while we have 'delim'...
428: for(; (pos_before=pos(delim, pos_after, lang))>=0 && limit; limit--) {
1.69 paf 429: result+=&mid(pos_after, pos_before);
1.60 paf 430: pos_after=pos_before+delim.size();
431: }
432: // last piece
433: if(pos_after<size() && limit) {
1.69 paf 434: result+=&mid(pos_after, size());
1.60 paf 435: pos_after=size();
436: }
437: if(pos_after_ref)
438: *pos_after_ref=pos_after;
439: } else { // empty delim
440: result+=this;
441: if(pos_after_ref)
442: *pos_after_ref+=size();
443: }
1.61 paf 444: }
445:
1.154 paf 446: static void regex_options(const String *options, int *result, bool& need_pre_post_match){
1.63 paf 447: struct Regex_option {
1.153 paf 448: const char *keyL;
449: const char *keyU;
1.63 paf 450: int clear, set;
451: int *result;
1.154 paf 452: bool *flag;
1.63 paf 453: } regex_option[]={
1.153 paf 454: {"i", "I", 0, PCRE_CASELESS, result}, // a=A
455: {"s", "S", 0, PCRE_DOTALL, result}, // \n\n$ [default]
456: {"x", "U", 0, PCRE_EXTENDED, result}, // whitespace in regex ignored
457: {"m", "M", PCRE_DOTALL, PCRE_MULTILINE, result}, // ^aaa\n$^bbb\n$
458: {"g", "G", 0, true, result+1}, // many rows
1.154 paf 459: {"'", 0, 0, 0, 0, &need_pre_post_match},
460: {0}
1.63 paf 461: };
462: result[0]=PCRE_EXTRA | PCRE_DOTALL;
463: result[1]=0;
464:
465: if(options)
1.153 paf 466: for(Regex_option *o=regex_option; o->keyL; o++)
1.154 paf 467: if(options->pos(o->keyL)>=0
468: || (o->keyU && options->pos(o->keyU)>=0)) {
469: if(o->flag)
470: *o->flag=true;
471: else { // result
472: *o->result &= ~o->clear;
473: *o->result |= o->set;
474: }
1.63 paf 475: }
476: }
477:
1.155 paf 478: /// @todo make replacement Table stacked
1.158 paf 479: bool String::match(
1.77 paf 480: const String *aorigin,
1.62 paf 481: const String& regexp,
1.63 paf 482: const String *options,
1.64 paf 483: Table **table,
1.95 parser 484: Row_action row_action, void *info,
485: bool *was_global) const {
1.64 paf 486:
1.140 paf 487: if(regexp.is_empty())
1.149 paf 488: throw Exception(0,
1.73 paf 489: aorigin,
490: "regexp is empty");
1.154 paf 491:
1.118 paf 492: const char *pattern=regexp.cstr();
1.62 paf 493: const char *errptr;
494: int erroffset;
1.154 paf 495: bool need_pre_post_match=false;
496: int option_bits[2]; regex_options(options, option_bits, need_pre_post_match);
1.95 parser 497: if(was_global)
498: *was_global=option_bits[1]!=0;
1.63 paf 499: pcre *code=pcre_compile(pattern, option_bits[0],
1.62 paf 500: &errptr, &erroffset,
1.132 paf 501: pool().get_source_charset().pcre_tables);
1.62 paf 502:
1.67 paf 503: if(!code)
1.149 paf 504: throw Exception(0,
1.69 paf 505: ®exp.mid(erroffset, regexp.size()),
1.74 paf 506: "regular expression syntax error - %s", errptr);
1.62 paf 507:
1.63 paf 508: int info_substrings=pcre_info(code, 0, 0);
509: if(info_substrings<0) {
1.100 parser 510: pcre_free(code);
1.149 paf 511: throw Exception(0,
1.73 paf 512: aorigin,
1.76 paf 513: "pcre_info error (%d)",
1.73 paf 514: info_substrings);
1.63 paf 515: }
516:
1.158 paf 517: const char *subject=cstr();
1.62 paf 518: int length=strlen(subject);
1.155 paf 519: const int ovecsize=(1/*match*/+MAX_STRING_MATCH_TABLE_COLUMNS)*3;
520: int ovector[ovecsize];
521:
522: // create table
1.157 paf 523: *table=NEW Table(pool(), *string_match_table_template);
1.63 paf 524:
1.64 paf 525: int exec_option_bits=0;
1.154 paf 526: int prestart=0;
527: int poststart=0;
528: int postfinish=size();
1.63 paf 529: while(true) {
530: int exec_substrings=pcre_exec(code, 0,
1.154 paf 531: subject, length, prestart,
1.64 paf 532: exec_option_bits, ovector, ovecsize);
1.63 paf 533:
534: if(exec_substrings==PCRE_ERROR_NOMATCH) {
1.100 parser 535: pcre_free(code);
1.154 paf 536: row_action(**table, 0/*last time, no row*/, 0, 0, poststart, postfinish, info);
1.63 paf 537: return option_bits[1]!=0; // global=true+table, not global=false
538: }
539:
540: if(exec_substrings<0) {
1.100 parser 541: pcre_free(code);
1.149 paf 542: throw Exception(0,
1.63 paf 543: aorigin,
1.76 paf 544: "regular expression execute error (%d)",
1.63 paf 545: exec_substrings);
546: }
547:
1.154 paf 548: int prefinish=ovector[0];
549: poststart=ovector[1];
1.63 paf 550: Array& row=*NEW Array(pool());
1.154 paf 551: row+=need_pre_post_match?&mid(0, prefinish):0; // .prematch column value
552: row+=need_pre_post_match?&mid(prefinish, poststart):0; // .match
553: row+=need_pre_post_match?&mid(poststart, postfinish):0; // .postmatch
1.63 paf 554:
555: for(int i=1; i<exec_substrings; i++) {
1.69 paf 556: // -1:-1 case handled peacefully by mid() itself
557: row+=&mid(ovector[i*2+0], ovector[i*2+1]); // .i column value
1.63 paf 558: }
559:
1.154 paf 560: row_action(**table, &row, prestart, prefinish, poststart, postfinish, info);
1.63 paf 561:
1.154 paf 562: if(!option_bits[1] || prestart==poststart) { // not global | going to hang
1.100 parser 563: pcre_free(code);
1.154 paf 564: row_action(**table, 0/*last time, no row*/, 0, 0, poststart, postfinish, info);
1.63 paf 565: return true;
566: }
1.154 paf 567: prestart=poststart;
1.63 paf 568:
569: /*
570: if(option_bits[0] & PCRE_MULTILINE)
1.64 paf 571: exec_option_bits|=PCRE_NOTBOL; // start of subject+startoffset not BOL
1.63 paf 572: */
573: }
1.82 parser 574: }
575:
1.132 paf 576: String& String::change_case(Pool& pool,
1.82 parser 577: Change_case_kind kind) const {
1.132 paf 578: const unsigned char *tables=pool.get_source_charset().pcre_tables;
1.82 parser 579: String& result=*new(pool) String(pool);
580:
581: const unsigned char *a;
582: const unsigned char *b;
583: switch(kind) {
584: case CC_UPPER:
585: a=tables+lcc_offset;
586: b=tables+fcc_offset;
587: break;
588: case CC_LOWER:
589: a=tables+lcc_offset;
590: b=0;
591: break;
592: default:
1.149 paf 593: throw Exception(0,
1.82 parser 594: this,
595: "unknown change case kind #%d",
596: static_cast<int>(kind)); // never
597: a=b=0; // calm, compiler
598: break; // never
599: }
600:
1.143 paf 601: STRING_FOREACH_ROW(
602: char *new_cstr=(char *)pool.malloc(row->item.size, 12);
603: char *dest=new_cstr;
604: const char *src=row->item.ptr;
605: for(int size=row->item.size; size--; src++) {
606: unsigned char c=a[(unsigned char)*src];
607: if(b)
608: c=b[c];
1.82 parser 609:
1.143 paf 610: *dest++=(char)c;
1.82 parser 611: }
1.143 paf 612:
613: result.APPEND(new_cstr, row->item.size,
614: row->item.lang,
615: row->item.origin.file, row->item.origin.line);
616: );
1.89 parser 617:
1.101 parser 618: return result;
619: }
620:
1.150 paf 621: /// @test if in some piece were found no dict words, append it, not it's duplicate
622: String& String::replace(Pool& pool, Dictionary& dict) const {
1.170 ! paf 623: char *lcstr=cstr();
! 624: const char *current=lcstr;
! 625:
1.108 parser 626: String& result=*new(pool) String(pool);
1.143 paf 627: STRING_FOREACH_ROW(
1.170 ! paf 628: IFNDEF_NO_STRING_ORIGIN(
! 629: const char *joined_origin_file=row->item.origin.file;
! 630: const size_t joined_origin_line=row->item.origin.line;
! 631: );
! 632: uchar joined_lang=row->item.lang;
! 633: const char *joined_ptr=current;
! 634: // calc size
! 635: size_t joined_size=0;
! 636: STRING_PREPARED_FOREACH_ROW(*this,
! 637: if(row->item.lang==joined_lang)
! 638: joined_size+=row->item.size;
! 639: else
! 640: break; // before non-ours
! 641: );
! 642: current+=joined_size;
! 643:
! 644: // pointers are after joined piece
! 645: // & one step back, see STRING_FOREACH_ROW
! 646: --row; ++countdown;
! 647:
! 648: char *new_cstr=(char *)pool.malloc((size_t)ceil(joined_size*dict.max_ratio()), 14);
1.123 paf 649: char *dest=new_cstr;
1.170 ! paf 650: while(joined_size) {
! 651: // there is a row where first column starts 'joined_ptr'
! 652: if(Table::Item *item=dict.first_that_starts(joined_ptr, joined_size)) {
1.123 paf 653: // get a=>b values
654: const String& a=*static_cast<Array *>(item)->get_string(0);
655: const String& b=*static_cast<Array *>(item)->get_string(1);
1.170 ! paf 656: // skip 'a' in 'joined_ptr' && reduce work size
! 657: joined_ptr+=a.size(); joined_size-=a.size();
1.123 paf 658: // write 'b' to 'dest' && skip 'b' in 'dest'
659: b.store_to(dest); dest+=b.size();
660: } else {
661: // write a char to b && reduce work size
1.170 ! paf 662: *dest++=*joined_ptr++; joined_size--;
1.101 parser 663: }
664: }
665:
1.170 ! paf 666: result.APPEND(new_cstr, dest-new_cstr, joined_lang,
! 667: joined_origin_file, joined_origin_line);
1.156 paf 668: );
1.170 ! paf 669:
1.156 paf 670: return result;
671: }
672:
673: String& String::join_chains(Pool& pool, char** acstr) const {
674: char *lcstr=cstr();
675: const char *current=lcstr;
676:
677: String& result=*new(pool) String(pool);
678: STRING_FOREACH_ROW(
679: IFNDEF_NO_STRING_ORIGIN(
680: const char *joined_origin_file=row->item.origin.file;
681: const size_t joined_origin_line=row->item.origin.line;
682: );
683: uchar joined_lang=row->item.lang;
684: const char *joined_ptr=current;
685: // calc size
686: size_t joined_size=0;
687: STRING_PREPARED_FOREACH_ROW(*this,
688: if(row->item.lang==joined_lang)
689: joined_size+=row->item.size;
690: else
691: break; // before non-ours
692: );
693: current+=joined_size;
694:
695: // pointers are after joined piece
696: // & one step back, see STRING_FOREACH_ROW
697: --row; ++countdown;
698:
699: result.APPEND(joined_ptr, joined_size, joined_lang,
1.150 paf 700: joined_origin_file, joined_origin_line);
1.123 paf 701: );
1.156 paf 702:
703: if(acstr)
704: *acstr=lcstr;
1.89 parser 705: return result;
706: }
707:
1.90 parser 708: double String::as_double() const {
1.89 parser 709: double result;
1.114 paf 710: const char *cstr;
711: char buf[MAX_NUMBER];
1.151 paf 712: if(head.chunk.rows+1==append_here) {
713: int size=min(head.chunk.rows[0].item.size, MAX_NUMBER-1);
714: memcpy(buf, head.chunk.rows[0].item.ptr, size);
1.114 paf 715: buf[size]=0;
716: cstr=buf;
717: } else
718: cstr=this->cstr();
1.161 paf 719: while(*cstr && isspace(*cstr))
720: cstr++;
721: if(!*cstr)
1.162 paf 722: return 0;
1.161 paf 723:
1.102 parser 724: char *error_pos;
1.89 parser 725: // 0xABC
1.99 parser 726: if(cstr[0]=='0')
727: if(cstr[1]=='x' || cstr[1]=='X')
728: result=(double)(unsigned long)strtol(cstr, &error_pos, 0);
729: else
1.102 parser 730: result=(double)strtod(cstr+1/*skip leading 0*/, &error_pos);
1.89 parser 731: else
1.99 parser 732: result=(double)strtod(cstr, &error_pos);
1.89 parser 733:
1.159 paf 734: while(char c=*error_pos++)
735: if(!isspace(c))
736: throw Exception("number.format",
737: this,
738: "invalid number (double)");
1.89 parser 739:
740: return result;
741: }
1.90 parser 742: int String::as_int() const {
1.89 parser 743: int result;
1.114 paf 744: const char *cstr;
745: char buf[MAX_NUMBER];
1.151 paf 746: if(head.chunk.rows+1==append_here) {
1.163 paf 747: size_t size=min(head.chunk.rows[0].item.size, MAX_NUMBER-1);
1.151 paf 748: memcpy(buf, head.chunk.rows[0].item.ptr, size);
1.114 paf 749: buf[size]=0;
750: cstr=buf;
751: } else
752: cstr=this->cstr();
1.161 paf 753: while(*cstr && isspace(*cstr))
754: cstr++;
755: if(!*cstr)
1.162 paf 756: return 0;
1.161 paf 757:
1.102 parser 758: char *error_pos;
1.89 parser 759: // 0xABC
1.99 parser 760: if(cstr[0]=='0')
761: if(cstr[1]=='x' || cstr[1]=='X')
762: result=(int)(unsigned long)strtol(cstr, &error_pos, 0);
763: else
1.102 parser 764: result=(int)strtol(cstr+1/*skip leading 0*/, &error_pos, 0);
1.89 parser 765: else
766: result=(int)strtol(cstr, &error_pos, 0);
767:
1.159 paf 768: while(char c=*error_pos++)
769: if(!isspace(c))
770: throw Exception("number.format",
771: this,
772: "invalid number (int)");
1.82 parser 773:
774: return result;
1.61 paf 775: }
1.113 parser 776:
1.128 paf 777: inline void ushort2uchars(ushort word, uchar& byte1, uchar& byte2) {
778: byte1=word&0xFF;
779: byte2=word>>8;
780: }
781: inline ushort uchars2ushort(uchar byte1, uchar byte2) {
782: return (byte2<<8) | byte1;
783: }
1.113 parser 784: /* @todo maybe network order worth spending some effort?
785: don't bothering myself with network byte order,
786: am not planning to be able to move resulting file across platforms
787: for now
788: */
789: void String::serialize(size_t prolog_size, void *& buf, size_t& buf_size) const {
790: buf_size=
791: prolog_size
1.126 paf 792: +used_rows()*(sizeof(uchar)+sizeof(ushort))
1.113 parser 793: +size();
1.114 paf 794: buf=malloc(buf_size,15);
1.113 parser 795: char *cur=(char *)buf+prolog_size;
796:
1.123 paf 797: STRING_FOREACH_ROW(
798: // lang
1.126 paf 799: memcpy(cur, &row->item.lang, sizeof(uchar));
800: cur+=sizeof(uchar);
1.123 paf 801: // size
1.128 paf 802: uchar byte1; uchar byte2;
803: ushort2uchars(row->item.size, byte1, byte2);
804: memcpy(cur, &byte1, sizeof(uchar)); cur+=sizeof(uchar);
805: memcpy(cur, &byte2, sizeof(uchar)); cur+=sizeof(uchar);
1.123 paf 806: // bytes
807: memcpy(cur, row->item.ptr, row->item.size);
808: cur+=row->item.size;
809: );
1.113 parser 810: }
1.148 paf 811: bool String::deserialize(size_t prolog_size, void *buf, size_t buf_size, const char *file) {
1.135 paf 812: if(buf_size<=prolog_size)
1.148 paf 813: return false;
1.135 paf 814:
1.126 paf 815: char *cur=(char *)buf+prolog_size;
1.113 parser 816: buf_size-=prolog_size;
817:
818: while(buf_size) {
1.148 paf 819: if(sizeof(uchar)+sizeof(ushort)>buf_size) // lang+size
820: return false;
821:
822: uchar lang=*(uchar *)(cur);
1.128 paf 823: ushort size=uchars2ushort(
824: *(uchar*)(cur+sizeof(uchar)*1),
825: *(uchar*)(cur+sizeof(uchar)*2)
826: );
827:
1.148 paf 828: size_t piece_size=sizeof(uchar)+sizeof(ushort)+size;
829: if(piece_size>buf_size) // buffer overrun, can be on incomplete cache files
830: return false;
831:
1.128 paf 832: const char *ptr=(const char*)(cur+sizeof(uchar)*3);
1.126 paf 833: APPEND(ptr, size, lang, file, 0);
1.113 parser 834:
835: cur+=piece_size;
836: buf_size-=piece_size;
837: }
1.148 paf 838: return true;
1.113 parser 839: }
E-mail: