Annotation of parser3/src/main/pa_string.C, revision 1.172.2.18
1.45 paf 1: /** @file
1.55 paf 2: Parser: string class. @see untasize_t.C.
1.46 paf 3:
1.172.2.11 paf 4: Copyright (c) 2001-2003 ArtLebedev Group (http://www.artlebedev.com)
1.138 paf 5: Author: Alexandr Petrosian <paf@design.ru> (http://paf.design.ru)
1.164 paf 6: */
1.46 paf 7:
1.172.2.18! paf 8: static const char* IDENT_STRING_C="$Date: 2003/02/20 14:53:35 $";
1.4 paf 9:
1.70 paf 10: #include "pcre.h"
11:
1.12 paf 12: #include "pa_string.h"
1.22 paf 13: #include "pa_exception.h"
1.61 paf 14: #include "pa_table.h"
1.101 parser 15: #include "pa_dictionary.h"
1.132 paf 16: #include "pa_charset.h"
1.60 paf 17:
1.172.2.2 paf 18: // helpers
1.139 paf 19:
1.172.2.2 paf 20: /// String::match uses this as replace & global search table columns
1.139 paf 21:
1.172.2.4 paf 22: const int MAX_MATCH_GROUPS=100;
23:
1.172.2.14 paf 24: class String_match_table_template_columns: public ArrayString {
1.172.2.4 paf 25: Pool pool;
1.172.2.2 paf 26: public:
1.172.2.4 paf 27: String_match_table_template_columns() {
1.172.2.10 paf 28: *this+=StringPtr(new String("prematch"));
29: *this+=StringPtr(new String("match"));
30: *this+=StringPtr(new String("postmatch"));
1.172.2.4 paf 31: for(int i=0; i<MAX_MATCH_GROUPS; i++) {
32: char *cname=new(pool) char[3/*strlen("100")*/+1/*terminating 0*/];
1.172.2.10 paf 33: StringPtr sname(new String(cname, sprintf(cname, "%d", 1+i)));
1.172.2.4 paf 34: *this+=sname;
1.172.2.2 paf 35: }
36: }
1.172.2.4 paf 37: };
38:
39: Table string_match_table_template(
1.172.2.13 paf 40: Exception::undefined_source,
1.172.2.4 paf 41: Table::columns_type(new String_match_table_template_columns));
1.172.2.2 paf 42:
43: // methods
44:
1.172.2.11 paf 45: String::String(const char* src, size_t src_size, bool tainted): Array<String_fragment>(1), fsize(0) {
1.41 paf 46: if(src)
1.75 paf 47: if(tainted)
48: APPEND_TAINTED(src, src_size, 0, 0);
1.41 paf 49: else
1.75 paf 50: APPEND_CLEAN(src, src_size, 0, 0);
1.1 paf 51: }
1.140 paf 52:
1.172.2.4 paf 53: String::String(const String& src): Array<String_fragment>(src.count()) {
1.169 paf 54: append(src, UL_PASS_APPENDED);
1.120 paf 55: }
1.28 paf 56:
1.13 paf 57: String& String::real_append(STRING_APPEND_PARAMS) {
1.9 paf 58: if(!src)
59: return *this;
1.26 paf 60: if(!size)
61: size=strlen(src);
62: if(!size)
1.9 paf 63: return *this;
1.122 paf 64:
1.172.2.2 paf 65: if(is_full())
66: expand(fdelta);
1.1 paf 67:
1.172.2.4 paf 68: String_fragment *fragment=&felements[fused++];
69: fragment->ptr=src;
70: fragment->size=size;
71: fragment->lang=lang;
1.13 paf 72: #ifndef NO_STRING_ORIGIN
1.172.2.4 paf 73: fragment->origin.file=file;
74: fragment->origin.line=line;
1.13 paf 75: #endif
1.172.2.16 paf 76:
77: fsize+=size;
1.1 paf 78:
79: return *this;
80: }
81:
1.16 paf 82: uint String::hash_code() const {
1.7 paf 83: uint result=0;
1.172.2.4 paf 84: STRING_FOREACH_FRAGMENT(
85: result=generic_hash_code(result, fragment->ptr, fragment->size);
1.123 paf 86: );
1.5 paf 87: return result;
88: }
89:
1.60 paf 90: /// @todo move 'lang' skipping to pos
91: int String::cmp(int& partial, const String& src,
92: size_t this_offset, Untaint_lang lang) const {
1.59 paf 93: partial=-1;
1.125 paf 94: size_t a_size=size();
95: this_offset=min(this_offset, a_size-1);
1.55 paf 96:
1.172.2.4 paf 97: const String_fragment *a_current=felements;
98: const String_fragment *b_current=src.felements;
1.55 paf 99: size_t a_offset=this_offset;
100: size_t b_offset=0;
1.172.2.4 paf 101: String_fragment *a_end=felements+fused;
102: String_fragment *b_end=src.felements+src.fused;
1.116 paf 103: int result;
1.60 paf 104: size_t pos=0;
1.33 paf 105:
1.172.2.4 paf 106: bool a_break;
107: bool b_break;
1.172.2.17 paf 108: while(true) {
109: a_break=(a_current==a_end);
110: b_break=(b_current==b_end);
111: if(a_break || b_break)
112: break;
113:
1.172.2.4 paf 114: if(pos+a_current->size > this_offset) {
1.172.2.6 paf 115: if(lang!=UL_UNSPECIFIED && a_current->lang>(String_UL)lang)
1.60 paf 116: return -1; // wrong lang -- bail out
117:
1.55 paf 118: int size_diff=
1.172.2.4 paf 119: (a_current->size-a_offset)-
120: (b_current->size-b_offset);
1.55 paf 121:
122: if(size_diff==0) { // a has same size as b
1.172.2.4 paf 123: result=memcmp(a_current->ptr+a_offset, b_current->ptr+b_offset,
124: a_current->size-a_offset);
1.55 paf 125: if(result)
126: return result;
1.172.2.4 paf 127: pos+=a_current->size;
128: a_current++; a_offset=0;
129: b_current++; b_offset=0;
130: } else if(size_diff>0) { // a longer
131: result=memcmp(a_current->ptr+a_offset, b_current->ptr+b_offset,
132: b_current->size-b_offset);
1.55 paf 133: if(result)
134: return result;
1.172.2.4 paf 135: a_offset+=b_current->size-b_offset;
136: b_current++; b_offset=0;
1.55 paf 137: } else { // b longer
1.172.2.4 paf 138: result=memcmp(a_current->ptr+a_offset, b_current->ptr+b_offset,
139: a_current->size-a_offset);
1.55 paf 140: if(result)
141: return result;
1.172.2.4 paf 142: b_offset+=a_current->size-a_offset;
143: pos+=a_current->size;
144: a_current++; a_offset=0;
1.55 paf 145: }
146: } else {
1.172.2.4 paf 147: a_offset-=a_current->size;
148: pos+=a_current->size;
149: a_current++;
1.9 paf 150: }
1.27 paf 151: }
1.55 paf 152: if(a_break==b_break) { // ended simultaneously
153: partial=0; return 0;
154: } else if(a_break) { // first bytes equal, but a ended before b
155: partial=1; return -1;
156: } else {
157: partial=2; return +1;
158: }
1.27 paf 159: }
160:
1.60 paf 161: /// @todo move 'lang' skipping to pos
1.59 paf 162: int String::cmp(int& partial, const char* b_ptr, size_t src_size,
1.60 paf 163: size_t this_offset, Untaint_lang lang) const {
1.59 paf 164: partial=-1;
1.125 paf 165: size_t a_size=size();
1.50 paf 166: size_t b_size=src_size?src_size:b_ptr?strlen(b_ptr):0;
1.125 paf 167: this_offset=min(this_offset, a_size-1);
1.27 paf 168:
1.172.2.4 paf 169: const String_fragment *a_current=felements;
1.59 paf 170: size_t a_offset=this_offset;
1.55 paf 171: size_t b_offset=0;
1.172.2.4 paf 172: String_fragment *a_end=felements+fused;
1.60 paf 173: size_t pos=0;
1.52 paf 174:
1.172.2.4 paf 175: bool a_break;
1.83 parser 176: bool b_break=b_size==0;
1.172.2.17 paf 177: while(true) {
178: a_break=(a_current==a_end);
179: if(a_break || b_break)
180: break;
1.172.2.4 paf 181: if(pos+a_current->size > this_offset) {
1.172.2.6 paf 182: if(lang!=UL_UNSPECIFIED && a_current->lang>(String_UL)lang)
1.60 paf 183: return -1; // wrong lang -- bail out
184:
1.59 paf 185: int size_diff=
1.172.2.4 paf 186: (a_current->size-a_offset)-
1.59 paf 187: (b_size-b_offset);
188:
189: if(size_diff==0) { // a has same size as b
1.172.2.4 paf 190: if(int result=memcmp(a_current->ptr+a_offset, b_ptr+b_offset,
191: a_current->size-a_offset)!=0)
1.59 paf 192: return result;
1.172.2.4 paf 193: pos+=a_current->size;
194: a_current++; a_offset=0;
1.59 paf 195: b_break=true;
196: } else if (size_diff>0) { // a longer
1.172.2.4 paf 197: if(int result=memcmp(a_current->ptr+a_offset, b_ptr+b_offset,
1.59 paf 198: b_size-b_offset)!=0)
199: return result;
200: a_offset+=b_size-b_offset;
201: b_break=true;
202: } else { // b longer
1.172.2.4 paf 203: if(int result=memcmp(a_current->ptr+a_offset, b_ptr+b_offset,
204: a_current->size-a_offset)!=0)
1.59 paf 205: return result;
1.172.2.4 paf 206: b_offset+=a_current->size-a_offset;
207: pos+=a_current->size;
208: a_current++; a_offset=0;
1.59 paf 209: }
210: } else {
1.172.2.4 paf 211: a_offset-=a_current->size;
212: pos+=a_current->size;
213: a_current++;
1.9 paf 214: }
215: }
1.55 paf 216: if(a_break==b_break) { // ended simultaneously
217: partial=0; return 0;
218: } else if(a_break) { // first bytes equal, but a ended before b
219: partial=1; return -1;
220: } else {
221: partial=2; return +1;
222: }
1.5 paf 223: }
1.46 paf 224:
225: #ifndef NO_STRING_ORIGIN
1.172.2.4 paf 226: const String_fragment::Origin& String::origin() const {
1.140 paf 227: if(is_empty()) {
1.172.2.4 paf 228: static const String_fragment::Origin empty_origin={"empty string"};
1.96 parser 229: return empty_origin;
230: }
1.46 paf 231:
1.147 paf 232: // determining origin by first piece or last appended piece
233: // because any of them can be constant=without origin:
1.50 paf 234: // ex: ^load[/file] "document_root" + "/file"
1.80 paf 235: // when last peice is constant,
236: // ex: parser_root_auto_path{dynamic} / auto.p{const}
237: // using first piece
1.172.2.4 paf 238: String_fragment::Origin& first_origin=felements[0].origin;
239: return first_origin.file ? first_origin : felements[fused-1].origin;
1.46 paf 240: }
241: #endif
1.53 paf 242:
1.172.2.3 paf 243: StringPtr String::mid(size_t start, size_t finish) const {
244: StringPtr result(new String());
1.107 parser 245:
1.166 paf 246: start=min(start, size());
1.167 paf 247: finish=max(start, finish);
1.60 paf 248: if(start==finish)
1.107 parser 249: return result;
1.53 paf 250:
251: size_t pos=0;
1.172.2.4 paf 252: STRING_FOREACH_FRAGMENT(
253: size_t item_finish=pos+fragment->size;
1.123 paf 254: if(item_finish > start) { // started now or already?
1.172.2.4 paf 255: bool started=result->is_empty(); // started now?
1.123 paf 256: bool finished=finish <= item_finish; // finished now?
257: size_t offset=started?start-pos:0;
1.172.2.4 paf 258: size_t size=finished?finish-pos:fragment->size;
259: result->APPEND(
260: fragment->ptr+offset, size-offset,
261: fragment->lang,
262: fragment->origin.file, fragment->origin.line);
1.123 paf 263: if(finished)
1.53 paf 264: goto break2;
265: }
1.172.2.4 paf 266: pos+=fragment->size;
1.123 paf 267: );
1.53 paf 268: break2:
1.60 paf 269: // SAPI::log(pool(), "piece of '%s' from %d to %d is '%s'",
270: //cstr(), start, finish, result.cstr());
1.53 paf 271: return result;
1.54 paf 272: }
273:
1.60 paf 274: int String::pos(const String& substr,
1.116 paf 275: int result, Untaint_lang lang) const {
1.125 paf 276: size_t self_size=size();
1.131 paf 277: for(; size_t(result)<self_size; result++) {
1.60 paf 278: int partial; cmp(partial, substr, result, lang);
1.58 paf 279: if(
280: partial==0 || // full match
281: partial==2) // 'substr' starts 'this'+'result'
282: return result;
283: }
284:
285: return -1;
286: }
287:
1.172.2.11 paf 288: int String::pos(const char* substr, size_t substr_size,
1.116 paf 289: int result, Untaint_lang lang) const {
1.125 paf 290: size_t self_size=size();
1.131 paf 291: for(; size_t(result)<self_size; result++) {
1.60 paf 292: int partial; cmp(partial, substr, substr_size, result, lang);
1.55 paf 293: if(
294: partial==0 || // full match
295: partial==2) // 'substr' starts 'this'+'result'
296: return result;
297: }
298:
299: return -1;
1.60 paf 300: }
301:
1.172.2.14 paf 302: void String::split(ArrayString& result,
1.60 paf 303: size_t* pos_after_ref,
1.172.2.11 paf 304: const char* delim, size_t delim_size,
1.172.2.10 paf 305: Untaint_lang lang, int limit) {
1.125 paf 306: size_t self_size=size();
1.60 paf 307: if(delim_size) {
308: size_t pos_after=pos_after_ref?*pos_after_ref:0;
309: int pos_before;
310: // while we have 'delim'...
311: for(; (pos_before=pos(delim, delim_size, pos_after, lang))>=0 && limit; limit--) {
1.172.2.4 paf 312: result+=mid(pos_after, pos_before);
1.60 paf 313: pos_after=pos_before+delim_size;
314: }
315: // last piece
1.124 paf 316: if(pos_after<self_size && limit) {
1.172.2.4 paf 317: result+=mid(pos_after, self_size);
1.124 paf 318: pos_after=self_size;
1.60 paf 319: }
320: if(pos_after_ref)
321: *pos_after_ref=pos_after;
322: } else { // empty delim
1.172.2.10 paf 323: result+=StringPtr(this);
1.60 paf 324: if(pos_after_ref)
1.124 paf 325: *pos_after_ref+=self_size;
1.60 paf 326: }
327: }
328:
1.172.2.14 paf 329: void String::split(ArrayString& result,
1.60 paf 330: size_t* pos_after_ref,
331: const String& delim, Untaint_lang lang,
1.172.2.10 paf 332: int limit) {
1.140 paf 333: if(!delim.is_empty()) {
1.60 paf 334: size_t pos_after=pos_after_ref?*pos_after_ref:0;
335: int pos_before;
336: // while we have 'delim'...
337: for(; (pos_before=pos(delim, pos_after, lang))>=0 && limit; limit--) {
1.172.2.4 paf 338: result+=mid(pos_after, pos_before);
1.60 paf 339: pos_after=pos_before+delim.size();
340: }
341: // last piece
342: if(pos_after<size() && limit) {
1.172.2.4 paf 343: result+=mid(pos_after, size());
1.60 paf 344: pos_after=size();
345: }
346: if(pos_after_ref)
347: *pos_after_ref=pos_after;
348: } else { // empty delim
1.172.2.10 paf 349: result+=StringPtr(this);
1.60 paf 350: if(pos_after_ref)
351: *pos_after_ref+=size();
352: }
1.61 paf 353: }
354:
1.172.2.10 paf 355: static void regex_options(StringPtr options, int *result, bool& need_pre_post_match){
1.63 paf 356: struct Regex_option {
1.172.2.11 paf 357: const char* keyL;
358: const char* keyU;
1.63 paf 359: int clear, set;
360: int *result;
1.154 paf 361: bool *flag;
1.63 paf 362: } regex_option[]={
1.153 paf 363: {"i", "I", 0, PCRE_CASELESS, result}, // a=A
364: {"s", "S", 0, PCRE_DOTALL, result}, // \n\n$ [default]
365: {"x", "U", 0, PCRE_EXTENDED, result}, // whitespace in regex ignored
366: {"m", "M", PCRE_DOTALL, PCRE_MULTILINE, result}, // ^aaa\n$^bbb\n$
367: {"g", "G", 0, true, result+1}, // many rows
1.154 paf 368: {"'", 0, 0, 0, 0, &need_pre_post_match},
369: {0}
1.63 paf 370: };
1.171 paf 371: result[0]=PCRE_EXTRA | PCRE_DOTALL | PCRE_DOLLAR_ENDONLY;
1.63 paf 372: result[1]=0;
373:
374: if(options)
1.153 paf 375: for(Regex_option *o=regex_option; o->keyL; o++)
1.154 paf 376: if(options->pos(o->keyL)>=0
377: || (o->keyU && options->pos(o->keyU)>=0)) {
378: if(o->flag)
379: *o->flag=true;
380: else { // result
381: *o->result &= ~o->clear;
382: *o->result |= o->set;
383: }
1.63 paf 384: }
385: }
386:
1.172.2.4 paf 387: TablePtr String::match(Charset& source_charset,
1.172.2.10 paf 388: StringPtr aorigin,
1.172.2.4 paf 389: const String& regexp,
1.172.2.10 paf 390: StringPtr options,
1.172.2.4 paf 391: Row_action row_action, void *info,
392: bool *was_global) const {
1.64 paf 393:
1.140 paf 394: if(regexp.is_empty())
1.149 paf 395: throw Exception(0,
1.73 paf 396: aorigin,
397: "regexp is empty");
1.154 paf 398:
1.172.2.4 paf 399: CharPtr pattern=regexp.cstr();
1.172.2.11 paf 400: const char* errptr;
1.62 paf 401: int erroffset;
1.154 paf 402: bool need_pre_post_match=false;
403: int option_bits[2]; regex_options(options, option_bits, need_pre_post_match);
1.95 parser 404: if(was_global)
405: *was_global=option_bits[1]!=0;
1.172.2.4 paf 406: pcre *code=pcre_compile(pattern.get(), option_bits[0],
1.62 paf 407: &errptr, &erroffset,
1.172.2.1 paf 408: source_charset.pcre_tables);
1.62 paf 409:
1.67 paf 410: if(!code)
1.149 paf 411: throw Exception(0,
1.172.2.4 paf 412: regexp.mid(erroffset, regexp.size()),
1.74 paf 413: "regular expression syntax error - %s", errptr);
1.62 paf 414:
1.63 paf 415: int info_substrings=pcre_info(code, 0, 0);
416: if(info_substrings<0) {
1.100 parser 417: pcre_free(code);
1.149 paf 418: throw Exception(0,
1.73 paf 419: aorigin,
1.76 paf 420: "pcre_info error (%d)",
1.73 paf 421: info_substrings);
1.63 paf 422: }
423:
1.172.2.4 paf 424: CharPtr subject=cstr();
425: int length=size();
426: const int ovecsize=(1/*match*/+MAX_MATCH_GROUPS)*3;
1.155 paf 427: int ovector[ovecsize];
428:
429: // create table
1.172.2.4 paf 430: TablePtr result(new Table(string_match_table_template));
1.63 paf 431:
1.64 paf 432: int exec_option_bits=0;
1.154 paf 433: int prestart=0;
434: int poststart=0;
435: int postfinish=size();
1.63 paf 436: while(true) {
437: int exec_substrings=pcre_exec(code, 0,
1.172.2.4 paf 438: subject.get(), length, prestart,
1.64 paf 439: exec_option_bits, ovector, ovecsize);
1.63 paf 440:
441: if(exec_substrings==PCRE_ERROR_NOMATCH) {
1.100 parser 442: pcre_free(code);
1.172.2.15 paf 443: row_action(result, ArrayStringPtr(0)/*last time, no raw*/, 0, 0, poststart, postfinish, info);
1.172.2.4 paf 444: if(option_bits[1])
445: return result; // global=true+result
446: else
447: return TablePtr(0);// not global=no result
1.63 paf 448: }
449:
450: if(exec_substrings<0) {
1.100 parser 451: pcre_free(code);
1.149 paf 452: throw Exception(0,
1.63 paf 453: aorigin,
1.76 paf 454: "regular expression execute error (%d)",
1.63 paf 455: exec_substrings);
456: }
457:
1.154 paf 458: int prefinish=ovector[0];
459: poststart=ovector[1];
1.172.2.14 paf 460: object_ptr<ArrayString> row(new ArrayString);
1.172.2.4 paf 461: if(need_pre_post_match) {
462: *row+=mid(0, prefinish); // .prematch column value
463: *row+=mid(prefinish, poststart); // .match
464: *row+=mid(poststart, postfinish); // .postmatch
465: } else {
1.172.2.10 paf 466: *row+=StringPtr(0); // .prematch column value
467: *row+=StringPtr(0); // .match
468: *row+=StringPtr(0); // .postmatch
1.172.2.4 paf 469: }
1.63 paf 470:
471: for(int i=1; i<exec_substrings; i++) {
1.69 paf 472: // -1:-1 case handled peacefully by mid() itself
1.172.2.4 paf 473: *row+=mid(ovector[i*2+0], ovector[i*2+1]); // .i column value
1.63 paf 474: }
475:
1.172.2.15 paf 476: row_action(result, row, prestart, prefinish, poststart, postfinish, info);
1.63 paf 477:
1.154 paf 478: if(!option_bits[1] || prestart==poststart) { // not global | going to hang
1.100 parser 479: pcre_free(code);
1.172.2.15 paf 480: row_action(result, ArrayStringPtr(0)/*last time, no row*/, 0, 0, poststart, postfinish, info);
1.172.2.4 paf 481: return result;
1.63 paf 482: }
1.154 paf 483: prestart=poststart;
1.63 paf 484:
485: /*
486: if(option_bits[0] & PCRE_MULTILINE)
1.64 paf 487: exec_option_bits|=PCRE_NOTBOL; // start of subject+startoffset not BOL
1.63 paf 488: */
489: }
1.82 parser 490: }
491:
1.172.2.10 paf 492: StringPtr String::change_case(Pool& pool, Charset& source_charset, Change_case_kind kind) {
1.172.2.4 paf 493: StringPtr result(new String());
494:
1.172.2.1 paf 495: const unsigned char *tables=source_charset.pcre_tables;
1.82 parser 496:
497: const unsigned char *a;
498: const unsigned char *b;
499: switch(kind) {
500: case CC_UPPER:
501: a=tables+lcc_offset;
502: b=tables+fcc_offset;
503: break;
504: case CC_LOWER:
505: a=tables+lcc_offset;
506: b=0;
507: break;
508: default:
1.149 paf 509: throw Exception(0,
1.172.2.10 paf 510: StringPtr(this),
1.82 parser 511: "unknown change case kind #%d",
512: static_cast<int>(kind)); // never
513: a=b=0; // calm, compiler
514: break; // never
515: }
516:
1.172.2.4 paf 517: STRING_FOREACH_FRAGMENT(
518: char *new_cstr=new(pool) char[fragment->size];
1.143 paf 519: char *dest=new_cstr;
1.172.2.11 paf 520: const char* end=fragment->ptr+fragment->size;
521: for(const char* current=fragment->ptr; current<end; current++) {
1.172.2.4 paf 522: unsigned char c=a[(unsigned char)*current];
1.143 paf 523: if(b)
524: c=b[c];
1.82 parser 525:
1.143 paf 526: *dest++=(char)c;
1.82 parser 527: }
1.143 paf 528:
1.172.2.4 paf 529: result->APPEND(new_cstr, fragment->size,
530: fragment->lang,
531: fragment->origin.file, fragment->origin.line);
1.143 paf 532: );
1.89 parser 533:
1.101 parser 534: return result;
535: }
536:
1.150 paf 537: /// @test if in some piece were found no dict words, append it, not it's duplicate
1.172.2.4 paf 538: StringPtr String::replace(Pool& pool, Dictionary& dict) const {
539: StringPtr result(new String());
540: CharPtr lcstr(cstr());
1.172.2.11 paf 541: const char* current=lcstr.get();
1.170 paf 542:
1.172.2.4 paf 543: STRING_FOREACH_FRAGMENT(
1.170 paf 544: IFNDEF_NO_STRING_ORIGIN(
1.172.2.11 paf 545: const char* joined_origin_file=fragment->origin.file;
1.172.2.4 paf 546: const size_t joined_origin_line=fragment->origin.line;
1.170 paf 547: );
1.172.2.18! paf 548: String_UL joined_lang=fragment->lang;
1.172.2.11 paf 549: const char* joined_ptr=current;
1.170 paf 550: // calc size
551: size_t joined_size=0;
1.172.2.4 paf 552: STRING_PREPARED_FOREACH_FRAGMENT(
553: if(fragment->lang==joined_lang)
554: joined_size+=fragment->size;
1.170 paf 555: else
556: break; // before non-ours
557: );
558: current+=joined_size;
559:
560: // pointers are after joined piece
1.172.2.4 paf 561: // & one step back, see STRING_PREPARED_FOREACH_FRAGMENT
562: --fragment;
1.170 paf 563:
1.172.2.4 paf 564: char *new_cstr=new(pool) char[(size_t)ceil(joined_size*dict.max_ratio())];
1.123 paf 565: char *dest=new_cstr;
1.170 paf 566: while(joined_size) {
1.172.2.4 paf 567: // there is a fragment where first column starts 'joined_ptr'
568: if(Table::element_type row=dict.first_that_starts(joined_ptr, joined_size)) {
1.123 paf 569: // get a=>b values
1.172.2.10 paf 570: StringPtr a=row->get(0);
571: StringPtr b=row->get(1);
1.170 paf 572: // skip 'a' in 'joined_ptr' && reduce work size
1.172.2.4 paf 573: joined_ptr+=a->size(); joined_size-=a->size();
1.123 paf 574: // write 'b' to 'dest' && skip 'b' in 'dest'
1.172.2.5 paf 575: b->store_to(dest, String::UL_AS_IS); dest+=b->size();
1.123 paf 576: } else {
577: // write a char to b && reduce work size
1.170 paf 578: *dest++=*joined_ptr++; joined_size--;
1.101 parser 579: }
580: }
581:
1.172.2.4 paf 582: result->APPEND(new_cstr, dest-new_cstr, joined_lang,
1.170 paf 583: joined_origin_file, joined_origin_line);
1.156 paf 584: );
1.170 paf 585:
1.156 paf 586: return result;
587: }
588:
1.172.2.4 paf 589: StringPtr String::join_chains(Pool& pool) const {
590: StringPtr result(new String());
591:
592: char *pooled_buf=new(pool) char[cstr_bufsize()];
1.172.2.5 paf 593: store_to(pooled_buf, String::UL_AS_IS);
1.172.2.11 paf 594: const char* current=pooled_buf;
1.156 paf 595:
1.172.2.4 paf 596: STRING_FOREACH_FRAGMENT(
1.156 paf 597: IFNDEF_NO_STRING_ORIGIN(
1.172.2.11 paf 598: const char* joined_origin_file=fragment->origin.file;
1.172.2.4 paf 599: const size_t joined_origin_line=fragment->origin.line;
1.156 paf 600: );
1.172.2.18! paf 601: String_UL joined_lang=fragment->lang;
1.172.2.11 paf 602: const char* joined_ptr=current;
1.156 paf 603: // calc size
604: size_t joined_size=0;
1.172.2.4 paf 605: STRING_PREPARED_FOREACH_FRAGMENT(
606: if(fragment->lang==joined_lang)
607: joined_size+=fragment->size;
1.156 paf 608: else
609: break; // before non-ours
610: );
611: current+=joined_size;
612:
613: // pointers are after joined piece
1.172.2.4 paf 614: // & one step back, see STRING_PREPARED_FOREACH_FRAGMENT
615: --fragment;
1.156 paf 616:
1.172.2.4 paf 617: result->APPEND(joined_ptr, joined_size, joined_lang,
1.150 paf 618: joined_origin_file, joined_origin_line);
1.123 paf 619: );
1.156 paf 620:
1.89 parser 621: return result;
622: }
623:
1.172.2.10 paf 624: double String::as_double() {
1.89 parser 625: double result;
1.172.2.4 paf 626: char buf[MAX_STRING];
627: if(size()>MAX_STRING-1)
628: throw Exception("number.format",
1.172.2.10 paf 629: StringPtr(this),
1.172.2.4 paf 630: "invalid number too long a string (%u>%u)", size(), MAX_STRING-1);
1.172.2.5 paf 631: char *eol=store_to(buf, String::UL_AS_IS); *eol=0;
1.172.2.11 paf 632: const char* cstr=buf;
1.172.2.4 paf 633:
1.161 paf 634: while(*cstr && isspace(*cstr))
635: cstr++;
636: if(!*cstr)
1.162 paf 637: return 0;
1.161 paf 638:
1.102 parser 639: char *error_pos;
1.89 parser 640: // 0xABC
1.99 parser 641: if(cstr[0]=='0')
642: if(cstr[1]=='x' || cstr[1]=='X')
643: result=(double)(unsigned long)strtol(cstr, &error_pos, 0);
644: else
1.102 parser 645: result=(double)strtod(cstr+1/*skip leading 0*/, &error_pos);
1.89 parser 646: else
1.99 parser 647: result=(double)strtod(cstr, &error_pos);
1.89 parser 648:
1.159 paf 649: while(char c=*error_pos++)
650: if(!isspace(c))
651: throw Exception("number.format",
1.172.2.10 paf 652: StringPtr(this),
1.159 paf 653: "invalid number (double)");
1.89 parser 654:
655: return result;
656: }
1.172.2.10 paf 657: int String::as_int() {
1.89 parser 658: int result;
1.172.2.4 paf 659: char buf[MAX_STRING];
660: if(size()>MAX_STRING-1)
661: throw Exception("number.format",
1.172.2.10 paf 662: StringPtr(this),
1.172.2.4 paf 663: "invalid number too long a string (%u>%u)", size(), MAX_STRING-1);
1.172.2.5 paf 664: char *eol=store_to(buf, String::UL_AS_IS); *eol=0;
1.172.2.11 paf 665: const char* cstr=buf;
1.172.2.4 paf 666:
1.161 paf 667: while(*cstr && isspace(*cstr))
668: cstr++;
669: if(!*cstr)
1.162 paf 670: return 0;
1.161 paf 671:
1.102 parser 672: char *error_pos;
1.89 parser 673: // 0xABC
1.99 parser 674: if(cstr[0]=='0')
675: if(cstr[1]=='x' || cstr[1]=='X')
676: result=(int)(unsigned long)strtol(cstr, &error_pos, 0);
677: else
1.102 parser 678: result=(int)strtol(cstr+1/*skip leading 0*/, &error_pos, 0);
1.89 parser 679: else
680: result=(int)strtol(cstr, &error_pos, 0);
681:
1.159 paf 682: while(char c=*error_pos++)
683: if(!isspace(c))
684: throw Exception("number.format",
1.172.2.10 paf 685: StringPtr(this),
1.159 paf 686: "invalid number (int)");
1.82 parser 687:
688: return result;
1.61 paf 689: }
1.113 parser 690:
1.172.2.4 paf 691: inline void uint2uchars(uint word, uchar *bytes) {
692: bytes[0]=word&0xFF;
693: bytes[1]=(word>>8)&0xFF;
694: bytes[2]=(word>>16)&0xFF;
695: bytes[3]=(word>>24)&0xFF;
696: }
697: inline uint uchars2uint(uchar *bytes) {
698: return bytes[3]<<24
699: | bytes[2]<<16
700: | bytes[1]<<8
701: | bytes[0];
702: }
703:
704: void String::serialize(Pool& pool, size_t prolog_size, char *& buf, size_t& buf_size) const {
1.113 parser 705: buf_size=
706: prolog_size
1.172.2.18! paf 707: +fused*(sizeof(String_UL)+sizeof(size_t))
1.113 parser 708: +size();
1.172.2.4 paf 709: buf=new(pool) char[buf_size];
710: char *cur=buf+prolog_size;
1.113 parser 711:
1.172.2.4 paf 712: STRING_FOREACH_FRAGMENT(
1.123 paf 713: // lang
1.172.2.4 paf 714: memcpy(cur, &fragment->lang, sizeof(fragment->lang));
715: cur+=sizeof(fragment->lang);
1.123 paf 716: // size
1.172.2.4 paf 717: // bug on some sparc platform [you can't work with integers on odd pointers]
718: // forces us to use byte array instead
719: uchar bytes[4];
720: uint2uchars(fragment->size, bytes);
721: memcpy(cur, &bytes, sizeof(bytes)); cur+=sizeof(bytes);
1.123 paf 722: // bytes
1.172.2.4 paf 723: memcpy(cur, fragment->ptr, fragment->size);
724: cur+=fragment->size;
1.123 paf 725: );
1.113 parser 726: }
1.172.2.11 paf 727: bool String::deserialize(size_t prolog_size, void *buf, size_t buf_size, const char* file) {
1.135 paf 728: if(buf_size<=prolog_size)
1.148 paf 729: return false;
1.135 paf 730:
1.126 paf 731: char *cur=(char *)buf+prolog_size;
1.113 parser 732: buf_size-=prolog_size;
733:
734: while(buf_size) {
1.172.2.18! paf 735: if(sizeof(String_UL)+sizeof(size_t)>buf_size) // lang+size
1.148 paf 736: return false;
737:
1.172.2.6 paf 738: String_UL lang=*(String_UL *)(cur);
1.172.2.4 paf 739: size_t size=uchars2uint((uchar *)cur);
1.128 paf 740:
1.172.2.18! paf 741: size_t piece_size=sizeof(String_UL)+sizeof(size_t)+size;
1.148 paf 742: if(piece_size>buf_size) // buffer overrun, can be on incomplete cache files
743: return false;
744:
1.172.2.18! paf 745: const char* ptr=(const char*)(cur+sizeof(String_UL)+sizeof(size_t));
1.126 paf 746: APPEND(ptr, size, lang, file, 0);
1.113 parser 747:
748: cur+=piece_size;
749: buf_size-=piece_size;
750: }
1.148 paf 751: return true;
1.113 parser 752: }
E-mail: