Annotation of parser3/src/main/pa_string.C, revision 1.172.2.21
1.45 paf 1: /** @file
1.55 paf 2: Parser: string class. @see untasize_t.C.
1.46 paf 3:
1.172.2.11 paf 4: Copyright (c) 2001-2003 ArtLebedev Group (http://www.artlebedev.com)
1.138 paf 5: Author: Alexandr Petrosian <paf@design.ru> (http://paf.design.ru)
1.164 paf 6: */
1.46 paf 7:
1.172.2.21! paf 8: static const char* IDENT_STRING_C="$Date: 2003/03/05 16:58:24 $";
1.4 paf 9:
1.70 paf 10: #include "pcre.h"
11:
1.12 paf 12: #include "pa_string.h"
1.22 paf 13: #include "pa_exception.h"
1.61 paf 14: #include "pa_table.h"
1.101 parser 15: #include "pa_dictionary.h"
1.132 paf 16: #include "pa_charset.h"
1.60 paf 17:
1.172.2.2 paf 18: // helpers
1.139 paf 19:
1.172.2.2 paf 20: /// String::match uses this as replace & global search table columns
1.139 paf 21:
1.172.2.4 paf 22: const int MAX_MATCH_GROUPS=100;
23:
1.172.2.14 paf 24: class String_match_table_template_columns: public ArrayString {
1.172.2.4 paf 25: Pool pool;
1.172.2.2 paf 26: public:
1.172.2.4 paf 27: String_match_table_template_columns() {
1.172.2.10 paf 28: *this+=StringPtr(new String("prematch"));
29: *this+=StringPtr(new String("match"));
30: *this+=StringPtr(new String("postmatch"));
1.172.2.4 paf 31: for(int i=0; i<MAX_MATCH_GROUPS; i++) {
32: char *cname=new(pool) char[3/*strlen("100")*/+1/*terminating 0*/];
1.172.2.10 paf 33: StringPtr sname(new String(cname, sprintf(cname, "%d", 1+i)));
1.172.2.4 paf 34: *this+=sname;
1.172.2.2 paf 35: }
1.172.2.21! paf 36: }
! 37:
! 38: override ~String_match_table_template_columns() {
1.172.2.2 paf 39: }
1.172.2.4 paf 40: };
41:
42: Table string_match_table_template(
1.172.2.13 paf 43: Exception::undefined_source,
1.172.2.4 paf 44: Table::columns_type(new String_match_table_template_columns));
1.172.2.2 paf 45:
46: // methods
47:
1.172.2.11 paf 48: String::String(const char* src, size_t src_size, bool tainted): Array<String_fragment>(1), fsize(0) {
1.41 paf 49: if(src)
1.75 paf 50: if(tainted)
51: APPEND_TAINTED(src, src_size, 0, 0);
1.41 paf 52: else
1.75 paf 53: APPEND_CLEAN(src, src_size, 0, 0);
1.1 paf 54: }
1.140 paf 55:
1.172.2.4 paf 56: String::String(const String& src): Array<String_fragment>(src.count()) {
1.169 paf 57: append(src, UL_PASS_APPENDED);
1.120 paf 58: }
1.28 paf 59:
1.13 paf 60: String& String::real_append(STRING_APPEND_PARAMS) {
1.9 paf 61: if(!src)
62: return *this;
1.26 paf 63: if(!size)
64: size=strlen(src);
65: if(!size)
1.9 paf 66: return *this;
1.122 paf 67:
1.172.2.2 paf 68: if(is_full())
69: expand(fdelta);
1.1 paf 70:
1.172.2.4 paf 71: String_fragment *fragment=&felements[fused++];
72: fragment->ptr=src;
73: fragment->size=size;
74: fragment->lang=lang;
1.13 paf 75: #ifndef NO_STRING_ORIGIN
1.172.2.4 paf 76: fragment->origin.file=file;
77: fragment->origin.line=line;
1.13 paf 78: #endif
1.172.2.16 paf 79:
80: fsize+=size;
1.1 paf 81:
82: return *this;
83: }
84:
1.16 paf 85: uint String::hash_code() const {
1.7 paf 86: uint result=0;
1.172.2.4 paf 87: STRING_FOREACH_FRAGMENT(
88: result=generic_hash_code(result, fragment->ptr, fragment->size);
1.123 paf 89: );
1.5 paf 90: return result;
91: }
92:
1.60 paf 93: /// @todo move 'lang' skipping to pos
94: int String::cmp(int& partial, const String& src,
95: size_t this_offset, Untaint_lang lang) const {
1.59 paf 96: partial=-1;
1.125 paf 97: size_t a_size=size();
98: this_offset=min(this_offset, a_size-1);
1.55 paf 99:
1.172.2.4 paf 100: const String_fragment *a_current=felements;
101: const String_fragment *b_current=src.felements;
1.55 paf 102: size_t a_offset=this_offset;
103: size_t b_offset=0;
1.172.2.4 paf 104: String_fragment *a_end=felements+fused;
105: String_fragment *b_end=src.felements+src.fused;
1.116 paf 106: int result;
1.60 paf 107: size_t pos=0;
1.33 paf 108:
1.172.2.4 paf 109: bool a_break;
110: bool b_break;
1.172.2.17 paf 111: while(true) {
112: a_break=(a_current==a_end);
113: b_break=(b_current==b_end);
114: if(a_break || b_break)
115: break;
116:
1.172.2.4 paf 117: if(pos+a_current->size > this_offset) {
1.172.2.6 paf 118: if(lang!=UL_UNSPECIFIED && a_current->lang>(String_UL)lang)
1.60 paf 119: return -1; // wrong lang -- bail out
120:
1.55 paf 121: int size_diff=
1.172.2.4 paf 122: (a_current->size-a_offset)-
123: (b_current->size-b_offset);
1.55 paf 124:
125: if(size_diff==0) { // a has same size as b
1.172.2.4 paf 126: result=memcmp(a_current->ptr+a_offset, b_current->ptr+b_offset,
127: a_current->size-a_offset);
1.55 paf 128: if(result)
129: return result;
1.172.2.4 paf 130: pos+=a_current->size;
131: a_current++; a_offset=0;
132: b_current++; b_offset=0;
133: } else if(size_diff>0) { // a longer
134: result=memcmp(a_current->ptr+a_offset, b_current->ptr+b_offset,
135: b_current->size-b_offset);
1.55 paf 136: if(result)
137: return result;
1.172.2.4 paf 138: a_offset+=b_current->size-b_offset;
139: b_current++; b_offset=0;
1.55 paf 140: } else { // b longer
1.172.2.4 paf 141: result=memcmp(a_current->ptr+a_offset, b_current->ptr+b_offset,
142: a_current->size-a_offset);
1.55 paf 143: if(result)
144: return result;
1.172.2.4 paf 145: b_offset+=a_current->size-a_offset;
146: pos+=a_current->size;
147: a_current++; a_offset=0;
1.55 paf 148: }
149: } else {
1.172.2.4 paf 150: a_offset-=a_current->size;
151: pos+=a_current->size;
152: a_current++;
1.9 paf 153: }
1.27 paf 154: }
1.55 paf 155: if(a_break==b_break) { // ended simultaneously
156: partial=0; return 0;
157: } else if(a_break) { // first bytes equal, but a ended before b
158: partial=1; return -1;
159: } else {
160: partial=2; return +1;
161: }
1.27 paf 162: }
163:
1.60 paf 164: /// @todo move 'lang' skipping to pos
1.59 paf 165: int String::cmp(int& partial, const char* b_ptr, size_t src_size,
1.60 paf 166: size_t this_offset, Untaint_lang lang) const {
1.59 paf 167: partial=-1;
1.125 paf 168: size_t a_size=size();
1.50 paf 169: size_t b_size=src_size?src_size:b_ptr?strlen(b_ptr):0;
1.125 paf 170: this_offset=min(this_offset, a_size-1);
1.27 paf 171:
1.172.2.4 paf 172: const String_fragment *a_current=felements;
1.59 paf 173: size_t a_offset=this_offset;
1.55 paf 174: size_t b_offset=0;
1.172.2.4 paf 175: String_fragment *a_end=felements+fused;
1.60 paf 176: size_t pos=0;
1.52 paf 177:
1.172.2.4 paf 178: bool a_break;
1.83 parser 179: bool b_break=b_size==0;
1.172.2.17 paf 180: while(true) {
181: a_break=(a_current==a_end);
182: if(a_break || b_break)
183: break;
1.172.2.4 paf 184: if(pos+a_current->size > this_offset) {
1.172.2.6 paf 185: if(lang!=UL_UNSPECIFIED && a_current->lang>(String_UL)lang)
1.60 paf 186: return -1; // wrong lang -- bail out
187:
1.59 paf 188: int size_diff=
1.172.2.4 paf 189: (a_current->size-a_offset)-
1.59 paf 190: (b_size-b_offset);
191:
192: if(size_diff==0) { // a has same size as b
1.172.2.4 paf 193: if(int result=memcmp(a_current->ptr+a_offset, b_ptr+b_offset,
194: a_current->size-a_offset)!=0)
1.59 paf 195: return result;
1.172.2.4 paf 196: pos+=a_current->size;
197: a_current++; a_offset=0;
1.59 paf 198: b_break=true;
199: } else if (size_diff>0) { // a longer
1.172.2.4 paf 200: if(int result=memcmp(a_current->ptr+a_offset, b_ptr+b_offset,
1.59 paf 201: b_size-b_offset)!=0)
202: return result;
203: a_offset+=b_size-b_offset;
204: b_break=true;
205: } else { // b longer
1.172.2.4 paf 206: if(int result=memcmp(a_current->ptr+a_offset, b_ptr+b_offset,
207: a_current->size-a_offset)!=0)
1.59 paf 208: return result;
1.172.2.4 paf 209: b_offset+=a_current->size-a_offset;
210: pos+=a_current->size;
211: a_current++; a_offset=0;
1.59 paf 212: }
213: } else {
1.172.2.4 paf 214: a_offset-=a_current->size;
215: pos+=a_current->size;
216: a_current++;
1.9 paf 217: }
218: }
1.55 paf 219: if(a_break==b_break) { // ended simultaneously
220: partial=0; return 0;
221: } else if(a_break) { // first bytes equal, but a ended before b
222: partial=1; return -1;
223: } else {
224: partial=2; return +1;
225: }
1.5 paf 226: }
1.46 paf 227:
228: #ifndef NO_STRING_ORIGIN
1.172.2.4 paf 229: const String_fragment::Origin& String::origin() const {
1.140 paf 230: if(is_empty()) {
1.172.2.4 paf 231: static const String_fragment::Origin empty_origin={"empty string"};
1.96 parser 232: return empty_origin;
233: }
1.46 paf 234:
1.147 paf 235: // determining origin by first piece or last appended piece
236: // because any of them can be constant=without origin:
1.50 paf 237: // ex: ^load[/file] "document_root" + "/file"
1.80 paf 238: // when last peice is constant,
239: // ex: parser_root_auto_path{dynamic} / auto.p{const}
240: // using first piece
1.172.2.4 paf 241: String_fragment::Origin& first_origin=felements[0].origin;
242: return first_origin.file ? first_origin : felements[fused-1].origin;
1.46 paf 243: }
244: #endif
1.53 paf 245:
1.172.2.3 paf 246: StringPtr String::mid(size_t start, size_t finish) const {
247: StringPtr result(new String());
1.107 parser 248:
1.166 paf 249: start=min(start, size());
1.167 paf 250: finish=max(start, finish);
1.60 paf 251: if(start==finish)
1.107 parser 252: return result;
1.53 paf 253:
254: size_t pos=0;
1.172.2.4 paf 255: STRING_FOREACH_FRAGMENT(
256: size_t item_finish=pos+fragment->size;
1.123 paf 257: if(item_finish > start) { // started now or already?
1.172.2.4 paf 258: bool started=result->is_empty(); // started now?
1.123 paf 259: bool finished=finish <= item_finish; // finished now?
260: size_t offset=started?start-pos:0;
1.172.2.4 paf 261: size_t size=finished?finish-pos:fragment->size;
262: result->APPEND(
263: fragment->ptr+offset, size-offset,
264: fragment->lang,
265: fragment->origin.file, fragment->origin.line);
1.123 paf 266: if(finished)
1.53 paf 267: goto break2;
268: }
1.172.2.4 paf 269: pos+=fragment->size;
1.123 paf 270: );
1.53 paf 271: break2:
1.60 paf 272: // SAPI::log(pool(), "piece of '%s' from %d to %d is '%s'",
273: //cstr(), start, finish, result.cstr());
1.53 paf 274: return result;
1.54 paf 275: }
276:
1.60 paf 277: int String::pos(const String& substr,
1.116 paf 278: int result, Untaint_lang lang) const {
1.125 paf 279: size_t self_size=size();
1.131 paf 280: for(; size_t(result)<self_size; result++) {
1.60 paf 281: int partial; cmp(partial, substr, result, lang);
1.58 paf 282: if(
283: partial==0 || // full match
284: partial==2) // 'substr' starts 'this'+'result'
285: return result;
286: }
287:
288: return -1;
289: }
290:
1.172.2.11 paf 291: int String::pos(const char* substr, size_t substr_size,
1.116 paf 292: int result, Untaint_lang lang) const {
1.125 paf 293: size_t self_size=size();
1.131 paf 294: for(; size_t(result)<self_size; result++) {
1.60 paf 295: int partial; cmp(partial, substr, substr_size, result, lang);
1.55 paf 296: if(
297: partial==0 || // full match
298: partial==2) // 'substr' starts 'this'+'result'
299: return result;
300: }
301:
302: return -1;
1.60 paf 303: }
304:
1.172.2.14 paf 305: void String::split(ArrayString& result,
1.60 paf 306: size_t* pos_after_ref,
1.172.2.11 paf 307: const char* delim, size_t delim_size,
1.172.2.10 paf 308: Untaint_lang lang, int limit) {
1.125 paf 309: size_t self_size=size();
1.60 paf 310: if(delim_size) {
311: size_t pos_after=pos_after_ref?*pos_after_ref:0;
312: int pos_before;
313: // while we have 'delim'...
314: for(; (pos_before=pos(delim, delim_size, pos_after, lang))>=0 && limit; limit--) {
1.172.2.4 paf 315: result+=mid(pos_after, pos_before);
1.60 paf 316: pos_after=pos_before+delim_size;
317: }
318: // last piece
1.124 paf 319: if(pos_after<self_size && limit) {
1.172.2.4 paf 320: result+=mid(pos_after, self_size);
1.124 paf 321: pos_after=self_size;
1.60 paf 322: }
323: if(pos_after_ref)
324: *pos_after_ref=pos_after;
325: } else { // empty delim
1.172.2.10 paf 326: result+=StringPtr(this);
1.60 paf 327: if(pos_after_ref)
1.124 paf 328: *pos_after_ref+=self_size;
1.60 paf 329: }
330: }
331:
1.172.2.14 paf 332: void String::split(ArrayString& result,
1.60 paf 333: size_t* pos_after_ref,
334: const String& delim, Untaint_lang lang,
1.172.2.10 paf 335: int limit) {
1.140 paf 336: if(!delim.is_empty()) {
1.60 paf 337: size_t pos_after=pos_after_ref?*pos_after_ref:0;
338: int pos_before;
339: // while we have 'delim'...
340: for(; (pos_before=pos(delim, pos_after, lang))>=0 && limit; limit--) {
1.172.2.4 paf 341: result+=mid(pos_after, pos_before);
1.60 paf 342: pos_after=pos_before+delim.size();
343: }
344: // last piece
345: if(pos_after<size() && limit) {
1.172.2.4 paf 346: result+=mid(pos_after, size());
1.60 paf 347: pos_after=size();
348: }
349: if(pos_after_ref)
350: *pos_after_ref=pos_after;
351: } else { // empty delim
1.172.2.10 paf 352: result+=StringPtr(this);
1.60 paf 353: if(pos_after_ref)
354: *pos_after_ref+=size();
355: }
1.61 paf 356: }
357:
1.172.2.10 paf 358: static void regex_options(StringPtr options, int *result, bool& need_pre_post_match){
1.63 paf 359: struct Regex_option {
1.172.2.11 paf 360: const char* keyL;
361: const char* keyU;
1.63 paf 362: int clear, set;
363: int *result;
1.154 paf 364: bool *flag;
1.63 paf 365: } regex_option[]={
1.153 paf 366: {"i", "I", 0, PCRE_CASELESS, result}, // a=A
367: {"s", "S", 0, PCRE_DOTALL, result}, // \n\n$ [default]
368: {"x", "U", 0, PCRE_EXTENDED, result}, // whitespace in regex ignored
369: {"m", "M", PCRE_DOTALL, PCRE_MULTILINE, result}, // ^aaa\n$^bbb\n$
370: {"g", "G", 0, true, result+1}, // many rows
1.154 paf 371: {"'", 0, 0, 0, 0, &need_pre_post_match},
372: {0}
1.63 paf 373: };
1.171 paf 374: result[0]=PCRE_EXTRA | PCRE_DOTALL | PCRE_DOLLAR_ENDONLY;
1.63 paf 375: result[1]=0;
376:
377: if(options)
1.153 paf 378: for(Regex_option *o=regex_option; o->keyL; o++)
1.154 paf 379: if(options->pos(o->keyL)>=0
380: || (o->keyU && options->pos(o->keyU)>=0)) {
381: if(o->flag)
382: *o->flag=true;
383: else { // result
384: *o->result &= ~o->clear;
385: *o->result |= o->set;
386: }
1.63 paf 387: }
388: }
389:
1.172.2.4 paf 390: TablePtr String::match(Charset& source_charset,
1.172.2.10 paf 391: StringPtr aorigin,
1.172.2.4 paf 392: const String& regexp,
1.172.2.10 paf 393: StringPtr options,
1.172.2.4 paf 394: Row_action row_action, void *info,
1.172.2.20 paf 395: bool& just_matched) const {
1.140 paf 396: if(regexp.is_empty())
1.149 paf 397: throw Exception(0,
1.73 paf 398: aorigin,
399: "regexp is empty");
1.154 paf 400:
1.172.2.4 paf 401: CharPtr pattern=regexp.cstr();
1.172.2.11 paf 402: const char* errptr;
1.62 paf 403: int erroffset;
1.154 paf 404: bool need_pre_post_match=false;
405: int option_bits[2]; regex_options(options, option_bits, need_pre_post_match);
1.172.2.20 paf 406: bool global=option_bits[1]!=0;
1.172.2.4 paf 407: pcre *code=pcre_compile(pattern.get(), option_bits[0],
1.62 paf 408: &errptr, &erroffset,
1.172.2.1 paf 409: source_charset.pcre_tables);
1.62 paf 410:
1.67 paf 411: if(!code)
1.149 paf 412: throw Exception(0,
1.172.2.4 paf 413: regexp.mid(erroffset, regexp.size()),
1.74 paf 414: "regular expression syntax error - %s", errptr);
1.62 paf 415:
1.172.2.20 paf 416: int subpatterns=pcre_info(code, 0, 0);
417: if(subpatterns<0) {
1.100 parser 418: pcre_free(code);
1.149 paf 419: throw Exception(0,
1.73 paf 420: aorigin,
1.76 paf 421: "pcre_info error (%d)",
1.172.2.20 paf 422: subpatterns);
1.63 paf 423: }
424:
1.172.2.4 paf 425: CharPtr subject=cstr();
426: int length=size();
427: const int ovecsize=(1/*match*/+MAX_MATCH_GROUPS)*3;
1.155 paf 428: int ovector[ovecsize];
429:
430: // create table
1.172.2.20 paf 431: TablePtr table(new Table(string_match_table_template));
1.63 paf 432:
1.64 paf 433: int exec_option_bits=0;
1.154 paf 434: int prestart=0;
435: int poststart=0;
436: int postfinish=size();
1.63 paf 437: while(true) {
438: int exec_substrings=pcre_exec(code, 0,
1.172.2.4 paf 439: subject.get(), length, prestart,
1.64 paf 440: exec_option_bits, ovector, ovecsize);
1.63 paf 441:
442: if(exec_substrings==PCRE_ERROR_NOMATCH) {
1.100 parser 443: pcre_free(code);
1.172.2.20 paf 444: row_action(table, ArrayStringPtr(0)/*last time, no raw*/, 0, 0, poststart, postfinish, info);
445: if(global || subpatterns)
446: return table; // global or with subpatterns=true+result
447: else {
448: just_matched=false; return TablePtr(0); // not global=no result
449: }
1.63 paf 450: }
451:
452: if(exec_substrings<0) {
1.100 parser 453: pcre_free(code);
1.149 paf 454: throw Exception(0,
1.63 paf 455: aorigin,
1.76 paf 456: "regular expression execute error (%d)",
1.63 paf 457: exec_substrings);
458: }
459:
1.154 paf 460: int prefinish=ovector[0];
461: poststart=ovector[1];
1.172.2.14 paf 462: object_ptr<ArrayString> row(new ArrayString);
1.172.2.4 paf 463: if(need_pre_post_match) {
464: *row+=mid(0, prefinish); // .prematch column value
465: *row+=mid(prefinish, poststart); // .match
466: *row+=mid(poststart, postfinish); // .postmatch
467: } else {
1.172.2.10 paf 468: *row+=StringPtr(0); // .prematch column value
469: *row+=StringPtr(0); // .match
470: *row+=StringPtr(0); // .postmatch
1.172.2.4 paf 471: }
1.63 paf 472:
473: for(int i=1; i<exec_substrings; i++) {
1.69 paf 474: // -1:-1 case handled peacefully by mid() itself
1.172.2.4 paf 475: *row+=mid(ovector[i*2+0], ovector[i*2+1]); // .i column value
1.63 paf 476: }
477:
1.172.2.20 paf 478: row_action(table, row, prestart, prefinish, poststart, postfinish, info);
1.63 paf 479:
1.172.2.20 paf 480: if(!global || prestart==poststart) { // not global | going to hang
1.100 parser 481: pcre_free(code);
1.172.2.20 paf 482: row_action(table, ArrayStringPtr(0)/*last time, no row*/, 0, 0, poststart, postfinish, info);
483: return table;
1.63 paf 484: }
1.154 paf 485: prestart=poststart;
1.63 paf 486:
487: /*
488: if(option_bits[0] & PCRE_MULTILINE)
1.64 paf 489: exec_option_bits|=PCRE_NOTBOL; // start of subject+startoffset not BOL
1.63 paf 490: */
491: }
1.82 parser 492: }
493:
1.172.2.10 paf 494: StringPtr String::change_case(Pool& pool, Charset& source_charset, Change_case_kind kind) {
1.172.2.4 paf 495: StringPtr result(new String());
496:
1.172.2.1 paf 497: const unsigned char *tables=source_charset.pcre_tables;
1.82 parser 498:
499: const unsigned char *a;
500: const unsigned char *b;
501: switch(kind) {
502: case CC_UPPER:
503: a=tables+lcc_offset;
504: b=tables+fcc_offset;
505: break;
506: case CC_LOWER:
507: a=tables+lcc_offset;
508: b=0;
509: break;
510: default:
1.149 paf 511: throw Exception(0,
1.172.2.10 paf 512: StringPtr(this),
1.82 parser 513: "unknown change case kind #%d",
514: static_cast<int>(kind)); // never
515: a=b=0; // calm, compiler
516: break; // never
517: }
518:
1.172.2.4 paf 519: STRING_FOREACH_FRAGMENT(
520: char *new_cstr=new(pool) char[fragment->size];
1.143 paf 521: char *dest=new_cstr;
1.172.2.11 paf 522: const char* end=fragment->ptr+fragment->size;
523: for(const char* current=fragment->ptr; current<end; current++) {
1.172.2.4 paf 524: unsigned char c=a[(unsigned char)*current];
1.143 paf 525: if(b)
526: c=b[c];
1.82 parser 527:
1.143 paf 528: *dest++=(char)c;
1.82 parser 529: }
1.143 paf 530:
1.172.2.4 paf 531: result->APPEND(new_cstr, fragment->size,
532: fragment->lang,
533: fragment->origin.file, fragment->origin.line);
1.143 paf 534: );
1.89 parser 535:
1.101 parser 536: return result;
537: }
538:
1.150 paf 539: /// @test if in some piece were found no dict words, append it, not it's duplicate
1.172.2.19 paf 540: StringPtr String::replace(Pool& pool, const Dictionary& dict) const {
1.172.2.4 paf 541: StringPtr result(new String());
542: CharPtr lcstr(cstr());
1.172.2.11 paf 543: const char* current=lcstr.get();
1.170 paf 544:
1.172.2.4 paf 545: STRING_FOREACH_FRAGMENT(
1.170 paf 546: IFNDEF_NO_STRING_ORIGIN(
1.172.2.11 paf 547: const char* joined_origin_file=fragment->origin.file;
1.172.2.4 paf 548: const size_t joined_origin_line=fragment->origin.line;
1.170 paf 549: );
1.172.2.18 paf 550: String_UL joined_lang=fragment->lang;
1.172.2.11 paf 551: const char* joined_ptr=current;
1.170 paf 552: // calc size
553: size_t joined_size=0;
1.172.2.4 paf 554: STRING_PREPARED_FOREACH_FRAGMENT(
555: if(fragment->lang==joined_lang)
556: joined_size+=fragment->size;
1.170 paf 557: else
558: break; // before non-ours
559: );
560: current+=joined_size;
561:
562: // pointers are after joined piece
1.172.2.4 paf 563: // & one step back, see STRING_PREPARED_FOREACH_FRAGMENT
564: --fragment;
1.170 paf 565:
1.172.2.4 paf 566: char *new_cstr=new(pool) char[(size_t)ceil(joined_size*dict.max_ratio())];
1.123 paf 567: char *dest=new_cstr;
1.170 paf 568: while(joined_size) {
1.172.2.4 paf 569: // there is a fragment where first column starts 'joined_ptr'
570: if(Table::element_type row=dict.first_that_starts(joined_ptr, joined_size)) {
1.123 paf 571: // get a=>b values
1.172.2.10 paf 572: StringPtr a=row->get(0);
573: StringPtr b=row->get(1);
1.170 paf 574: // skip 'a' in 'joined_ptr' && reduce work size
1.172.2.4 paf 575: joined_ptr+=a->size(); joined_size-=a->size();
1.123 paf 576: // write 'b' to 'dest' && skip 'b' in 'dest'
1.172.2.5 paf 577: b->store_to(dest, String::UL_AS_IS); dest+=b->size();
1.123 paf 578: } else {
579: // write a char to b && reduce work size
1.170 paf 580: *dest++=*joined_ptr++; joined_size--;
1.101 parser 581: }
582: }
583:
1.172.2.4 paf 584: result->APPEND(new_cstr, dest-new_cstr, joined_lang,
1.170 paf 585: joined_origin_file, joined_origin_line);
1.156 paf 586: );
1.170 paf 587:
1.156 paf 588: return result;
589: }
590:
1.172.2.4 paf 591: StringPtr String::join_chains(Pool& pool) const {
592: StringPtr result(new String());
593:
594: char *pooled_buf=new(pool) char[cstr_bufsize()];
1.172.2.5 paf 595: store_to(pooled_buf, String::UL_AS_IS);
1.172.2.11 paf 596: const char* current=pooled_buf;
1.156 paf 597:
1.172.2.4 paf 598: STRING_FOREACH_FRAGMENT(
1.156 paf 599: IFNDEF_NO_STRING_ORIGIN(
1.172.2.11 paf 600: const char* joined_origin_file=fragment->origin.file;
1.172.2.4 paf 601: const size_t joined_origin_line=fragment->origin.line;
1.156 paf 602: );
1.172.2.18 paf 603: String_UL joined_lang=fragment->lang;
1.172.2.11 paf 604: const char* joined_ptr=current;
1.156 paf 605: // calc size
606: size_t joined_size=0;
1.172.2.4 paf 607: STRING_PREPARED_FOREACH_FRAGMENT(
608: if(fragment->lang==joined_lang)
609: joined_size+=fragment->size;
1.156 paf 610: else
611: break; // before non-ours
612: );
613: current+=joined_size;
614:
615: // pointers are after joined piece
1.172.2.4 paf 616: // & one step back, see STRING_PREPARED_FOREACH_FRAGMENT
617: --fragment;
1.156 paf 618:
1.172.2.4 paf 619: result->APPEND(joined_ptr, joined_size, joined_lang,
1.150 paf 620: joined_origin_file, joined_origin_line);
1.123 paf 621: );
1.156 paf 622:
1.89 parser 623: return result;
624: }
625:
1.172.2.10 paf 626: double String::as_double() {
1.89 parser 627: double result;
1.172.2.4 paf 628: char buf[MAX_STRING];
629: if(size()>MAX_STRING-1)
630: throw Exception("number.format",
1.172.2.10 paf 631: StringPtr(this),
1.172.2.4 paf 632: "invalid number too long a string (%u>%u)", size(), MAX_STRING-1);
1.172.2.5 paf 633: char *eol=store_to(buf, String::UL_AS_IS); *eol=0;
1.172.2.11 paf 634: const char* cstr=buf;
1.172.2.4 paf 635:
1.161 paf 636: while(*cstr && isspace(*cstr))
637: cstr++;
638: if(!*cstr)
1.162 paf 639: return 0;
1.161 paf 640:
1.102 parser 641: char *error_pos;
1.89 parser 642: // 0xABC
1.99 parser 643: if(cstr[0]=='0')
644: if(cstr[1]=='x' || cstr[1]=='X')
645: result=(double)(unsigned long)strtol(cstr, &error_pos, 0);
646: else
1.102 parser 647: result=(double)strtod(cstr+1/*skip leading 0*/, &error_pos);
1.89 parser 648: else
1.99 parser 649: result=(double)strtod(cstr, &error_pos);
1.89 parser 650:
1.159 paf 651: while(char c=*error_pos++)
652: if(!isspace(c))
653: throw Exception("number.format",
1.172.2.10 paf 654: StringPtr(this),
1.159 paf 655: "invalid number (double)");
1.89 parser 656:
657: return result;
658: }
1.172.2.10 paf 659: int String::as_int() {
1.89 parser 660: int result;
1.172.2.4 paf 661: char buf[MAX_STRING];
662: if(size()>MAX_STRING-1)
663: throw Exception("number.format",
1.172.2.10 paf 664: StringPtr(this),
1.172.2.4 paf 665: "invalid number too long a string (%u>%u)", size(), MAX_STRING-1);
1.172.2.5 paf 666: char *eol=store_to(buf, String::UL_AS_IS); *eol=0;
1.172.2.11 paf 667: const char* cstr=buf;
1.172.2.4 paf 668:
1.161 paf 669: while(*cstr && isspace(*cstr))
670: cstr++;
671: if(!*cstr)
1.162 paf 672: return 0;
1.161 paf 673:
1.102 parser 674: char *error_pos;
1.89 parser 675: // 0xABC
1.99 parser 676: if(cstr[0]=='0')
677: if(cstr[1]=='x' || cstr[1]=='X')
678: result=(int)(unsigned long)strtol(cstr, &error_pos, 0);
679: else
1.102 parser 680: result=(int)strtol(cstr+1/*skip leading 0*/, &error_pos, 0);
1.89 parser 681: else
682: result=(int)strtol(cstr, &error_pos, 0);
683:
1.159 paf 684: while(char c=*error_pos++)
685: if(!isspace(c))
686: throw Exception("number.format",
1.172.2.10 paf 687: StringPtr(this),
1.159 paf 688: "invalid number (int)");
1.82 parser 689:
690: return result;
1.61 paf 691: }
1.113 parser 692:
1.172.2.4 paf 693: inline void uint2uchars(uint word, uchar *bytes) {
694: bytes[0]=word&0xFF;
695: bytes[1]=(word>>8)&0xFF;
696: bytes[2]=(word>>16)&0xFF;
697: bytes[3]=(word>>24)&0xFF;
698: }
699: inline uint uchars2uint(uchar *bytes) {
700: return bytes[3]<<24
701: | bytes[2]<<16
702: | bytes[1]<<8
703: | bytes[0];
704: }
705:
706: void String::serialize(Pool& pool, size_t prolog_size, char *& buf, size_t& buf_size) const {
1.113 parser 707: buf_size=
708: prolog_size
1.172.2.18 paf 709: +fused*(sizeof(String_UL)+sizeof(size_t))
1.113 parser 710: +size();
1.172.2.4 paf 711: buf=new(pool) char[buf_size];
712: char *cur=buf+prolog_size;
1.113 parser 713:
1.172.2.4 paf 714: STRING_FOREACH_FRAGMENT(
1.123 paf 715: // lang
1.172.2.4 paf 716: memcpy(cur, &fragment->lang, sizeof(fragment->lang));
717: cur+=sizeof(fragment->lang);
1.123 paf 718: // size
1.172.2.4 paf 719: // bug on some sparc platform [you can't work with integers on odd pointers]
720: // forces us to use byte array instead
721: uchar bytes[4];
722: uint2uchars(fragment->size, bytes);
723: memcpy(cur, &bytes, sizeof(bytes)); cur+=sizeof(bytes);
1.123 paf 724: // bytes
1.172.2.4 paf 725: memcpy(cur, fragment->ptr, fragment->size);
726: cur+=fragment->size;
1.123 paf 727: );
1.113 parser 728: }
1.172.2.11 paf 729: bool String::deserialize(size_t prolog_size, void *buf, size_t buf_size, const char* file) {
1.135 paf 730: if(buf_size<=prolog_size)
1.148 paf 731: return false;
1.135 paf 732:
1.126 paf 733: char *cur=(char *)buf+prolog_size;
1.113 parser 734: buf_size-=prolog_size;
735:
736: while(buf_size) {
1.172.2.18 paf 737: if(sizeof(String_UL)+sizeof(size_t)>buf_size) // lang+size
1.148 paf 738: return false;
739:
1.172.2.6 paf 740: String_UL lang=*(String_UL *)(cur);
1.172.2.4 paf 741: size_t size=uchars2uint((uchar *)cur);
1.128 paf 742:
1.172.2.18 paf 743: size_t piece_size=sizeof(String_UL)+sizeof(size_t)+size;
1.148 paf 744: if(piece_size>buf_size) // buffer overrun, can be on incomplete cache files
745: return false;
746:
1.172.2.18 paf 747: const char* ptr=(const char*)(cur+sizeof(String_UL)+sizeof(size_t));
1.126 paf 748: APPEND(ptr, size, lang, file, 0);
1.113 parser 749:
750: cur+=piece_size;
751: buf_size-=piece_size;
752: }
1.148 paf 753: return true;
1.113 parser 754: }
E-mail: