Annotation of parser3/src/main/pa_string.C, revision 1.214
1.45 paf 1: /** @file
1.174 paf 2: Parser: string class. @see untalength_t.C.
1.46 paf 3:
1.203 paf 4: Copyright (c) 2001-2005 ArtLebedev Group (http://www.artlebedev.com)
1.138 paf 5: Author: Alexandr Petrosian <paf@design.ru> (http://paf.design.ru)
1.164 paf 6: */
1.46 paf 7:
1.214 ! misha 8: static const char * const IDENT_STRING_C="$Date: 2008-07-21 07:37:11 $";
1.70 paf 9:
1.12 paf 10: #include "pa_string.h"
1.22 paf 11: #include "pa_exception.h"
1.61 paf 12: #include "pa_table.h"
1.101 parser 13: #include "pa_dictionary.h"
1.132 paf 14: #include "pa_charset.h"
1.60 paf 15:
1.185 paf 16: const String String::Empty;
17:
1.193 paf 18: int pa_atoi(const char* str, const String* problem_source) {
19: if(!str)
20: return 0;
21:
1.199 paf 22: while(*str && isspace((unsigned char)*str))
1.193 paf 23: str++;
24: if(!*str)
25: return 0;
26:
27: int result;
28: char *error_pos;
1.200 paf 29: bool negative=false;
30: if(str[0]=='-') {
31: negative=true;
32: str++;
33: } else if(str[0]=='+') {
34: str++;
35: }
1.193 paf 36: // 0xABC
37: if(str[0]=='0')
38: if(str[1]=='x' || str[1]=='X')
39: result=(int)(unsigned long)strtol(str, &error_pos, 0);
1.197 paf 40: else {
41: // skip leading 0000, to disable octal interpretation
42: do str++; while(*str=='0');
43: result=(int)strtol(str, &error_pos, 0);
44: }
1.193 paf 45: else
46: result=(int)strtol(str, &error_pos, 0);
1.200 paf 47: if(negative)
48: result=-result;
1.193 paf 49:
50: while(char c=*error_pos++)
1.199 paf 51: if(!isspace((unsigned char)c))
1.193 paf 52: throw Exception("number.format",
53: problem_source,
54: problem_source?"invalid number (int)": "'%s' is invalid number (int)", str);
55:
56: return result;
57: }
58:
59: double pa_atod(const char* str, const String* problem_source) {
60: if(!str)
61: return 0;
62:
1.199 paf 63: while(*str && isspace((unsigned char)*str))
1.193 paf 64: str++;
65: if(!*str)
66: return 0;
67:
68: double result;
69: char *error_pos;
1.200 paf 70: bool negative=false;
71: if(str[0]=='-') {
72: negative=true;
73: str++;
74: } else if(str[0]=='+') {
75: str++;
76: }
1.193 paf 77: // 0xABC
78: if(str[0]=='0')
79: if(str[1]=='x' || str[1]=='X')
80: result=(double)(unsigned long)strtol(str, &error_pos, 0);
1.200 paf 81: else {
82: // skip leading 0000, to disable octal interpretation
83: do str++; while(*str=='0');
84: result=(double)strtod(str, &error_pos);
85: }
1.193 paf 86: else
87: result=(double)strtod(str, &error_pos);
1.200 paf 88: if(negative)
89: result=-result;
1.193 paf 90:
91: while(char c=*error_pos++)
1.199 paf 92: if(!isspace((unsigned char)c))
1.193 paf 93: throw Exception("number.format",
94: problem_source,
95: problem_source?"invalid number (double)": "'%s' is invalid number (double)", str);
96:
97: return result;
98: }
99:
1.176 paf 100: // cord lib extension
101:
102: #ifndef DOXYGEN
103: typedef struct {
104: ssize_t countdown;
1.204 paf 105: int target; /* Character we're looking for */
1.176 paf 106: } chr_data;
107: #endif
108: static int CORD_range_contains_chr_greater_then_proc(char c, size_t size, void* client_data)
109: {
110: register chr_data * d = (chr_data *)client_data;
111:
112: if (d -> countdown<=0) return(2);
113: d -> countdown -= size;
114: if (c > d -> target) return(1);
115: return(0);
116: }
117: int CORD_range_contains_chr_greater_then(CORD x, size_t i, size_t n, int c)
118: {
119: chr_data d;
120:
121: d.countdown = n;
122: d.target = c;
123: return(CORD_block_iter(x, i, CORD_range_contains_chr_greater_then_proc, &d) == 1/*alternatives: 0 normally ended, 2=struck 'n'*/);
124: }
125:
1.187 paf 126: static int CORD_block_count_proc(char /*c*/, size_t /*size*/, void* client_data)
1.178 paf 127: {
128: int* result=(int*)client_data;
129: (*result)++;
130: return(0); // 0=continue
131: }
132: size_t CORD_block_count(CORD x)
133: {
134: size_t result=0;
135: CORD_block_iter(x, 0, CORD_block_count_proc, &result);
136: return result;
137: }
138:
1.174 paf 139: // helpers
1.139 paf 140:
1.174 paf 141: /// String::match uses this as replace & global search table columns
1.139 paf 142:
1.174 paf 143: const int MAX_MATCH_GROUPS=100;
1.139 paf 144:
1.174 paf 145: class String_match_table_template_columns: public ArrayString {
146: public:
147: String_match_table_template_columns() {
148: *this+=new String("prematch");
149: *this+=new String("match");
150: *this+=new String("postmatch");
151: for(int i=0; i<MAX_MATCH_GROUPS; i++) {
1.176 paf 152: *this+=new String(String::Body::Format(1+i), String::L_CLEAN);
1.174 paf 153: }
154: }
155: };
156:
157: Table string_match_table_template(new String_match_table_template_columns);
158:
1.176 paf 159: // String::Body methods
1.140 paf 160:
1.176 paf 161: String::Body String::Body::Format(int value) {
1.174 paf 162: char local[MAX_NUMBER];
163: size_t length=snprintf(local, MAX_NUMBER, "%d", value);
1.176 paf 164: return String::Body(pa_strdup(local, length), length);
1.120 paf 165: }
166:
1.195 paf 167: String::Body String::Body::trim(String::Trim_kind kind, const char* chars,
168: size_t* out_start, size_t* out_length) const {
169: size_t our_length=length();
170: if(!our_length)
171: return *this;
172: if(!chars)
173: chars=" \t\n"; // white space
174:
175: size_t start=0;
176: size_t end=our_length;
1.196 paf 177: // from left...
178: if(kind!=TRIM_END) {
1.195 paf 179: CORD_pos pos; set_pos(pos, 0);
180: while(true) {
181: char c=CORD_pos_fetch(pos);
182: if(strchr(chars, c)) {
183: if(++start==our_length)
184: return 0; // all chars are empty, just return empty string
185: } else
186: break;
187:
188: CORD_next(pos);
189: }
190: }
1.196 paf 191: // from right..
192: if(kind!=TRIM_START) {
193: CORD_pos pos; set_pos(pos, end-1);
194: while(true) {
195: char c=CORD_pos_fetch(pos);
196: if(strchr(chars, c)) {
197: if(--end==0) // optimization: NO need to check for 'end>=start', that's(<) impossible
198: return 0; // all chars are empty, just return empty string
199: } else
200: break;
201:
202: CORD_prev(pos);
203: }
204: }
1.195 paf 205:
206: if(start==0 && end==our_length) // nobody moved a thing
207: return *this;
208:
209: if(out_start)
210: *out_start=start;
211: size_t new_length=end-start;
212: if(out_length)
213: *out_length=new_length;
214:
215: return mid(start, new_length);
216: }
217:
1.174 paf 218: static int CORD_batched_iter_fn_generic_hash_code(char c, void * client_data) {
219: uint& result=*static_cast<uint*>(client_data);
220: generic_hash_code(result, c);
221: return 0;
222: }
223: static int CORD_batched_iter_fn_generic_hash_code(const char* s, void * client_data) {
224: uint& result=*static_cast<uint*>(client_data);
225: generic_hash_code(result, s);
226: return 0;
227: };
1.176 paf 228: uint String::Body::hash_code() const {
1.174 paf 229: uint result=0;
230: CORD_iter5(body, 0,
231: CORD_batched_iter_fn_generic_hash_code,
232: CORD_batched_iter_fn_generic_hash_code, &result);
1.120 paf 233: return result;
1.94 parser 234: }
235:
1.174 paf 236: // String methods
237:
238: String::String(const char* cstr, size_t helper_length, bool tainted): body(CORD_EMPTY) {
239: append_help_length(cstr, helper_length, tainted?L_TAINTED:L_CLEAN);
1.115 paf 240: }
1.174 paf 241: String::String(const String::C cstr, bool tainted): body(CORD_EMPTY) {
242: append_know_length(cstr.str, cstr.length, tainted?L_TAINTED:L_CLEAN);
1.5 paf 243: }
1.28 paf 244:
1.174 paf 245: String& String::append_know_length(const char* str, size_t known_length, Language lang) {
246: if(!known_length)
1.9 paf 247: return *this;
1.122 paf 248:
1.176 paf 249: // first: langs
250: langs.append(body, lang, known_length);
251: // next: letters themselves
1.174 paf 252: body.append_know_length(str, known_length);
1.1 paf 253:
1.174 paf 254: ASSERT_STRING_INVARIANT(*this);
1.1 paf 255: return *this;
256: }
1.174 paf 257: String& String::append_help_length(const char* str, size_t helper_length, Language lang) {
258: if(!str)
259: return *this;
260: size_t known_length=helper_length?helper_length:strlen(str);
261: if(!known_length)
262: return *this;
1.1 paf 263:
1.174 paf 264: return append_know_length(str, known_length, lang);
1.5 paf 265: }
1.174 paf 266: String& String::append_strdup(const char* str, size_t helper_length, Language lang) {
267: size_t known_length=helper_length?helper_length:strlen(str);
268: if(!known_length)
269: return *this;
1.5 paf 270:
1.176 paf 271: // first: langs
272: langs.append(body, lang, known_length);
273: // next: letters themselves
1.174 paf 274: body.append_strdup_know_length(str, known_length);
1.33 paf 275:
1.174 paf 276: ASSERT_STRING_INVARIANT(*this);
277: return *this;
1.5 paf 278: }
1.46 paf 279:
1.210 misha 280: size_t String::length(Charset& charset) const {
281: if(charset.isUTF8()){
282: const XMLByte* srcPtr=(const XMLByte*)cstrm();
283: return lengthUTF8(srcPtr, srcPtr+body.length());
284: } else
285: return body.length();
286: }
287:
1.174 paf 288: /// @todo check in doc: whether it documents NOW bad situation "abc".mid(-1, 3) =were?="ab"
289: String& String::mid(size_t substr_begin, size_t substr_end) const {
290: String& result=*new String;
291:
292: size_t self_length=length();
293: substr_begin=min(substr_begin, self_length);
294: substr_end=min(max(substr_end, substr_begin), self_length);
1.176 paf 295: size_t substr_length=substr_end-substr_begin;
296: if(!substr_length)
1.107 parser 297: return result;
1.53 paf 298:
1.176 paf 299: // first: their langs
300: result.langs.append(result.body, langs, substr_begin, substr_length);
301: // next: letters themselves
302: result.body=body.mid(substr_begin, substr_length);
1.174 paf 303:
304: ASSERT_STRING_INVARIANT(result);
1.53 paf 305: return result;
1.54 paf 306: }
307:
1.211 misha 308: // from, to and helper_length in characters, not in bytes (it's important for utf-8)
309: String& String::mid(Charset& charset, size_t from, size_t to, size_t helper_length) const {
1.210 misha 310: String& result=*new String;
311:
1.211 misha 312: size_t self_length=(helper_length)?helper_length:length(charset);
313:
314: if(!self_length)
315: return result;
316:
317: from=min(min(to, from), self_length);
1.210 misha 318: to=min(max(to, from), self_length);
1.211 misha 319:
1.210 misha 320: size_t substr_length=to-from;
1.211 misha 321:
1.210 misha 322: if(!substr_length)
323: return result;
324:
325: if(charset.isUTF8()){
326: const XMLByte* srcPtr=(const XMLByte*)cstrm();
1.211 misha 327: const XMLByte* srcEnd=srcPtr+body.length();
1.210 misha 328:
1.212 misha 329: // convert 'from' and 'substr_length' from 'characters' to 'bytes'
1.210 misha 330: from=getUTF8BytePos(srcPtr, srcEnd, from);
331: substr_length=getUTF8BytePos(srcPtr+from, srcEnd, substr_length);
332: if(!substr_length)
333: return result;
334: }
335:
336: // first: their langs
337: result.langs.append(result.body, langs, from, substr_length);
338: // next: letters themselves
339: result.body=body.mid(from, substr_length);
340:
341: ASSERT_STRING_INVARIANT(result);
342: return result;
343: }
344:
1.176 paf 345: size_t String::pos(const String::Body substr, size_t this_offset, Language lang) const {
1.183 paf 346: size_t substr_length=substr.length();
347: while(true) {
348: size_t substr_begin=body.pos(substr, this_offset);
349:
350: if(substr_begin==CORD_NOT_FOUND)
351: return STRING_NOT_FOUND;
1.174 paf 352:
1.183 paf 353: if(langs.check_lang(lang, substr_begin, substr_length))
354: return substr_begin;
355:
356: this_offset=substr_begin+substr_length;
357: }
1.58 paf 358: }
359:
1.174 paf 360: size_t String::pos(const String& substr,
361: size_t this_offset, Language lang) const {
362: return pos(substr.body, this_offset, lang);
1.60 paf 363: }
364:
1.210 misha 365: size_t String::pos(Charset& charset, const String& substr,
366: size_t this_offset, Language lang) const {
367:
1.212 misha 368: size_t result;
1.210 misha 369: if(charset.isUTF8()){
370: const XMLByte* srcPtr=(const XMLByte*)cstrm();
1.212 misha 371: const XMLByte* srcEnd=srcPtr+body.length();
372:
373: // convert 'this_offset' from 'characters' to 'bytes'
374: this_offset=getUTF8BytePos(srcPtr, srcEnd, this_offset);
375:
376: result=pos(substr.body, this_offset, lang);
377: if(result==CORD_NOT_FOUND)
378: return STRING_NOT_FOUND;
379:
380: // convert 'result' from 'bytes' to 'characters'
381: result=getUTF8CharPos(srcPtr, srcEnd, result);
382: } else {
383: result=pos(substr.body, this_offset, lang);
384: if(result==CORD_NOT_FOUND)
385: return STRING_NOT_FOUND;
1.210 misha 386: }
387:
388: return result;
389: }
390:
1.174 paf 391: void String::split(ArrayString& result,
392: size_t& pos_after,
393: const char* delim,
394: Language lang, int limit) const {
395: size_t self_length=length();
396: if(size_t delim_length=strlen(delim)) {
1.186 paf 397: size_t pos_before;
1.60 paf 398: // while we have 'delim'...
1.174 paf 399: for(; (pos_before=pos(delim, pos_after, lang))!=STRING_NOT_FOUND && limit; limit--) {
1.69 paf 400: result+=&mid(pos_after, pos_before);
1.174 paf 401: pos_after=pos_before+delim_length;
1.60 paf 402: }
403: // last piece
1.174 paf 404: if(pos_after<self_length && limit) {
405: result+=&mid(pos_after, self_length);
406: pos_after=self_length;
1.60 paf 407: }
408: } else { // empty delim
409: result+=this;
1.174 paf 410: pos_after+=self_length;
1.60 paf 411: }
412: }
413:
1.174 paf 414: void String::split(ArrayString& result,
415: size_t& pos_after,
416: const String& delim, Language lang,
417: int limit) const {
1.140 paf 418: if(!delim.is_empty()) {
1.186 paf 419: size_t pos_before;
1.60 paf 420: // while we have 'delim'...
1.174 paf 421: for(; (pos_before=pos(delim, pos_after, lang))!=STRING_NOT_FOUND && limit; limit--) {
1.69 paf 422: result+=&mid(pos_after, pos_before);
1.174 paf 423: pos_after=pos_before+delim.length();
1.60 paf 424: }
425: // last piece
1.174 paf 426: if(pos_after<length() && limit) {
427: result+=&mid(pos_after, length());
428: pos_after=length();
1.60 paf 429: }
430: } else { // empty delim
431: result+=this;
1.174 paf 432: pos_after+=length();
1.60 paf 433: }
1.61 paf 434: }
435:
1.209 misha 436: enum Match_feature {
437: MF_NEED_PRE_POST_MATCH = 0x01,
438: MF_JUST_COUNT_MATCHES = 0x02
439: };
440:
441: static void regex_options(const String* options, int* result, int* match_features){
1.63 paf 442: struct Regex_option {
1.174 paf 443: const char* keyL;
444: const char* keyU;
1.209 misha 445: int clear;
446: int set;
1.63 paf 447: int *result;
1.209 misha 448: int flag;
1.63 paf 449: } regex_option[]={
1.189 paf 450: {"i", "I", 0, PCRE_CASELESS, result, 0}, // a=A
451: {"s", "S", 0, PCRE_DOTALL, result, 0}, // \n\n$ [default]
452: {"x", "U", 0, PCRE_EXTENDED, result, 0}, // whitespace in regex ignored
453: {"m", "M", PCRE_DOTALL, PCRE_MULTILINE, result, 0}, // ^aaa\n$^bbb\n$
454: {"g", "G", 0, 1, result+1, 0}, // many rows
1.209 misha 455: {"'", 0, 0, 0, 0, MF_NEED_PRE_POST_MATCH},
456: {"n", "N", 0, 0, 0, MF_JUST_COUNT_MATCHES},
1.189 paf 457: {0, 0, 0, 0, 0, 0}
1.63 paf 458: };
1.171 paf 459: result[0]=PCRE_EXTRA | PCRE_DOTALL | PCRE_DOLLAR_ENDONLY;
1.63 paf 460: result[1]=0;
461:
1.174 paf 462: if(options && !options->is_empty())
1.153 paf 463: for(Regex_option *o=regex_option; o->keyL; o++)
1.209 misha 464: if(
465: options->pos(o->keyL)!=STRING_NOT_FOUND
466: || (o->keyU && options->pos(o->keyU)!=STRING_NOT_FOUND)
467: ){
468: if(o->flag){
469: (*match_features) |= o->flag;
470: } else {
1.154 paf 471: *o->result &= ~o->clear;
472: *o->result |= o->set;
473: }
1.63 paf 474: }
475: }
476:
1.174 paf 477: Table* String::match(Charset& source_charset,
478: const String& regexp,
479: const String* options,
480: Row_action row_action, void *info,
1.209 misha 481: int& matches_count) const {
1.140 paf 482: if(regexp.is_empty())
1.149 paf 483: throw Exception(0,
1.174 paf 484: 0,
1.73 paf 485: "regexp is empty");
1.154 paf 486:
1.205 paf 487: const char* pattern=regexp.cstr(String::L_UNSPECIFIED); // fix any tainted with L_REGEX
1.174 paf 488: const char* errptr;
1.62 paf 489: int erroffset;
1.209 misha 490: int option_bits[2]={0};
491: int match_features=0;
492: regex_options(options, option_bits, &match_features);
493: bool need_pre_post_match=(match_features & MF_NEED_PRE_POST_MATCH) != 0;
494: bool just_count_matches=(match_features & MF_JUST_COUNT_MATCHES) != 0;
1.174 paf 495: bool global=option_bits[1]!=0;
1.214 ! misha 496: PCRE::pcre *code=PCRE::pcre_compile(pattern, option_bits[0],
1.62 paf 497: &errptr, &erroffset,
1.174 paf 498: source_charset.pcre_tables);
1.62 paf 499:
1.67 paf 500: if(!code)
1.149 paf 501: throw Exception(0,
1.174 paf 502: ®exp.mid(erroffset, regexp.length()),
1.74 paf 503: "regular expression syntax error - %s", errptr);
1.62 paf 504:
1.214 ! misha 505: int subpatterns=PCRE::pcre_info(code, 0, 0);
1.174 paf 506: if(subpatterns<0) {
1.214 ! misha 507: PCRE::pcre_free(code);
1.149 paf 508: throw Exception(0,
1.174 paf 509: ®exp,
1.76 paf 510: "pcre_info error (%d)",
1.174 paf 511: subpatterns);
1.63 paf 512: }
513:
1.174 paf 514: const char* subject=cstr();
515: size_t subject_length=strlen(subject);
516: const int oveclength=(1/*match*/+MAX_MATCH_GROUPS)*3;
517: int ovector[oveclength];
1.155 paf 518:
519: // create table
1.173 paf 520: Table::Action_options table_options;
1.174 paf 521: Table& table=*new Table(string_match_table_template, table_options);
1.63 paf 522:
1.64 paf 523: int exec_option_bits=0;
1.154 paf 524: int prestart=0;
525: int poststart=0;
1.174 paf 526: int postfinish=length();
1.63 paf 527: while(true) {
1.214 ! misha 528: int exec_substrings=PCRE::pcre_exec(code, 0,
1.174 paf 529: subject, subject_length, prestart,
530: exec_option_bits, ovector, oveclength);
1.63 paf 531:
532: if(exec_substrings==PCRE_ERROR_NOMATCH) {
1.214 ! misha 533: PCRE::pcre_free(code);
1.174 paf 534: row_action(table, 0/*last time, no raw*/, 0, 0, poststart, postfinish, info);
1.208 misha 535: // if(global || subpatterns)
536: // return &table; // global or with subpatterns=true+result
537: // else {
538: // just_matched=false; return 0; // not global=no result
539: // }
1.209 misha 540: return just_count_matches ? 0 : &table;
1.63 paf 541: }
542:
543: if(exec_substrings<0) {
1.214 ! misha 544: PCRE::pcre_free(code);
1.149 paf 545: throw Exception(0,
1.174 paf 546: ®exp,
1.76 paf 547: "regular expression execute error (%d)",
1.63 paf 548: exec_substrings);
549: }
550:
1.154 paf 551: int prefinish=ovector[0];
552: poststart=ovector[1];
1.174 paf 553: ArrayString* row=new ArrayString;
554: if(need_pre_post_match) {
555: *row+=&mid(0, prefinish); // .prematch column value
556: *row+=&mid(prefinish, poststart); // .match
557: *row+=&mid(poststart, postfinish); // .postmatch
558: } else {
1.185 paf 559: *row+=&Empty; // .prematch column value
560: *row+=&Empty; // .match
561: *row+=&Empty; // .postmatch
1.174 paf 562: }
1.63 paf 563:
564: for(int i=1; i<exec_substrings; i++) {
1.69 paf 565: // -1:-1 case handled peacefully by mid() itself
1.174 paf 566: *row+=&mid(ovector[i*2+0], ovector[i*2+1]); // .i column value
1.63 paf 567: }
568:
1.209 misha 569: matches_count++;
1.174 paf 570: row_action(table, row, prestart, prefinish, poststart, postfinish, info);
1.63 paf 571:
1.174 paf 572: if(!global || prestart==poststart) { // not global | going to hang
1.214 ! misha 573: PCRE::pcre_free(code);
1.174 paf 574: row_action(table, 0/*last time, no row*/, 0, 0, poststart, postfinish, info);
1.209 misha 575: return just_count_matches ? 0 : &table;
576: // return &table;
1.63 paf 577: }
1.154 paf 578: prestart=poststart;
1.63 paf 579:
580: /*
581: if(option_bits[0] & PCRE_MULTILINE)
1.64 paf 582: exec_option_bits|=PCRE_NOTBOL; // start of subject+startoffset not BOL
1.63 paf 583: */
584: }
1.82 parser 585: }
586:
1.174 paf 587: String& String::change_case(Charset& source_charset, Change_case_kind kind) const {
588: String& result=*new String();
589: if(is_empty())
590: return result;
591:
592: char* new_cstr=cstrm();
1.192 paf 593: size_t new_cstr_len=length();
1.181 paf 594: if(source_charset.isUTF8()) {
595: switch(kind) {
596: case CC_UPPER:
1.192 paf 597: change_case_UTF8((const XMLByte*)new_cstr, new_cstr_len, (XMLByte*)new_cstr, new_cstr_len, UTF8CaseToUpper);
1.181 paf 598: break;
599: case CC_LOWER:
1.192 paf 600: change_case_UTF8((const XMLByte*)new_cstr, new_cstr_len, (XMLByte*)new_cstr, new_cstr_len, UTF8CaseToLower);
1.181 paf 601: break;
602: default:
603: assert(!"unknown change case kind");
604: break; // never
605: }
606:
607: } else {
608: const unsigned char *tables=source_charset.pcre_tables;
1.82 parser 609:
1.181 paf 610: const unsigned char *a;
611: const unsigned char *b;
612: switch(kind) {
613: case CC_UPPER:
614: a=tables+lcc_offset;
615: b=tables+fcc_offset;
616: break;
617: case CC_LOWER:
618: a=tables+lcc_offset;
619: b=0;
620: break;
621: default:
622: assert(!"unknown change case kind");
623: a=b=0; // calm, compiler
624: break; // never
625: }
626:
1.192 paf 627: char *dest=new_cstr;
1.181 paf 628: unsigned char index;
1.190 paf 629: for(const char* current=new_cstr; (index=(unsigned char)*current); current++) {
1.181 paf 630: unsigned char c=a[index];
631: if(b)
632: c=b[c];
633:
634: *dest++=(char)c;
635: }
1.174 paf 636: }
1.176 paf 637: result.langs=langs;
1.174 paf 638: result.body=new_cstr;
1.89 parser 639:
1.101 parser 640: return result;
641: }
642:
1.213 misha 643: const String& String::escape(Charset& source_charset) const {
644: if(is_empty())
645: return *this;
646:
647: return Charset::escape(*this, source_charset);
648: }
649:
1.174 paf 650: const String& String::replace(const Dictionary& dict) const {
651: String& result=*new String();
652: const char* old_cstr=cstr();
653: const char* prematch_begin=old_cstr;
654:
655: const char* current=old_cstr;
656: while(*current) {
1.184 paf 657: if(Dictionary::Subst subst=dict.first_that_begins(current)) {
1.174 paf 658: // prematch
659: if(size_t prematch_length=current-prematch_begin) {
1.179 paf 660: result.langs.append(result.body, langs, prematch_begin-old_cstr, prematch_length);
1.174 paf 661: result.body.append_strdup_know_length(prematch_begin, prematch_length);
1.101 parser 662: }
663:
1.174 paf 664: // match
665: // skip 'a' in 'current'; move prematch_begin
1.184 paf 666: current+=subst.from_length; prematch_begin=current;
1.174 paf 667:
1.184 paf 668: if(const String* b=subst.to) // are there any b?
1.174 paf 669: result<<*b;
670: } else // simply advance
671: current++;
672: }
1.156 paf 673:
1.174 paf 674: // postmatch
675: if(size_t postmatch_length=current-prematch_begin) {
1.179 paf 676: result.langs.append(result.body, langs, prematch_begin-old_cstr, postmatch_length);
1.174 paf 677: result.body.append_strdup_know_length(prematch_begin, postmatch_length);
678: }
1.156 paf 679:
1.174 paf 680: ASSERT_STRING_INVARIANT(result);
1.82 parser 681: return result;
1.61 paf 682: }
1.113 parser 683:
1.180 paf 684: static int serialize_body_char(char c, char** cur) {
685: *((*cur)++)=c;
686: return 0; // 0=continue
687: };
1.174 paf 688: static int serialize_body_piece(const char* s, char** cur) {
689: size_t length=strlen(s);
690: memcpy(*cur, s, length); *cur+=length;
1.178 paf 691: return 0; // 0=continue
1.174 paf 692: };
1.178 paf 693: static int serialize_lang_piece(char alang, size_t asize, char** cur) {
694: // lang
1.191 paf 695: **cur=alang; (*cur)++;
696: // length [WARNING: not cast, addresses must be %4=0 on sparc]
1.178 paf 697: memcpy(*cur, &asize, sizeof(asize)); *cur+=sizeof(asize);
698:
699: return 0; // 0=continue
700: }
1.174 paf 701: String::Cm String::serialize(size_t prolog_length) const {
1.178 paf 702: size_t fragments_count=langs.count();
1.202 paf 703: size_t body_length=body.length();
1.174 paf 704: size_t buf_length=
1.178 paf 705: prolog_length //1
706: +sizeof(size_t) //2
1.202 paf 707: +body_length //3
708: +1 // 4 for zero terminator used in deserialize
709: +sizeof(size_t) //5
710: +fragments_count*(sizeof(char)+sizeof(size_t)); //6
711:
1.174 paf 712: String::Cm result(new(PointerFreeGC) char[buf_length], buf_length);
713:
714: // 1: prolog
715: char *cur=result.str+prolog_length;
1.202 paf 716: // 2: chars.count [WARNING: not cast, addresses must be %4=0 on sparc]
717: memcpy(cur, &body_length, sizeof(body_length)); cur+=sizeof(body_length);
718: // 3: letters
719: body.for_each(serialize_body_char, serialize_body_piece, &cur);
720: // 4: zero terminator
721: *cur++=0;
722: // 5: langs.count [WARNING: not cast, addresses must be %4=0 on sparc]
1.174 paf 723: memcpy(cur, &fragments_count, sizeof(fragments_count)); cur+=sizeof(fragments_count);
1.202 paf 724: // 6: lang info
1.178 paf 725: langs.for_each(body, serialize_lang_piece, &cur);
1.113 parser 726:
1.174 paf 727: return result;
1.113 parser 728: }
1.202 paf 729: bool String::deserialize(size_t prolog_size, void *buf, size_t buf_size) {
730: size_t in_buf=buf_size;
731: if(in_buf<=prolog_size)
1.148 paf 732: return false;
1.202 paf 733: in_buf-=prolog_size;
1.135 paf 734:
1.174 paf 735: // 1: prolog
1.202 paf 736: const char* cur=(const char* )buf+prolog_size;
1.113 parser 737:
1.207 paf 738: // 2: chars.count
1.202 paf 739: size_t body_length;
740: if(in_buf<sizeof(body_length)) // body.length don't fit?
741: return false;
742: // [WARNING: not cast, addresses must be %4=0 on sparc]
743: memcpy(&body_length, cur, sizeof(body_length)); cur+=sizeof(body_length);
744: in_buf-=sizeof(body_length);
745:
746: if(in_buf<body_length+1) // letters+terminator don't fit?
747: return false;
748: // 4: zero terminator
749: if(cur[body_length] != 0) // in place?
750: return false;
751: // 3: letters
752: body=String::Body(cur, body_length);
753: cur+=body_length+1;
754: in_buf-=body_length+1;
755:
756: // 5: langs.count
1.191 paf 757: size_t fragments_count;
1.202 paf 758: if(in_buf<sizeof(fragments_count)) // langs.count don't fit?
1.174 paf 759: return false;
1.191 paf 760: // [WARNING: not cast, addresses must be %4=0 on sparc]
761: memcpy(&fragments_count, cur, sizeof(fragments_count)); cur+=sizeof(fragments_count);
1.202 paf 762: in_buf-=sizeof(fragments_count);
1.174 paf 763:
764: if(fragments_count) {
1.202 paf 765: // 6: lang info
1.174 paf 766: size_t total_length=0;
767: for(size_t f=0; f<fragments_count; f++) {
1.191 paf 768: char lang;
769: size_t fragment_length;
770: size_t piece_length=sizeof(lang)+sizeof(fragment_length);
1.202 paf 771: if(in_buf<piece_length) // lang+length
1.174 paf 772: return false;
773:
1.191 paf 774: // lang
775: lang=*cur++;
776: // length [WARNING: not cast, addresses must be %4=0 on sparc]
777: memcpy(&fragment_length, cur, sizeof(fragment_length)); cur+=sizeof(fragment_length);
778:
1.206 paf 779: size_t combined_length=total_length+fragment_length;
780: if(combined_length>body_length)
781: return false; // file curruption
1.191 paf 782: // uchar needed to prevent propagating 0x80 bit to upper bytes
783: langs.append(total_length, (String::Language)(uchar)lang, fragment_length);
1.206 paf 784: total_length=combined_length;
1.202 paf 785: in_buf-=piece_length;
1.174 paf 786: }
1.128 paf 787:
1.202 paf 788: if(total_length!=body_length) // length(all language fragments) vs length(letters)
1.148 paf 789: return false;
1.174 paf 790: }
1.202 paf 791: if(in_buf!=0) // some strange extra bytes
792: return false;
1.113 parser 793:
1.174 paf 794: ASSERT_STRING_INVARIANT(*this);
1.148 paf 795: return true;
1.176 paf 796: }
797:
798: const char* String::Body::v() const {
799: return CORD_to_const_char_star(body);
800: }
1.201 paf 801: void String::Body::dump() const {
802: CORD_dump(body);
803: }
804:
1.176 paf 805: const char* String::Languages::v() const {
1.177 paf 806: if(opt.is_not_just_lang)
1.176 paf 807: return CORD_to_const_char_star(langs);
808: else
809: return (const char*)&langs;
810: }
1.201 paf 811: void String::Languages::dump() const {
812: if(opt.is_not_just_lang)
813: CORD_dump(langs);
814: else
815: puts((const char*)&langs);
816: }
1.176 paf 817: const char* String::v() const {
1.198 paf 818: const uint LIMIT_VIEW=20;
1.176 paf 819: char* buf=(char*)malloc(MAX_STRING);
820: const char*body_view=body.v();
821: const char*langs_view=langs.v();
822: snprintf(buf, MAX_STRING,
1.178 paf 823: "%d:%.*s%s} "
1.176 paf 824: "{%d:%s",
1.178 paf 825: langs.count(), LIMIT_VIEW, langs_view, strlen(langs_view)>LIMIT_VIEW?"...":"",
1.176 paf 826: strlen(body_view), body_view
827: );
828:
829: return buf;
1.113 parser 830: }
1.201 paf 831: void String::dump() const {
832: body.dump();
833: langs.dump();
834: }
1.195 paf 835: const String& String::trim(String::Trim_kind kind, const char* chars) const {
836: if(!length())
837: return *this;
838:
839: size_t substr_begin, substr_length;
840: Body new_body=body.trim(kind, chars, &substr_begin, &substr_length);
841: if(new_body==body) // we received unchanged pointer, do likewise
842: return *this;
843: // new_body differs from body, adjust langs along
844:
845: String& result=*new String;
846: if(!new_body) // body.trim produced empty result
847: return result;
848: // body.trim produced nonempty result
849:
850: // first: their langs
851: result.langs.append(result.body, langs, substr_begin, substr_length);
852: // next: letters themselves
853: result.body=new_body;
854:
855: ASSERT_STRING_INVARIANT(result);
856: return result;
1.198 paf 857: }
E-mail: