--- parser3/src/main/pa_string.C 2003/10/21 05:11:00 1.185 +++ parser3/src/main/pa_string.C 2012/06/20 21:01:20 1.243 @@ -1,28 +1,161 @@ /** @file Parser: string class. @see untalength_t.C. - Copyright (c) 2001-2003 ArtLebedev Group (http://www.artlebedev.com) + Copyright (c) 2001-2012 Art. Lebedev Studio (http://www.artlebedev.com) Author: Alexandr Petrosian (http://paf.design.ru) */ -static const char* IDENT_STRING_C="$Date: 2003/10/21 05:11:00 $"; - -#include "pcre.h" - #include "pa_string.h" #include "pa_exception.h" #include "pa_table.h" #include "pa_dictionary.h" #include "pa_charset.h" +#include "pa_vregex.h" + +volatile const char * IDENT_PA_STRING_C="$Id: pa_string.C,v 1.243 2012/06/20 21:01:20 moko Exp $" IDENT_PA_STRING_H; const String String::Empty; + +// pa_atoui is based on Manuel Novoa III _strto_l for uClibc + +unsigned int pa_atoui(const char *str, int base, const String* problem_source){ + unsigned int result = 0; + const char *pos = str; + + while (isspace(*pos)) /* skip leading whitespace */ + ++pos; + + if (base == 16 && *pos == '0') { /* handle option prefix */ + ++pos; + if (*pos == 'x' || *pos == 'X') { + ++pos; + } + } + + if (base == 0) { /* dynamic base */ + base = 10; /* default is 10 */ + if (*pos == '0') { + ++pos; + if (*pos == 'x' || *pos == 'X') + ++pos; + base=16; + } + } + + if (base < 2 || base > 16) { /* illegal base */ + throw Exception(PARSER_RUNTIME, 0, "base to must be an integer from 2 to 16"); + } + + unsigned int cutoff = UINT_MAX / base; + int cutoff_digit = UINT_MAX - cutoff * base; + + while(true) { + int digit; + + if ((*pos >= '0') && (*pos <= '9')) { + digit = (*pos - '0'); + } else if (*pos >= 'a') { + digit = (*pos - 'a' + 10); + } else if (*pos >= 'A') { + digit = (*pos - 'A' + 10); + } else break; + + if (digit >= base) { + break; + } + + ++pos; + + /* adjust number, with overflow check */ + if ((result > cutoff) || ((result == cutoff) && (digit > cutoff_digit))) { + throw Exception("number.format", problem_source, problem_source ? "out of range (int)" : "'%s' is out of range (int)", str); + } else { + result = result * base + digit; + } + } + + while(char c=*pos++) + if(!isspace(c)) + throw Exception("number.format", problem_source, problem_source ? "invalid number (int)" : "'%s' is invalid number (int)", str); + + return result; +} + +int pa_atoi(const char* str, const String* problem_source) { + if(!str) + return 0; + + while(isspace(*str)) + str++; + + if(!*str) + return 0; + + bool negative=false; + if(str[0]=='-') { + negative=true; + str++; + } else if(str[0]=='+') { + str++; + } + + unsigned int result=pa_atoui(str, 0, problem_source); + + if(negative && result <= ((unsigned int)(-(1+INT_MIN)))+1) + return -(int)result; + + if(result<=INT_MAX) + return (int)result; + + throw Exception("number.format", problem_source, problem_source ? "out of range (int)" : "'%s' is out of range (int)", str); +} + +double pa_atod(const char* str, const String* problem_source) { + if(!str) + return 0; + + while(isspace(*str)) + str++; + + if(!*str) + return 0; + + bool negative=false; + if(str[0]=='-') { + negative=true; + str++; + } else if(str[0]=='+') { + str++; + } + + double result; + if(str[0]=='0') + if(str[1]=='x' || str[1]=='X'){ + // 0xABC + result=(double)pa_atoui(str, 0, problem_source); + return negative ? -result : result; + } else { + // skip leading 0000, to disable octal interpretation + do str++; while(*str=='0'); + } + + char *error_pos; + result=strtod(str, &error_pos); + + while(char c=*error_pos++) + if(!isspace((unsigned char)c)) + throw Exception("number.format", problem_source, problem_source ? "invalid number (double)" : "'%s' is invalid number (double)", str); + + return negative ? -result : result; +} + // cord lib extension #ifndef DOXYGEN typedef struct { ssize_t countdown; - char target; /* Character we're looking for */ + int target; /* Character we're looking for */ } chr_data; #endif static int CORD_range_contains_chr_greater_then_proc(char c, size_t size, void* client_data) @@ -43,7 +176,7 @@ int CORD_range_contains_chr_greater_then return(CORD_block_iter(x, i, CORD_range_contains_chr_greater_then_proc, &d) == 1/*alternatives: 0 normally ended, 2=struck 'n'*/); } -static int CORD_block_count_proc(char c, size_t size, void* client_data) +static int CORD_block_count_proc(char /*c*/, size_t /*size*/, void* client_data) { int* result=(int*)client_data; (*result)++; @@ -81,7 +214,125 @@ Table string_match_table_template(new St String::Body String::Body::Format(int value) { char local[MAX_NUMBER]; size_t length=snprintf(local, MAX_NUMBER, "%d", value); - return String::Body(pa_strdup(local, length), length); + return String::Body(pa_strdup(local, length)); +} + +String::Body String::Body::trim(String::Trim_kind kind, const char* chars, + size_t* out_start, size_t* out_length, Charset* source_charset) const { + size_t our_length=length(); + if(!our_length) + return *this; + + // check if any UTF-8 in chars + bool fast=true; + if(chars && source_charset && source_charset->isUTF8()){ + const char* pos=chars; + while(unsigned char c=*pos++) + if(c>127){ + fast=false; + break; + } + } + + size_t start=0; + size_t end=our_length; + if(!chars) + chars=" \t\n"; // white space + + if(fast){ + // from left... + if(kind!=TRIM_END) { + CORD_pos pos; set_pos(pos, 0); + while(true) { + char c=CORD_pos_fetch(pos); + if(strchr(chars, c)) { + if(++start==our_length) + return 0; // all chars are empty, just return empty string + } else + break; + + CORD_next(pos); + } + } + + // from right.. + if(kind!=TRIM_START) { + CORD_pos pos; set_pos(pos, end-1); + while(true) { + char c=CORD_pos_fetch(pos); + if(strchr(chars, c)) { + if(--end==0) // optimization: NO need to check for 'end>=start', that's(<) impossible + return 0; // all chars are empty, just return empty string + } else + break; + + CORD_prev(pos); + } + } + } else { + const XMLByte* src_begin=(const XMLByte*)cstr(); + const XMLByte* src_end=src_begin+our_length; + + // from left... + if(kind!=TRIM_END) { + while(src_begin127 && *src_begin<0xC0)) + char_length++; + + bool found=false; + for(const char* chars_byte=chars; chars_byte=strchr(chars_byte, *ptr); chars_byte++) + if(strncmp(chars_byte, (const char*)ptr, char_length)==0){ + found=true; + break; + } + + if(found){ + start+=char_length; + if(start==our_length) + return 0; // all chars are empty, just return empty string + } else + break; + } + } + + // from right.. + if(kind!=TRIM_START) { + while(src_begin127 && *src_end<0xC0)) + char_length++; + + bool found=false; + for(const char* chars_byte=chars; chars_byte=strchr(chars_byte, *src_end); chars_byte++) + if(strncmp(chars_byte, (const char*)src_end, char_length)==0){ + found=true; + break; + } + + if(found){ + end-=char_length; + if(end==0) + return 0; // all chars are empty, just return empty string + } else + break; + } + } + } + + if(start==0 && end==our_length) // nobody moved a thing + return *this; + + if(out_start) + *out_start=start; + size_t new_length=end-start; + if(out_length) + *out_length=new_length; + + return mid(start, new_length); } static int CORD_batched_iter_fn_generic_hash_code(char c, void * client_data) { @@ -94,23 +345,76 @@ static int CORD_batched_iter_fn_generic_ generic_hash_code(result, s); return 0; }; -uint String::Body::hash_code() const { - uint result=0; - CORD_iter5(body, 0, - CORD_batched_iter_fn_generic_hash_code, - CORD_batched_iter_fn_generic_hash_code, &result); - return result; +uint String::Body::get_hash_code() const { +#ifdef HASH_CODE_CACHING + if(hash_code) + return hash_code; +#else + uint hash_code=0; +#endif + if (body && CORD_IS_STRING(body)){ + generic_hash_code(hash_code, body); + } else { + CORD_iter5(body, 0, + CORD_batched_iter_fn_generic_hash_code, + CORD_batched_iter_fn_generic_hash_code, &hash_code); + } + return hash_code; } -// String methods +struct CORD_pos_info { + const char* chars; + size_t left; + size_t pos; +}; + +// can be called only for IS_FUNCTION(CORD) which is used in String::Body::strrpbrk +static int CORD_iter_fn_rpos(char c, CORD_pos_info* info) { + if(info->pos < info->left){ + info->pos=STRING_NOT_FOUND; + return 1; + } + if(strchr(info->chars, c)) + return 1; + --(info->pos); + return 0; +} + +size_t String::Body::strrpbrk(const char* chars, size_t left, size_t right) const { + if(is_empty() || !chars || !strlen(chars)) + return STRING_NOT_FOUND; + CORD_pos_info info={chars, left, right}; + if(CORD_riter4(body, right, (CORD_iter_fn)CORD_iter_fn_rpos, &info)) + return info.pos; + else + return STRING_NOT_FOUND; +} -String::String(const char* cstr, size_t helper_length, bool tainted): body(CORD_EMPTY) { - append_help_length(cstr, helper_length, tainted?L_TAINTED:L_CLEAN); + +// can be called only for IS_FUNCTION(CORD) which is used in String::Body::rskipchars +static int CORD_iter_fn_rskip(char c, CORD_pos_info* info) { + if(info->pos < info->left) { + info->pos=STRING_NOT_FOUND; + return 1; + } + if(!strchr(info->chars, c)) + return 1; + --(info->pos); + return 0; } -String::String(const String::C cstr, bool tainted): body(CORD_EMPTY) { - append_know_length(cstr.str, cstr.length, tainted?L_TAINTED:L_CLEAN); + +size_t String::Body::rskipchars(const char* chars, size_t left, size_t right) const { + if(is_empty() || !chars || !strlen(chars)) + return STRING_NOT_FOUND; + CORD_pos_info info={chars, left, right}; + if(CORD_riter4(body, right, (CORD_iter_fn)CORD_iter_fn_rskip, &info)) + return info.pos; + else + return STRING_NOT_FOUND; } +// String methods + String& String::append_know_length(const char* str, size_t known_length, Language lang) { if(!known_length) return *this; @@ -132,6 +436,10 @@ String& String::append_help_length(const return append_know_length(str, known_length, lang); } +String::String(int value, char *format) : langs(L_CLEAN){ + char buf[MAX_NUMBER]; + body.append_strdup_know_length(buf, snprintf(buf, MAX_NUMBER, format, value)); +} String& String::append_strdup(const char* str, size_t helper_length, Language lang) { size_t known_length=helper_length?helper_length:strlen(str); if(!known_length) @@ -146,6 +454,35 @@ String& String::append_strdup(const char return *this; } +struct CORD_length_info { + size_t len; + size_t skip; +}; + +int CORD_batched_len(const char* s, CORD_length_info* info){ + info->len += lengthUTF8( (const XMLByte *)s, (const XMLByte *)s+strlen(s)); return 0; +} + +// can be called only for IS_FUNCTION(CORD) which are used in large String::Body::mid +int CORD_batched_len(const char c, CORD_length_info* info){ + if (info->skip==0){ + info->len++; + info->skip = lengthUTF8Char(c)-1; + } else { + info->skip--; + } + return 0; +} + +size_t String::length(Charset& charset) const { + if(charset.isUTF8()){ + CORD_length_info info = {0, 0}; + body.for_each(CORD_batched_len, CORD_batched_len, &info); + return info.len; + } else + return body.length(); +} + /// @todo check in doc: whether it documents NOW bad situation "abc".mid(-1, 3) =were?="ab" String& String::mid(size_t substr_begin, size_t substr_end) const { String& result=*new String; @@ -162,8 +499,43 @@ String& String::mid(size_t substr_begin, // next: letters themselves result.body=body.mid(substr_begin, substr_length); -// SAPI::log("piece of '%s' from %d to %d is '%s'", - //cstr(), substr_begin, substr_end, result.cstr()); + ASSERT_STRING_INVARIANT(result); + return result; +} + +// from, to and helper_length in characters, not in bytes (it's important for utf-8) +String& String::mid(Charset& charset, size_t from, size_t to, size_t helper_length) const { + String& result=*new String; + + size_t self_length=(helper_length)?helper_length:length(charset); + + if(!self_length) + return result; + + from=min(min(to, from), self_length); + to=min(max(to, from), self_length); + + size_t substr_length=to-from; + + if(!substr_length) + return result; + + if(charset.isUTF8()){ + const XMLByte* src_begin=(const XMLByte*)cstr(); + const XMLByte* src_end=src_begin+body.length(); + + // convert 'from' and 'substr_length' from 'characters' to 'bytes' + from=getUTF8BytePos(src_begin, src_end, from); + substr_length=getUTF8BytePos(src_begin+from, src_end, substr_length); + if(!substr_length) + return result; + } + + // first: their langs + result.langs.append(result.body, langs, from, substr_length); + // next: letters themselves + result.body=body.mid(from, substr_length); + ASSERT_STRING_INVARIANT(result); return result; } @@ -188,13 +560,37 @@ size_t String::pos(const String& substr, return pos(substr.body, this_offset, lang); } +size_t String::pos(Charset& charset, const String& substr, + size_t this_offset, Language lang) const { + + if(charset.isUTF8()){ + const XMLByte* srcPtr=(const XMLByte*)cstr(); + const XMLByte* srcEnd=srcPtr+body.length(); + + // convert 'this_offset' from 'characters' to 'bytes' + this_offset=getUTF8BytePos(srcPtr, srcEnd, this_offset); + + size_t result=pos(substr.body, this_offset, lang); + return (result==CORD_NOT_FOUND) + ? STRING_NOT_FOUND + : getUTF8CharPos(srcPtr, srcEnd, result); // convert 'result' from 'bytes' to 'characters' + } else { + size_t result=pos(substr.body, this_offset, lang); + return (result==CORD_NOT_FOUND) + ? STRING_NOT_FOUND + : result; + } +} + void String::split(ArrayString& result, size_t& pos_after, const char* delim, Language lang, int limit) const { + if(is_empty()) + return; size_t self_length=length(); if(size_t delim_length=strlen(delim)) { - int pos_before; + size_t pos_before; // while we have 'delim'... for(; (pos_before=pos(delim, pos_after, lang))!=STRING_NOT_FOUND && limit; limit--) { result+=&mid(pos_after, pos_before); @@ -215,8 +611,10 @@ void String::split(ArrayString& result, size_t& pos_after, const String& delim, Language lang, int limit) const { + if(is_empty()) + return; if(!delim.is_empty()) { - int pos_before; + size_t pos_before; // while we have 'delim'... for(; (pos_before=pos(delim, pos_after, lang))!=STRING_NOT_FOUND && limit; limit--) { result+=&mid(pos_after, pos_before); @@ -233,111 +631,41 @@ void String::split(ArrayString& result, } } -static void regex_options(const String* options, int *result, bool& need_pre_post_match){ - struct Regex_option { - const char* keyL; - const char* keyU; - int clear, set; - int *result; - bool *flag; - } regex_option[]={ - {"i", "I", 0, PCRE_CASELESS, result}, // a=A - {"s", "S", 0, PCRE_DOTALL, result}, // \n\n$ [default] - {"x", "U", 0, PCRE_EXTENDED, result}, // whitespace in regex ignored - {"m", "M", PCRE_DOTALL, PCRE_MULTILINE, result}, // ^aaa\n$^bbb\n$ - {"g", "G", 0, true, result+1}, // many rows - {"'", 0, 0, 0, 0, &need_pre_post_match}, - {0} - }; - result[0]=PCRE_EXTRA | PCRE_DOTALL | PCRE_DOLLAR_ENDONLY; - result[1]=0; - - if(options && !options->is_empty()) - for(Regex_option *o=regex_option; o->keyL; o++) - if(options->pos(o->keyL)!=STRING_NOT_FOUND - || (o->keyU && options->pos(o->keyU)!=STRING_NOT_FOUND)) { - if(o->flag) - *o->flag=true; - else { // result - *o->result &= ~o->clear; - *o->result |= o->set; - } - } -} +Table* String::match(VRegex* vregex, + Row_action row_action, void *info, + int& matches_count) const { -Table* String::match(Charset& source_charset, - const String& regexp, - const String* options, - Row_action row_action, void *info, - bool& just_matched) const { - if(regexp.is_empty()) - throw Exception(0, - 0, - "regexp is empty"); - - const char* pattern=regexp.cstr(); - const char* errptr; - int erroffset; - bool need_pre_post_match=false; - int option_bits[2]={0}; regex_options(options, option_bits, need_pre_post_match); - bool global=option_bits[1]!=0; - pcre *code=pcre_compile(pattern, option_bits[0], - &errptr, &erroffset, - source_charset.pcre_tables); - - if(!code) - throw Exception(0, - ®exp.mid(erroffset, regexp.length()), - "regular expression syntax error - %s", errptr); - - int subpatterns=pcre_info(code, 0, 0); - if(subpatterns<0) { - pcre_free(code); - throw Exception(0, - ®exp, - "pcre_info error (%d)", - subpatterns); - } + // vregex->info(); // I have no idea what does it for? + + bool need_pre_post_match=vregex->is_pre_post_match_needed(); + bool global=vregex->is_global_search(); const char* subject=cstr(); - size_t subject_length=strlen(subject); - const int oveclength=(1/*match*/+MAX_MATCH_GROUPS)*3; - int ovector[oveclength]; + size_t subject_length=length(); + const int ovector_size=(1/*match*/+MAX_MATCH_GROUPS)*3; + int ovector[ovector_size]; - // create table Table::Action_options table_options; Table& table=*new Table(string_match_table_template, table_options); - int exec_option_bits=0; int prestart=0; int poststart=0; int postfinish=length(); while(true) { - int exec_substrings=pcre_exec(code, 0, - subject, subject_length, prestart, - exec_option_bits, ovector, oveclength); - - if(exec_substrings==PCRE_ERROR_NOMATCH) { - pcre_free(code); - row_action(table, 0/*last time, no raw*/, 0, 0, poststart, postfinish, info); - if(global || subpatterns) - return &table; // global or with subpatterns=true+result - else { - just_matched=false; return 0; // not global=no result - } - } + int exec_result=vregex->exec(subject, subject_length, ovector, ovector_size, prestart); - if(exec_substrings<0) { - pcre_free(code); - throw Exception(0, - ®exp, - "regular expression execute error (%d)", - exec_substrings); - } + if(exec_result<0) // only PCRE_ERROR_NOMATCH might be here, other negative results cause an exception + break; int prefinish=ovector[0]; poststart=ovector[1]; - ArrayString* row=new ArrayString; + + if (prestart==poststart && subject[poststart]=='\n'){ + prestart++; + continue; + } + + ArrayString* row=new ArrayString(3); if(need_pre_post_match) { *row+=&mid(0, prefinish); // .prematch column value *row+=&mid(prefinish, poststart); // .match @@ -348,25 +676,22 @@ Table* String::match(Charset& source_cha *row+=&Empty; // .postmatch } - for(int i=1; i=0 && ovector[i*2+1]>0)?&mid(ovector[i*2+0], ovector[i*2+1]):new String; // .i column value } + matches_count++; row_action(table, row, prestart, prefinish, poststart, postfinish, info); - if(!global || prestart==poststart) { // not global | going to hang - pcre_free(code); - row_action(table, 0/*last time, no row*/, 0, 0, poststart, postfinish, info); - return &table; - } - prestart=poststart; + if(!global || prestart==poststart) // last step + break; -/* - if(option_bits[0] & PCRE_MULTILINE) - exec_option_bits|=PCRE_NOTBOL; // start of subject+startoffset not BOL -*/ + prestart=poststart; } + + row_action(table, 0/*last time, no raw*/, 0, 0, poststart, postfinish, info); + return vregex->is_just_count() ? 0 : &table; } String& String::change_case(Charset& source_charset, Change_case_kind kind) const { @@ -375,14 +700,15 @@ String& String::change_case(Charset& sou return result; char* new_cstr=cstrm(); - char *dest=new_cstr; + if(source_charset.isUTF8()) { + size_t new_cstr_len=length(); switch(kind) { case CC_UPPER: - change_case_UTF8((const XMLByte*)new_cstr, (XMLByte*)new_cstr, UTF8CaseToUpper); + change_case_UTF8((const XMLByte*)new_cstr, new_cstr_len, (XMLByte*)new_cstr, new_cstr_len, UTF8CaseToUpper); break; case CC_LOWER: - change_case_UTF8((const XMLByte*)new_cstr, (XMLByte*)new_cstr, UTF8CaseToLower); + change_case_UTF8((const XMLByte*)new_cstr, new_cstr_len, (XMLByte*)new_cstr, new_cstr_len, UTF8CaseToLower); break; default: assert(!"unknown change case kind"); @@ -409,8 +735,9 @@ String& String::change_case(Charset& sou break; // never } + char *dest=new_cstr; unsigned char index; - for(const char* current=new_cstr; index=(unsigned char)*current; current++) { + for(const char* current=new_cstr; (index=(unsigned char)*current); current++) { unsigned char c=a[index]; if(b) c=b[c]; @@ -424,92 +751,73 @@ String& String::change_case(Charset& sou return result; } +const String& String::escape(Charset& source_charset) const { + if(is_empty()) + return *this; + + return Charset::escape(*this, source_charset); +} + +#define STRING_APPEND(result, from_cstr, langs, langs_offset, length) \ + result.langs.append(result.body, langs, langs_offset, length); \ + result.body.append_strdup_know_length(from_cstr, length); + const String& String::replace(const Dictionary& dict) const { + if(!dict.count() || is_empty()) + return *this; + String& result=*new String(); const char* old_cstr=cstr(); const char* prematch_begin=old_cstr; - const char* current=old_cstr; - while(*current) { - if(Dictionary::Subst subst=dict.first_that_begins(current)) { + if(dict.count()==1) { + // optimized simple case + + Dictionary::Subst subst=dict.get(0); + while(const char* p=strstr(prematch_begin, subst.from)) { // prematch - if(size_t prematch_length=current-prematch_begin) { - result.langs.append(result.body, langs, prematch_begin-old_cstr, prematch_length); - result.body.append_strdup_know_length(prematch_begin, prematch_length); + if(size_t prematch_length=p-prematch_begin) { + STRING_APPEND(result, prematch_begin, langs, prematch_begin-old_cstr, prematch_length) } // match - // skip 'a' in 'current'; move prematch_begin - current+=subst.from_length; prematch_begin=current; + prematch_begin=p+subst.from_length; if(const String* b=subst.to) // are there any b? result<<*b; - } else // simply advance - current++; - } - - // postmatch - if(size_t postmatch_length=current-prematch_begin) { - result.langs.append(result.body, langs, prematch_begin-old_cstr, postmatch_length); - result.body.append_strdup_know_length(prematch_begin, postmatch_length); - } - - ASSERT_STRING_INVARIANT(result); - return result; -} - -double String::as_double() const { - double result; - const char *str=cstr(); - - while(*str && isspace(*str)) - str++; - if(!*str) - return 0; + } - char *error_pos; - // 0xABC - if(str[0]=='0') - if(str[1]=='x' || str[1]=='X') - result=(double)(unsigned long)strtol(str, &error_pos, 0); - else - result=(double)strtod(str+1/*skip leading 0*/, &error_pos); - else - result=(double)strtod(str, &error_pos); + } else { - while(char c=*error_pos++) - if(!isspace(c)) - throw Exception("number.format", - this, - "invalid number (double)"); + const char* current=old_cstr; + while(*current) { + if(Dictionary::Subst subst=dict.first_that_begins(current)) { + // prematch + if(size_t prematch_length=current-prematch_begin) { + STRING_APPEND(result, prematch_begin, langs, prematch_begin-old_cstr, prematch_length) + } - return result; -} -int String::as_int() const { - int result; - const char *str=cstr(); + // match + // skip 'a' in 'current'; move prematch_begin + current+=subst.from_length; prematch_begin=current; + + if(const String* b=subst.to) // are there any b? + result<<*b; + } else // simply advance + current++; + } - while(*str && isspace(*str)) - str++; - if(!*str) - return 0; + } - char *error_pos; - // 0xABC - if(str[0]=='0') - if(str[1]=='x' || str[1]=='X') - result=(int)(unsigned long)strtol(str, &error_pos, 0); - else - result=(int)strtol(str+1/*skip leading 0*/, &error_pos, 0); - else - result=(int)strtol(str, &error_pos, 0); + if(prematch_begin==old_cstr) // not modified + return *this; - while(char c=*error_pos++) - if(!isspace(c)) - throw Exception("number.format", - this, - "invalid number (int)"); + // postmatch + if(size_t postmatch_length=old_cstr+length()-prematch_begin) { + STRING_APPEND(result, prematch_begin, langs, prematch_begin-old_cstr, postmatch_length) + } + ASSERT_STRING_INVARIANT(result); return result; } @@ -524,89 +832,133 @@ static int serialize_body_piece(const ch }; static int serialize_lang_piece(char alang, size_t asize, char** cur) { // lang - memcpy(*cur, &alang, sizeof(alang)); *cur+=sizeof(alang); - // length + **cur=alang; (*cur)++; + // length [WARNING: not cast, addresses must be %4=0 on sparc] memcpy(*cur, &asize, sizeof(asize)); *cur+=sizeof(asize); return 0; // 0=continue } String::Cm String::serialize(size_t prolog_length) const { size_t fragments_count=langs.count(); + size_t body_length=body.length(); size_t buf_length= prolog_length //1 +sizeof(size_t) //2 - +fragments_count*(sizeof(char)+sizeof(size_t)) //3 - +body.length() //4 - +1; // for zero terminator used in deserialize + +body_length //3 + +1 // 4 for zero terminator used in deserialize + +sizeof(size_t) //5 + +fragments_count*(sizeof(char)+sizeof(size_t)); //6 + String::Cm result(new(PointerFreeGC) char[buf_length], buf_length); // 1: prolog char *cur=result.str+prolog_length; - // 2: langs.count + // 2: chars.count [WARNING: not cast, addresses must be %4=0 on sparc] + memcpy(cur, &body_length, sizeof(body_length)); cur+=sizeof(body_length); + // 3: letters + body.for_each(serialize_body_char, serialize_body_piece, &cur); + // 4: zero terminator + *cur++=0; + // 5: langs.count [WARNING: not cast, addresses must be %4=0 on sparc] memcpy(cur, &fragments_count, sizeof(fragments_count)); cur+=sizeof(fragments_count); - // 3: lang info + // 6: lang info langs.for_each(body, serialize_lang_piece, &cur); - // 4: letters - body.for_each(serialize_body_char, serialize_body_piece, &cur); - // 5: zero terminator - *cur=0; return result; } -bool String::deserialize(size_t prolog_length, void *buf, size_t buf_length) { - if(buf_length<=prolog_length) +bool String::deserialize(size_t prolog_size, void *buf, size_t buf_size) { + size_t in_buf=buf_size; + if(in_buf<=prolog_size) return false; - buf_length-=prolog_length; - buf_length-=1; // 5: zero terminator + in_buf-=prolog_size; // 1: prolog - const char* cur=(const char* )buf+prolog_length; + const char* cur=(const char* )buf+prolog_size; - // 2: langs.count - if(buf_length(cur); cur+=sizeof(size_t); - buf_length-=sizeof(size_t); + // [WARNING: not cast, addresses must be %4=0 on sparc] + memcpy(&body_length, cur, sizeof(body_length)); cur+=sizeof(body_length); + in_buf-=sizeof(body_length); + + if(in_buf(cur); cur+=sizeof(char); - size_t fragment_length=*reinterpret_cast(cur); cur+=sizeof(size_t); - langs.append(total_length, lang, fragment_length); - total_length+=fragment_length; - - buf_length-=piece_length; + // lang + lang=*cur++; + // length [WARNING: not cast, addresses must be %4=0 on sparc] + memcpy(&fragment_length, cur, sizeof(fragment_length)); cur+=sizeof(fragment_length); + + size_t combined_length=total_length+fragment_length; + if(combined_length>body_length) + return false; // file curruption + // uchar needed to prevent propagating 0x80 bit to upper bytes + langs.append(total_length, (String::Language)(uchar)lang, fragment_length); + total_length=combined_length; + in_buf-=piece_length; } - // 4: letters - if(buf_length!=total_length) + if(total_length!=body_length) // length(all language fragments) vs length(letters) return false; - - // serialize wrote extra zero byte there, we can rely on that - body=String::Body(cur, buf_length); } + if(in_buf!=0) // some strange extra bytes + return false; ASSERT_STRING_INVARIANT(*this); return true; } const char* String::Body::v() const { - return CORD_to_const_char_star(body); + return CORD_to_const_char_star(body, length()); +} +void String::Body::dump() const { + CORD_dump(body); } + const char* String::Languages::v() const { if(opt.is_not_just_lang) - return CORD_to_const_char_star(langs); + return CORD_to_const_char_star(langs, 0); else return (const char*)&langs; } +void String::Languages::dump() const { + if(opt.is_not_just_lang) + CORD_dump(langs); + else + puts((const char*)&langs); +} const char* String::v() const { -#define LIMIT_VIEW 20 + const uint LIMIT_VIEW=20; char* buf=(char*)malloc(MAX_STRING); const char*body_view=body.v(); const char*langs_view=langs.v(); @@ -618,5 +970,33 @@ const char* String::v() const { ); return buf; -#undef LIMIT_VIEW +} + +void String::dump() const { + body.dump(); + langs.dump(); +} + +const String& String::trim(String::Trim_kind kind, const char* chars, Charset* source_charset) const { + if(is_empty()) + return *this; + + size_t substr_begin, substr_length; + Body new_body=body.trim(kind, chars, &substr_begin, &substr_length, source_charset); + if(new_body==body) // we received unchanged pointer, do likewise + return *this; + // new_body differs from body, adjust langs along + + String& result=*new String; + if(!new_body) // body.trim produced empty result + return result; + // body.trim produced nonempty result + + // first: their langs + result.langs.append(result.body, langs, substr_begin, substr_length); + // next: letters themselves + result.body=new_body; + + ASSERT_STRING_INVARIANT(result); + return result; }