--- parser3/src/main/pa_string.C 2003/10/21 05:11:00 1.185 +++ parser3/src/main/pa_string.C 2006/03/04 18:02:40 1.206 @@ -1,11 +1,11 @@ /** @file Parser: string class. @see untalength_t.C. - Copyright (c) 2001-2003 ArtLebedev Group (http://www.artlebedev.com) + Copyright (c) 2001-2005 ArtLebedev Group (http://www.artlebedev.com) Author: Alexandr Petrosian (http://paf.design.ru) */ -static const char* IDENT_STRING_C="$Date: 2003/10/21 05:11:00 $"; +static const char * const IDENT_STRING_C="$Date: 2006/03/04 18:02:40 $"; #include "pcre.h" @@ -17,12 +17,94 @@ static const char* IDENT_STRING_C="$Date const String String::Empty; +int pa_atoi(const char* str, const String* problem_source) { + if(!str) + return 0; + + while(*str && isspace((unsigned char)*str)) + str++; + if(!*str) + return 0; + + int result; + char *error_pos; + bool negative=false; + if(str[0]=='-') { + negative=true; + str++; + } else if(str[0]=='+') { + str++; + } + // 0xABC + if(str[0]=='0') + if(str[1]=='x' || str[1]=='X') + result=(int)(unsigned long)strtol(str, &error_pos, 0); + else { + // skip leading 0000, to disable octal interpretation + do str++; while(*str=='0'); + result=(int)strtol(str, &error_pos, 0); + } + else + result=(int)strtol(str, &error_pos, 0); + if(negative) + result=-result; + + while(char c=*error_pos++) + if(!isspace((unsigned char)c)) + throw Exception("number.format", + problem_source, + problem_source?"invalid number (int)": "'%s' is invalid number (int)", str); + + return result; +} + +double pa_atod(const char* str, const String* problem_source) { + if(!str) + return 0; + + while(*str && isspace((unsigned char)*str)) + str++; + if(!*str) + return 0; + + double result; + char *error_pos; + bool negative=false; + if(str[0]=='-') { + negative=true; + str++; + } else if(str[0]=='+') { + str++; + } + // 0xABC + if(str[0]=='0') + if(str[1]=='x' || str[1]=='X') + result=(double)(unsigned long)strtol(str, &error_pos, 0); + else { + // skip leading 0000, to disable octal interpretation + do str++; while(*str=='0'); + result=(double)strtod(str, &error_pos); + } + else + result=(double)strtod(str, &error_pos); + if(negative) + result=-result; + + while(char c=*error_pos++) + if(!isspace((unsigned char)c)) + throw Exception("number.format", + problem_source, + problem_source?"invalid number (double)": "'%s' is invalid number (double)", str); + + return result; +} + // cord lib extension #ifndef DOXYGEN typedef struct { ssize_t countdown; - char target; /* Character we're looking for */ + int target; /* Character we're looking for */ } chr_data; #endif static int CORD_range_contains_chr_greater_then_proc(char c, size_t size, void* client_data) @@ -43,7 +125,7 @@ int CORD_range_contains_chr_greater_then return(CORD_block_iter(x, i, CORD_range_contains_chr_greater_then_proc, &d) == 1/*alternatives: 0 normally ended, 2=struck 'n'*/); } -static int CORD_block_count_proc(char c, size_t size, void* client_data) +static int CORD_block_count_proc(char /*c*/, size_t /*size*/, void* client_data) { int* result=(int*)client_data; (*result)++; @@ -84,6 +166,57 @@ String::Body String::Body::Format(int va return String::Body(pa_strdup(local, length), length); } +String::Body String::Body::trim(String::Trim_kind kind, const char* chars, + size_t* out_start, size_t* out_length) const { + size_t our_length=length(); + if(!our_length) + return *this; + if(!chars) + chars=" \t\n"; // white space + + size_t start=0; + size_t end=our_length; + // from left... + if(kind!=TRIM_END) { + CORD_pos pos; set_pos(pos, 0); + while(true) { + char c=CORD_pos_fetch(pos); + if(strchr(chars, c)) { + if(++start==our_length) + return 0; // all chars are empty, just return empty string + } else + break; + + CORD_next(pos); + } + } + // from right.. + if(kind!=TRIM_START) { + CORD_pos pos; set_pos(pos, end-1); + while(true) { + char c=CORD_pos_fetch(pos); + if(strchr(chars, c)) { + if(--end==0) // optimization: NO need to check for 'end>=start', that's(<) impossible + return 0; // all chars are empty, just return empty string + } else + break; + + CORD_prev(pos); + } + } + + if(start==0 && end==our_length) // nobody moved a thing + return *this; + + if(out_start) + *out_start=start; + size_t new_length=end-start; + if(out_length) + *out_length=new_length; + + return mid(start, new_length); +} + static int CORD_batched_iter_fn_generic_hash_code(char c, void * client_data) { uint& result=*static_cast(client_data); generic_hash_code(result, c); @@ -162,8 +295,6 @@ String& String::mid(size_t substr_begin, // next: letters themselves result.body=body.mid(substr_begin, substr_length); -// SAPI::log("piece of '%s' from %d to %d is '%s'", - //cstr(), substr_begin, substr_end, result.cstr()); ASSERT_STRING_INVARIANT(result); return result; } @@ -194,7 +325,7 @@ void String::split(ArrayString& result, Language lang, int limit) const { size_t self_length=length(); if(size_t delim_length=strlen(delim)) { - int pos_before; + size_t pos_before; // while we have 'delim'... for(; (pos_before=pos(delim, pos_after, lang))!=STRING_NOT_FOUND && limit; limit--) { result+=&mid(pos_after, pos_before); @@ -216,7 +347,7 @@ void String::split(ArrayString& result, const String& delim, Language lang, int limit) const { if(!delim.is_empty()) { - int pos_before; + size_t pos_before; // while we have 'delim'... for(; (pos_before=pos(delim, pos_after, lang))!=STRING_NOT_FOUND && limit; limit--) { result+=&mid(pos_after, pos_before); @@ -241,13 +372,13 @@ static void regex_options(const String* int *result; bool *flag; } regex_option[]={ - {"i", "I", 0, PCRE_CASELESS, result}, // a=A - {"s", "S", 0, PCRE_DOTALL, result}, // \n\n$ [default] - {"x", "U", 0, PCRE_EXTENDED, result}, // whitespace in regex ignored - {"m", "M", PCRE_DOTALL, PCRE_MULTILINE, result}, // ^aaa\n$^bbb\n$ - {"g", "G", 0, true, result+1}, // many rows + {"i", "I", 0, PCRE_CASELESS, result, 0}, // a=A + {"s", "S", 0, PCRE_DOTALL, result, 0}, // \n\n$ [default] + {"x", "U", 0, PCRE_EXTENDED, result, 0}, // whitespace in regex ignored + {"m", "M", PCRE_DOTALL, PCRE_MULTILINE, result, 0}, // ^aaa\n$^bbb\n$ + {"g", "G", 0, 1, result+1, 0}, // many rows {"'", 0, 0, 0, 0, &need_pre_post_match}, - {0} + {0, 0, 0, 0, 0, 0} }; result[0]=PCRE_EXTRA | PCRE_DOTALL | PCRE_DOLLAR_ENDONLY; result[1]=0; @@ -275,7 +406,7 @@ Table* String::match(Charset& source_cha 0, "regexp is empty"); - const char* pattern=regexp.cstr(); + const char* pattern=regexp.cstr(String::L_UNSPECIFIED); // fix any tainted with L_REGEX const char* errptr; int erroffset; bool need_pre_post_match=false; @@ -375,14 +506,14 @@ String& String::change_case(Charset& sou return result; char* new_cstr=cstrm(); - char *dest=new_cstr; + size_t new_cstr_len=length(); if(source_charset.isUTF8()) { switch(kind) { case CC_UPPER: - change_case_UTF8((const XMLByte*)new_cstr, (XMLByte*)new_cstr, UTF8CaseToUpper); + change_case_UTF8((const XMLByte*)new_cstr, new_cstr_len, (XMLByte*)new_cstr, new_cstr_len, UTF8CaseToUpper); break; case CC_LOWER: - change_case_UTF8((const XMLByte*)new_cstr, (XMLByte*)new_cstr, UTF8CaseToLower); + change_case_UTF8((const XMLByte*)new_cstr, new_cstr_len, (XMLByte*)new_cstr, new_cstr_len, UTF8CaseToLower); break; default: assert(!"unknown change case kind"); @@ -409,8 +540,9 @@ String& String::change_case(Charset& sou break; // never } + char *dest=new_cstr; unsigned char index; - for(const char* current=new_cstr; index=(unsigned char)*current; current++) { + for(const char* current=new_cstr; (index=(unsigned char)*current); current++) { unsigned char c=a[index]; if(b) c=b[c]; @@ -458,61 +590,6 @@ const String& String::replace(const Dict return result; } -double String::as_double() const { - double result; - const char *str=cstr(); - - while(*str && isspace(*str)) - str++; - if(!*str) - return 0; - - char *error_pos; - // 0xABC - if(str[0]=='0') - if(str[1]=='x' || str[1]=='X') - result=(double)(unsigned long)strtol(str, &error_pos, 0); - else - result=(double)strtod(str+1/*skip leading 0*/, &error_pos); - else - result=(double)strtod(str, &error_pos); - - while(char c=*error_pos++) - if(!isspace(c)) - throw Exception("number.format", - this, - "invalid number (double)"); - - return result; -} -int String::as_int() const { - int result; - const char *str=cstr(); - - while(*str && isspace(*str)) - str++; - if(!*str) - return 0; - - char *error_pos; - // 0xABC - if(str[0]=='0') - if(str[1]=='x' || str[1]=='X') - result=(int)(unsigned long)strtol(str, &error_pos, 0); - else - result=(int)strtol(str+1/*skip leading 0*/, &error_pos, 0); - else - result=(int)strtol(str, &error_pos, 0); - - while(char c=*error_pos++) - if(!isspace(c)) - throw Exception("number.format", - this, - "invalid number (int)"); - - return result; -} - static int serialize_body_char(char c, char** cur) { *((*cur)++)=c; return 0; // 0=continue @@ -524,73 +601,104 @@ static int serialize_body_piece(const ch }; static int serialize_lang_piece(char alang, size_t asize, char** cur) { // lang - memcpy(*cur, &alang, sizeof(alang)); *cur+=sizeof(alang); - // length + **cur=alang; (*cur)++; + // length [WARNING: not cast, addresses must be %4=0 on sparc] memcpy(*cur, &asize, sizeof(asize)); *cur+=sizeof(asize); return 0; // 0=continue } String::Cm String::serialize(size_t prolog_length) const { size_t fragments_count=langs.count(); + size_t body_length=body.length(); size_t buf_length= prolog_length //1 +sizeof(size_t) //2 - +fragments_count*(sizeof(char)+sizeof(size_t)) //3 - +body.length() //4 - +1; // for zero terminator used in deserialize + +body_length //3 + +1 // 4 for zero terminator used in deserialize + +sizeof(size_t) //5 + +fragments_count*(sizeof(char)+sizeof(size_t)); //6 + String::Cm result(new(PointerFreeGC) char[buf_length], buf_length); // 1: prolog char *cur=result.str+prolog_length; - // 2: langs.count + // 2: chars.count [WARNING: not cast, addresses must be %4=0 on sparc] + memcpy(cur, &body_length, sizeof(body_length)); cur+=sizeof(body_length); + // 3: letters + body.for_each(serialize_body_char, serialize_body_piece, &cur); + // 4: zero terminator + *cur++=0; + // 5: langs.count [WARNING: not cast, addresses must be %4=0 on sparc] memcpy(cur, &fragments_count, sizeof(fragments_count)); cur+=sizeof(fragments_count); - // 3: lang info + // 6: lang info langs.for_each(body, serialize_lang_piece, &cur); - // 4: letters - body.for_each(serialize_body_char, serialize_body_piece, &cur); - // 5: zero terminator - *cur=0; return result; } -bool String::deserialize(size_t prolog_length, void *buf, size_t buf_length) { - if(buf_length<=prolog_length) +bool String::deserialize(size_t prolog_size, void *buf, size_t buf_size) { + size_t in_buf=buf_size; + if(in_buf<=prolog_size) return false; - buf_length-=prolog_length; - buf_length-=1; // 5: zero terminator + in_buf-=prolog_size; // 1: prolog - const char* cur=(const char* )buf+prolog_length; + const char* cur=(const char* )buf+prolog_size; // 2: langs.count - if(buf_length(cur); cur+=sizeof(size_t); - buf_length-=sizeof(size_t); + // 4: zero terminator + if(cur[body_length] != 0) // in place? + return false; + // 3: letters + body=String::Body(cur, body_length); + cur+=body_length+1; + in_buf-=body_length+1; + + // 5: langs.count + size_t fragments_count; + if(in_buf(cur); cur+=sizeof(char); - size_t fragment_length=*reinterpret_cast(cur); cur+=sizeof(size_t); - langs.append(total_length, lang, fragment_length); - total_length+=fragment_length; - - buf_length-=piece_length; + // lang + lang=*cur++; + // length [WARNING: not cast, addresses must be %4=0 on sparc] + memcpy(&fragment_length, cur, sizeof(fragment_length)); cur+=sizeof(fragment_length); + + size_t combined_length=total_length+fragment_length; + if(combined_length>body_length) + return false; // file curruption + // uchar needed to prevent propagating 0x80 bit to upper bytes + langs.append(total_length, (String::Language)(uchar)lang, fragment_length); + total_length=combined_length; + in_buf-=piece_length; } - // 4: letters - if(buf_length!=total_length) + if(total_length!=body_length) // length(all language fragments) vs length(letters) return false; - - // serialize wrote extra zero byte there, we can rely on that - body=String::Body(cur, buf_length); } + if(in_buf!=0) // some strange extra bytes + return false; ASSERT_STRING_INVARIANT(*this); return true; @@ -599,14 +707,24 @@ bool String::deserialize(size_t prolog_l const char* String::Body::v() const { return CORD_to_const_char_star(body); } +void String::Body::dump() const { + CORD_dump(body); +} + const char* String::Languages::v() const { if(opt.is_not_just_lang) return CORD_to_const_char_star(langs); else return (const char*)&langs; } +void String::Languages::dump() const { + if(opt.is_not_just_lang) + CORD_dump(langs); + else + puts((const char*)&langs); +} const char* String::v() const { -#define LIMIT_VIEW 20 + const uint LIMIT_VIEW=20; char* buf=(char*)malloc(MAX_STRING); const char*body_view=body.v(); const char*langs_view=langs.v(); @@ -618,5 +736,31 @@ const char* String::v() const { ); return buf; -#undef LIMIT_VIEW +} +void String::dump() const { + body.dump(); + langs.dump(); +} +const String& String::trim(String::Trim_kind kind, const char* chars) const { + if(!length()) + return *this; + + size_t substr_begin, substr_length; + Body new_body=body.trim(kind, chars, &substr_begin, &substr_length); + if(new_body==body) // we received unchanged pointer, do likewise + return *this; + // new_body differs from body, adjust langs along + + String& result=*new String; + if(!new_body) // body.trim produced empty result + return result; + // body.trim produced nonempty result + + // first: their langs + result.langs.append(result.body, langs, substr_begin, substr_length); + // next: letters themselves + result.body=new_body; + + ASSERT_STRING_INVARIANT(result); + return result; }