--- parser3/src/main/pa_string.C 2003/09/25 09:15:03 1.176 +++ parser3/src/main/pa_string.C 2003/10/21 05:11:00 1.185 @@ -5,7 +5,7 @@ Author: Alexandr Petrosian (http://paf.design.ru) */ -static const char* IDENT_STRING_C="$Date: 2003/09/25 09:15:03 $"; +static const char* IDENT_STRING_C="$Date: 2003/10/21 05:11:00 $"; #include "pcre.h" @@ -15,6 +15,8 @@ static const char* IDENT_STRING_C="$Date #include "pa_dictionary.h" #include "pa_charset.h" +const String String::Empty; + // cord lib extension #ifndef DOXYGEN @@ -41,6 +43,19 @@ int CORD_range_contains_chr_greater_then return(CORD_block_iter(x, i, CORD_range_contains_chr_greater_then_proc, &d) == 1/*alternatives: 0 normally ended, 2=struck 'n'*/); } +static int CORD_block_count_proc(char c, size_t size, void* client_data) +{ + int* result=(int*)client_data; + (*result)++; + return(0); // 0=continue +} +size_t CORD_block_count(CORD x) +{ + size_t result=0; + CORD_block_iter(x, 0, CORD_block_count_proc, &result); + return result; +} + // helpers /// String::match uses this as replace & global search table columns @@ -154,11 +169,18 @@ String& String::mid(size_t substr_begin, } size_t String::pos(const String::Body substr, size_t this_offset, Language lang) const { - size_t substr_begin=body.pos(substr, this_offset); - if(substr_begin==CORD_NOT_FOUND || !langs.check_lang(lang, substr_begin, substr.length())) - return STRING_NOT_FOUND; + size_t substr_length=substr.length(); + while(true) { + size_t substr_begin=body.pos(substr, this_offset); + + if(substr_begin==CORD_NOT_FOUND) + return STRING_NOT_FOUND; - return substr_begin; + if(langs.check_lang(lang, substr_begin, substr_length)) + return substr_begin; + + this_offset=substr_begin+substr_length; + } } size_t String::pos(const String& substr, @@ -321,9 +343,9 @@ Table* String::match(Charset& source_cha *row+=&mid(prefinish, poststart); // .match *row+=&mid(poststart, postfinish); // .postmatch } else { - *row+=0; // .prematch column value - *row+=0; // .match - *row+=0; // .postmatch + *row+=&Empty; // .prematch column value + *row+=&Empty; // .match + *row+=&Empty; // .postmatch } for(int i=1; i(kind)); // never - a=b=0; // calm, compiler - break; // never - } - char* new_cstr=cstrm(); char *dest=new_cstr; - unsigned char index; - for(const char* current=new_cstr; index=(unsigned char)*current; current++) { - unsigned char c=a[index]; - if(b) - c=b[c]; + if(source_charset.isUTF8()) { + switch(kind) { + case CC_UPPER: + change_case_UTF8((const XMLByte*)new_cstr, (XMLByte*)new_cstr, UTF8CaseToUpper); + break; + case CC_LOWER: + change_case_UTF8((const XMLByte*)new_cstr, (XMLByte*)new_cstr, UTF8CaseToLower); + break; + default: + assert(!"unknown change case kind"); + break; // never + } + + } else { + const unsigned char *tables=source_charset.pcre_tables; + + const unsigned char *a; + const unsigned char *b; + switch(kind) { + case CC_UPPER: + a=tables+lcc_offset; + b=tables+fcc_offset; + break; + case CC_LOWER: + a=tables+lcc_offset; + b=0; + break; + default: + assert(!"unknown change case kind"); + a=b=0; // calm, compiler + break; // never + } + + unsigned char index; + for(const char* current=new_cstr; index=(unsigned char)*current; current++) { + unsigned char c=a[index]; + if(b) + c=b[c]; - *dest++=(char)c; + *dest++=(char)c; + } } result.langs=langs; result.body=new_cstr; @@ -397,30 +431,26 @@ const String& String::replace(const Dict const char* current=old_cstr; while(*current) { - if(Table::element_type row=dict.first_that_begins(current)) { + if(Dictionary::Subst subst=dict.first_that_begins(current)) { // prematch if(size_t prematch_length=current-prematch_begin) { - result.langs.append(result.body, langs, prematch_begin-old_cstr, current-old_cstr); + result.langs.append(result.body, langs, prematch_begin-old_cstr, prematch_length); result.body.append_strdup_know_length(prematch_begin, prematch_length); } // match - - const String* a=row->get(0); // skip 'a' in 'current'; move prematch_begin - current+=a->length(); prematch_begin=current; + current+=subst.from_length; prematch_begin=current; - if(row->count()>1) { // are there any b? - const String* b=row->get(1); + if(const String* b=subst.to) // are there any b? result<<*b; - } } else // simply advance current++; } // postmatch if(size_t postmatch_length=current-prematch_begin) { - result.langs.append(result.body, langs, prematch_begin-old_cstr, current-old_cstr); + result.langs.append(result.body, langs, prematch_begin-old_cstr, postmatch_length); result.body.append_strdup_know_length(prematch_begin, postmatch_length); } @@ -483,63 +513,51 @@ int String::as_int() const { return result; } -inline void uint2uchars(uint word, uchar *bytes) { - bytes[0]=word&0xFF; - bytes[1]=(word>>8)&0xFF; - bytes[2]=(word>>16)&0xFF; - bytes[3]=(word>>24)&0xFF; -} -inline uint uchars2uint(uchar *bytes) { - return bytes[3]<<24 - | bytes[2]<<16 - | bytes[1]<<8 - | bytes[0]; -} - +static int serialize_body_char(char c, char** cur) { + *((*cur)++)=c; + return 0; // 0=continue +}; static int serialize_body_piece(const char* s, char** cur) { size_t length=strlen(s); memcpy(*cur, s, length); *cur+=length; - return 0; + return 0; // 0=continue }; +static int serialize_lang_piece(char alang, size_t asize, char** cur) { + // lang + memcpy(*cur, &alang, sizeof(alang)); *cur+=sizeof(alang); + // length + memcpy(*cur, &asize, sizeof(asize)); *cur+=sizeof(asize); + + return 0; // 0=continue +} String::Cm String::serialize(size_t prolog_length) const { -#if TODO - //_asm int 3; + size_t fragments_count=langs.count(); size_t buf_length= - prolog_length - +sizeof(size_t) - +langs.count()*(sizeof(Language)+sizeof(size_t)) - +length(); + prolog_length //1 + +sizeof(size_t) //2 + +fragments_count*(sizeof(char)+sizeof(size_t)) //3 + +body.length() //4 + +1; // for zero terminator used in deserialize String::Cm result(new(PointerFreeGC) char[buf_length], buf_length); // 1: prolog char *cur=result.str+prolog_length; - - // 2: langs.count - size_t fragments_count=langs.count(); memcpy(cur, &fragments_count, sizeof(fragments_count)); cur+=sizeof(fragments_count); - // 3: lang info - for(Array_iterator i(langs); i.has_next(); ) { - const Fragment fragment=i.next(); - // lang - memcpy(cur, &fragment.lang, sizeof(fragment.lang)); cur+=sizeof(fragment.lang); - // length - memcpy(cur, &fragment.length, sizeof(fragment.length)); cur+=sizeof(fragment.length); - } - + langs.for_each(body, serialize_lang_piece, &cur); // 4: letters - body.for_each(serialize_body_piece, &cur); + body.for_each(serialize_body_char, serialize_body_piece, &cur); + // 5: zero terminator + *cur=0; return result; -#endif - return String::Cm(0, 0); } bool String::deserialize(size_t prolog_length, void *buf, size_t buf_length) { -#if TODO if(buf_length<=prolog_length) return false; buf_length-=prolog_length; + buf_length-=1; // 5: zero terminator // 1: prolog const char* cur=(const char* )buf+prolog_length; @@ -554,13 +572,13 @@ bool String::deserialize(size_t prolog_l // 3: lang info size_t total_length=0; for(size_t f=0; f(cur); cur+=sizeof(Language); + Language lang=*reinterpret_cast(cur); cur+=sizeof(char); size_t fragment_length=*reinterpret_cast(cur); cur+=sizeof(size_t); - langs+=Fragment(lang, fragment_length); + langs.append(total_length, lang, fragment_length); total_length+=fragment_length; buf_length-=piece_length; @@ -570,20 +588,19 @@ bool String::deserialize(size_t prolog_l if(buf_length!=total_length) return false; + // serialize wrote extra zero byte there, we can rely on that body=String::Body(cur, buf_length); } ASSERT_STRING_INVARIANT(*this); return true; -#endif - return false; } const char* String::Body::v() const { return CORD_to_const_char_star(body); } const char* String::Languages::v() const { - if(is_not_just_lang) + if(opt.is_not_just_lang) return CORD_to_const_char_star(langs); else return (const char*)&langs; @@ -594,9 +611,9 @@ const char* String::v() const { const char*body_view=body.v(); const char*langs_view=langs.v(); snprintf(buf, MAX_STRING, - "%.*s%s} " + "%d:%.*s%s} " "{%d:%s", - LIMIT_VIEW, langs_view, strlen(langs_view)>LIMIT_VIEW?"...":"", + langs.count(), LIMIT_VIEW, langs_view, strlen(langs_view)>LIMIT_VIEW?"...":"", strlen(body_view), body_view );