--- parser3/src/main/pa_string.C 2003/07/24 11:31:24 1.174 +++ parser3/src/main/pa_string.C 2003/09/26 06:53:27 1.178 @@ -5,7 +5,7 @@ Author: Alexandr Petrosian (http://paf.design.ru) */ -static const char* IDENT_STRING_C="$Date: 2003/07/24 11:31:24 $"; +static const char* IDENT_STRING_C="$Date: 2003/09/26 06:53:27 $"; #include "pcre.h" @@ -15,6 +15,45 @@ static const char* IDENT_STRING_C="$Date #include "pa_dictionary.h" #include "pa_charset.h" +// cord lib extension + +#ifndef DOXYGEN +typedef struct { + ssize_t countdown; + char target; /* Character we're looking for */ +} chr_data; +#endif +static int CORD_range_contains_chr_greater_then_proc(char c, size_t size, void* client_data) +{ + register chr_data * d = (chr_data *)client_data; + + if (d -> countdown<=0) return(2); + d -> countdown -= size; + if (c > d -> target) return(1); + return(0); +} +int CORD_range_contains_chr_greater_then(CORD x, size_t i, size_t n, int c) +{ + chr_data d; + + d.countdown = n; + d.target = c; + return(CORD_block_iter(x, i, CORD_range_contains_chr_greater_then_proc, &d) == 1/*alternatives: 0 normally ended, 2=struck 'n'*/); +} + +static int CORD_block_count_proc(char c, size_t size, void* client_data) +{ + int* result=(int*)client_data; + (*result)++; + return(0); // 0=continue +} +size_t CORD_block_count(CORD x) +{ + size_t result=0; + CORD_block_iter(x, 0, CORD_block_count_proc, &result); + return result; +} + // helpers /// String::match uses this as replace & global search table columns @@ -28,64 +67,19 @@ public: *this+=new String("match"); *this+=new String("postmatch"); for(int i=0; i i(src); ; fragment_begin=fragment_end) { - const Fragment fragment=i.next(); - fragment_end=fragment_begin+fragment.length; - //fprintf(err, "1end=%u\n", fragment_end);fflush(err); - - // not reached fragments which may include 'substr'? - if(!(substr_begin>=fragment_begin && substr_beginfragment_end) // are there still more? - append(Fragment(fragment.lang, fragment.length)); // appending whole fragment - else { // no, it was last - append(Fragment(fragment.lang, substr_end-fragment_begin)); - //fclose(err); - return; - } - } - } - - break; - } - //fclose(err); -} +// String::Body methods -// StringBody methods - -StringBody StringBody::Format(int value) { +String::Body String::Body::Format(int value) { char local[MAX_NUMBER]; size_t length=snprintf(local, MAX_NUMBER, "%d", value); - return StringBody(pa_strdup(local, length), length); + return String::Body(pa_strdup(local, length), length); } static int CORD_batched_iter_fn_generic_hash_code(char c, void * client_data) { @@ -98,7 +92,7 @@ static int CORD_batched_iter_fn_generic_ generic_hash_code(result, s); return 0; }; -uint StringBody::hash_code() const { +uint String::Body::hash_code() const { uint result=0; CORD_iter5(body, 0, CORD_batched_iter_fn_generic_hash_code, @@ -115,17 +109,14 @@ String::String(const String::C cstr, boo append_know_length(cstr.str, cstr.length, tainted?L_TAINTED:L_CLEAN); } -String::String(const String& src): body(src.body) { - fragments.append(src.fragments); - ASSERT_STRING_INVARIANT(*this); -} - String& String::append_know_length(const char* str, size_t known_length, Language lang) { if(!known_length) return *this; + // first: langs + langs.append(body, lang, known_length); + // next: letters themselves body.append_know_length(str, known_length); - fragments+=Fragment(lang, known_length); ASSERT_STRING_INVARIANT(*this); return *this; @@ -144,8 +135,10 @@ String& String::append_strdup(const char if(!known_length) return *this; + // first: langs + langs.append(body, lang, known_length); + // next: letters themselves body.append_strdup_know_length(str, known_length); - fragments+=Fragment(lang, known_length); ASSERT_STRING_INVARIANT(*this); return *this; @@ -158,14 +151,14 @@ String& String::mid(size_t substr_begin, size_t self_length=length(); substr_begin=min(substr_begin, self_length); substr_end=min(max(substr_end, substr_begin), self_length); - if(substr_begin==substr_end) + size_t substr_length=substr_end-substr_begin; + if(!substr_length) return result; - // first: letters themselves - result.body=body.mid(substr_begin, substr_end-substr_begin); - - // next: their langs - result.fragments.append_positions(fragments, substr_begin, substr_end); + // first: their langs + result.langs.append(result.body, langs, substr_begin, substr_length); + // next: letters themselves + result.body=body.mid(substr_begin, substr_length); // SAPI::log("piece of '%s' from %d to %d is '%s'", //cstr(), substr_begin, substr_end, result.cstr()); @@ -173,51 +166,12 @@ String& String::mid(size_t substr_begin, return result; } -size_t String::pos(const StringBody substr, - size_t this_offset, Language lang) const { - // first: letters themselves +size_t String::pos(const String::Body substr, size_t this_offset, Language lang) const { size_t substr_begin=body.pos(substr, this_offset); - if(substr_begin==CORD_NOT_FOUND) + if(substr_begin==CORD_NOT_FOUND || !langs.check_lang(lang, substr_begin, substr.length())) return STRING_NOT_FOUND; - // next: check the lang when specified - - if(lang==L_UNSPECIFIED) // ignore lang? - return substr_begin; - - // substr must be in one fragment, and fragments' lang must = lang - size_t substr_end=substr_begin+substr.length(); - size_t fragment_begin=0; - size_t fragment_end; - for(Array_iterator i(fragments); i.has_next(); fragment_begin=fragment_end) { - const Fragment fragment=i.next(); - fragment_end=fragment_begin+fragment.length; - - if(substr_begin=fragment_end) // begin of substr OUT of current fragment? - continue; - - if(substr_end>fragment_end) // end of substr OUT of current fragment? - throw Exception(0, // (*) see below - this, - "searching for '%s' starting from %ud problem: found begin in one fragment, but end in another", - substr.cstr(), this_offset); - - if(fragment.lang<=lang) - return substr_begin; - else { // bad lang... - /// WARNING: this possibly skips assert (*), but it's fast - substr_begin=body.pos(substr, fragment_end/*...search AFTER for more*/); - if(substr_begin==CORD_NOT_FOUND) - return STRING_NOT_FOUND; - - size_t substr_end=substr_begin+substr.length(); - // and continuing with next fragment - } - } - - return STRING_NOT_FOUND; + return substr_begin; } size_t String::pos(const String& substr, @@ -443,8 +397,8 @@ String& String::change_case(Charset& sou *dest++=(char)c; } + result.langs=langs; result.body=new_cstr; - result.fragments.append(fragments); return result; } @@ -459,8 +413,8 @@ const String& String::replace(const Dict if(Table::element_type row=dict.first_that_begins(current)) { // prematch if(size_t prematch_length=current-prematch_begin) { + result.langs.append(result.body, langs, prematch_begin-old_cstr, current-old_cstr); result.body.append_strdup_know_length(prematch_begin, prematch_length); - result.fragments.append_positions(fragments, prematch_begin-old_cstr, current-old_cstr); } // match @@ -479,8 +433,8 @@ const String& String::replace(const Dict // postmatch if(size_t postmatch_length=current-prematch_begin) { + result.langs.append(result.body, langs, prematch_begin-old_cstr, current-old_cstr); result.body.append_strdup_know_length(prematch_begin, postmatch_length); - result.fragments.append_positions(fragments, prematch_begin-old_cstr, current-old_cstr); } ASSERT_STRING_INVARIANT(result); @@ -542,50 +496,38 @@ int String::as_int() const { return result; } -inline void uint2uchars(uint word, uchar *bytes) { - bytes[0]=word&0xFF; - bytes[1]=(word>>8)&0xFF; - bytes[2]=(word>>16)&0xFF; - bytes[3]=(word>>24)&0xFF; -} -inline uint uchars2uint(uchar *bytes) { - return bytes[3]<<24 - | bytes[2]<<16 - | bytes[1]<<8 - | bytes[0]; -} - static int serialize_body_piece(const char* s, char** cur) { size_t length=strlen(s); memcpy(*cur, s, length); *cur+=length; - return 0; + return 0; // 0=continue }; +static int serialize_lang_piece(char alang, size_t asize, char** cur) { + // lang + memcpy(*cur, &alang, sizeof(alang)); *cur+=sizeof(alang); + // length + memcpy(*cur, &asize, sizeof(asize)); *cur+=sizeof(asize); + + return 0; // 0=continue +} String::Cm String::serialize(size_t prolog_length) const { + size_t fragments_count=langs.count(); size_t buf_length= - prolog_length - +fragments.count()*(sizeof(Language)+sizeof(size_t)) - +length(); + prolog_length //1 + +sizeof(size_t) //2 + +fragments_count*(sizeof(char)+sizeof(size_t)) //3 + +body.length() //4 + +1; // for zero terminator used in deserialize String::Cm result(new(PointerFreeGC) char[buf_length], buf_length); // 1: prolog char *cur=result.str+prolog_length; - - - // 2: fragments.count - size_t fragments_count=fragments.count(); + // 2: langs.count memcpy(cur, &fragments_count, sizeof(fragments_count)); cur+=sizeof(fragments_count); - // 3: lang info - for(Array_iterator i(fragments); i.has_next(); ) { - const Fragment fragment=i.next(); - // lang - memcpy(cur, &fragment.lang, sizeof(fragment.lang)); cur+=sizeof(fragment.lang); - // length - memcpy(cur, &fragment.length, sizeof(fragment.length)); cur+=sizeof(fragment.length); - } - + langs.for_each(body, serialize_lang_piece, &cur); // 4: letters body.for_each(serialize_body_piece, &cur); + // 5: zero terminator already there put by new(PointerFreeGC) return result; } @@ -593,12 +535,13 @@ bool String::deserialize(size_t prolog_l if(buf_length<=prolog_length) return false; buf_length-=prolog_length; + buf_length-=1; // 5: zero terminator // 1: prolog const char* cur=(const char* )buf+prolog_length; - // 2: fragments.count - if(buf_length(cur); cur+=sizeof(size_t); buf_length-=sizeof(size_t); @@ -607,13 +550,13 @@ bool String::deserialize(size_t prolog_l // 3: lang info size_t total_length=0; for(size_t f=0; f(cur); cur+=sizeof(Language); + Language lang=*reinterpret_cast(cur); cur+=sizeof(char); size_t fragment_length=*reinterpret_cast(cur); cur+=sizeof(size_t); - fragments+=Fragment(lang, fragment_length); + langs.append(total_length, lang, fragment_length); total_length+=fragment_length; buf_length-=piece_length; @@ -623,9 +566,35 @@ bool String::deserialize(size_t prolog_l if(buf_length!=total_length) return false; - body=StringBody(cur, buf_length); + // serialize wrote extra zero byte there, we can rely on that + body=String::Body(cur, buf_length); } ASSERT_STRING_INVARIANT(*this); return true; } + +const char* String::Body::v() const { + return CORD_to_const_char_star(body); +} +const char* String::Languages::v() const { + if(opt.is_not_just_lang) + return CORD_to_const_char_star(langs); + else + return (const char*)&langs; +} +const char* String::v() const { +#define LIMIT_VIEW 20 + char* buf=(char*)malloc(MAX_STRING); + const char*body_view=body.v(); + const char*langs_view=langs.v(); + snprintf(buf, MAX_STRING, + "%d:%.*s%s} " + "{%d:%s", + langs.count(), LIMIT_VIEW, langs_view, strlen(langs_view)>LIMIT_VIEW?"...":"", + strlen(body_view), body_view + ); + + return buf; +#undef LIMIT_VIEW +}