--- parser3/src/main/pa_string.C 2009/07/14 11:15:29 1.232 +++ parser3/src/main/pa_string.C 2026/04/25 13:38:46 1.285 @@ -1,12 +1,10 @@ /** @file Parser: string class. @see untalength_t.C. - Copyright (c) 2001-2009 ArtLebedev Group (http://www.artlebedev.com) - Author: Alexandr Petrosian (http://paf.design.ru) + Copyright (c) 2001-2026 Art. Lebedev Studio (https://www.artlebedev.com) + Authors: Konstantin Morshnev , Alexandr Petrosian */ -static const char * const IDENT_STRING_C="$Date: 2009/07/14 11:15:29 $"; - #include "pa_string.h" #include "pa_exception.h" #include "pa_table.h" @@ -14,127 +12,53 @@ static const char * const IDENT_STRING_C #include "pa_charset.h" #include "pa_vregex.h" -const String String::Empty; - -int pa_atoi(const char* str, const String* problem_source) { - if(!str) - return 0; - - while(*str && isspace((unsigned char)*str)) - str++; - if(!*str) - return 0; - - int result; - char *error_pos; - bool negative=false; - if(str[0]=='-') { - negative=true; - str++; - } else if(str[0]=='+') { - str++; - } - // 0xABC - if(str[0]=='0') - if(str[1]=='x' || str[1]=='X') - result=(int)(unsigned long)strtol(str, &error_pos, 0); - else { - // skip leading 0000, to disable octal interpretation - do str++; while(*str=='0'); - result=(int)strtol(str, &error_pos, 0); - } - else - result=(int)strtol(str, &error_pos, 0); - if(negative) - result=-result; - - while(char c=*error_pos++) - if(!isspace((unsigned char)c)) - throw Exception("number.format", - problem_source, - problem_source?"invalid number (int)": "'%s' is invalid number (int)", str); - - return result; -} +volatile const char * IDENT_PA_STRING_C="$Id: pa_string.C,v 1.285 2026/04/25 13:38:46 moko Exp $" IDENT_PA_STRING_H; -double pa_atod(const char* str, const String* problem_source) { - if(!str) - return 0; - - while(*str && isspace((unsigned char)*str)) - str++; - if(!*str) - return 0; - - double result; - char *error_pos; - bool negative=false; - if(str[0]=='-') { - negative=true; - str++; - } else if(str[0]=='+') { - str++; - } - // 0xABC - if(str[0]=='0') - if(str[1]=='x' || str[1]=='X') - result=(double)(unsigned long)strtol(str, &error_pos, 0); - else { - // skip leading 0000, to disable octal interpretation - do str++; while(*str=='0'); - result=(double)strtod(str, &error_pos); - } - else - result=(double)strtod(str, &error_pos); - if(negative) - result=-result; - - while(char c=*error_pos++) - if(!isspace((unsigned char)c)) - throw Exception("number.format", - problem_source, - problem_source?"invalid number (double)": "'%s' is invalid number (double)", str); +const String String::Empty; - return result; -} +#define COMPILE_ASSERT(x) extern int assert_checker[(x) ? 1 : -1] +COMPILE_ASSERT(sizeof(String::Languages) == sizeof(CORD)); // cord lib extension #ifndef DOXYGEN typedef struct { - ssize_t countdown; - int target; /* Character we're looking for */ + ssize_t countdown; + int target; /* Character we're looking for */ } chr_data; #endif + static int CORD_range_contains_chr_greater_then_proc(char c, size_t size, void* client_data) { - register chr_data * d = (chr_data *)client_data; - - if (d -> countdown<=0) return(2); - d -> countdown -= size; - if (c > d -> target) return(1); - return(0); + chr_data * d = (chr_data *)client_data; + + if (d -> countdown<=0) return(2); + d -> countdown -= size; + if (c > d -> target) return(1); + return(0); } + int CORD_range_contains_chr_greater_then(CORD x, size_t i, size_t n, int c) { - chr_data d; + chr_data d; - d.countdown = n; - d.target = c; - return(CORD_block_iter(x, i, CORD_range_contains_chr_greater_then_proc, &d) == 1/*alternatives: 0 normally ended, 2=struck 'n'*/); + d.countdown = n; + d.target = c; + return(CORD_block_iter(x, i, CORD_range_contains_chr_greater_then_proc, &d) == 1/*alternatives: 0 normally ended, 2=struck 'n'*/); } static int CORD_block_count_proc(char /*c*/, size_t /*size*/, void* client_data) { - int* result=(int*)client_data; - (*result)++; - return(0); // 0=continue + int* result=(int*)client_data; + (*result)++; + return(0); // 0=continue } + size_t CORD_block_count(CORD x) { size_t result=0; CORD_block_iter(x, 0, CORD_block_count_proc, &result); - return result; + return result; } // helpers @@ -150,23 +74,30 @@ public: *this+=new String("match"); *this+=new String("postmatch"); for(int i=0; i=start', that's(<) impossible - return 0; // all chars are empty, just return empty string + return String::Body(); // all chars are empty, just return empty string } else - break; + break; CORD_prev(pos); } } } else { - const XMLByte* src_begin=(const XMLByte*)cstrm(); + const XMLByte* src_begin=(const XMLByte*)cstr(); const XMLByte* src_end=src_begin+our_length; // from left... @@ -240,7 +171,7 @@ String::Body String::Body::trim(String:: if(found){ start+=char_length; if(start==our_length) - return 0; // all chars are empty, just return empty string + return String::Body(); // all chars are empty, just return empty string } else break; } @@ -264,7 +195,7 @@ String::Body String::Body::trim(String:: if(found){ end-=char_length; if(end==0) - return 0; // all chars are empty, just return empty string + return String::Body(); // all chars are empty, just return empty string } else break; } @@ -288,11 +219,13 @@ static int CORD_batched_iter_fn_generic_ generic_hash_code(result, c); return 0; } + static int CORD_batched_iter_fn_generic_hash_code(const char* s, void * client_data) { uint& result=*static_cast(client_data); generic_hash_code(result, s); return 0; -}; +} + uint String::Body::get_hash_code() const { #ifdef HASH_CODE_CACHING if(hash_code) @@ -301,7 +234,7 @@ uint String::Body::get_hash_code() const uint hash_code=0; #endif if (body && CORD_IS_STRING(body)){ - generic_hash_code(hash_code, body); + generic_hash_code(hash_code, (const char *)body); } else { CORD_iter5(body, 0, CORD_batched_iter_fn_generic_hash_code, @@ -310,6 +243,57 @@ uint String::Body::get_hash_code() const return hash_code; } +struct CORD_pos_info { + const char* chars; + size_t left; + size_t pos; +}; + +// can be called only for IS_FUNCTION(CORD) which is used in String::Body::strrpbrk +static int CORD_iter_fn_rpos(char c, CORD_pos_info* info) { + if(info->pos < info->left){ + info->pos=STRING_NOT_FOUND; + return 1; + } + if(strchr(info->chars, c)) + return 1; + --(info->pos); + return 0; +} + +size_t String::Body::strrpbrk(const char* chars, size_t left, size_t right) const { + if(is_empty() || !chars || !strlen(chars)) + return STRING_NOT_FOUND; + CORD_pos_info info={chars, left, right}; + if(CORD_riter4(body, right, (CORD_iter_fn)CORD_iter_fn_rpos, &info)) + return info.pos; + else + return STRING_NOT_FOUND; +} + + +// can be called only for IS_FUNCTION(CORD) which is used in String::Body::rskipchars +static int CORD_iter_fn_rskip(char c, CORD_pos_info* info) { + if(info->pos < info->left) { + info->pos=STRING_NOT_FOUND; + return 1; + } + if(!strchr(info->chars, c)) + return 1; + --(info->pos); + return 0; +} + +size_t String::Body::rskipchars(const char* chars, size_t left, size_t right) const { + if(is_empty() || !chars || !strlen(chars)) + return STRING_NOT_FOUND; + CORD_pos_info info={chars, left, right}; + if(CORD_riter4(body, right, (CORD_iter_fn)CORD_iter_fn_rskip, &info)) + return info.pos; + else + return STRING_NOT_FOUND; +} + // String methods String& String::append_know_length(const char* str, size_t known_length, Language lang) { @@ -324,6 +308,7 @@ String& String::append_know_length(const ASSERT_STRING_INVARIANT(*this); return *this; } + String& String::append_help_length(const char* str, size_t helper_length, Language lang) { if(!str) return *this; @@ -333,10 +318,12 @@ String& String::append_help_length(const return append_know_length(str, known_length, lang); } -String::String(int value, char *format) : langs(L_CLEAN){ + +String::String(int value, const char *format) : langs(L_CLEAN){ char buf[MAX_NUMBER]; body.append_strdup_know_length(buf, snprintf(buf, MAX_NUMBER, format, value)); } + String& String::append_strdup(const char* str, size_t helper_length, Language lang) { size_t known_length=helper_length?helper_length:strlen(str); if(!known_length) @@ -351,22 +338,32 @@ String& String::append_strdup(const char return *this; } -int CORD_batched_len(const char* s, size_t* len){ - (*len) += lengthUTF8( (const XMLByte *)s, (const XMLByte *)s+strlen(s)); +struct CORD_length_info { + size_t len; + size_t skip; +}; + +int CORD_batched_len(const char* s, CORD_length_info* info){ + info->len += lengthUTF8( (const XMLByte *)s, (const XMLByte *)s+strlen(s)); return 0; } -// can be called only for IS_FUNCTION(CORD) which are used only in Lang -int CORD_batched_len(const char, size_t *len){ - (*len)++; +// can be called only for IS_FUNCTION(CORD) which are used in large String::Body::mid +int CORD_batched_len(const char c, CORD_length_info* info){ + if (info->skip==0){ + info->len++; + info->skip = lengthUTF8Char(c)-1; + } else { + info->skip--; + } return 0; } size_t String::length(Charset& charset) const { if(charset.isUTF8()){ - size_t len=0; - body.for_each(CORD_batched_len, CORD_batched_len, &len); - return len; + CORD_length_info info = {0, 0}; + body.for_each(CORD_batched_len, CORD_batched_len, &info); + return info.len; } else return body.length(); } @@ -395,7 +392,7 @@ String& String::mid(size_t substr_begin, String& String::mid(Charset& charset, size_t from, size_t to, size_t helper_length) const { String& result=*new String; - size_t self_length=(helper_length)?helper_length:length(charset); + size_t self_length=helper_length ? helper_length : length(charset); if(!self_length) return result; @@ -409,7 +406,7 @@ String& String::mid(Charset& charset, si return result; if(charset.isUTF8()){ - const XMLByte* src_begin=(const XMLByte*)cstrm(); + const XMLByte* src_begin=(const XMLByte*)cstr(); const XMLByte* src_end=src_begin+body.length(); // convert 'from' and 'substr_length' from 'characters' to 'bytes' @@ -443,16 +440,14 @@ size_t String::pos(const String::Body su } } -size_t String::pos(const String& substr, - size_t this_offset, Language lang) const { +size_t String::pos(const String& substr, size_t this_offset, Language lang) const { return pos(substr.body, this_offset, lang); } -size_t String::pos(Charset& charset, const String& substr, - size_t this_offset, Language lang) const { +size_t String::pos(Charset& charset, const String& substr, size_t this_offset, Language lang) const { if(charset.isUTF8()){ - const XMLByte* srcPtr=(const XMLByte*)cstrm(); + const XMLByte* srcPtr=(const XMLByte*)cstr(); const XMLByte* srcEnd=srcPtr+body.length(); // convert 'this_offset' from 'characters' to 'bytes' @@ -470,54 +465,44 @@ size_t String::pos(Charset& charset, con } } -void String::split(ArrayString& result, - size_t& pos_after, - const char* delim, - Language lang, int limit) const { +void String::split(ArrayString& result, size_t pos_after, const char* delim, Language lang) const { + if(is_empty()) + return; size_t self_length=length(); if(size_t delim_length=strlen(delim)) { size_t pos_before; // while we have 'delim'... - for(; (pos_before=pos(delim, pos_after, lang))!=STRING_NOT_FOUND && limit; limit--) { + while((pos_before=pos(String::Body(delim), pos_after, lang)) != STRING_NOT_FOUND) { result+=&mid(pos_after, pos_before); pos_after=pos_before+delim_length; } // last piece - if(pos_afterinfo(); // I have no idea what does it for? @@ -525,16 +510,17 @@ Table* String::match(VRegex* vregex, bool global=vregex->is_global_search(); const char* subject=cstr(); - size_t subject_length=strlen(subject); - const int ovector_size=(1/*match*/+MAX_MATCH_GROUPS)*3; + size_t subject_length=length(); + const int ovector_size=(1/*match*/+MAX_MATCH_GROUPS)*3; /* 1/3 is used as workspace by pcre_exec() */ int ovector[ovector_size]; Table::Action_options table_options; - Table& table=*new Table(string_match_table_template, table_options); + Table& table=*new Table(string_match_table_template(), table_options); int prestart=0; int poststart=0; int postfinish=length(); + int action_was_executed=-1; while(true) { int exec_result=vregex->exec(subject, subject_length, ovector, ovector_size, prestart); @@ -543,6 +529,13 @@ Table* String::match(VRegex* vregex, int prefinish=ovector[0]; poststart=ovector[1]; + + if (prestart==poststart && action_was_executed==1){ + prestart++; + action_was_executed=0; + continue; + } + ArrayString* row=new ArrayString(3); if(need_pre_post_match) { *row+=&mid(0, prefinish); // .prematch column value @@ -560,12 +553,13 @@ Table* String::match(VRegex* vregex, } matches_count++; - row_action(table, row, prestart, prefinish, poststart, postfinish, info); + row_action(table, row, prestart - !action_was_executed, prefinish, poststart, postfinish, info); - if(!global || prestart==poststart) // last step + if(!global || (size_t)poststart>=subject_length) // last step, avoid prestart++ after last char break; prestart=poststart; + action_was_executed=1; } row_action(table, 0/*last time, no raw*/, 0, 0, poststart, postfinish, info); @@ -624,7 +618,7 @@ String& String::change_case(Charset& sou } } result.langs=langs; - result.body=new_cstr; + result.body=String::Body(new_cstr); return result; } @@ -636,34 +630,63 @@ const String& String::escape(Charset& so return Charset::escape(*this, source_charset); } +#define STRING_APPEND(result, from_cstr, langs, langs_offset, length) \ + result.langs.append(result.body, langs, langs_offset, length); \ + result.body.append_strdup_know_length(from_cstr, length); + const String& String::replace(const Dictionary& dict) const { + if(!dict.count() || is_empty()) + return *this; + String& result=*new String(); const char* old_cstr=cstr(); const char* prematch_begin=old_cstr; - const char* current=old_cstr; - while(*current) { - if(Dictionary::Subst subst=dict.first_that_begins(current)) { + if(dict.count()==1) { + // optimized simple case + + Dictionary::Subst subst=dict.get(0); + while(const char* p=strstr(prematch_begin, subst.from)) { // prematch - if(size_t prematch_length=current-prematch_begin) { - result.langs.append(result.body, langs, prematch_begin-old_cstr, prematch_length); - result.body.append_strdup_know_length(prematch_begin, prematch_length); + if(size_t prematch_length=p-prematch_begin) { + STRING_APPEND(result, prematch_begin, langs, prematch_begin-old_cstr, prematch_length) } // match - // skip 'a' in 'current'; move prematch_begin - current+=subst.from_length; prematch_begin=current; + prematch_begin=p+subst.from_length; if(const String* b=subst.to) // are there any b? result<<*b; - } else // simply advance - current++; + } + + } else { + + const char* current=old_cstr; + while(*current) { + if(Dictionary::Subst subst=dict.first_that_begins(current)) { + // prematch + if(size_t prematch_length=current-prematch_begin) { + STRING_APPEND(result, prematch_begin, langs, prematch_begin-old_cstr, prematch_length) + } + + // match + // skip 'a' in 'current'; move prematch_begin + current+=subst.from_length; prematch_begin=current; + + if(const String* b=subst.to) // are there any b? + result<<*b; + } else // simply advance + current++; + } + } + if(prematch_begin==old_cstr) // not modified + return *this; + // postmatch - if(size_t postmatch_length=current-prematch_begin) { - result.langs.append(result.body, langs, prematch_begin-old_cstr, postmatch_length); - result.body.append_strdup_know_length(prematch_begin, postmatch_length); + if(size_t postmatch_length=old_cstr+length()-prematch_begin) { + STRING_APPEND(result, prematch_begin, langs, prematch_begin-old_cstr, postmatch_length) } ASSERT_STRING_INVARIANT(result); @@ -673,12 +696,14 @@ const String& String::replace(const Dict static int serialize_body_char(char c, char** cur) { *((*cur)++)=c; return 0; // 0=continue -}; +} + static int serialize_body_piece(const char* s, char** cur) { size_t length=strlen(s); memcpy(*cur, s, length); *cur+=length; return 0; // 0=continue -}; +} + static int serialize_lang_piece(char alang, size_t asize, char** cur) { // lang **cur=alang; (*cur)++; @@ -687,6 +712,7 @@ static int serialize_lang_piece(char ala return 0; // 0=continue } + String::Cm String::serialize(size_t prolog_length) const { size_t fragments_count=langs.count(); size_t body_length=body.length(); @@ -715,6 +741,7 @@ String::Cm String::serialize(size_t prol return result; } + bool String::deserialize(size_t prolog_size, void *buf, size_t buf_size) { size_t in_buf=buf_size; if(in_buf<=prolog_size) @@ -738,10 +765,7 @@ bool String::deserialize(size_t prolog_s if(cur[body_length] != 0) // in place? return false; // 3: letters - body=String::Body(*cur?cur:0); -#ifdef STRING_LENGTH_CACHING - body.set_length(body_length); -#endif + body=String::Body(String::C(cur, body_length)); cur+=body_length+1; in_buf-=body_length+1; @@ -787,45 +811,40 @@ bool String::deserialize(size_t prolog_s return true; } -const char* String::Body::v() const { - return CORD_to_const_char_star(body); -} void String::Body::dump() const { CORD_dump(body); } -const char* String::Languages::v() const { +const char* String::Languages::visualize() const { if(opt.is_not_just_lang) - return CORD_to_const_char_star(langs); + return CORD_to_const_char_star(langs, 0); else - return (const char*)&langs; + return 0; } + void String::Languages::dump() const { if(opt.is_not_just_lang) CORD_dump(langs); else puts((const char*)&langs); } -const char* String::v() const { - const uint LIMIT_VIEW=20; - char* buf=(char*)malloc(MAX_STRING); - const char*body_view=body.v(); - const char*langs_view=langs.v(); - snprintf(buf, MAX_STRING, - "%d:%.*s%s} " - "{%d:%s", - langs.count(), LIMIT_VIEW, langs_view, strlen(langs_view)>LIMIT_VIEW?"...":"", - strlen(body_view), body_view - ); - - return buf; -} void String::dump() const { body.dump(); langs.dump(); } +static char *n_chars(char c, size_t length){ + char *result=(char *)pa_malloc_atomic(length+1); + memset(result, c, length); + result[length] = '\0'; + return result; +} + +char* String::visualize_langs() const { + return is_not_just_lang() ? pa_strdup(langs.visualize()) : n_chars((char)just_lang(), length()); +} + const String& String::trim(String::Trim_kind kind, const char* chars, Charset* source_charset) const { if(is_empty()) return *this;