--- parser3/src/include/pa_string.h 2009/11/09 00:37:44 1.197 +++ parser3/src/include/pa_string.h 2024/03/14 03:17:01 1.232 @@ -1,14 +1,14 @@ /** @file Parser: string class decl. - Copyright (c) 2001-2009 ArtLebedev Group (http://www.artlebedev.com) - Author: Alexandr Petrosian (http://paf.design.ru) + Copyright (c) 2001-2023 Art. Lebedev Studio (http://www.artlebedev.com) + Authors: Konstantin Morshnev , Alexandr Petrosian */ #ifndef PA_STRING_H #define PA_STRING_H -static const char * const IDENT_STRING_H="$Date: 2009/11/09 00:37:44 $"; +#define IDENT_PA_STRING_H "$Id: pa_string.h,v 1.232 2024/03/14 03:17:01 moko Exp $" // includes #include "pa_types.h" @@ -16,13 +16,13 @@ static const char * const IDENT_STRING_H extern "C" { // cord's author forgot to do that #define CORD_NO_IO -#include "cord.h" +#include "../lib/cord/include/cord.h" #ifdef CORD_CAT_OPTIMIZATION #define CORD_cat(x, y) CORD_cat_optimized(x, y) #define CORD_cat_char_star(x, y, leny) CORD_cat_char_star_optimized(x, y, leny) #endif -}; +} // defines @@ -47,17 +47,41 @@ class SQL_Connection; class Dictionary; class Request_charsets; class String; -typedef Array ArrayString; class VRegex; +#ifdef NDEBUG +typedef Array ArrayString; +#else +class ArrayString : public Array { +public: + inline ArrayString(size_t initial=0) : Array(initial){ + } + inline Array& operator+=(element_type src) { + assert(src != NULL); + return Array::operator+=(src); + } + + inline element_type get(size_t index) const { + element_type result=Array::get(index); + assert(result != NULL); + return result; + } +}; +#endif + // generally useful -int pa_atoi(const char* str, const String* problem_source=0); -double pa_atod(const char* str, const String* problem_source=0); +double pa_atod(const char* str, const String* problem_source); +int pa_atoi(const char* str, int base=10, const String* problem_source=0); +unsigned int pa_atoui(const char *str, int base=10, const String* problem_source=0); +uint64_t pa_atoul(const char *str, int base=10, const String* problem_source=0); /// this is result of pos functions which mean that substr were not found #define STRING_NOT_FOUND ((size_t)-1) +/// CORD can't be empty string, thus checking it in assigment +#define AS_CORD(v) ((v) && *(v) ? (CORD)(v):0) + /** String which knows the lang of all it's langs. @@ -68,6 +92,7 @@ double pa_atod(const char* str, const St class String: public PA_Object { + friend class StringSplitHelper; public: /** piece is tainted or not. the lang to use when detaint @@ -84,30 +109,31 @@ public: WARNING WARNING WARNING WARNING WARNING WARNING */ + +#if _MSC_VER >= 1900 + /// required for VS2015+ to make sizeof(Languages::opt) == sizeof(CORD), will be 16 byte under x64 without it + enum Language : size_t { +#else enum Language { - L_UNSPECIFIED=0, ///< no real string has parts of this lange: it's just convinient to check when string's empty +#endif + L_UNSPECIFIED=0, ///< no real string has parts of this lange: it's just convinient to check when string's empty // these two must go before others, there are checks for >L_AS_IS - L_CLEAN='0', ///< clean WARNING: read above warning before changing - L_AS_IS='A', ///< leave all characters intact WARNING: read above warning before changing + L_CLEAN='0', ///< clean WARNING: read above warning before changing + L_AS_IS='A', ///< leave all characters intact WARNING: read above warning before changing - L_PASS_APPENDED='P', - /**< - leave lang built into string being appended. - just a flag, that value not stored - */ - L_TAINTED='T', ///< tainted, untaint lang as assigned later + L_TAINTED='T', ///< tainted, untaint lang as assigned later // untaint langs. assigned by ^untaint[lang]{...} L_FILE_SPEC='F', ///< file specification L_HTTP_HEADER='h', ///< text in HTTP response header L_MAIL_HEADER='m', ///< text in mail header - L_URI='U', ///< text in uri - L_SQL='Q', ///< ^table:sql body - L_JS='J', ///< JavaScript code - L_XML='X', ///< ^dom:set xml - L_HTML='H', ///< HTML code - L_REGEX='R', ///< RegEx expression + L_URI='U', ///< text in uri + L_SQL='Q', ///< ^table:sql body + L_JS='J', ///< JavaScript code + L_XML='X', ///< ^xdoc:create xml + L_HTML='H', ///< HTML code + L_REGEX='R', ///< RegExp + L_JSON='S', ///< JSON code L_HTTP_COOKIE='C', ///< cookies encoded as %uXXXX for compartibility with js functions encode/decode - L_FILE_POST='f', ///< temporary escaping zero-char L_PARSER_CODE='p', ///< ^process body // READ WARNING ABOVE BEFORE ADDING ANYTHING L_OPTIMIZE_BIT = 0x80 ///< flag, requiring cstr whitespace optimization @@ -170,7 +196,7 @@ public: public: - const char* v() const; + const char* visualize() const; void dump() const; Languages(): langs(0) {} @@ -184,7 +210,7 @@ public: assert(alang); assert(length); - if(!opt.is_not_just_lang) + if(!opt.is_not_just_lang) { if(opt.lang) { if(opt.lang==alang) // same language? ignoring return; @@ -192,6 +218,7 @@ public: opt.lang=alang; // to uninitialized return; } + } append(current, CORD_chars((char)alang, length)); } @@ -200,7 +227,7 @@ public: assert(alang); assert(length); - if(!opt.is_not_just_lang) + if(!opt.is_not_just_lang) { if(opt.lang) { if(opt.lang==alang) // same language? ignoring return; @@ -208,6 +235,7 @@ public: opt.lang=alang; // to uninitialized return; } + } append(current, CORD_chars((char)alang, length)); } @@ -215,7 +243,7 @@ public: void appendHelper(const Body& current, Language alang, const Body &length_helper) { assert(alang); - if(!opt.is_not_just_lang) + if(!opt.is_not_just_lang) { if(opt.lang) { if(opt.lang==alang) // same language? ignoring return; @@ -223,6 +251,7 @@ public: opt.lang=alang; // to uninitialized return; } + } append(current, CORD_chars((char)alang, length_helper.length())); } @@ -303,6 +332,22 @@ public: } }; + struct C { + const char *str; + size_t length; + C(): str(0), length(0) {} + C(const char *astr, size_t asize): str(astr), length(asize) {} + explicit C(Body abody): str(abody.cstr()), length(abody.length()) {} + }; + + struct Cm { + char *str; + size_t length; + Cm(): str(0), length(0) {} + Cm(char *astr, size_t asize): str(astr), length(asize) {} + explicit Cm(Body abody): str(abody.cstrm()), length(abody.length()) {} + }; + class Body { CORD body; @@ -311,31 +356,32 @@ public: // cached hash code is not reseted on write operations as test shows // that string body does not change after it is stored as a hash key mutable uint hash_code; +#define INIT_HASH_CODE(c) ,hash_code(c) +#define ZERO_HASH_CODE hash_code=0; +#else +#define INIT_HASH_CODE(c) +#define ZERO_HASH_CODE #endif #ifdef STRING_LENGTH_CACHING // cached length is reseted on modification, used only for char*, not CORD mutable size_t string_length; -#define INIT_LENGTH ,string_length(0) +#define INIT_LENGTH(l) ,string_length(l) #define ZERO_LENGTH string_length=0; #else -#define INIT_LENGTH +#define INIT_LENGTH(l) #define ZERO_LENGTH #endif public: - const char* v() const; void dump() const; -#ifdef HASH_CODE_CACHING - Body(): body(CORD_EMPTY), hash_code(0) INIT_LENGTH {} - Body(CORD abody, uint ahash_code): body(abody), hash_code(ahash_code) INIT_LENGTH {} - Body(CORD abody): body(abody), hash_code(0) INIT_LENGTH { -#else - Body(): body(CORD_EMPTY) INIT_LENGTH {} - Body(CORD abody): body(abody) INIT_LENGTH { -#endif + Body(): body(CORD_EMPTY) INIT_HASH_CODE(0) INIT_LENGTH(0) {} + Body(const char *abody): body(AS_CORD(abody)) INIT_HASH_CODE(0) INIT_LENGTH(0) {} + Body(CORD abody, uint ahash_code): body(abody) INIT_HASH_CODE(ahash_code) INIT_LENGTH(0) {} + explicit Body(C ac): body(AS_CORD(ac.str)) INIT_HASH_CODE(0) INIT_LENGTH(ac.length) {} + explicit Body(CORD abody): body(abody) INIT_HASH_CODE(0) INIT_LENGTH(0) { #ifdef CORD_CAT_OPTIMIZATION assert(!body // no body || *body // ordinary string @@ -354,34 +400,40 @@ public: #endif } + static Body Format(int value); - void clear() { ZERO_LENGTH body=CORD_EMPTY; } + void clear() { ZERO_LENGTH ZERO_HASH_CODE body=CORD_EMPTY; } bool operator! () const { return is_empty(); } - CORD get_cord() const { return body; } + inline CORD get_cord() const { return body; } uint get_hash_code() const; + // never null const char* cstr() const { #ifdef STRING_LENGTH_CACHING string_length = length(); - if(string_length) - return const_cast(this)->body=CORD_to_const_char_star(body, string_length); + if(string_length){ + const char *result=CORD_to_const_char_star(body, string_length); + const_cast(this)->body=(CORD)result; + return result; + } #endif return CORD_to_const_char_star(body, length()); } + // never null char* cstrm() const { return CORD_to_char_star(body, length()); } #ifdef STRING_LENGTH_CACHING void set_length(size_t alength){ string_length = alength; } - size_t length() const { return body ? CORD_IS_STRING(body) ? string_length ? string_length : (string_length=strlen(body)) : CORD_len(body) : 0; } + size_t length() const { return body ? CORD_IS_STRING(body) ? string_length ? string_length : (string_length=strlen((const char *)body)) : CORD_len(body) : 0; } #else size_t length() const { return CORD_len(body); } #endif - bool is_empty() const { return body==CORD_EMPTY; } + inline bool is_empty() const { return body==CORD_EMPTY; } void append_know_length(const char *str, size_t known_length) { if(known_length){ @@ -389,7 +441,7 @@ public: body = CORD_cat_char_star(body, str, known_length); ZERO_LENGTH } else { - body=str; + body=(CORD)str; #ifdef STRING_LENGTH_CACHING string_length=known_length; #endif @@ -405,21 +457,24 @@ public: Body& operator << (const char* str) { append_know_length(str, strlen(str)); return *this; } - // could not figure out why this operator is needed [should do this chain: string->simple->==] bool operator < (const Body src) const { return CORD_cmp(body, src.body)<0; } bool operator > (const Body src) const { return CORD_cmp(body, src.body)>0; } bool operator <= (const Body src) const { return CORD_cmp(body, src.body)<=0; } bool operator >= (const Body src) const { return CORD_cmp(body, src.body)>=0; } + bool operator != (const Body src) const { return CORD_cmp(body, src.body)!=0; } bool operator == (const Body src) const { return CORD_cmp(body, src.body)==0; } + bool operator != (const char *src) const { return CORD_cmp(body, AS_CORD(src))!=0; } + bool operator == (const char *src) const { return CORD_cmp(body, AS_CORD(src))==0; } + int ncmp(size_t x_begin, const Body y, size_t y_begin, size_t size) const { return CORD_ncmp(body, x_begin, y.body, y_begin, size); } char fetch(size_t index) const { return CORD_fetch(body, index); } - Body mid(size_t aindex, size_t alength) const { return CORD_substr(body, aindex, alength, length()); } - size_t pos(const char* substr, size_t offset=0) const { return CORD_str(body, offset, substr, length()); } + Body mid(size_t aindex, size_t alength) const { return Body(CORD_substr(body, aindex, alength, length())); } + size_t pos(const char* substr, size_t offset=0) const { return CORD_str(body, offset, AS_CORD(substr), length()); } size_t pos(const Body substr, size_t offset=0) const { if(substr.is_empty()) return STRING_NOT_FOUND; // in this case CORD_str returns 0 [parser users got used to -1] @@ -427,14 +482,17 @@ public: // CORD_str checks for bad offset [CORD_chr does not] return CORD_str(body, offset, substr.body, length()); } - size_t pos(char c, - size_t offset=0) const { + size_t pos(char c, size_t offset=0) const { if(offset>=length()) // CORD_chr does not check that [and ABORT's in that case] return STRING_NOT_FOUND; return CORD_chr(body, offset, c); } + size_t strrpbrk(const char* chars, size_t left, size_t right) const; + + size_t rskipchars(const char* chars, size_t left, size_t right) const; + template int for_each(int (*f)(char c, I), I info) const { return CORD_iter(body, (CORD_iter_fn)f, (void*)info); @@ -447,37 +505,16 @@ public: void set_pos(CORD_pos& pos, size_t index) const { CORD_set_pos(pos, body, index); } - /*Body normalize() const { - return Body(CORD_balance(body)); - }*/ - /// @returns this or 0 or mid. if returns this or 0 out_* are not filled Body trim(Trim_kind kind=TRIM_BOTH, const char* chars=0, size_t* out_start=0, size_t* out_length=0, Charset* source_charset=0) const; }; - struct C { - const char *str; - size_t length; - operator const char *() { return str; } - C(): str(0), length(0) {} - C(const char *astr, size_t asize): str(astr), length(asize) {} - }; - - struct Cm { - char *str; - size_t length; - //operator char *() { return str; } - Cm(): str(0), length(0) {} - Cm(char *astr, size_t asize): str(astr), length(asize) {} - }; - private: Body body; ///< all characters of string Languages langs; ///< string characters lang - const char* v() const; void dump() const; #define ASSERT_STRING_INVARIANT(string) \ assert((string).langs.invariant((string).body.length())) @@ -487,29 +524,23 @@ public: static const String Empty; explicit String(){}; - explicit String(const char* cstr, Language alang=L_CLEAN){ - if(cstr && *cstr){ - body=cstr; + explicit String(const char* cstr, Language alang=L_CLEAN) : body(cstr){ + if(body.get_cord()){ langs=alang; } } - explicit String(const char* cstr, Language alang, size_t alength){ - if(cstr && *cstr){ - body=cstr; -#ifdef STRING_LENGTH_CACHING - body.set_length(alength); -#endif + explicit String(C ac, Language alang=L_CLEAN) : body(ac){ + if(body.get_cord()){ langs=alang; } } - - String(int value, char *format); String(Body abody, Language alang): body(abody), langs(alang) { ASSERT_STRING_INVARIANT(*this); } String(const String& src): body(src.body), langs(src.langs) { ASSERT_STRING_INVARIANT(*this); } + String(int value, const char *format); /// for convinient hash lookup #ifdef HASH_CODE_CACHING @@ -527,14 +558,9 @@ public: /// convert to CORD with tainting dirty to lang Body cstr_to_string_body_untaint(Language lang, SQL_Connection* connection=0, const Request_charsets *charsets=0) const; - /// - const char* cstr() const { - return body.cstr(); - } - /// - char* cstrm() const { - return body.cstrm(); - } + /// from body + const char* cstr() const { return body.cstr(); } + char* cstrm() const { return body.cstrm(); } /// convert to constant C string forcing lang tainting const char* taint_cstr(Language lang, SQL_Connection* connection=0, const Request_charsets *charsets=0) const { @@ -554,6 +580,16 @@ public: const char* untaint_and_transcode_cstr(Language lang, const Request_charsets *charsets) const; + bool is_not_just_lang() const { + return langs.opt.is_not_just_lang !=0; + } + + Language just_lang() const { + return langs.opt.lang; + } + + char* visualize_langs() const; + /// puts pieces to buf Cm serialize(size_t prolog_size) const; /// appends pieces from buf to self @@ -576,11 +612,20 @@ public: return Body(x).ncmp(0/*x_begin*/, body, 0/*y_begin*/, length())==0; } + String& append_to(String& dest) const; String& append_to(String& dest, Language lang, bool forced=false) const; String& append(const String& src, Language lang, bool forced=false) { return src.append_to(*this, lang, forced); } - String& operator << (const String& src) { return append(src, L_PASS_APPENDED); } + String& append_quoted(const String* src, Language lang=L_JSON){ + *this << "\""; + if(src) + this->append(*src, lang, true/*forced lang*/); + *this << "\""; + return *this; + } + + String& operator << (const String& src) { return src.append_to(*this); } String& operator << (const char* src) { return append_help_length(src, 0, L_AS_IS); } String& operator << (const Body& src){ langs.appendHelper(body, L_AS_IS, src); @@ -588,11 +633,14 @@ public: return *this; } - /// extracts first char of a string, if any char first_char() const { return is_empty()?0:body.fetch(0); } + char last_char() const { + return is_empty()?0:body.fetch(body.length()-1); + } + bool operator < (const String& src) const { return body (const String& src) const { return body>src.body; } bool operator <= (const String& src) const { return body<=src.body; } @@ -609,32 +657,33 @@ public: but when specified: look for substring that lies in ONE fragment in THAT lang @return position of substr in string, -1 means "not found" [const char* version] */ - size_t pos(const Body substr, - size_t this_offset=0, Language lang=L_UNSPECIFIED) const; + size_t pos(const Body substr, size_t this_offset=0, Language lang=L_UNSPECIFIED) const; /// String version of @see pos(const char*, int, Language) - size_t pos(const String& substr, - size_t this_offset=0, Language lang=L_UNSPECIFIED) const; - size_t pos(char c, - size_t this_offset=0) const { + size_t pos(const String& substr, size_t this_offset=0, Language lang=L_UNSPECIFIED) const; + size_t pos(char c, size_t this_offset=0) const { return body.pos(c, this_offset); } - size_t pos(Charset& charset, - const String& substr, - size_t this_offset=0, Language lang=L_UNSPECIFIED) const; - - void split(ArrayString& result, - size_t& pos_after, - const char* delim, - Language lang=L_UNSPECIFIED, int limit=-1) const; - void split(ArrayString& result, - size_t& pos_after, - const String& delim, - Language lang=L_UNSPECIFIED, int limit=-1) const; - - typedef void (*Row_action)(Table& table, ArrayString* row, - int prestart, int prefinish, - int poststart, int postfinish, - void *info); + size_t pos(Charset& charset, const String& substr, size_t this_offset=0, Language lang=L_UNSPECIFIED) const; + + size_t strrpbrk(const char* chars, size_t left=0) const { + return (length()) ? body.strrpbrk(chars, left, length()-1) : STRING_NOT_FOUND; + } + size_t strrpbrk(const char* chars, size_t left, size_t right) const { + return body.strrpbrk(chars, left, right); + } + + size_t rskipchars(const char* chars, size_t left=0) const { + return (length()) ? body.rskipchars(chars, left, length()-1) : STRING_NOT_FOUND; + } + size_t rskipchars(const char* chars, size_t left, size_t right) const { + return body.rskipchars(chars, left, right); + } + + void split(ArrayString& result, size_t pos_after, const char* delim, Language lang=L_UNSPECIFIED) const; + void split(ArrayString& result, size_t pos_after, const String& delim, Language lang=L_UNSPECIFIED) const; + + typedef void (*Row_action)(Table& table, ArrayString* row, int prestart, int prefinish, int poststart, int postfinish, void *info); + /** @return table of found items, if any. table format is defined and fixed[can be used by others]: @@ -642,19 +691,18 @@ public: prematch/match/postmatch/1/2/3/... @endverbatim */ - Table* match(VRegex* vregex, - Row_action row_action, void *info, - int& matches_count) const; + Table* match(VRegex* vregex, Row_action row_action, void *info, int& matches_count) const; + enum Change_case_kind { CC_UPPER, CC_LOWER }; - String& change_case(Charset& source_charset, - Change_case_kind kind) const; + String& change_case(Charset& source_charset, Change_case_kind kind) const; + const String& replace(const Dictionary& dict) const; const String& trim(Trim_kind kind=TRIM_BOTH, const char* chars=0, Charset* source_charset=0) const; double as_double() const { return pa_atod(cstr(), this); } - int as_int() const { return pa_atoi(cstr(), this); } + int as_int() const { return pa_atoi(cstr(), 0, this); } bool as_bool() const { return as_int()!=0; } const String& escape(Charset& source_charset) const;