--- parser3/src/include/pa_string.h 2016/09/07 14:40:07 1.217 +++ parser3/src/include/pa_string.h 2026/01/06 13:27:59 1.239 @@ -1,22 +1,23 @@ /** @file Parser: string class decl. - Copyright (c) 2001-2015 Art. Lebedev Studio (http://www.artlebedev.com) - Author: Alexandr Petrosian (http://paf.design.ru) + Copyright (c) 2001-2024 Art. Lebedev Studio (http://www.artlebedev.com) + Authors: Konstantin Morshnev , Alexandr Petrosian */ #ifndef PA_STRING_H #define PA_STRING_H -#define IDENT_PA_STRING_H "$Id: pa_string.h,v 1.217 2016/09/07 14:40:07 moko Exp $" +#define IDENT_PA_STRING_H "$Id: pa_string.h,v 1.239 2026/01/06 13:27:59 moko Exp $" // includes #include "pa_types.h" +#include "pa_int.h" #include "pa_array.h" extern "C" { // cord's author forgot to do that #define CORD_NO_IO -#include "cord.h" +#include "../lib/cord/include/cord.h" #ifdef CORD_CAT_OPTIMIZATION #define CORD_cat(x, y) CORD_cat_optimized(x, y) @@ -47,15 +48,29 @@ class SQL_Connection; class Dictionary; class Request_charsets; class String; -typedef Array ArrayString; class VRegex; -// generally useful +#ifdef NDEBUG +typedef Array ArrayString; +#else +class ArrayString : public Array { +public: + inline ArrayString(size_t initial=0) : Array(initial){ + } + inline Array& operator+=(element_type src) { + assert(src != NULL); + return Array::operator+=(src); + } + + inline element_type get(size_t index) const { + element_type result=Array::get(index); + assert(result != NULL); + return result; + } +}; +#endif -int pa_atoi(const char* str, const String* problem_source=0); -double pa_atod(const char* str, const String* problem_source=0); -unsigned int pa_atoui(const char *str, int base, const String* problem_source=0); -unsigned long long int pa_atoul(const char *str, int base, const String* problem_source=0); +// generally useful /// this is result of pos functions which mean that substr were not found #define STRING_NOT_FOUND ((size_t)-1) @@ -73,6 +88,7 @@ unsigned long long int pa_atoul(const ch class String: public PA_Object { + friend class StringSplitHelper; public: /** piece is tainted or not. the lang to use when detaint @@ -89,29 +105,30 @@ public: WARNING WARNING WARNING WARNING WARNING WARNING */ + +#if _MSC_VER >= 1900 + /// required for VS2015+ to make sizeof(Languages::opt) == sizeof(CORD), will be 16 byte under x64 without it + enum Language : size_t { +#else enum Language { - L_UNSPECIFIED=0, ///< no real string has parts of this lange: it's just convinient to check when string's empty +#endif + L_UNSPECIFIED=0, ///< no real string has parts of this lange: it's just convinient to check when string's empty // these two must go before others, there are checks for >L_AS_IS - L_CLEAN='0', ///< clean WARNING: read above warning before changing - L_AS_IS='A', ///< leave all characters intact WARNING: read above warning before changing + L_CLEAN='0', ///< clean WARNING: read above warning before changing + L_AS_IS='A', ///< leave all characters intact WARNING: read above warning before changing - L_PASS_APPENDED='P', - /**< - leave lang built into string being appended. - just a flag, that value not stored - */ - L_TAINTED='T', ///< tainted, untaint lang as assigned later + L_TAINTED='T', ///< tainted, untaint lang as assigned later // untaint langs. assigned by ^untaint[lang]{...} L_FILE_SPEC='F', ///< file specification L_HTTP_HEADER='h', ///< text in HTTP response header L_MAIL_HEADER='m', ///< text in mail header - L_URI='U', ///< text in uri - L_SQL='Q', ///< ^table:sql body - L_JS='J', ///< JavaScript code - L_XML='X', ///< ^xdoc:create xml - L_HTML='H', ///< HTML code + L_URI='U', ///< text in uri + L_SQL='Q', ///< ^table:sql body + L_JS='J', ///< JavaScript code + L_XML='X', ///< ^xdoc:create xml + L_HTML='H', ///< HTML code L_REGEX='R', ///< RegExp - L_JSON='S', ///< JSON code + L_JSON='S', ///< JSON code L_HTTP_COOKIE='C', ///< cookies encoded as %uXXXX for compartibility with js functions encode/decode L_PARSER_CODE='p', ///< ^process body // READ WARNING ABOVE BEFORE ADDING ANYTHING @@ -175,7 +192,7 @@ public: public: - const char* v() const; + const char* visualize() const; void dump() const; Languages(): langs(0) {} @@ -316,6 +333,7 @@ public: size_t length; C(): str(0), length(0) {} C(const char *astr, size_t asize): str(astr), length(asize) {} + explicit C(Body abody): str(abody.cstr()), length(abody.length()) {} }; struct Cm { @@ -323,6 +341,7 @@ public: size_t length; Cm(): str(0), length(0) {} Cm(char *astr, size_t asize): str(astr), length(asize) {} + explicit Cm(Body abody): str(abody.cstrm()), length(abody.length()) {} }; class Body { @@ -352,12 +371,11 @@ public: public: - const char* v() const; void dump() const; Body(): body(CORD_EMPTY) INIT_HASH_CODE(0) INIT_LENGTH(0) {} - Body(const char *abody): body(AS_CORD(abody)) INIT_HASH_CODE(0) INIT_LENGTH(0) {} - Body(CORD abody, uint ahash_code): body(abody) INIT_HASH_CODE(ahash_code) INIT_LENGTH(0) {} + explicit Body(const char *abody): body(AS_CORD(abody)) INIT_HASH_CODE(0) INIT_LENGTH(0) {} + explicit Body(CORD abody, uint ahash_code): body(abody) INIT_HASH_CODE(ahash_code) INIT_LENGTH(0) {} explicit Body(C ac): body(AS_CORD(ac.str)) INIT_HASH_CODE(0) INIT_LENGTH(ac.length) {} explicit Body(CORD abody): body(abody) INIT_HASH_CODE(0) INIT_LENGTH(0) { #ifdef CORD_CAT_OPTIMIZATION @@ -378,8 +396,7 @@ public: #endif } - - static Body Format(int value); + static Body uitoa(size_t aindex); void clear() { ZERO_LENGTH ZERO_HASH_CODE body=CORD_EMPTY; } @@ -388,6 +405,7 @@ public: inline CORD get_cord() const { return body; } uint get_hash_code() const; + // never null const char* cstr() const { #ifdef STRING_LENGTH_CACHING string_length = length(); @@ -400,6 +418,7 @@ public: return CORD_to_const_char_star(body, length()); } + // never null char* cstrm() const { return CORD_to_char_star(body, length()); } #ifdef STRING_LENGTH_CACHING @@ -486,12 +505,11 @@ public: size_t* out_start=0, size_t* out_length=0, Charset* source_charset=0) const; }; -protected: +private: Body body; ///< all characters of string Languages langs; ///< string characters lang - const char* v() const; void dump() const; #define ASSERT_STRING_INVARIANT(string) \ assert((string).langs.invariant((string).body.length())) @@ -565,6 +583,8 @@ public: return langs.opt.lang; } + char* visualize_langs() const; + /// puts pieces to buf Cm serialize(size_t prolog_size) const; /// appends pieces from buf to self @@ -587,6 +607,7 @@ public: return Body(x).ncmp(0/*x_begin*/, body, 0/*y_begin*/, length())==0; } + String& append_to(String& dest) const; String& append_to(String& dest, Language lang, bool forced=false) const; String& append(const String& src, Language lang, bool forced=false) { return src.append_to(*this, lang, forced); @@ -599,7 +620,7 @@ public: return *this; } - String& operator << (const String& src) { return append(src, L_PASS_APPENDED); } + String& operator << (const String& src) { return src.append_to(*this); } String& operator << (const char* src) { return append_help_length(src, 0, L_AS_IS); } String& operator << (const Body& src){ langs.appendHelper(body, L_AS_IS, src); @@ -626,17 +647,14 @@ public: String& mid(size_t substr_begin, size_t substr_end) const; String& mid(Charset& charset, size_t from, size_t to, size_t helper_length=0) const; - /** - ignore lang if it's L_UNSPECIFIED - but when specified: look for substring that lies in ONE fragment in THAT lang - @return position of substr in string, -1 means "not found" [const char* version] - */ - size_t pos(const Body substr, size_t this_offset=0, Language lang=L_UNSPECIFIED) const; - /// String version of @see pos(const char*, int, Language) - size_t pos(const String& substr, size_t this_offset=0, Language lang=L_UNSPECIFIED) const; - size_t pos(char c, size_t this_offset=0) const { - return body.pos(c, this_offset); - } + /// return position of substr in string, -1 means "not found" [const char* version] + size_t pos(const char* substr, size_t this_offset=0) const { return body.pos(substr, this_offset); } + size_t pos(const Body substr, size_t this_offset=0) const { return body.pos(substr, this_offset); } + size_t pos(const String& substr, size_t this_offset=0) const { return body.pos(substr.body, this_offset); } + size_t pos(char c, size_t this_offset=0) const { return body.pos(c, this_offset); } + /// ignore lang if it's L_UNSPECIFIED, otherwise look for substring that lies in ONE fragment in THAT lang + size_t pos(const Body substr, size_t this_offset, Language lang) const; + size_t pos(const String& substr, size_t this_offset, Language lang) const; size_t pos(Charset& charset, const String& substr, size_t this_offset=0, Language lang=L_UNSPECIFIED) const; size_t strrpbrk(const char* chars, size_t left=0) const { @@ -653,8 +671,8 @@ public: return body.rskipchars(chars, left, right); } - void split(ArrayString& result, size_t& pos_after, const char* delim, Language lang=L_UNSPECIFIED, int limit=-1) const; - void split(ArrayString& result, size_t& pos_after, const String& delim, Language lang=L_UNSPECIFIED, int limit=-1) const; + void split(ArrayString& result, size_t pos_after, const char* delim, Language lang=L_UNSPECIFIED) const; + void split(ArrayString& result, size_t pos_after, const String& delim, Language lang=L_UNSPECIFIED) const; typedef void (*Row_action)(Table& table, ArrayString* row, int prestart, int prefinish, int poststart, int postfinish, void *info); @@ -676,7 +694,7 @@ public: const String& replace(const Dictionary& dict) const; const String& trim(Trim_kind kind=TRIM_BOTH, const char* chars=0, Charset* source_charset=0) const; double as_double() const { return pa_atod(cstr(), this); } - int as_int() const { return pa_atoi(cstr(), this); } + int as_int() const { return pa_atoi(cstr(), 0, this); } bool as_bool() const { return as_int()!=0; } const String& escape(Charset& source_charset) const;