--- parser3/src/include/pa_string.h 2008/07/17 09:44:29 1.173 +++ parser3/src/include/pa_string.h 2009/05/15 06:57:43 1.185 @@ -1,14 +1,14 @@ /** @file Parser: string class decl. - Copyright (c) 2001-2005 ArtLebedev Group (http://www.artlebedev.com) + Copyright (c) 2001-2009 ArtLebedev Group (http://www.artlebedev.com) Author: Alexandr Petrosian (http://paf.design.ru) */ #ifndef PA_STRING_H #define PA_STRING_H -static const char * const IDENT_STRING_H="$Date: 2008/07/17 09:44:29 $"; +static const char * const IDENT_STRING_H="$Date: 2009/05/15 06:57:43 $"; // includes #include "pa_types.h" @@ -21,6 +21,9 @@ extern "C" { // cord's author forgot to // defines +// cache hash code in String::Body for faster hash access +#define HASH_CODE_CACHING + // cord extension /* Returns true if x does contain */ /* char not_c at positions i..i+n. Value i,i+n must be < CORD_len(x). */ @@ -36,6 +39,7 @@ class Dictionary; class Request_charsets; class String; typedef Array ArrayString; +class VRegex; // generally useful @@ -57,6 +61,8 @@ inline size_t get_length(T current) { - whether they are tainted or not, and the lang which should be used to detaint them */ + + class String: public PA_Object { public: @@ -97,6 +103,7 @@ public: L_HTML='H', ///< HTML code L_REGEX='R', ///< RegEx expression L_HTTP_COOKIE='C', ///< cookies encoded as %uXXXX for compartibility with js functions encode/decode + L_FILE_POST='f', ///temporary escaping zero-char // READ WARNING ABOVE BEFORE ADDING ANYTHING L_OPTIMIZE_BIT = 0x80 ///< flag, requiring cstr whitespace optimization }; @@ -107,14 +114,15 @@ public: TRIM_END }; + union Languages { struct { #ifdef PA_LITTLE_ENDIAN Language lang:8; - int is_not_just_lang:sizeof(CORD)*8-8; + size_t is_not_just_lang:sizeof(CORD)*8-8; #elif defined(PA_BIG_ENDIAN) - int is_not_just_lang:sizeof(CORD)*8-8; + size_t is_not_just_lang:sizeof(CORD)*8-8; Language lang:8; #else # error word endianness not determined for some obscure reason @@ -171,7 +179,7 @@ public: if(!opt.is_not_just_lang) if(opt.lang) { - if(opt.lang==alang) // same length? ignoring + if(opt.lang==alang) // same language? ignoring return; } else { opt.lang=alang; // to uninitialized @@ -181,20 +189,34 @@ public: append(current, CORD_chars((char)alang, asize)); } - /// MUST be called exactly prior to modification of current [uses it's length] template - void append(C current, size_t appending_length, - const Languages src) { - assert(appending_length); + void appendHelper(C current, Language alang, C asize_helper) { + assert(alang); + + if(!opt.is_not_just_lang) + if(opt.lang) { + if(opt.lang==alang) // same language? ignoring + return; + } else { + opt.lang=alang; // to uninitialized + return; + } + + append(current, CORD_chars((char)alang, asize_helper.length())); + } + + template + void appendHelper(C current, C length_helper, const Languages src) { if(!langs) langs=src.langs; // to uninitialized else if(!src.opt.is_not_just_lang) - append(current, src.opt.lang, appending_length); // simplifying when simple source + appendHelper(current, src.opt.lang, length_helper); // simplifying when simple source else - append(current, src.make_langs(appending_length)); + append(current, src.make_langs(length_helper)); } - + + /// MUST be called exactly prior to modification of current [uses it's length] template void append(C current, @@ -257,13 +279,25 @@ public: CORD body; +#ifdef HASH_CODE_CACHING + // cached hash code is not reseted on write operations as test shows + // that string body does not change after it is stored as a hash key + mutable uint hash_code; +#endif + public: const char* v() const; void dump() const; +#ifdef HASH_CODE_CACHING + Body(): body(CORD_EMPTY), hash_code(0) {} + Body(CORD abody, uint ahash_code): body(abody), hash_code(ahash_code) {} + Body(CORD abody): body(abody), hash_code(0) { +#else Body(): body(CORD_EMPTY) {} Body(CORD abody): body(abody) { +#endif assert(!body // no body || *body // ordinary string || body[1]==1 // CONCAT_HDR @@ -271,17 +305,15 @@ public: || body[1]==6 // SUBSTR_HDR ); } - /// WARNING: length is only HELPER length, str in ANY case should be zero-terminated - Body(const char* str, size_t helper_length): body(CORD_EMPTY) { - append_know_length(str, helper_length?helper_length:strlen(str)); - } + static Body Format(int value); void clear() { body=CORD_EMPTY; } bool operator! () const { return is_empty(); } - uint hash_code() const; + CORD get_cord() const { return body; } + uint get_hash_code() const; const char* cstr() const { return CORD_to_const_char_star(body); } char* cstrm() const { return CORD_to_char_star(body); } @@ -318,7 +350,7 @@ public: Body mid(size_t index, size_t length) const { return CORD_substr(body, index, length); } size_t pos(const char* substr, size_t offset=0) const { return CORD_str(body, offset, substr); } size_t pos(const Body substr, size_t offset=0) const { - if(!substr.length()) + if(substr.is_empty()) return STRING_NOT_FOUND; // in this case CORD_str returns 0 [parser users got used to -1] // CORD_str checks for bad offset [CORD_chr does not] @@ -386,8 +418,20 @@ public: static const String Empty; - explicit String(const char* cstr=0, size_t helper_length=0, bool tainted=false); - explicit String(const C cstr, bool tainted=false); + explicit String(){}; + explicit String(const char* cstr, Language alang=L_CLEAN){ + if(cstr && *cstr){ + body=cstr; + langs=alang; + } + } + explicit String(const String::C cstr, Language alang=L_CLEAN){ + if(cstr.length){ + body=cstr.str; + langs=alang; + } + } + String(int value, char *format); String(Body abody, Language alang): body(abody), langs(alang) { ASSERT_STRING_INVARIANT(*this); } @@ -396,7 +440,11 @@ public: } /// for convinient hash lookup +#ifdef HASH_CODE_CACHING + operator const Body&() const { return body; } +#else operator const Body() const { return body; } +#endif bool is_empty() const { return body.is_empty(); } size_t length() const { return body.length(); } @@ -503,9 +551,7 @@ public: prematch/match/postmatch/1/2/3/... @endverbatim */ - Table* match(Charset& source_charset, - const String& regexp, - const String* options, + Table* match(VRegex* vregex, Row_action row_action, void *info, int& matches_count) const; enum Change_case_kind { @@ -519,6 +565,7 @@ public: double as_double() const { return pa_atod(cstr(), this); } int as_int() const { return pa_atoi(cstr(), this); } bool as_bool() const { return as_int()!=0; } + const String& escape(Charset& source_charset) const; private: //disabled @@ -531,10 +578,12 @@ inline size_t get_length(S return body.length(); } +#ifndef HASH_CODE_CACHING /// simple hash code of string. used by Hash inline uint hash_code(const String::Body self) { - return self.hash_code(); + return self.get_hash_code(); } +#endif /// now that we've declared specialization we can use it