--- parser3/src/include/pa_string.h 2012/03/16 09:24:10 1.201 +++ parser3/src/include/pa_string.h 2015/10/08 18:29:15 1.211 @@ -8,7 +8,7 @@ #ifndef PA_STRING_H #define PA_STRING_H -#define IDENT_PA_STRING_H "$Id: pa_string.h,v 1.201 2012/03/16 09:24:10 moko Exp $" +#define IDENT_PA_STRING_H "$Id: pa_string.h,v 1.211 2015/10/08 18:29:15 moko Exp $" // includes #include "pa_types.h" @@ -22,7 +22,7 @@ extern "C" { // cord's author forgot to #define CORD_cat(x, y) CORD_cat_optimized(x, y) #define CORD_cat_char_star(x, y, leny) CORD_cat_char_star_optimized(x, y, leny) #endif -}; +} // defines @@ -54,10 +54,15 @@ class VRegex; int pa_atoi(const char* str, const String* problem_source=0); double pa_atod(const char* str, const String* problem_source=0); +unsigned int pa_atoui(const char *str, int base, const String* problem_source=0); +unsigned long long int pa_atoul(const char *str, int base, const String* problem_source=0); /// this is result of pos functions which mean that substr were not found #define STRING_NOT_FOUND ((size_t)-1) +/// CORD can't be empty string, thus checking it in assigment +#define AS_CORD(v) ((v) && *(v) ? (CORD)(v):0) + /** String which knows the lang of all it's langs. @@ -184,7 +189,7 @@ public: assert(alang); assert(length); - if(!opt.is_not_just_lang) + if(!opt.is_not_just_lang) { if(opt.lang) { if(opt.lang==alang) // same language? ignoring return; @@ -192,6 +197,7 @@ public: opt.lang=alang; // to uninitialized return; } + } append(current, CORD_chars((char)alang, length)); } @@ -200,7 +206,7 @@ public: assert(alang); assert(length); - if(!opt.is_not_just_lang) + if(!opt.is_not_just_lang) { if(opt.lang) { if(opt.lang==alang) // same language? ignoring return; @@ -208,6 +214,7 @@ public: opt.lang=alang; // to uninitialized return; } + } append(current, CORD_chars((char)alang, length)); } @@ -215,7 +222,7 @@ public: void appendHelper(const Body& current, Language alang, const Body &length_helper) { assert(alang); - if(!opt.is_not_just_lang) + if(!opt.is_not_just_lang) { if(opt.lang) { if(opt.lang==alang) // same language? ignoring return; @@ -223,6 +230,7 @@ public: opt.lang=alang; // to uninitialized return; } + } append(current, CORD_chars((char)alang, length_helper.length())); } @@ -331,10 +339,12 @@ public: #ifdef HASH_CODE_CACHING Body(): body(CORD_EMPTY), hash_code(0) INIT_LENGTH {} Body(CORD abody, uint ahash_code): body(abody), hash_code(ahash_code) INIT_LENGTH {} - Body(CORD abody): body(abody), hash_code(0) INIT_LENGTH { + Body(const char *abody): body(AS_CORD(abody)), hash_code(0) INIT_LENGTH {} + explicit Body(CORD abody): body(abody), hash_code(0) INIT_LENGTH { #else Body(): body(CORD_EMPTY) INIT_LENGTH {} - Body(CORD abody): body(abody) INIT_LENGTH { + Body(const char *abody): body(AS_CORD(abody)) INIT_LENGTH {} + explicit Body(CORD abody): body(abody) INIT_LENGTH { #endif #ifdef CORD_CAT_OPTIMIZATION assert(!body // no body @@ -360,14 +370,17 @@ public: bool operator! () const { return is_empty(); } - CORD get_cord() const { return body; } + inline CORD get_cord() const { return body; } uint get_hash_code() const; const char* cstr() const { #ifdef STRING_LENGTH_CACHING string_length = length(); - if(string_length) - return const_cast(this)->body=CORD_to_const_char_star(body, string_length); + if(string_length){ + const char *result=CORD_to_const_char_star(body, string_length); + const_cast(this)->body=(CORD)result; + return result; + } #endif return CORD_to_const_char_star(body, length()); } @@ -376,7 +389,7 @@ public: #ifdef STRING_LENGTH_CACHING void set_length(size_t alength){ string_length = alength; } - size_t length() const { return body ? CORD_IS_STRING(body) ? string_length ? string_length : (string_length=strlen(body)) : CORD_len(body) : 0; } + size_t length() const { return body ? CORD_IS_STRING(body) ? string_length ? string_length : (string_length=strlen((const char *)body)) : CORD_len(body) : 0; } #else size_t length() const { return CORD_len(body); } #endif @@ -389,7 +402,7 @@ public: body = CORD_cat_char_star(body, str, known_length); ZERO_LENGTH } else { - body=str; + body=(CORD)str; #ifdef STRING_LENGTH_CACHING string_length=known_length; #endif @@ -405,21 +418,24 @@ public: Body& operator << (const char* str) { append_know_length(str, strlen(str)); return *this; } - // could not figure out why this operator is needed [should do this chain: string->simple->==] bool operator < (const Body src) const { return CORD_cmp(body, src.body)<0; } bool operator > (const Body src) const { return CORD_cmp(body, src.body)>0; } bool operator <= (const Body src) const { return CORD_cmp(body, src.body)<=0; } bool operator >= (const Body src) const { return CORD_cmp(body, src.body)>=0; } + bool operator != (const Body src) const { return CORD_cmp(body, src.body)!=0; } bool operator == (const Body src) const { return CORD_cmp(body, src.body)==0; } + bool operator != (const char *src) const { return CORD_cmp(body, AS_CORD(src))!=0; } + bool operator == (const char *src) const { return CORD_cmp(body, AS_CORD(src))==0; } + int ncmp(size_t x_begin, const Body y, size_t y_begin, size_t size) const { return CORD_ncmp(body, x_begin, y.body, y_begin, size); } char fetch(size_t index) const { return CORD_fetch(body, index); } - Body mid(size_t aindex, size_t alength) const { return CORD_substr(body, aindex, alength, length()); } - size_t pos(const char* substr, size_t offset=0) const { return CORD_str(body, offset, substr, length()); } + Body mid(size_t aindex, size_t alength) const { return Body(CORD_substr(body, aindex, alength, length())); } + size_t pos(const char* substr, size_t offset=0) const { return CORD_str(body, offset, AS_CORD(substr), length()); } size_t pos(const Body substr, size_t offset=0) const { if(substr.is_empty()) return STRING_NOT_FOUND; // in this case CORD_str returns 0 [parser users got used to -1] @@ -427,14 +443,17 @@ public: // CORD_str checks for bad offset [CORD_chr does not] return CORD_str(body, offset, substr.body, length()); } - size_t pos(char c, - size_t offset=0) const { + size_t pos(char c, size_t offset=0) const { if(offset>=length()) // CORD_chr does not check that [and ABORT's in that case] return STRING_NOT_FOUND; return CORD_chr(body, offset, c); } + size_t strrpbrk(const char* chars, size_t left, size_t right) const; + + size_t rskipchars(const char* chars, size_t left, size_t right) const; + template int for_each(int (*f)(char c, I), I info) const { return CORD_iter(body, (CORD_iter_fn)f, (void*)info); @@ -447,10 +466,6 @@ public: void set_pos(CORD_pos& pos, size_t index) const { CORD_set_pos(pos, body, index); } - /*Body normalize() const { - return Body(CORD_balance(body)); - }*/ - /// @returns this or 0 or mid. if returns this or 0 out_* are not filled Body trim(Trim_kind kind=TRIM_BOTH, const char* chars=0, size_t* out_start=0, size_t* out_length=0, Charset* source_charset=0) const; @@ -459,7 +474,7 @@ public: struct C { const char *str; size_t length; - operator const char *() { return str; } + //operator const char *() { return str; } C(): str(0), length(0) {} C(const char *astr, size_t asize): str(astr), length(asize) {} }; @@ -487,15 +502,13 @@ public: static const String Empty; explicit String(){}; - explicit String(const char* cstr, Language alang=L_CLEAN){ - if(cstr && *cstr){ - body=cstr; + explicit String(const char* cstr, Language alang=L_CLEAN) : body(cstr){ + if(body.get_cord()){ langs=alang; } } - explicit String(const char* cstr, Language alang, size_t alength){ - if(cstr && *cstr){ - body=cstr; + explicit String(const char* cstr, Language alang, size_t alength) : body(cstr){ + if(body.get_cord()){ #ifdef STRING_LENGTH_CACHING body.set_length(alength); #endif @@ -503,7 +516,16 @@ public: } } - String(int value, char *format); + explicit String(C ac, Language alang=L_CLEAN) : body(ac.str){ + if(body.get_cord()){ +#ifdef STRING_LENGTH_CACHING + body.set_length(ac.length); +#endif + langs=alang; + } + } + + String(int value, const char *format); String(Body abody, Language alang): body(abody), langs(alang) { ASSERT_STRING_INVARIANT(*this); } @@ -554,6 +576,14 @@ public: const char* untaint_and_transcode_cstr(Language lang, const Request_charsets *charsets) const; + bool is_not_just_lang() const { + return langs.opt.is_not_just_lang !=0; + } + + Language just_lang() const { + return langs.opt.lang; + } + /// puts pieces to buf Cm serialize(size_t prolog_size) const; /// appends pieces from buf to self @@ -596,11 +626,14 @@ public: return *this; } - /// extracts first char of a string, if any char first_char() const { return is_empty()?0:body.fetch(0); } + char last_char() const { + return is_empty()?0:body.fetch(body.length()-1); + } + bool operator < (const String& src) const { return body (const String& src) const { return body>src.body; } bool operator <= (const String& src) const { return body<=src.body; } @@ -630,6 +663,20 @@ public: const String& substr, size_t this_offset=0, Language lang=L_UNSPECIFIED) const; + size_t strrpbrk(const char* chars, size_t left=0) const { + return (length()) ? body.strrpbrk(chars, left, length()-1) : STRING_NOT_FOUND; + } + size_t strrpbrk(const char* chars, size_t left, size_t right) const { + return body.strrpbrk(chars, left, right); + } + + size_t rskipchars(const char* chars, size_t left=0) const { + return (length()) ? body.rskipchars(chars, left, length()-1) : STRING_NOT_FOUND; + } + size_t rskipchars(const char* chars, size_t left, size_t right) const { + return body.rskipchars(chars, left, right); + } + void split(ArrayString& result, size_t& pos_after, const char* delim,