Annotation of parser3/src/include/pa_string.h, revision 1.145
1.41 paf 1: /** @file
1.43 paf 2: Parser: string class decl.
3:
1.145 ! paf 4: Copyright (c) 2001-2003 ArtLebedev Group (http://www.artlebedev.com)
1.124 paf 5: Author: Alexandr Petrosian <paf@design.ru> (http://paf.design.ru)
1.1 paf 6: */
7:
8: #ifndef PA_STRING_H
9: #define PA_STRING_H
1.140 paf 10:
1.145 ! paf 11: static const char* IDENT_STRING_H="$Date: 2003/04/21 06:37:58 $";
! 12:
! 13: // includes
1.1 paf 14:
1.4 paf 15: #include "pa_types.h"
1.145 ! paf 16: #include "pa_array.h"
! 17:
! 18: extern "C" { // cord's author forgot to do that
! 19: #define CORD_NO_IO
! 20: #include "cord.h"
! 21: };
1.4 paf 22:
1.145 ! paf 23: // forwards
1.9 paf 24:
1.145 ! paf 25: class Charset;
1.135 paf 26: class Table;
1.71 paf 27: class SQL_Connection;
1.101 parser 28: class Dictionary;
1.145 ! paf 29: class Request_charsets;
! 30: class String;
! 31: typedef Array<const String*> ArrayString;
! 32:
! 33: /// this is result of pos functions which mean that substr were not found
! 34: #define STRING_NOT_FOUND ((size_t)-1)
! 35:
! 36: class StringBody {
! 37:
! 38: CORD body;
! 39:
! 40: public:
! 41:
! 42: StringBody(): body(CORD_EMPTY) {}
! 43: StringBody(CORD abody): body(abody) {
! 44: assert(!body // no body
! 45: || *body // ordinary string
! 46: || body[1]==1 // CONCAT_HDR
! 47: || body[1]==4 // FN_HDR
! 48: || body[1]==6 // SUBSTR_HDR
! 49: );
! 50: }
! 51: /// WARNING: length is only HELPER length, str in ANY case should be zero-terminated
! 52: StringBody(const char* str, size_t helper_length): body(CORD_EMPTY) {
! 53: append_know_length(str, helper_length?helper_length:strlen(str));
! 54: }
! 55: static StringBody Format(int value);
! 56:
! 57: void clear() { body=CORD_EMPTY; }
! 58:
! 59: bool operator! () const { return is_empty(); }
! 60:
! 61: uint hash_code() const;
! 62:
! 63: const char* cstr() const { return CORD_to_const_char_star(body); }
! 64: char* cstrm() const { return CORD_to_char_star(body); }
! 65:
! 66: size_t length() const { return CORD_len(body); }
! 67:
! 68: bool is_empty() const { return body==CORD_EMPTY; }
! 69:
! 70: void append_know_length(const char *str, size_t known_length) {
! 71: if(known_length)
! 72: body=CORD_cat_char_star(body, str, known_length);
! 73: }
! 74: void append_strdup_know_length(const char* str, size_t known_length) {
! 75: if(known_length)
! 76: append_know_length(pa_strdup(str, known_length), known_length);
! 77: }
! 78: void append(char c) { body=CORD_cat_char(body, c); }
! 79: StringBody& operator << (const StringBody src) { body=CORD_cat(body, src.body); return *this; }
! 80: StringBody& operator << (const char* str) { append_know_length(str, strlen(str)); return *this; }
! 81:
! 82: // could not figure out why this operator is needed [should do this chain: string->simple->==]
! 83: bool operator < (const StringBody src) const { return CORD_cmp(body, src.body)<0; }
! 84: bool operator > (const StringBody src) const { return CORD_cmp(body, src.body)>0; }
! 85: bool operator <= (const StringBody src) const { return CORD_cmp(body, src.body)<=0; }
! 86: bool operator >= (const StringBody src) const { return CORD_cmp(body, src.body)>=0; }
! 87: bool operator != (const StringBody src) const { return CORD_cmp(body, src.body)!=0; }
! 88: bool operator == (const StringBody src) const { return CORD_cmp(body, src.body)==0; }
! 89:
! 90: int ncmp(size_t x_begin, const StringBody y, size_t y_begin, size_t size) const {
! 91: return CORD_ncmp(body, x_begin, y.body, y_begin, size);
! 92: }
! 93:
! 94: char fetch(size_t index) const { return CORD_fetch(body, index); }
! 95: StringBody mid(size_t index, size_t length) const { return CORD_substr(body, index, length); }
! 96: size_t pos(const char* substr, size_t offset=0) const { return CORD_str(body, offset, substr); }
! 97: size_t pos(const StringBody substr, size_t offset=0) const {
! 98: if(!substr.length())
! 99: return STRING_NOT_FOUND; // in this case CORD_str returns 0 [parser users got used to -1]
! 100: return CORD_str(body, offset, substr.body);
! 101: }
! 102: size_t pos(char c,
! 103: size_t offset=0) const {
! 104: return CORD_chr(body, offset, c);
! 105: }
! 106:
! 107: template<typename I> void for_each(int (*callback)(const char* s, I), I info) const {
! 108: CORD_iter5(body, 0, 0, (CORD_batched_iter_fn)callback, info);
! 109: }
! 110:
! 111: void set_pos(CORD_pos& pos, size_t index) const { CORD_set_pos(pos, body, index); }
! 112:
! 113: StringBody normalize() const {
! 114: return StringBody(CORD_balance(body));
! 115: }
! 116:
! 117: void dump() const {
! 118: CORD_dump(body);
! 119: }
! 120: };
1.62 paf 121:
1.42 paf 122: /**
1.145 ! paf 123: String which knows the language of all it's fragments.
1.41 paf 124:
125: All pieces remember
126: - whether they are tainted or not,
127: and the language which should be used to detaint them
128: */
1.145 ! paf 129: class String: public PA_Object {
! 130:
! 131: // friend class StringBody;
! 132:
1.1 paf 133: public:
1.48 paf 134:
1.106 parser 135: /** piece is tainted or not. the language to use when detaint
136: remember to change String_Untaint_lang_name @ untaint.C along
137: */
1.145 ! paf 138: enum Language {
! 139: L_UNSPECIFIED=0, ///< zero value handy for hash lookup @see untaint_lang_name2enum
! 140: // these two must go before others, there are checks for >L_AS_IS
! 141: L_CLEAN, ///< clean
! 142: L_AS_IS, ///< leave all characters intact
1.122 paf 143:
1.145 ! paf 144: L_PASS_APPENDED,
1.41 paf 145: /**<
146: leave language built into string being appended.
147: just a flag, that value not stored
148: */
1.145 ! paf 149: L_TAINTED, ///< tainted, untaint language as assigned later
1.122 paf 150: // untaint languages. assigned by ^untaint[lang]{...}
1.145 ! paf 151: L_FILE_SPEC, ///< file specification
! 152: L_HTTP_HEADER, ///< text in HTTP response header
! 153: L_MAIL_HEADER, ///< text in mail header
! 154: L_URI, ///< text in uri
! 155: L_TABLE, ///< ^table:set body
! 156: L_SQL, ///< ^table:sql body
! 157: L_JS, ///< JavaScript code
! 158: L_XML, ///< ^dom:set xml
! 159: L_HTML, ///< HTML code (for editing)
! 160: L_OPTIMIZE_BIT = 0x8000 ///< flag, requiring cstr whitespace optimization
1.27 paf 161: };
162:
1.145 ! paf 163: struct Fragment {
! 164: Language lang; ///< untaint flag, later untaint language
! 165: size_t length; ///< length
! 166: Fragment(Language alang, size_t asize): lang(alang), length(asize) {
! 167: assert(alang!=L_UNSPECIFIED);
! 168: assert(asize!=0);
! 169: assert(asize!=(size_t)-1);
! 170: }
! 171: };
! 172:
! 173: class ArrayFragment: public Array<Fragment> {
! 174: void append(element_type src) {
! 175: *static_cast<Array<Fragment> *>(this)+=src;
! 176: }
! 177: /// hiding from accidental USE, use append_positions
! 178: void append(const ArrayFragment& src, int offset, int limit) {
! 179: static_cast<Array<Fragment> *>(this)->append(src, offset, limit);
! 180: }
! 181: public:
! 182: ArrayFragment& operator += (element_type src) {
! 183: if(size_t lcount=count()) { // not empty?
! 184: // try to join with last
! 185: Fragment& last=get_ref(lcount-1);
! 186: if(last.lang==src.lang) {
! 187: last.length+=src.length;
! 188: return *this;
! 189: }
! 190: }
! 191: append(src);
! 192: return *this;
! 193: }
! 194: void append(const ArrayFragment& src) { append(src, 0, ARRAY_OPTION_LIMIT_ALL); }
! 195: void append_positions(const ArrayFragment& src, size_t substr_begin, size_t substr_end);
! 196:
! 197: size_t length() {
! 198: size_t result=0;
! 199: for(Array_iterator<element_type> i(*this); i.has_next(); ) {
! 200: const Fragment fragment=i.next();
! 201: result+=fragment.length;
! 202: }
! 203: return result;
! 204: }
! 205: };
! 206:
! 207: struct C {
! 208: const char *str;
! 209: size_t length;
! 210: operator const char *() { return str; }
! 211: C(const char *astr, size_t asize): str(astr), length(asize) {}
! 212: };
! 213:
! 214: struct Cm {
! 215: char *str;
! 216: size_t length;
! 217: //operator char *() { return str; }
! 218: Cm(char *astr, size_t asize): str(astr), length(asize) {}
! 219: };
! 220:
! 221: private:
! 222:
! 223: StringBody body; ///< all characters of string
! 224: ArrayFragment fragments; ///< fragment language+length info
! 225:
1.8 paf 226: public:
227:
1.145 ! paf 228: explicit String(const char* cstr=0, size_t helper_length=0, bool tainted=false);
! 229: explicit String(const C cstr, bool tainted=false);
1.14 paf 230: String(const String& src);
1.145 ! paf 231: String(StringBody abody, Language alang): body(abody) {
! 232: fragments+=Fragment(alang, abody.length());
! 233: }
! 234:
! 235: #define ASSERT_STRING_INVARIANT(string) \
! 236: assert((string).body.length()==(string).fragments.length())
! 237:
! 238: /// for convinient hash lookup
! 239: operator const StringBody() const { return body; }
! 240:
! 241: bool is_empty() const { return body.is_empty(); }
! 242: size_t length() const { return body.length(); }
! 243:
! 244: /// convert to CORD. if 'lang' known, forcing 'lang' to it
! 245: StringBody cstr_to_string_body(Language lang=L_AS_IS,
! 246: SQL_Connection* connection=0,
! 247: const Request_charsets *charsets=0) const;
! 248:
! 249: /// convert to constant C string. if 'lang' known, forcing 'lang' to it
! 250: const char* cstr(Language lang=L_AS_IS,
! 251: SQL_Connection* connection=0,
! 252: const Request_charsets *charsets=0) const {
! 253: return cstr_to_string_body(lang, connection, charsets).cstr();
! 254: }
! 255: /// convert to Modifiable C string. if 'lang' known, forcing 'lang' to it
! 256: char *cstrm(Language lang=L_AS_IS,
! 257: SQL_Connection* connection=0,
! 258: const Request_charsets *charsets=0) const {
! 259: return cstr_to_string_body(lang, connection, charsets).cstrm();
1.50 paf 260: }
1.108 parser 261: /// puts pieces to buf
1.145 ! paf 262: Cm serialize(size_t prolog_size) const;
1.108 parser 263: /// appends pieces from buf to self
1.145 ! paf 264: bool deserialize(size_t prolog_size, void *buf, size_t buf_size);
! 265: /// @see StringBody::append_know_length
! 266: String& append_know_length(const char* str, size_t known_length, Language lang);
! 267: /// @see StringBody::append_help_length
! 268: String& append_help_length(const char* str, size_t helper_length, Language lang);
! 269: String& append_strdup(const char* str, size_t helper_length, Language lang);
! 270:
! 271: bool operator == (const char* y) const { return body==StringBody(y); }
! 272: bool operator != (const char* y) const { return body!=StringBody(y); }
! 273:
! 274: /// this starts with y
! 275: bool starts_with(const char* y) const {
! 276: return body.ncmp(0/*x_begin*/, StringBody(y), 0/*y_begin*/, strlen(y))==0;
! 277: }
! 278: /// x starts with this
! 279: bool this_starts(const char* x) const {
! 280: return StringBody(x).ncmp(0/*x_begin*/, body, 0/*y_begin*/, length())==0;
1.26 paf 281: }
282:
1.145 ! paf 283: String& append_to(String& dest, Language lang, bool forced) const;
! 284: String& append(const String& src, Language lang, bool forced=false) {
! 285: return src.append_to(*this, lang, forced);
! 286: }
! 287: String& operator << (const String& src) { return append(src, L_PASS_APPENDED); }
! 288: String& operator << (const char* src) { return append_help_length(src, 0, L_AS_IS); }
! 289: String& operator << (const StringBody src) {
! 290: body<<src;
! 291: fragments+=Fragment(L_AS_IS, src.length());
! 292: return *this;
! 293: }
1.100 parser 294:
1.142 paf 295: /// extracts first char of a string, if any
296: char first_char() const {
1.145 ! paf 297: return is_empty()?0:body.fetch(0);
1.142 paf 298: }
1.54 paf 299:
1.145 ! paf 300: bool operator < (const String& src) const { return body<src.body; }
! 301: bool operator > (const String& src) const { return body>src.body; }
! 302: bool operator <= (const String& src) const { return body<=src.body; }
! 303: bool operator >= (const String& src) const { return body>=src.body; }
! 304: bool operator != (const String& src) const { return body!=src.body; }
! 305: bool operator == (const String& src) const { return body==src.body; }
! 306:
1.54 paf 307: /// extracts [start, finish) piece of string
1.145 ! paf 308: String& mid(size_t substr_begin, size_t substr_end) const;
! 309:
! 310: /**
! 311: ignore lang if it's L_UNSPECIFIED
! 312: but when specified: look for substring that lies in ONE fragment in THAT lang
! 313: @return position of substr in string, -1 means "not found" [const char* version]
! 314: */
! 315: size_t pos(const StringBody substr,
! 316: size_t this_offset=0, Language lang=L_UNSPECIFIED) const;
! 317: /// String version of @see pos(const char*, int, Language)
! 318: size_t pos(const String& substr,
! 319: size_t this_offset=0, Language lang=L_UNSPECIFIED) const;
! 320: size_t pos(char c,
! 321: size_t this_offset=0) const {
! 322: return body.pos(c, this_offset);
! 323: }
1.55 paf 324:
1.145 ! paf 325: void split(ArrayString& result,
! 326: size_t& pos_after,
! 327: const char* delim,
! 328: Language lang=L_UNSPECIFIED, int limit=-1) const;
! 329: void split(ArrayString& result,
! 330: size_t& pos_after,
1.62 paf 331: const String& delim,
1.145 ! paf 332: Language lang=L_UNSPECIFIED, int limit=-1) const;
1.62 paf 333:
1.145 ! paf 334: typedef void (*Row_action)(Table& table, ArrayString* row,
1.136 paf 335: int prestart, int prefinish,
336: int poststart, int postfinish,
1.68 paf 337: void *info);
1.87 parser 338: /**
1.145 ! paf 339: @return table of found items, if any.
1.87 parser 340: table format is defined and fixed[can be used by others]:
341: @verbatim
342: prematch/match/postmatch/1/2/3/...
343: @endverbatim
344: */
1.145 ! paf 345: Table* match(Charset& source_charset,
1.64 paf 346: const String& regexp,
1.145 ! paf 347: const String* options,
1.99 parser 348: Row_action row_action, void *info,
1.145 ! paf 349: bool& just_matched) const;
1.87 parser 350: enum Change_case_kind {
351: CC_UPPER,
352: CC_LOWER
353: };
1.145 ! paf 354: String& change_case(Charset& source_charset,
1.87 parser 355: Change_case_kind kind) const;
1.145 ! paf 356: const String& replace(const Dictionary& dict) const;
1.96 parser 357: double as_double() const;
358: int as_int() const;
1.137 paf 359:
1.7 paf 360: private: //disabled
361:
1.12 paf 362: String& operator = (const String&) { return *this; }
1.7 paf 363:
1.1 paf 364: };
1.119 paf 365:
1.145 ! paf 366: /// simple hash code of string. used by Hash
! 367: inline uint hash_code(const StringBody self) {
! 368: return self.hash_code();
1.119 paf 369: }
1.1 paf 370:
371: #endif
E-mail: