|
|
1.7 paf 1: /** @file
1.8 paf 2: Parser: String class part: untaint mechanizm.
3:
1.90 paf 4: Copyright(c) 2001, 2002 ArtLebedev Group (http://www.artlebedev.com)
1.89 paf 5: Author: Alexandr Petrosian <paf@design.ru> (http://paf.design.ru)
1.103 paf 6: */
1.8 paf 7:
1.109 ! paf 8: static const char* IDENT_UNTAINT_C="$Date: 2002/09/24 10:24:23 $";
1.1 paf 9:
10: #include "pa_pool.h"
11: #include "pa_string.h"
12: #include "pa_hash.h"
13: #include "pa_exception.h"
1.13 paf 14: #include "pa_table.h"
1.32 paf 15: #include "pa_globals.h"
1.34 paf 16: #include "pa_sql_connection.h"
1.58 parser 17: #include "pa_dictionary.h"
1.66 parser 18: #include "pa_common.h"
1.85 paf 19: #include "pa_charset.h"
1.1 paf 20:
1.95 paf 21: //#define DEBUG_STRING_APPENDS_VS_EXPANDS
1.91 paf 22:
23: #ifdef DEBUG_STRING_APPENDS_VS_EXPANDS
24: ulong string_string_shortcut_economy=0;
25: #endif
26:
1.18 paf 27: #define escape(action) \
1.1 paf 28: { \
1.13 paf 29: const char *src=row->item.ptr; \
30: for(int size=row->item.size; size--; src++) \
1.18 paf 31: action \
1.1 paf 32: }
1.13 paf 33: #define _default default: *dest++=*src; break
34: #define encode(need_encode_func, prefix) \
35: if(need_encode_func(*src)) { \
1.5 paf 36: static const char *hex="0123456789ABCDEF"; \
1.9 paf 37: char chunk[3]={prefix}; \
1.13 paf 38: chunk[1]=hex[((unsigned char)*src)/0x10]; \
39: chunk[2]=hex[((unsigned char)*src)%0x10]; \
1.60 parser 40: memcpy(dest, chunk, 3); dest+=3; \
1.5 paf 41: } else \
1.13 paf 42: *dest++=*src; \
1.5 paf 43: break
1.18 paf 44: #define to_char(c) *dest++=c
45: #define to_string(b, bsize) \
1.60 parser 46: memcpy(dest, b, bsize); \
1.18 paf 47: dest+=bsize; \
1.4 paf 48:
1.9 paf 49: inline bool need_file_encode(unsigned char c){
1.108 paf 50: // russian letters and space ENABLED
51: // encoding only these...
52: return strchr(
53: "*?'\"<>|"
54: #ifndef WIN32
55: ":\\~"
1.31 paf 56: #endif
1.108 paf 57: , c)!=0;
1.9 paf 58: }
1.5 paf 59: inline bool need_uri_encode(unsigned char c){
1.13 paf 60: if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z'))
1.4 paf 61: return false;
62:
1.5 paf 63: return !strchr("_-./", c);
64: }
1.36 paf 65: inline bool need_http_header_encode(unsigned char c){
1.18 paf 66: if(strchr(" , :", c))
1.5 paf 67: return false;
68:
69: return need_uri_encode(c);
1.4 paf 70: }
1.1 paf 71:
1.56 parser 72: //
73:
74: static const char * String_Untaint_lang_name[]={
75: "U", ///< zero value handy for hash lookup @see untaint_lang_name2enum
76: "C", ///< clean
77: "T", ///< tainted, untaint language as assigned later
78: // untaint languages. assigned by ^untaint[lang]{...}
79: "P",
80: /**<
81: leave language built into string being appended.
82: just a flag, that value not stored
83: */
84: "A", ///< leave all characters intact
1.68 parser 85: "F", ///< file specification
86: "H", ///< ext in HTTP response header
1.56 parser 87: "M", ///< text in mail header
88: "URI", ///< text in uri
89: "T", ///< ^table:set body
90: "SQL", ///< ^table:sql body
91: "JS", ///< JavaScript code
1.68 parser 92: "XML", ///< ^dom:set xml
1.82 paf 93: "HTML" ///< HTML code (for editing)
1.56 parser 94: };
95:
96:
1.1 paf 97: // String
98:
1.41 paf 99: /*
100:
101: HTTP-header = field-name ":" [ field-value ] CRLF
102:
103: field-name = token
104: field-value = *( field-content | LWS )
105:
106: field-content = <the OCTETs making up the field-value
107: and consisting of either *TEXT or combinations
108: of token, tspecials, and quoted-string>
109:
110:
111:
112: word = token | quoted-string
113:
114: token = 1*<any CHAR except CTLs or tspecials>
115:
116:
117:
118: tspecials = "(" | ")" | "<" | ">" | "@"
119: | "," | ";" | ":" | "\" | <">
120: | "/" | "[" | "]" | "?" | "="
121: | "{" | "}" | SP | HT
122:
123: SP = <US-ASCII SP, space (32)>
124: HT = <US-ASCII HT, horizontal-tab (9)>
125:
126: LWS = [CRLF] 1*( SP | HT )
127: TEXT = <any OCTET except CTLs,
128: but including LWS>
129:
130: quoted-pair = "\" CHAR
131:
132: if(strchr("()<>@,;:\\\"/[]?={} \t", *ptr))
133: */
134: inline bool need_quote_http_header(const char *ptr, size_t size) {
135: for(; size--; ptr++)
1.42 paf 136: if(strchr(";\\\"= \t" /* excluded ()<>@, :/ ? []{} */, *ptr))
1.41 paf 137: return true;
138: return false;
139: }
140:
1.91 paf 141: //#include "pa_sapi.h"
142: /**
1.92 paf 143: appends other String,
1.91 paf 144: marking all tainted pieces of it with @a lang.
1.92 paf 145: or marking ALL pieces of it with a @a lang when @a forced to,
146: and propagating OPTIMIZE language bit.
1.91 paf 147:
148: using architecture advantage: after string-to-string-append string never modified.
149: algorithm:
150: if no language-change specified and src not yet appended to some other string[last_chunk!=0]
1.92 paf 151: shrinking dest last_chunk[preparing it for linking],
1.93 paf 152: ///shrinking src last_chunk[preparing it to be linked, consequent dest.appends would go there],
1.101 paf 153: linking[dest.last_chunk = src.head.chunk]
1.91 paf 154: if some language-change specified or src already appended to some other string[last_chunk==0]
155: cloning pieces.
156: */
1.77 paf 157: String& String::append(const String& src, uchar lang, bool forced) {
1.94 paf 158: if(!last_chunk) // growth stopped [we're appended as string to somebody]
1.99 paf 159: throw Exception(0,
1.94 paf 160: this,
161: "string growth stopped (append string)");
162:
1.93 paf 163: if(src.is_empty())
164: return *this;
165:
1.94 paf 166: // without language-chage, not-appended-before, big[not fitting our tail] string?
167: if(lang==UL_PASS_APPENDED
168: && src.last_chunk
169: && (uint(&last_chunk->rows[last_chunk->count]-append_here) < src.used_rows())) {
1.91 paf 170: #ifdef DEBUG_STRING_APPENDS_VS_EXPANDS
1.92 paf 171: string_string_shortcut_economy+=src.used_rows()*sizeof(String::Chunk::Row);
1.91 paf 172: #endif
1.94 paf 173:
1.93 paf 174: // using fact:
1.101 paf 175: // src.head.chunk.count initally equeals this.head.chunk.count and shrinks-only,
176: // so can't be more than this.head.chunk.count,
1.94 paf 177: // which means that we know that
1.101 paf 178: // src.head.chunk would fit into this.head.chunk
179: if(is_empty()) { // our head.chunk is empty
180: // they have more than head.chunk? we need all head.chunk : we need only filled-part of head.chunk
181: Chunk *src_head_link=src.head.chunk.rows[src.head.chunk.count].link;
182: size_t head_count=src_head_link?src.head.chunk.count:(src.append_here-src.head.chunk.rows);
183: // "your head.chunk is my head.chunk"
184: memcpy(head.chunk.rows, src.head.chunk.rows, sizeof(Chunk::Row)*(head_count));
1.94 paf 185: if(src_head_link) {
186: // "your body is my body"
1.101 paf 187: head.chunk.rows[head.chunk.count=head_count].link=src_head_link;
1.94 paf 188: // "your last_chunk is mine now"
189: last_chunk=src.last_chunk;
190: // "your append_here is mine now"
191: append_here=src.append_here;
192: } else {
193: // "your last_chunk is mine now"
1.101 paf 194: last_chunk=&head.chunk;
1.94 paf 195: // "your append_here is recalc-mine now"
1.101 paf 196: append_here=head.chunk.rows+head_count;
1.97 paf 197: }
1.101 paf 198: } else { // our head.chunk contains something
1.94 paf 199: // "chopping off my tail-reserve"
1.96 paf 200: last_chunk->count=append_here-last_chunk->rows;
1.93 paf 201: // "you is my tail"
1.101 paf 202: append_here->link=&src.head.chunk;
1.94 paf 203: // "your last_chunk is mine now"
204: last_chunk=src.last_chunk;
205: // "your append_here is mine now"
206: append_here=src.append_here;
1.93 paf 207: }
1.92 paf 208:
1.93 paf 209: // stop-growing mark
1.97 paf 210: src.last_chunk=0;
211: return *this;
1.91 paf 212: }
213:
1.77 paf 214: // manually unrolled code to avoid do{if(const)} constructs
215: if(forced)
216: STRING_SRC_FOREACH_ROW(
217: APPEND(row->item.ptr, row->item.size,
218: lang, //forcing passed lang
219: row->item.origin.file, row->item.origin.line);
220: )
221: else if(lang==UL_PASS_APPENDED)
222: STRING_SRC_FOREACH_ROW(
223: APPEND(row->item.ptr, row->item.size,
224: row->item.lang, // passing item's lang
225: row->item.origin.file, row->item.origin.line);
226: )
227: else if(lang&UL_OPTIMIZE_BIT) // main idea here
228: // tainted piece would get OPTIMIZED bit from 'lang'
229: // clean piece would be marked OPTIMIZED manually
230: // pieces with determined languages [not tainted|clean] would retain theirs langs
231: STRING_SRC_FOREACH_ROW(
232: APPEND(row->item.ptr, row->item.size,
233: row->item.lang==UL_TAINTED?lang:(
234: row->item.lang==UL_CLEAN?UL_CLEAN|UL_OPTIMIZE_BIT: // ORing with OPTIMIZED flag
235: row->item.lang
236: ),
237: row->item.origin.file, row->item.origin.line);
238: )
239: else
240: STRING_SRC_FOREACH_ROW(
241: APPEND(row->item.ptr, row->item.size,
242: row->item.lang==UL_TAINTED?lang:row->item.lang,
243: row->item.origin.file, row->item.origin.line);
244: );
1.97 paf 245: /*
1.96 paf 246: for(Chunk::Row *row=last_chunk->rows; row<append_here; row++)
247: if(row->link==(void*)0xcdcdcdcd)
1.97 paf 248: _asm int 3;*/
1.77 paf 249: return *this;
250: }
251:
1.75 paf 252: size_t String::cstr_bufsize(Untaint_lang lang,
253: SQL_Connection *connection,
1.87 paf 254: Charset *buf_charset) const {
1.77 paf 255: size_t dest=1; // for terminating 0
256: STRING_FOREACH_ROW(
257: uchar to_lang=lang==UL_UNSPECIFIED?row->item.lang:lang;
258:
259: switch(to_lang & ~UL_OPTIMIZE_BIT) {
260: case UL_CLEAN:
261: case UL_TAINTED:
262: case UL_AS_IS:
263: // clean piece
264:
265: // tainted piece, but undefined untaint language
266: // for VString.as_double of tainted values
267: // for ^process{body} evaluation
268:
269: // tainted, untaint language: as-is
270: dest+=row->item.size;
271: break;
272: case UL_FILE_SPEC:
273: // tainted, untaint language: file [name]
274: dest+=row->item.size*3/* worst: Z->%XX */;
275: break;
276: case UL_URI:
277: // tainted, untaint language: uri
1.84 paf 278: dest+=row->item.size*6*3/* worst utf8 x worst Z->%XX */;
1.77 paf 279: break;
280: case UL_HTTP_HEADER:
281: // tainted, untaint language: http-field-content-text
282: dest+=row->item.size*3/* worst: Z->%XX */;
283: break;
284: case UL_MAIL_HEADER:
285: // tainted, untaint language: mail-header
1.87 paf 286: if(buf_charset) {
1.77 paf 287: // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?=
1.87 paf 288: dest+=
289: row->item.size*3+
290: buf_charset->name().size()+MAX_STRING/* worst: =?charset?Q?=%XX?= */;
291: } else
1.75 paf 292: dest+=row->item.size;
1.77 paf 293: break;
294: case UL_TABLE:
295: // tainted, untaint language: table
296: dest+=row->item.size;
297: break;
298: case UL_SQL:
299: // tainted, untaint language: sql
300: if(connection)
301: dest+=connection->quote(0, row->item.ptr, row->item.size);
302: break;
303: case UL_JS:
304: escape(switch(*src) {
305: case '"': case '\'': case '\n': case '\\': case '\xFF':
306: dest+=2; break;
307: default:
308: dest++; break;
309: });
310: break;
311: case UL_XML:
312: escape(switch(*src) {
313: case '&': case '>': case '<': case '"': case '\'':
314: dest+= 6; break;
315: default:
316: dest++; break;
317: });
318: break;
319: case UL_HTML:
320: escape(switch(*src) {
321: case '&':
322: case '>':
323: case '<':
324: case '"':
325: dest+=6; break;
326: default:
327: dest++; break;
328: });
329: break;
1.75 paf 330: }
1.77 paf 331: );
1.75 paf 332: return dest;
1.51 parser 333: }
334:
1.109 ! paf 335: /** http://www.ietf.org/rfc/rfc2047.txt
! 336: RFC
! 337: (3) As a replacement for a 'word' entity within a 'phrase', for example,
! 338: one that precedes an address in a From, To, or Cc header. The ABNF
! 339: definition for 'phrase' from RFC 822 thus becomes:
! 340:
! 341: phrase = 1*( encoded-word / word )
! 342:
! 343: In this case the set of characters that may be used in a "Q"-encoded
! 344: 'encoded-word' is restricted to: <upper and lower case ASCII
! 345: letters, decimal digits, "!", "*", "+", "-", "/", "=", and "_"
! 346: (underscore, ASCII 95.)>. An 'encoded-word' that appears within a
! 347: 'phrase' MUST be separated from any adjacent 'word', 'text' or
! 348: 'special' by 'linear-white-space'.
! 349: ...
! 350: (2) The 8-bit hexadecimal value 20 (e.g., ISO-8859-1 SPACE) may be
! 351: represented as "_" (underscore, ASCII 95.). (This character may
! 352: not pass through some internetwork mail gateways, but its use
! 353: will greatly enhance readability of "Q" encoded data with mail
! 354: readers that do not support this encoding.) Note that the "_"
! 355: always represents hexadecimal 20, even if the SPACE character
! 356: occupies a different code position in the character set in use.
! 357:
! 358: paf: obviously,
! 359: without "=", or one could not differ "=E0" and "russian letter a"
! 360: and without "_", or in would mean 0x20
! 361: */
! 362: static bool mail_header_char_valid_within_Qencoded(char c) {
! 363: return c>='A' && c<='Z'
! 364: || c>='a' && c<='Z'
! 365: || c>='0' && c<='9'
! 366: || strchr("!*+-/", c);
! 367: }
1.43 paf 368: char *String::store_to(char *dest, Untaint_lang lang,
369: SQL_Connection *connection,
1.100 paf 370: Charset *store_to_charset,
371: const char *store_to_charset_name) const {
1.75 paf 372: // WARNING:
373: // before any changes check cstr_bufsize first!!!
1.44 paf 374: bool whitespace=true;
1.96 paf 375: STRING_FOREACH_ROW(
1.77 paf 376: uchar to_lang=lang==UL_UNSPECIFIED?row->item.lang:lang;
377:
378: char *start=dest;
379:
380: switch(to_lang & ~UL_OPTIMIZE_BIT) {
381: case UL_CLEAN:
382: case UL_TAINTED:
383: case UL_AS_IS:
384: // clean piece
385:
386: // tainted piece, but undefined untaint language
387: // for VString.as_double of tainted values
388: // for ^process{body} evaluation
389:
390: // tainted, untaint language: as-is
391: memcpy(dest, row->item.ptr, row->item.size);
392: dest+=row->item.size;
393: break;
394: case UL_FILE_SPEC:
395: // tainted, untaint language: file [name]
1.83 paf 396: escape(
397: encode(need_file_encode, '_');
398: );
1.77 paf 399: break;
400: case UL_URI:
401: // tainted, untaint language: uri
1.85 paf 402: const void *client_ptr;
403: size_t client_size;
404: Charset::transcode(pool(),
405: pool().get_source_charset(), row->item.ptr, row->item.size,
406: pool().get_client_charset(), client_ptr, client_size);
407: {
408: const char *src=(const char *)client_ptr;
409: for(int size=client_size; size--; src++)
1.108 paf 410: encode(need_uri_encode, '%');
1.85 paf 411: }
1.77 paf 412: break;
413: case UL_HTTP_HEADER:
414: // tainted, untaint language: http-field-content-text
1.108 paf 415: escape(
416: encode(need_uri_encode, '%');
417: );
1.77 paf 418: break;
419: case UL_MAIL_HEADER:
420: // tainted, untaint language: mail-header
1.105 paf 421: // http://www.ietf.org/rfc/rfc2047.txt
1.100 paf 422: if(store_to_charset && store_to_charset_name) {
1.87 paf 423: const void *mail_ptr;
424: size_t mail_size;
425: Charset::transcode(pool(),
426: pool().get_source_charset(), row->item.ptr, row->item.size,
427: *store_to_charset, mail_ptr, mail_size);
428:
1.77 paf 429: // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?=
1.87 paf 430: const char *src=(const char *)mail_ptr;
1.77 paf 431: bool to_quoted_printable=false;
1.105 paf 432:
433: //RFC + An 'encoded-word' MUST NOT appear in any portion of an 'addr-spec'.
434: const char *tail=src+mail_size;
435: if(*--tail=='>') {
436: for(int size=mail_size-1; size--; tail--)
437: if(*tail=='<')
438: break;
439: }
1.106 paf 440: const char *stop=*tail=='<'?tail:0;
441: while(stop>src && stop[-1]==' ')
442: --stop;
1.105 paf 443:
444: bool closed=false;
1.87 paf 445: for(int size=mail_size; size--; src++) {
1.107 paf 446: if(src==stop && to_quoted_printable) {
1.105 paf 447: dest+=sprintf(dest, "?=");
448: closed=true;
1.107 paf 449: to_quoted_printable=false;
1.105 paf 450: }
451: if((!stop || src<stop) && (
1.109 ! paf 452: !to_quoted_printable && (*src & 0x80) // starting quote-printable-encoding on first 8bit char
! 453: || to_quoted_printable && !mail_header_char_valid_within_Qencoded(*src)
1.105 paf 454: )) {
1.77 paf 455: if(!to_quoted_printable) {
1.100 paf 456: dest+=sprintf(dest, "=?%s?Q?", store_to_charset_name);
1.77 paf 457: to_quoted_printable=true;
458: }
1.105 paf 459: //RFC Upper case should be used for hexadecimal digits "A" through "F"
1.109 ! paf 460: if(*src == 0x20) // RFC The 8-bit hexadecimal value 20 (e.g., ISO-8859-1 SPACE)
! 461: *dest++='_'; // RFC may be represented as "_"
! 462: else
! 463: dest+=sprintf(dest, "=%02X", *src & 0xFF);
1.105 paf 464: } else
1.109 ! paf 465: *dest++=*src;
1.44 paf 466: }
1.105 paf 467: if(to_quoted_printable && !closed) // close
1.77 paf 468: dest+=sprintf(dest, "?=");
1.87 paf 469:
1.77 paf 470: } else {
1.13 paf 471: memcpy(dest, row->item.ptr, row->item.size);
472: dest+=row->item.size;
1.1 paf 473: }
1.77 paf 474: break;
475: case UL_TABLE:
476: // tainted, untaint language: table
477: escape(switch(*src) {
478: case '\t': to_char(' '); break;
479: case '\n': to_char(' '); break;
480: _default;
481: });
482: break;
483: case UL_SQL:
484: // tainted, untaint language: sql
485: if(connection)
486: dest+=connection->quote(dest, row->item.ptr, row->item.size);
487: else
1.99 paf 488: throw Exception(0,
1.77 paf 489: this,
490: "untaint in SQL language failed - no connection specified");
491: break;
492: case UL_JS:
493: escape(switch(*src) {
494: case '"': to_string("\\\"", 2); break;
495: case '\'': to_string("\\'", 2); break;
496: case '\n': to_string("\\n", 2); break;
497: case '\\': to_string("\\\\", 2); break;
498: case '\xFF': to_string("\\\xFF", 2); break;
499: _default;
500: });
501: break;
502: case UL_XML:
503: escape(switch(*src) {
504: case '&': to_string("&", 5); break;
505: case '>': to_string(">", 4); break;
506: case '<': to_string("<", 4); break;
507: case '"': to_string(""", 6); break;
508: case '\'': to_string("'", 6); break;
509: _default;
510: });
511: break;
512: case UL_HTML:
513: escape(switch(*src) {
514: case '&': to_string("&", 5); break;
515: case '>': to_string(">", 4); break;
516: case '<': to_string("<", 4); break;
517: case '"': to_string(""", 6); break;
518: _default;
519: });
520: break;
521: default:
1.99 paf 522: throw Exception(0,
1.77 paf 523: this,
1.81 paf 524: "unknown untaint language #%d",
525: static_cast<int>(row->item.lang)); // sould never
1.77 paf 526: break; // never
1.76 paf 527: }
1.55 parser 528:
1.77 paf 529: if(to_lang & UL_OPTIMIZE_BIT) {
530: // optimizing whitespace
531: char *stop=dest; dest=start;
532: for(char *src=start; src<stop; src++)
533: switch(*src) {
534: // of all consequent white space chars leaving only first one
1.80 paf 535: case ' ': case '\r': case '\n': case '\t':
1.77 paf 536: if(!whitespace) {
537: *dest++=*src;
538: whitespace=true;
539: }
540: break;
541: default:
542: whitespace=false;
543: *dest++=*src;
544: break;
545: };
546: } else // piece without optimization
547: whitespace=false;
1.96 paf 548: );
1.78 paf 549:
1.76 paf 550: return dest;
551: }
552:
553: char *String::cstr_debug_origins() const {
1.81 paf 554: //_asm int 3;
1.76 paf 555: char *result=(char *)malloc(size()+used_rows()*MAX_STRING*2);
556: char *dest=result;
557:
1.96 paf 558: STRING_FOREACH_ROW(
559: IFNDEF_NO_STRING_ORIGIN(
560: if(row->item.origin.file)
561: dest+=sprintf(dest, ORIGIN_FILE_LINE_FORMAT,
562: row->item.origin.file,
563: 1+row->item.origin.line);
564: else
565: dest+=sprintf(dest, "<unknown>");
566: );
567: uchar show_lang=row->item.lang & ~UL_OPTIMIZE_BIT;
568: if(show_lang>=sizeof(String_Untaint_lang_name)/sizeof(String_Untaint_lang_name[0]))
1.99 paf 569: throw Exception(0,
1.96 paf 570: this,
571: "unknown untaint language #%d",
572: static_cast<int>(show_lang)); // sould never
573:
574: dest+=sprintf(dest, "#%s%s: ",
575: String_Untaint_lang_name[show_lang],
576: row->item.lang & UL_OPTIMIZE_BIT?".O":"");
577: char *dest_after_origins=dest;
1.76 paf 578:
1.96 paf 579: memcpy(dest, row->item.ptr, row->item.size);
580: dest+=row->item.size;
1.76 paf 581:
1.96 paf 582: remove_crlf(dest_after_origins, dest);
583: to_char('\n');
584: );
1.64 parser 585:
1.76 paf 586: *dest=0;
587: return result;
1.1 paf 588: }