|
|
1.7 paf 1: /** @file
1.8 paf 2: Parser: String class part: untaint mechanizm.
3:
1.90 paf 4: Copyright(c) 2001, 2002 ArtLebedev Group (http://www.artlebedev.com)
1.89 paf 5: Author: Alexandr Petrosian <paf@design.ru> (http://paf.design.ru)
1.103 paf 6: */
1.8 paf 7:
1.108 ! paf 8: static const char* IDENT_UNTAINT_C="$Date: 2002/09/16 07:08:49 $";
1.1 paf 9:
10: #include "pa_pool.h"
11: #include "pa_string.h"
12: #include "pa_hash.h"
13: #include "pa_exception.h"
1.13 paf 14: #include "pa_table.h"
1.32 paf 15: #include "pa_globals.h"
1.34 paf 16: #include "pa_sql_connection.h"
1.58 parser 17: #include "pa_dictionary.h"
1.66 parser 18: #include "pa_common.h"
1.85 paf 19: #include "pa_charset.h"
1.1 paf 20:
1.95 paf 21: //#define DEBUG_STRING_APPENDS_VS_EXPANDS
1.91 paf 22:
23: #ifdef DEBUG_STRING_APPENDS_VS_EXPANDS
24: ulong string_string_shortcut_economy=0;
25: #endif
26:
1.18 paf 27: #define escape(action) \
1.1 paf 28: { \
1.13 paf 29: const char *src=row->item.ptr; \
30: for(int size=row->item.size; size--; src++) \
1.18 paf 31: action \
1.1 paf 32: }
1.13 paf 33: #define _default default: *dest++=*src; break
34: #define encode(need_encode_func, prefix) \
35: if(need_encode_func(*src)) { \
1.5 paf 36: static const char *hex="0123456789ABCDEF"; \
1.9 paf 37: char chunk[3]={prefix}; \
1.13 paf 38: chunk[1]=hex[((unsigned char)*src)/0x10]; \
39: chunk[2]=hex[((unsigned char)*src)%0x10]; \
1.60 parser 40: memcpy(dest, chunk, 3); dest+=3; \
1.5 paf 41: } else \
1.13 paf 42: *dest++=*src; \
1.5 paf 43: break
1.18 paf 44: #define to_char(c) *dest++=c
45: #define to_string(b, bsize) \
1.60 parser 46: memcpy(dest, b, bsize); \
1.18 paf 47: dest+=bsize; \
1.4 paf 48:
1.9 paf 49: inline bool need_file_encode(unsigned char c){
1.108 ! paf 50: // russian letters and space ENABLED
! 51: // encoding only these...
! 52: return strchr(
! 53: "*?'\"<>|"
! 54: #ifndef WIN32
! 55: ":\\~"
1.31 paf 56: #endif
1.108 ! paf 57: , c)!=0;
1.9 paf 58: }
1.5 paf 59: inline bool need_uri_encode(unsigned char c){
1.13 paf 60: if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z'))
1.4 paf 61: return false;
62:
1.5 paf 63: return !strchr("_-./", c);
64: }
1.36 paf 65: inline bool need_http_header_encode(unsigned char c){
1.18 paf 66: if(strchr(" , :", c))
1.5 paf 67: return false;
68:
69: return need_uri_encode(c);
1.4 paf 70: }
1.1 paf 71:
1.56 parser 72: //
73:
74: static const char * String_Untaint_lang_name[]={
75: "U", ///< zero value handy for hash lookup @see untaint_lang_name2enum
76: "C", ///< clean
77: "T", ///< tainted, untaint language as assigned later
78: // untaint languages. assigned by ^untaint[lang]{...}
79: "P",
80: /**<
81: leave language built into string being appended.
82: just a flag, that value not stored
83: */
84: "A", ///< leave all characters intact
1.68 parser 85: "F", ///< file specification
86: "H", ///< ext in HTTP response header
1.56 parser 87: "M", ///< text in mail header
88: "URI", ///< text in uri
89: "T", ///< ^table:set body
90: "SQL", ///< ^table:sql body
91: "JS", ///< JavaScript code
1.68 parser 92: "XML", ///< ^dom:set xml
1.82 paf 93: "HTML" ///< HTML code (for editing)
1.56 parser 94: };
95:
96:
1.1 paf 97: // String
98:
1.41 paf 99: /*
100:
101: HTTP-header = field-name ":" [ field-value ] CRLF
102:
103: field-name = token
104: field-value = *( field-content | LWS )
105:
106: field-content = <the OCTETs making up the field-value
107: and consisting of either *TEXT or combinations
108: of token, tspecials, and quoted-string>
109:
110:
111:
112: word = token | quoted-string
113:
114: token = 1*<any CHAR except CTLs or tspecials>
115:
116:
117:
118: tspecials = "(" | ")" | "<" | ">" | "@"
119: | "," | ";" | ":" | "\" | <">
120: | "/" | "[" | "]" | "?" | "="
121: | "{" | "}" | SP | HT
122:
123: SP = <US-ASCII SP, space (32)>
124: HT = <US-ASCII HT, horizontal-tab (9)>
125:
126: LWS = [CRLF] 1*( SP | HT )
127: TEXT = <any OCTET except CTLs,
128: but including LWS>
129:
130: quoted-pair = "\" CHAR
131:
132: if(strchr("()<>@,;:\\\"/[]?={} \t", *ptr))
133: */
134: inline bool need_quote_http_header(const char *ptr, size_t size) {
135: for(; size--; ptr++)
1.42 paf 136: if(strchr(";\\\"= \t" /* excluded ()<>@, :/ ? []{} */, *ptr))
1.41 paf 137: return true;
138: return false;
139: }
140:
1.91 paf 141: //#include "pa_sapi.h"
142: /**
1.92 paf 143: appends other String,
1.91 paf 144: marking all tainted pieces of it with @a lang.
1.92 paf 145: or marking ALL pieces of it with a @a lang when @a forced to,
146: and propagating OPTIMIZE language bit.
1.91 paf 147:
148: using architecture advantage: after string-to-string-append string never modified.
149: algorithm:
150: if no language-change specified and src not yet appended to some other string[last_chunk!=0]
1.92 paf 151: shrinking dest last_chunk[preparing it for linking],
1.93 paf 152: ///shrinking src last_chunk[preparing it to be linked, consequent dest.appends would go there],
1.101 paf 153: linking[dest.last_chunk = src.head.chunk]
1.91 paf 154: if some language-change specified or src already appended to some other string[last_chunk==0]
155: cloning pieces.
156: */
1.77 paf 157: String& String::append(const String& src, uchar lang, bool forced) {
1.94 paf 158: if(!last_chunk) // growth stopped [we're appended as string to somebody]
1.99 paf 159: throw Exception(0,
1.94 paf 160: this,
161: "string growth stopped (append string)");
162:
1.93 paf 163: if(src.is_empty())
164: return *this;
165:
1.94 paf 166: // without language-chage, not-appended-before, big[not fitting our tail] string?
167: if(lang==UL_PASS_APPENDED
168: && src.last_chunk
169: && (uint(&last_chunk->rows[last_chunk->count]-append_here) < src.used_rows())) {
1.91 paf 170: #ifdef DEBUG_STRING_APPENDS_VS_EXPANDS
1.92 paf 171: string_string_shortcut_economy+=src.used_rows()*sizeof(String::Chunk::Row);
1.91 paf 172: #endif
1.94 paf 173:
1.93 paf 174: // using fact:
1.101 paf 175: // src.head.chunk.count initally equeals this.head.chunk.count and shrinks-only,
176: // so can't be more than this.head.chunk.count,
1.94 paf 177: // which means that we know that
1.101 paf 178: // src.head.chunk would fit into this.head.chunk
179: if(is_empty()) { // our head.chunk is empty
180: // they have more than head.chunk? we need all head.chunk : we need only filled-part of head.chunk
181: Chunk *src_head_link=src.head.chunk.rows[src.head.chunk.count].link;
182: size_t head_count=src_head_link?src.head.chunk.count:(src.append_here-src.head.chunk.rows);
183: // "your head.chunk is my head.chunk"
184: memcpy(head.chunk.rows, src.head.chunk.rows, sizeof(Chunk::Row)*(head_count));
1.94 paf 185: if(src_head_link) {
186: // "your body is my body"
1.101 paf 187: head.chunk.rows[head.chunk.count=head_count].link=src_head_link;
1.94 paf 188: // "your last_chunk is mine now"
189: last_chunk=src.last_chunk;
190: // "your append_here is mine now"
191: append_here=src.append_here;
192: } else {
193: // "your last_chunk is mine now"
1.101 paf 194: last_chunk=&head.chunk;
1.94 paf 195: // "your append_here is recalc-mine now"
1.101 paf 196: append_here=head.chunk.rows+head_count;
1.97 paf 197: }
1.101 paf 198: } else { // our head.chunk contains something
1.94 paf 199: // "chopping off my tail-reserve"
1.96 paf 200: last_chunk->count=append_here-last_chunk->rows;
1.93 paf 201: // "you is my tail"
1.101 paf 202: append_here->link=&src.head.chunk;
1.94 paf 203: // "your last_chunk is mine now"
204: last_chunk=src.last_chunk;
205: // "your append_here is mine now"
206: append_here=src.append_here;
1.93 paf 207: }
1.92 paf 208:
1.93 paf 209: // stop-growing mark
1.97 paf 210: src.last_chunk=0;
211: return *this;
1.91 paf 212: }
213:
1.77 paf 214: // manually unrolled code to avoid do{if(const)} constructs
215: if(forced)
216: STRING_SRC_FOREACH_ROW(
217: APPEND(row->item.ptr, row->item.size,
218: lang, //forcing passed lang
219: row->item.origin.file, row->item.origin.line);
220: )
221: else if(lang==UL_PASS_APPENDED)
222: STRING_SRC_FOREACH_ROW(
223: APPEND(row->item.ptr, row->item.size,
224: row->item.lang, // passing item's lang
225: row->item.origin.file, row->item.origin.line);
226: )
227: else if(lang&UL_OPTIMIZE_BIT) // main idea here
228: // tainted piece would get OPTIMIZED bit from 'lang'
229: // clean piece would be marked OPTIMIZED manually
230: // pieces with determined languages [not tainted|clean] would retain theirs langs
231: STRING_SRC_FOREACH_ROW(
232: APPEND(row->item.ptr, row->item.size,
233: row->item.lang==UL_TAINTED?lang:(
234: row->item.lang==UL_CLEAN?UL_CLEAN|UL_OPTIMIZE_BIT: // ORing with OPTIMIZED flag
235: row->item.lang
236: ),
237: row->item.origin.file, row->item.origin.line);
238: )
239: else
240: STRING_SRC_FOREACH_ROW(
241: APPEND(row->item.ptr, row->item.size,
242: row->item.lang==UL_TAINTED?lang:row->item.lang,
243: row->item.origin.file, row->item.origin.line);
244: );
1.97 paf 245: /*
1.96 paf 246: for(Chunk::Row *row=last_chunk->rows; row<append_here; row++)
247: if(row->link==(void*)0xcdcdcdcd)
1.97 paf 248: _asm int 3;*/
1.77 paf 249: return *this;
250: }
251:
1.75 paf 252: size_t String::cstr_bufsize(Untaint_lang lang,
253: SQL_Connection *connection,
1.87 paf 254: Charset *buf_charset) const {
1.77 paf 255: size_t dest=1; // for terminating 0
256: STRING_FOREACH_ROW(
257: uchar to_lang=lang==UL_UNSPECIFIED?row->item.lang:lang;
258:
259: switch(to_lang & ~UL_OPTIMIZE_BIT) {
260: case UL_CLEAN:
261: case UL_TAINTED:
262: case UL_AS_IS:
263: // clean piece
264:
265: // tainted piece, but undefined untaint language
266: // for VString.as_double of tainted values
267: // for ^process{body} evaluation
268:
269: // tainted, untaint language: as-is
270: dest+=row->item.size;
271: break;
272: case UL_FILE_SPEC:
273: // tainted, untaint language: file [name]
274: dest+=row->item.size*3/* worst: Z->%XX */;
275: break;
276: case UL_URI:
277: // tainted, untaint language: uri
1.84 paf 278: dest+=row->item.size*6*3/* worst utf8 x worst Z->%XX */;
1.77 paf 279: break;
280: case UL_HTTP_HEADER:
281: // tainted, untaint language: http-field-content-text
282: dest+=row->item.size*3/* worst: Z->%XX */;
283: break;
284: case UL_MAIL_HEADER:
285: // tainted, untaint language: mail-header
1.87 paf 286: if(buf_charset) {
1.77 paf 287: // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?=
1.87 paf 288: dest+=
289: row->item.size*3+
290: buf_charset->name().size()+MAX_STRING/* worst: =?charset?Q?=%XX?= */;
291: } else
1.75 paf 292: dest+=row->item.size;
1.77 paf 293: break;
294: case UL_TABLE:
295: // tainted, untaint language: table
296: dest+=row->item.size;
297: break;
298: case UL_SQL:
299: // tainted, untaint language: sql
300: if(connection)
301: dest+=connection->quote(0, row->item.ptr, row->item.size);
302: break;
303: case UL_JS:
304: escape(switch(*src) {
305: case '"': case '\'': case '\n': case '\\': case '\xFF':
306: dest+=2; break;
307: default:
308: dest++; break;
309: });
310: break;
311: case UL_XML:
312: escape(switch(*src) {
313: case '&': case '>': case '<': case '"': case '\'':
314: dest+= 6; break;
315: default:
316: dest++; break;
317: });
318: break;
319: case UL_HTML:
320: escape(switch(*src) {
321: case '&':
322: case '>':
323: case '<':
324: case '"':
325: dest+=6; break;
326: default:
327: dest++; break;
328: });
329: break;
1.75 paf 330: }
1.77 paf 331: );
1.75 paf 332: return dest;
1.51 parser 333: }
334:
1.43 paf 335: char *String::store_to(char *dest, Untaint_lang lang,
336: SQL_Connection *connection,
1.100 paf 337: Charset *store_to_charset,
338: const char *store_to_charset_name) const {
1.75 paf 339: // WARNING:
340: // before any changes check cstr_bufsize first!!!
1.44 paf 341: bool whitespace=true;
1.96 paf 342: STRING_FOREACH_ROW(
1.77 paf 343: uchar to_lang=lang==UL_UNSPECIFIED?row->item.lang:lang;
344:
345: char *start=dest;
346:
347: switch(to_lang & ~UL_OPTIMIZE_BIT) {
348: case UL_CLEAN:
349: case UL_TAINTED:
350: case UL_AS_IS:
351: // clean piece
352:
353: // tainted piece, but undefined untaint language
354: // for VString.as_double of tainted values
355: // for ^process{body} evaluation
356:
357: // tainted, untaint language: as-is
358: memcpy(dest, row->item.ptr, row->item.size);
359: dest+=row->item.size;
360: break;
361: case UL_FILE_SPEC:
362: // tainted, untaint language: file [name]
1.83 paf 363: escape(
364: encode(need_file_encode, '_');
365: );
1.77 paf 366: break;
367: case UL_URI:
368: // tainted, untaint language: uri
1.85 paf 369: const void *client_ptr;
370: size_t client_size;
371: Charset::transcode(pool(),
372: pool().get_source_charset(), row->item.ptr, row->item.size,
373: pool().get_client_charset(), client_ptr, client_size);
374: {
375: const char *src=(const char *)client_ptr;
376: for(int size=client_size; size--; src++)
1.108 ! paf 377: encode(need_uri_encode, '%');
1.85 paf 378: }
1.77 paf 379: break;
380: case UL_HTTP_HEADER:
381: // tainted, untaint language: http-field-content-text
1.108 ! paf 382: escape(
! 383: encode(need_uri_encode, '%');
! 384: );
1.77 paf 385: break;
386: case UL_MAIL_HEADER:
387: // tainted, untaint language: mail-header
1.105 paf 388: // http://www.ietf.org/rfc/rfc2047.txt
1.100 paf 389: if(store_to_charset && store_to_charset_name) {
1.87 paf 390: const void *mail_ptr;
391: size_t mail_size;
392: Charset::transcode(pool(),
393: pool().get_source_charset(), row->item.ptr, row->item.size,
394: *store_to_charset, mail_ptr, mail_size);
395:
1.77 paf 396: // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?=
1.87 paf 397: const char *src=(const char *)mail_ptr;
1.77 paf 398: bool to_quoted_printable=false;
1.105 paf 399:
400: //RFC + An 'encoded-word' MUST NOT appear in any portion of an 'addr-spec'.
401: const char *tail=src+mail_size;
402: if(*--tail=='>') {
403: for(int size=mail_size-1; size--; tail--)
404: if(*tail=='<')
405: break;
406: }
1.106 paf 407: const char *stop=*tail=='<'?tail:0;
408: while(stop>src && stop[-1]==' ')
409: --stop;
1.105 paf 410:
411: bool closed=false;
1.87 paf 412: for(int size=mail_size; size--; src++) {
1.105 paf 413: /*RFC
414: (3) 8-bit values which correspond to printable ASCII characters other
415: than "=", "?", and "_" (underscore), MAY be represented as those
416: characters. (But see section 5 for restrictions.) In
417: particular, SPACE and TAB MUST NOT be represented as themselves
418: within encoded words.
419: */
1.107 paf 420: if(src==stop && to_quoted_printable) {
1.105 paf 421: dest+=sprintf(dest, "?=");
422: closed=true;
1.107 paf 423: to_quoted_printable=false;
1.105 paf 424: }
425: if((!stop || src<stop) && (
426: (*src & 0x80) // starting quote-printable-encoding on first 8bit char
427: || (to_quoted_printable
428: && (*src==' ' || *src=='=' || *src=='?') || *src=='_')
429: )) {
1.77 paf 430: if(!to_quoted_printable) {
1.100 paf 431: dest+=sprintf(dest, "=?%s?Q?", store_to_charset_name);
1.77 paf 432: to_quoted_printable=true;
1.105 paf 433:
1.77 paf 434: }
1.105 paf 435: //RFC Upper case should be used for hexadecimal digits "A" through "F"
1.77 paf 436: dest+=sprintf(dest, "=%02X", *src & 0xFF);
1.105 paf 437: } else
1.77 paf 438: *dest++=*src;
1.44 paf 439: }
1.105 paf 440: if(to_quoted_printable && !closed) // close
1.77 paf 441: dest+=sprintf(dest, "?=");
1.87 paf 442:
1.77 paf 443: } else {
1.13 paf 444: memcpy(dest, row->item.ptr, row->item.size);
445: dest+=row->item.size;
1.1 paf 446: }
1.77 paf 447: break;
448: case UL_TABLE:
449: // tainted, untaint language: table
450: escape(switch(*src) {
451: case '\t': to_char(' '); break;
452: case '\n': to_char(' '); break;
453: _default;
454: });
455: break;
456: case UL_SQL:
457: // tainted, untaint language: sql
458: if(connection)
459: dest+=connection->quote(dest, row->item.ptr, row->item.size);
460: else
1.99 paf 461: throw Exception(0,
1.77 paf 462: this,
463: "untaint in SQL language failed - no connection specified");
464: break;
465: case UL_JS:
466: escape(switch(*src) {
467: case '"': to_string("\\\"", 2); break;
468: case '\'': to_string("\\'", 2); break;
469: case '\n': to_string("\\n", 2); break;
470: case '\\': to_string("\\\\", 2); break;
471: case '\xFF': to_string("\\\xFF", 2); break;
472: _default;
473: });
474: break;
475: case UL_XML:
476: escape(switch(*src) {
477: case '&': to_string("&", 5); break;
478: case '>': to_string(">", 4); break;
479: case '<': to_string("<", 4); break;
480: case '"': to_string(""", 6); break;
481: case '\'': to_string("'", 6); break;
482: _default;
483: });
484: break;
485: case UL_HTML:
486: escape(switch(*src) {
487: case '&': to_string("&", 5); break;
488: case '>': to_string(">", 4); break;
489: case '<': to_string("<", 4); break;
490: case '"': to_string(""", 6); break;
491: _default;
492: });
493: break;
494: default:
1.99 paf 495: throw Exception(0,
1.77 paf 496: this,
1.81 paf 497: "unknown untaint language #%d",
498: static_cast<int>(row->item.lang)); // sould never
1.77 paf 499: break; // never
1.76 paf 500: }
1.55 parser 501:
1.77 paf 502: if(to_lang & UL_OPTIMIZE_BIT) {
503: // optimizing whitespace
504: char *stop=dest; dest=start;
505: for(char *src=start; src<stop; src++)
506: switch(*src) {
507: // of all consequent white space chars leaving only first one
1.80 paf 508: case ' ': case '\r': case '\n': case '\t':
1.77 paf 509: if(!whitespace) {
510: *dest++=*src;
511: whitespace=true;
512: }
513: break;
514: default:
515: whitespace=false;
516: *dest++=*src;
517: break;
518: };
519: } else // piece without optimization
520: whitespace=false;
1.96 paf 521: );
1.78 paf 522:
1.76 paf 523: return dest;
524: }
525:
526: char *String::cstr_debug_origins() const {
1.81 paf 527: //_asm int 3;
1.76 paf 528: char *result=(char *)malloc(size()+used_rows()*MAX_STRING*2);
529: char *dest=result;
530:
1.96 paf 531: STRING_FOREACH_ROW(
532: IFNDEF_NO_STRING_ORIGIN(
533: if(row->item.origin.file)
534: dest+=sprintf(dest, ORIGIN_FILE_LINE_FORMAT,
535: row->item.origin.file,
536: 1+row->item.origin.line);
537: else
538: dest+=sprintf(dest, "<unknown>");
539: );
540: uchar show_lang=row->item.lang & ~UL_OPTIMIZE_BIT;
541: if(show_lang>=sizeof(String_Untaint_lang_name)/sizeof(String_Untaint_lang_name[0]))
1.99 paf 542: throw Exception(0,
1.96 paf 543: this,
544: "unknown untaint language #%d",
545: static_cast<int>(show_lang)); // sould never
546:
547: dest+=sprintf(dest, "#%s%s: ",
548: String_Untaint_lang_name[show_lang],
549: row->item.lang & UL_OPTIMIZE_BIT?".O":"");
550: char *dest_after_origins=dest;
1.76 paf 551:
1.96 paf 552: memcpy(dest, row->item.ptr, row->item.size);
553: dest+=row->item.size;
1.76 paf 554:
1.96 paf 555: remove_crlf(dest_after_origins, dest);
556: to_char('\n');
557: );
1.64 parser 558:
1.76 paf 559: *dest=0;
560: return result;
1.1 paf 561: }