|
|
1.7 paf 1: /** @file
1.8 paf 2: Parser: String class part: untaint mechanizm.
3:
1.13 paf 4: Copyright(c) 2001 ArtLebedev Group(http://www.artlebedev.com)
1.74 paf 5: Author: Alexander Petrosyan <paf@design.ru>(http://paf.design.ru)
1.8 paf 6:
1.84 ! paf 7: $Id: untaint.C,v 1.83 2001/12/10 10:20:24 paf Exp $
1.1 paf 8: */
9:
10: #include "pa_pool.h"
11: #include "pa_string.h"
12: #include "pa_hash.h"
13: #include "pa_exception.h"
1.13 paf 14: #include "pa_table.h"
1.32 paf 15: #include "pa_globals.h"
1.34 paf 16: #include "pa_sql_connection.h"
1.58 parser 17: #include "pa_dictionary.h"
1.66 parser 18: #include "pa_common.h"
1.1 paf 19:
1.18 paf 20: #define escape(action) \
1.1 paf 21: { \
1.13 paf 22: const char *src=row->item.ptr; \
23: for(int size=row->item.size; size--; src++) \
1.18 paf 24: action \
1.1 paf 25: }
1.13 paf 26: #define _default default: *dest++=*src; break
27: #define encode(need_encode_func, prefix) \
28: if(need_encode_func(*src)) { \
1.5 paf 29: static const char *hex="0123456789ABCDEF"; \
1.9 paf 30: char chunk[3]={prefix}; \
1.13 paf 31: chunk[1]=hex[((unsigned char)*src)/0x10]; \
32: chunk[2]=hex[((unsigned char)*src)%0x10]; \
1.60 parser 33: memcpy(dest, chunk, 3); dest+=3; \
1.5 paf 34: } else \
1.13 paf 35: *dest++=*src; \
1.5 paf 36: break
1.18 paf 37: #define to_char(c) *dest++=c
38: #define to_string(b, bsize) \
1.60 parser 39: memcpy(dest, b, bsize); \
1.18 paf 40: dest+=bsize; \
1.4 paf 41:
1.9 paf 42: inline bool need_file_encode(unsigned char c){
1.83 paf 43: // theoretical problem with, for instance, "_2B" and "." fragments,
44: // they would yield the same
45: // because need_file_encode('_')=false
46: // but we need to delete such files somehow, getting names from ^index
47:
1.13 paf 48: if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z'))
1.9 paf 49: return false;
50:
1.31 paf 51: return !strchr(
1.83 paf 52: "_./()-"
1.31 paf 53: #ifdef WIN32
1.37 paf 54: ":\\~"
1.31 paf 55: #endif
1.83 paf 56: , c);
1.9 paf 57: }
1.5 paf 58: inline bool need_uri_encode(unsigned char c){
1.13 paf 59: if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z'))
1.4 paf 60: return false;
61:
1.5 paf 62: return !strchr("_-./", c);
63: }
1.36 paf 64: inline bool need_http_header_encode(unsigned char c){
1.18 paf 65: if(strchr(" , :", c))
1.5 paf 66: return false;
67:
68: return need_uri_encode(c);
1.4 paf 69: }
1.1 paf 70:
1.56 parser 71: //
72:
73: static const char * String_Untaint_lang_name[]={
74: "U", ///< zero value handy for hash lookup @see untaint_lang_name2enum
75: "C", ///< clean
76: "T", ///< tainted, untaint language as assigned later
77: // untaint languages. assigned by ^untaint[lang]{...}
78: "P",
79: /**<
80: leave language built into string being appended.
81: just a flag, that value not stored
82: */
83: "A", ///< leave all characters intact
1.68 parser 84: "F", ///< file specification
85: "H", ///< ext in HTTP response header
1.56 parser 86: "M", ///< text in mail header
87: "URI", ///< text in uri
88: "T", ///< ^table:set body
89: "SQL", ///< ^table:sql body
90: "JS", ///< JavaScript code
1.68 parser 91: "XML", ///< ^dom:set xml
1.82 paf 92: "HTML" ///< HTML code (for editing)
1.56 parser 93: };
94:
95:
1.1 paf 96: // String
97:
1.41 paf 98: /*
99:
100: HTTP-header = field-name ":" [ field-value ] CRLF
101:
102: field-name = token
103: field-value = *( field-content | LWS )
104:
105: field-content = <the OCTETs making up the field-value
106: and consisting of either *TEXT or combinations
107: of token, tspecials, and quoted-string>
108:
109:
110:
111: word = token | quoted-string
112:
113: token = 1*<any CHAR except CTLs or tspecials>
114:
115:
116:
117: tspecials = "(" | ")" | "<" | ">" | "@"
118: | "," | ";" | ":" | "\" | <">
119: | "/" | "[" | "]" | "?" | "="
120: | "{" | "}" | SP | HT
121:
122: SP = <US-ASCII SP, space (32)>
123: HT = <US-ASCII HT, horizontal-tab (9)>
124:
125: LWS = [CRLF] 1*( SP | HT )
126: TEXT = <any OCTET except CTLs,
127: but including LWS>
128:
129: quoted-pair = "\" CHAR
130:
131: if(strchr("()<>@,;:\\\"/[]?={} \t", *ptr))
132: */
133: inline bool need_quote_http_header(const char *ptr, size_t size) {
134: for(; size--; ptr++)
1.42 paf 135: if(strchr(";\\\"= \t" /* excluded ()<>@, :/ ? []{} */, *ptr))
1.41 paf 136: return true;
137: return false;
138: }
139:
1.77 paf 140: String& String::append(const String& src, uchar lang, bool forced) {
141: // manually unrolled code to avoid do{if(const)} constructs
142: if(forced)
143: STRING_SRC_FOREACH_ROW(
144: APPEND(row->item.ptr, row->item.size,
145: lang, //forcing passed lang
146: row->item.origin.file, row->item.origin.line);
147: )
148: else if(lang==UL_PASS_APPENDED)
149: STRING_SRC_FOREACH_ROW(
150: APPEND(row->item.ptr, row->item.size,
151: row->item.lang, // passing item's lang
152: row->item.origin.file, row->item.origin.line);
153: )
154: else if(lang&UL_OPTIMIZE_BIT) // main idea here
155: // tainted piece would get OPTIMIZED bit from 'lang'
156: // clean piece would be marked OPTIMIZED manually
157: // pieces with determined languages [not tainted|clean] would retain theirs langs
158: STRING_SRC_FOREACH_ROW(
159: APPEND(row->item.ptr, row->item.size,
160: row->item.lang==UL_TAINTED?lang:(
161: row->item.lang==UL_CLEAN?UL_CLEAN|UL_OPTIMIZE_BIT: // ORing with OPTIMIZED flag
162: row->item.lang
163: ),
164: row->item.origin.file, row->item.origin.line);
165: )
166: else
167: STRING_SRC_FOREACH_ROW(
168: APPEND(row->item.ptr, row->item.size,
169: row->item.lang==UL_TAINTED?lang:row->item.lang,
170: row->item.origin.file, row->item.origin.line);
171: );
172: break2:
173: return *this;
174: }
175:
1.75 paf 176: size_t String::cstr_bufsize(Untaint_lang lang,
177: SQL_Connection *connection,
178: const char *charset) const {
1.77 paf 179: size_t dest=1; // for terminating 0
180: STRING_FOREACH_ROW(
181: uchar to_lang=lang==UL_UNSPECIFIED?row->item.lang:lang;
182:
183: switch(to_lang & ~UL_OPTIMIZE_BIT) {
184: case UL_CLEAN:
185: case UL_TAINTED:
186: case UL_AS_IS:
187: // clean piece
188:
189: // tainted piece, but undefined untaint language
190: // for VString.as_double of tainted values
191: // for ^process{body} evaluation
192:
193: // tainted, untaint language: as-is
194: dest+=row->item.size;
195: break;
196: case UL_FILE_SPEC:
197: // tainted, untaint language: file [name]
198: dest+=row->item.size*3/* worst: Z->%XX */;
199: break;
200: case UL_URI:
201: // tainted, untaint language: uri
1.84 ! paf 202: dest+=row->item.size*6*3/* worst utf8 x worst Z->%XX */;
1.77 paf 203: break;
204: case UL_HTTP_HEADER:
205: // tainted, untaint language: http-field-content-text
206: dest+=row->item.size*3/* worst: Z->%XX */;
207: break;
208: case UL_MAIL_HEADER:
209: // tainted, untaint language: mail-header
210: if(charset) {
211: // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?=
212: dest+=row->item.size*3+MAX_STRING/* worst: =?charset?Q?=%XX?= */;
213: } else {
1.75 paf 214: dest+=row->item.size;
215: }
1.77 paf 216: break;
217: case UL_TABLE:
218: // tainted, untaint language: table
219: dest+=row->item.size;
220: break;
221: case UL_SQL:
222: // tainted, untaint language: sql
223: if(connection)
224: dest+=connection->quote(0, row->item.ptr, row->item.size);
225: break;
226: case UL_JS:
227: escape(switch(*src) {
228: case '"': case '\'': case '\n': case '\\': case '\xFF':
229: dest+=2; break;
230: default:
231: dest++; break;
232: });
233: break;
234: case UL_XML:
235: escape(switch(*src) {
236: case '&': case '>': case '<': case '"': case '\'':
237: dest+= 6; break;
238: default:
239: dest++; break;
240: });
241: break;
242: case UL_HTML:
243: escape(switch(*src) {
244: case '&':
245: case '>':
246: case '<':
247: case '"':
248: dest+=6; break;
249: default:
250: dest++; break;
251: });
252: break;
1.75 paf 253: }
1.77 paf 254: );
1.75 paf 255: break2:
256: return dest;
1.51 parser 257: }
258:
1.43 paf 259: char *String::store_to(char *dest, Untaint_lang lang,
260: SQL_Connection *connection,
261: const char *charset) const {
1.75 paf 262: // WARNING:
263: // before any changes check cstr_bufsize first!!!
1.44 paf 264: bool whitespace=true;
1.79 paf 265: // expanded STRING_FOREACH_ROW here for debugging purposes
1.78 paf 266: const Chunk *chunk=&head; \
267: do { \
268: const Chunk::Row *row=chunk->rows; \
269: for(uint i=0; i<chunk->count; i++, row++) { \
270: if(row==append_here) \
271: goto break2; \
272: \
1.77 paf 273: uchar to_lang=lang==UL_UNSPECIFIED?row->item.lang:lang;
274:
275: char *start=dest;
276:
277: switch(to_lang & ~UL_OPTIMIZE_BIT) {
278: case UL_CLEAN:
279: case UL_TAINTED:
280: case UL_AS_IS:
281: // clean piece
282:
283: // tainted piece, but undefined untaint language
284: // for VString.as_double of tainted values
285: // for ^process{body} evaluation
286:
287: // tainted, untaint language: as-is
288: memcpy(dest, row->item.ptr, row->item.size);
289: dest+=row->item.size;
290: break;
291: case UL_FILE_SPEC:
292: // tainted, untaint language: file [name]
1.83 paf 293: escape(
294: encode(need_file_encode, '_');
295: );
1.77 paf 296: break;
297: case UL_URI:
298: // tainted, untaint language: uri
299: escape(switch(*src) {
300: case ' ': to_char('+'); break;
1.83 paf 301: default: encode(need_uri_encode, '%');
1.77 paf 302: });
303: break;
304: case UL_HTTP_HEADER:
305: // tainted, untaint language: http-field-content-text
306: escape(switch(*src) {
307: case ' ': to_char('+'); break;
1.83 paf 308: default: encode(need_uri_encode, '%');
1.77 paf 309: });
310: break;
311: case UL_MAIL_HEADER:
312: // tainted, untaint language: mail-header
313: if(charset) {
314: // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?=
315: const char *src=row->item.ptr;
316: bool to_quoted_printable=false;
317: for(int size=row->item.size; size--; src++) {
318: if(*src & 0x80) {
319: if(!to_quoted_printable) {
320: dest+=sprintf(dest, "=?%.15s?Q?", charset);
321: to_quoted_printable=true;
322: }
323: dest+=sprintf(dest, "=%02X", *src & 0xFF);
324: } else {
325: *dest++=*src;
326: }
1.44 paf 327: }
1.77 paf 328: if(to_quoted_printable) // close
329: dest+=sprintf(dest, "?=");
330: } else {
1.13 paf 331: memcpy(dest, row->item.ptr, row->item.size);
332: dest+=row->item.size;
1.1 paf 333: }
1.77 paf 334: break;
335: case UL_TABLE:
336: // tainted, untaint language: table
337: escape(switch(*src) {
338: case '\t': to_char(' '); break;
339: case '\n': to_char(' '); break;
340: _default;
341: });
342: break;
343: case UL_SQL:
344: // tainted, untaint language: sql
345: if(connection)
346: dest+=connection->quote(dest, row->item.ptr, row->item.size);
347: else
348: throw Exception(0, 0,
349: this,
350: "untaint in SQL language failed - no connection specified");
351: break;
352: case UL_JS:
353: escape(switch(*src) {
354: case '"': to_string("\\\"", 2); break;
355: case '\'': to_string("\\'", 2); break;
356: case '\n': to_string("\\n", 2); break;
357: case '\\': to_string("\\\\", 2); break;
358: case '\xFF': to_string("\\\xFF", 2); break;
359: _default;
360: });
361: break;
362: case UL_XML:
363: escape(switch(*src) {
364: case '&': to_string("&", 5); break;
365: case '>': to_string(">", 4); break;
366: case '<': to_string("<", 4); break;
367: case '"': to_string(""", 6); break;
368: case '\'': to_string("'", 6); break;
369: _default;
370: });
371: break;
372: case UL_HTML:
373: escape(switch(*src) {
374: case '&': to_string("&", 5); break;
375: case '>': to_string(">", 4); break;
376: case '<': to_string("<", 4); break;
377: case '"': to_string(""", 6); break;
378: _default;
379: });
380: break;
381: default:
382: throw Exception(0, 0,
383: this,
1.81 paf 384: "unknown untaint language #%d",
385: static_cast<int>(row->item.lang)); // sould never
1.77 paf 386: break; // never
1.76 paf 387: }
1.55 parser 388:
1.77 paf 389: if(to_lang & UL_OPTIMIZE_BIT) {
390: // optimizing whitespace
391: char *stop=dest; dest=start;
392: for(char *src=start; src<stop; src++)
393: switch(*src) {
394: // of all consequent white space chars leaving only first one
1.80 paf 395: case ' ': case '\r': case '\n': case '\t':
1.77 paf 396: if(!whitespace) {
397: *dest++=*src;
398: whitespace=true;
399: }
400: break;
401: default:
402: whitespace=false;
403: *dest++=*src;
404: break;
405: };
406: } else // piece without optimization
407: whitespace=false;
1.78 paf 408:
409: } \
410: chunk=row->link; \
411: } while(chunk); \
412:
1.76 paf 413: break2:
414: return dest;
415: }
416:
417: char *String::cstr_debug_origins() const {
1.81 paf 418: //_asm int 3;
1.76 paf 419: char *result=(char *)malloc(size()+used_rows()*MAX_STRING*2);
420: char *dest=result;
421:
422: const Chunk *chunk=&head;
423: do {
424: const Chunk::Row *row=chunk->rows;
425: for(uint i=0; i<chunk->count; i++, row++) {
426: if(row==append_here)
427: goto break2;
1.55 parser 428:
1.76 paf 429: #ifndef NO_STRING_ORIGIN
430: if(row->item.origin.file)
431: dest+=sprintf(dest, ORIGIN_FILE_LINE_FORMAT,
432: row->item.origin.file,
433: 1+row->item.origin.line);
434: else
435: dest+=sprintf(dest, "<unknown>");
436: #endif
1.81 paf 437: uchar show_lang=row->item.lang & ~UL_OPTIMIZE_BIT;
438: if(show_lang>=sizeof(String_Untaint_lang_name)/sizeof(String_Untaint_lang_name[0]))
439: throw Exception(0, 0,
440: this,
441: "unknown untaint language #%d",
442: static_cast<int>(show_lang)); // sould never
443:
444: dest+=sprintf(dest, "#%s%s: ",
445: String_Untaint_lang_name[show_lang],
446: row->item.lang & UL_OPTIMIZE_BIT?".O":"");
1.76 paf 447: char *dest_after_origins=dest;
448:
449: memcpy(dest, row->item.ptr, row->item.size);
450: dest+=row->item.size;
451:
452: remove_crlf(dest_after_origins, dest);
453: to_char('\n');
1.1 paf 454: }
455: chunk=row->link;
456: } while(chunk);
1.64 parser 457:
1.1 paf 458: break2:
1.76 paf 459: *dest=0;
460: return result;
1.1 paf 461: }