--- parser3/src/main/untaint.C 2001/11/16 13:51:14 1.76 +++ parser3/src/main/untaint.C 2002/02/20 11:15:13 1.93 @@ -1,10 +1,10 @@ /** @file Parser: String class part: untaint mechanizm. - Copyright(c) 2001 ArtLebedev Group(http://www.artlebedev.com) - Author: Alexander Petrosyan (http://paf.design.ru) + Copyright(c) 2001, 2002 ArtLebedev Group (http://www.artlebedev.com) + Author: Alexandr Petrosian (http://paf.design.ru) - $Id: untaint.C,v 1.76 2001/11/16 13:51:14 paf Exp $ + $Id: untaint.C,v 1.93 2002/02/20 11:15:13 paf Exp $ */ #include "pa_pool.h" @@ -16,6 +16,14 @@ #include "pa_sql_connection.h" #include "pa_dictionary.h" #include "pa_common.h" +#include "pa_charset.h" + +#define DEBUG_STRING_APPENDS_VS_EXPANDS + +#ifdef DEBUG_STRING_APPENDS_VS_EXPANDS +ulong string_string_shortcut_economy=0; +#endif + #define escape(action) \ { \ @@ -25,7 +33,6 @@ } #define _default default: *dest++=*src; break #define encode(need_encode_func, prefix) \ - default: \ if(need_encode_func(*src)) { \ static const char *hex="0123456789ABCDEF"; \ char chunk[3]={prefix}; \ @@ -41,14 +48,20 @@ dest+=bsize; \ inline bool need_file_encode(unsigned char c){ + // theoretical problem with, for instance, "_2B" and "." fragments, + // they would yield the same + // because need_file_encode('_')=false + // but we need to delete such files somehow, getting names from ^index + if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) return false; return !strchr( + " _./()-" #ifdef WIN32 ":\\~" #endif - "./()_-", c); + , c); } inline bool need_uri_encode(unsigned char c){ if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) @@ -84,8 +97,7 @@ static const char * String_Untaint_lang_ "SQL", ///< ^table:sql body "JS", ///< JavaScript code "XML", ///< ^dom:set xml - "HTML", ///< HTML code (for editing) - "UHTML", ///< HTML code with USER chars + "HTML" ///< HTML code (for editing) }; @@ -133,272 +145,353 @@ inline bool need_quote_http_header(const return false; } -/// @test UL_OPTIMIZED_HTML optimize -size_t String::cstr_bufsize(Untaint_lang lang, - SQL_Connection *connection, - const char *charset) const { - size_t dest=1; - bool whitespace=true; - const Chunk *chunk=&head; - do { - const Chunk::Row *row=chunk->rows; - for(uint i=0; icount; i++, row++) { - if(row==append_here) - goto break2; +//#include "pa_sapi.h" +/** + appends other String, + marking all tainted pieces of it with @a lang. + or marking ALL pieces of it with a @a lang when @a forced to, + and propagating OPTIMIZE language bit. + + using architecture advantage: after string-to-string-append string never modified. + algorithm: + if no language-change specified and src not yet appended to some other string[last_chunk!=0] + shrinking dest last_chunk[preparing it for linking], + ///shrinking src last_chunk[preparing it to be linked, consequent dest.appends would go there], + linking[dest.last_chunk = src.head] + if some language-change specified or src already appended to some other string[last_chunk==0] + cloning pieces. +*/ +String& String::append(const String& src, uchar lang, bool forced) { + // should never, but just in case... + if(src.is_empty()) + return *this; + + if(lang==UL_PASS_APPENDED && src.last_chunk) { +#ifdef DEBUG_STRING_APPENDS_VS_EXPANDS + string_string_shortcut_economy+=src.used_rows()*sizeof(String::Chunk::Row); +#endif +/* + // using fact: + // src.head.count shrinks-only, + // so can't be less than this.head.count, + // which means that we know that src.head would fit into this.head + if(is_empty()) { // our head is empty + // "your head is my head" + memcpy(head.rows, src.head.rows, sizeof(Chunk::Row)*(head.count=src.head.count)); + // "your body is my body" + head.rows[head.count].link=src.head.rows[src.head.count].link; + } else { // our head contains something + // "chopping my tail-reserve" + last_chunk->count=append_here-last_chunk->rows; + // "you is my tail" + last_chunk->rows[last_chunk->count].link=src.head.rows; + } + // "your append_here is mine now" + append_here=src.append_here; + // "your last_chunk is mine now" + last_chunk=src.last_chunk; - Untaint_lang to_lang=lang==UL_UNSPECIFIED?(Untaint_lang)row->item.lang:lang; +*/ + // stop-growing mark + src.last_chunk=0; +// return; + } - switch(to_lang) { - case UL_CLEAN: - // clean piece - { // optimizing whitespace - escape(switch(*src) { - case ' ': case '\n': case '\t': - if(!whitespace) { - dest++; - whitespace=true; - } - break; - default: - whitespace=false; - dest++; - break; - }); - } - break; - case UL_TAINTED: - // tainted piece, but undefined untaint language - // for VString.as_double of tainted values - // for ^process{body} evaluation - case UL_AS_IS: - // tainted, untaint language: as-is - dest+=row->item.size; - break; - case UL_FILE_SPEC: - // tainted, untaint language: file [name] - dest+=row->item.size*3/* worst: Z->%XX */; - break; - case UL_URI: - // tainted, untaint language: uri - dest+=row->item.size*3/* worst: Z->%XX */; - break; - case UL_HTTP_HEADER: - // tainted, untaint language: http-field-content-text - dest+=row->item.size*3/* worst: Z->%XX */; - break; - case UL_MAIL_HEADER: - // tainted, untaint language: mail-header - if(charset) { - // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= - dest+=row->item.size*3+MAX_STRING/* worst: =?charset?Q?=%XX?= */; - } else { - dest+=row->item.size; - } - break; - case UL_TABLE: - // tainted, untaint language: table - dest+=row->item.size; - break; - case UL_SQL: - // tainted, untaint language: sql - if(connection) - dest+=connection->quote(0, row->item.ptr, row->item.size); - break; - case UL_JS: - escape(switch(*src) { - case '"': case '\'': case '\n': case '\\': case '\xFF': - dest+=2; break; - default: - dest++; break; - }); - break; - case UL_XML: - escape(switch(*src) { - case '&': case '>': case '<': case '"': case '\'': - dest+= 6; break; - default: - dest++; break; - }); - break; - case UL_HTML: - case UL_OPTIMIZED_HTML: - escape(switch(*src) { - case '&': - case '>': - case '<': - case '"': - dest+=6; break; - default: - dest++; break; - }); - break; - } + // manually unrolled code to avoid do{if(const)} constructs + if(forced) + STRING_SRC_FOREACH_ROW( + APPEND(row->item.ptr, row->item.size, + lang, //forcing passed lang + row->item.origin.file, row->item.origin.line); + ) + else if(lang==UL_PASS_APPENDED) + STRING_SRC_FOREACH_ROW( + APPEND(row->item.ptr, row->item.size, + row->item.lang, // passing item's lang + row->item.origin.file, row->item.origin.line); + ) + else if(lang&UL_OPTIMIZE_BIT) // main idea here + // tainted piece would get OPTIMIZED bit from 'lang' + // clean piece would be marked OPTIMIZED manually + // pieces with determined languages [not tainted|clean] would retain theirs langs + STRING_SRC_FOREACH_ROW( + APPEND(row->item.ptr, row->item.size, + row->item.lang==UL_TAINTED?lang:( + row->item.lang==UL_CLEAN?UL_CLEAN|UL_OPTIMIZE_BIT: // ORing with OPTIMIZED flag + row->item.lang + ), + row->item.origin.file, row->item.origin.line); + ) + else + STRING_SRC_FOREACH_ROW( + APPEND(row->item.ptr, row->item.size, + row->item.lang==UL_TAINTED?lang:row->item.lang, + row->item.origin.file, row->item.origin.line); + ); +break2: + return *this; +} - if((lang==UL_UNSPECIFIED?row->item.lang:lang)!=UL_CLEAN) - whitespace=false; +size_t String::cstr_bufsize(Untaint_lang lang, + SQL_Connection *connection, + Charset *buf_charset) const { + size_t dest=1; // for terminating 0 + STRING_FOREACH_ROW( + uchar to_lang=lang==UL_UNSPECIFIED?row->item.lang:lang; + + switch(to_lang & ~UL_OPTIMIZE_BIT) { + case UL_CLEAN: + case UL_TAINTED: + case UL_AS_IS: + // clean piece + + // tainted piece, but undefined untaint language + // for VString.as_double of tainted values + // for ^process{body} evaluation + + // tainted, untaint language: as-is + dest+=row->item.size; + break; + case UL_FILE_SPEC: + // tainted, untaint language: file [name] + dest+=row->item.size*3/* worst: Z->%XX */; + break; + case UL_URI: + // tainted, untaint language: uri + dest+=row->item.size*6*3/* worst utf8 x worst Z->%XX */; + break; + case UL_HTTP_HEADER: + // tainted, untaint language: http-field-content-text + dest+=row->item.size*3/* worst: Z->%XX */; + break; + case UL_MAIL_HEADER: + // tainted, untaint language: mail-header + if(buf_charset) { + // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= + dest+= + row->item.size*3+ + buf_charset->name().size()+MAX_STRING/* worst: =?charset?Q?=%XX?= */; + } else + dest+=row->item.size; + break; + case UL_TABLE: + // tainted, untaint language: table + dest+=row->item.size; + break; + case UL_SQL: + // tainted, untaint language: sql + if(connection) + dest+=connection->quote(0, row->item.ptr, row->item.size); + break; + case UL_JS: + escape(switch(*src) { + case '"': case '\'': case '\n': case '\\': case '\xFF': + dest+=2; break; + default: + dest++; break; + }); + break; + case UL_XML: + escape(switch(*src) { + case '&': case '>': case '<': case '"': case '\'': + dest+= 6; break; + default: + dest++; break; + }); + break; + case UL_HTML: + escape(switch(*src) { + case '&': + case '>': + case '<': + case '"': + dest+=6; break; + default: + dest++; break; + }); + break; } - chunk=row->link; - } while(chunk); - + ); break2: return dest; } -/// @test UL_OPTIMIZED_HTML optimize char *String::store_to(char *dest, Untaint_lang lang, SQL_Connection *connection, - const char *charset) const { + Charset *store_to_charset) const { // WARNING: // before any changes check cstr_bufsize first!!! bool whitespace=true; - const Chunk *chunk=&head; - do { - const Chunk::Row *row=chunk->rows; - for(uint i=0; icount; i++, row++) { - if(row==append_here) - goto break2; - - Untaint_lang to_lang=lang==UL_UNSPECIFIED?(Untaint_lang)row->item.lang:lang; + // expanded STRING_FOREACH_ROW here for debugging purposes + const Chunk *chunk=&head; \ + do { \ + const Chunk::Row *row=chunk->rows; \ + for(uint i=0; icount; i++, row++) { \ + if(row==append_here) \ + goto break2; \ + \ + uchar to_lang=lang==UL_UNSPECIFIED?row->item.lang:lang; + + char *start=dest; + + switch(to_lang & ~UL_OPTIMIZE_BIT) { + case UL_CLEAN: + case UL_TAINTED: + case UL_AS_IS: + // clean piece + + // tainted piece, but undefined untaint language + // for VString.as_double of tainted values + // for ^process{body} evaluation - switch(to_lang) { - case UL_CLEAN: - // clean piece - { // optimizing whitespace - escape(switch(*src) { - case ' ': case '\n': case '\t': - if(!whitespace) { - *dest++=*src; - whitespace=true; - } - break; - default: - whitespace=false; - *dest++=*src; - break; - }); - } - break; - case UL_TAINTED: - // tainted piece, but undefined untaint language - // for VString.as_double of tainted values - // for ^process{body} evaluation - case UL_AS_IS: - // tainted, untaint language: as-is - memcpy(dest, row->item.ptr, row->item.size); - dest+=row->item.size; - break; - case UL_FILE_SPEC: - // tainted, untaint language: file [name] - escape(switch(*src) { - case ' ': to_char('_'); break; - encode(need_file_encode, '+'); - }); - break; - case UL_URI: - // tainted, untaint language: uri - escape(switch(*src) { - case ' ': to_char('+'); break; - encode(need_uri_encode, '%'); - }); - break; - case UL_HTTP_HEADER: - // tainted, untaint language: http-field-content-text - escape(switch(*src) { - case ' ': to_char('+'); break; - encode(need_uri_encode, '%'); - }); - break; - case UL_MAIL_HEADER: - // tainted, untaint language: mail-header - if(charset) { - // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= - const char *src=row->item.ptr; - bool to_quoted_printable=false; - for(int size=row->item.size; size--; src++) { - if(*src & 0x80) { - if(!to_quoted_printable) { - dest+=sprintf(dest, "=?%.15s?Q?", charset); - to_quoted_printable=true; - } - dest+=sprintf(dest, "=%02X", *src & 0xFF); - } else { - *dest++=*src; + // tainted, untaint language: as-is + memcpy(dest, row->item.ptr, row->item.size); + dest+=row->item.size; + break; + case UL_FILE_SPEC: + // tainted, untaint language: file [name] + escape( + encode(need_file_encode, '_'); + ); + break; + case UL_URI: + // tainted, untaint language: uri + const void *client_ptr; + size_t client_size; + Charset::transcode(pool(), + pool().get_source_charset(), row->item.ptr, row->item.size, + pool().get_client_charset(), client_ptr, client_size); + { + const char *src=(const char *)client_ptr; + for(int size=client_size; size--; src++) + switch(*src) { + case ' ': to_char('+'); break; + default: encode(need_uri_encode, '%'); + }; + } + break; + case UL_HTTP_HEADER: + // tainted, untaint language: http-field-content-text + escape(switch(*src) { + case ' ': to_char('+'); break; + default: encode(need_uri_encode, '%'); + }); + break; + case UL_MAIL_HEADER: + // tainted, untaint language: mail-header + if(store_to_charset) { + const void *mail_ptr; + size_t mail_size; + Charset::transcode(pool(), + pool().get_source_charset(), row->item.ptr, row->item.size, + *store_to_charset, mail_ptr, mail_size); + + // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= + const char *src=(const char *)mail_ptr; + bool to_quoted_printable=false; + for(int size=mail_size; size--; src++) { + if(*src & 0x80) { + if(!to_quoted_printable) { + dest+=sprintf(dest, "=?%s?Q?", store_to_charset->name().cstr()); + to_quoted_printable=true; } + dest+=sprintf(dest, "=%02X", *src & 0xFF); + } else { + *dest++=*src; } - if(to_quoted_printable) // close - dest+=sprintf(dest, "?="); - } else { - memcpy(dest, row->item.ptr, row->item.size); - dest+=row->item.size; } - break; - case UL_TABLE: - // tainted, untaint language: table - escape(switch(*src) { - case '\t': to_char(' '); break; - case '\n': to_char(' '); break; - _default; - }); - break; - case UL_SQL: - // tainted, untaint language: sql - if(connection) - dest+=connection->quote(dest, row->item.ptr, row->item.size); - else - throw Exception(0, 0, - this, - "untaint in SQL language failed - no connection specified"); - break; - case UL_JS: - escape(switch(*src) { - case '"': to_string("\\\"", 2); break; - case '\'': to_string("\\'", 2); break; - case '\n': to_string("\\n", 2); break; - case '\\': to_string("\\\\", 2); break; - case '\xFF': to_string("\\\xFF", 2); break; - _default; - }); - break; - case UL_XML: - escape(switch(*src) { - case '&': to_string("&", 5); break; - case '>': to_string(">", 4); break; - case '<': to_string("<", 4); break; - case '"': to_string(""", 6); break; - case '\'': to_string("'", 6); break; - _default; - }); - break; - case UL_HTML: - case UL_OPTIMIZED_HTML: - escape(switch(*src) { - case '&': to_string("&", 5); break; - case '>': to_string(">", 4); break; - case '<': to_string("<", 4); break; - case '"': to_string(""", 6); break; - _default; - }); - break; - default: - throw Exception(0, 0, - this, - "unknown untaint language #%d of %d piece", - static_cast(row->item.lang), - i); // never - break; // never + if(to_quoted_printable) // close + dest+=sprintf(dest, "?="); + + } else { + memcpy(dest, row->item.ptr, row->item.size); + dest+=row->item.size; } - - if((lang==UL_UNSPECIFIED?row->item.lang:lang)!=UL_CLEAN) - whitespace=false; + break; + case UL_TABLE: + // tainted, untaint language: table + escape(switch(*src) { + case '\t': to_char(' '); break; + case '\n': to_char(' '); break; + _default; + }); + break; + case UL_SQL: + // tainted, untaint language: sql + if(connection) + dest+=connection->quote(dest, row->item.ptr, row->item.size); + else + throw Exception(0, 0, + this, + "untaint in SQL language failed - no connection specified"); + break; + case UL_JS: + escape(switch(*src) { + case '"': to_string("\\\"", 2); break; + case '\'': to_string("\\'", 2); break; + case '\n': to_string("\\n", 2); break; + case '\\': to_string("\\\\", 2); break; + case '\xFF': to_string("\\\xFF", 2); break; + _default; + }); + break; + case UL_XML: + escape(switch(*src) { + case '&': to_string("&", 5); break; + case '>': to_string(">", 4); break; + case '<': to_string("<", 4); break; + case '"': to_string(""", 6); break; + case '\'': to_string("'", 6); break; + _default; + }); + break; + case UL_HTML: + escape(switch(*src) { + case '&': to_string("&", 5); break; + case '>': to_string(">", 4); break; + case '<': to_string("<", 4); break; + case '"': to_string(""", 6); break; + _default; + }); + break; + default: + throw Exception(0, 0, + this, + "unknown untaint language #%d", + static_cast(row->item.lang)); // sould never + break; // never } - chunk=row->link; - } while(chunk); + + if(to_lang & UL_OPTIMIZE_BIT) { + // optimizing whitespace + char *stop=dest; dest=start; + for(char *src=start; srclink; \ + } while(chunk); \ break2: return dest; } char *String::cstr_debug_origins() const { + //_asm int 3; char *result=(char *)malloc(size()+used_rows()*MAX_STRING*2); char *dest=result; @@ -417,8 +510,16 @@ char *String::cstr_debug_origins() const else dest+=sprintf(dest, ""); #endif - dest+=sprintf(dest, "#%s: ", - String_Untaint_lang_name[row->item.lang]); + uchar show_lang=row->item.lang & ~UL_OPTIMIZE_BIT; + if(show_lang>=sizeof(String_Untaint_lang_name)/sizeof(String_Untaint_lang_name[0])) + throw Exception(0, 0, + this, + "unknown untaint language #%d", + static_cast(show_lang)); // sould never + + dest+=sprintf(dest, "#%s%s: ", + String_Untaint_lang_name[show_lang], + row->item.lang & UL_OPTIMIZE_BIT?".O":""); char *dest_after_origins=dest; memcpy(dest, row->item.ptr, row->item.size);