--- parser3/src/main/untaint.C 2001/11/21 08:26:55 1.78 +++ parser3/src/main/untaint.C 2002/02/20 11:15:13 1.93 @@ -1,10 +1,10 @@ /** @file Parser: String class part: untaint mechanizm. - Copyright(c) 2001 ArtLebedev Group(http://www.artlebedev.com) - Author: Alexander Petrosyan (http://paf.design.ru) + Copyright(c) 2001, 2002 ArtLebedev Group (http://www.artlebedev.com) + Author: Alexandr Petrosian (http://paf.design.ru) - $Id: untaint.C,v 1.78 2001/11/21 08:26:55 paf Exp $ + $Id: untaint.C,v 1.93 2002/02/20 11:15:13 paf Exp $ */ #include "pa_pool.h" @@ -16,6 +16,14 @@ #include "pa_sql_connection.h" #include "pa_dictionary.h" #include "pa_common.h" +#include "pa_charset.h" + +#define DEBUG_STRING_APPENDS_VS_EXPANDS + +#ifdef DEBUG_STRING_APPENDS_VS_EXPANDS +ulong string_string_shortcut_economy=0; +#endif + #define escape(action) \ { \ @@ -25,7 +33,6 @@ } #define _default default: *dest++=*src; break #define encode(need_encode_func, prefix) \ - default: \ if(need_encode_func(*src)) { \ static const char *hex="0123456789ABCDEF"; \ char chunk[3]={prefix}; \ @@ -41,14 +48,20 @@ dest+=bsize; \ inline bool need_file_encode(unsigned char c){ + // theoretical problem with, for instance, "_2B" and "." fragments, + // they would yield the same + // because need_file_encode('_')=false + // but we need to delete such files somehow, getting names from ^index + if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) return false; return !strchr( + " _./()-" #ifdef WIN32 ":\\~" #endif - "./()_-", c); + , c); } inline bool need_uri_encode(unsigned char c){ if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) @@ -84,8 +97,7 @@ static const char * String_Untaint_lang_ "SQL", ///< ^table:sql body "JS", ///< JavaScript code "XML", ///< ^dom:set xml - "HTML", ///< HTML code (for editing) - "UHTML", ///< HTML code with USER chars + "HTML" ///< HTML code (for editing) }; @@ -133,7 +145,58 @@ inline bool need_quote_http_header(const return false; } +//#include "pa_sapi.h" +/** + appends other String, + marking all tainted pieces of it with @a lang. + or marking ALL pieces of it with a @a lang when @a forced to, + and propagating OPTIMIZE language bit. + + using architecture advantage: after string-to-string-append string never modified. + algorithm: + if no language-change specified and src not yet appended to some other string[last_chunk!=0] + shrinking dest last_chunk[preparing it for linking], + ///shrinking src last_chunk[preparing it to be linked, consequent dest.appends would go there], + linking[dest.last_chunk = src.head] + if some language-change specified or src already appended to some other string[last_chunk==0] + cloning pieces. +*/ String& String::append(const String& src, uchar lang, bool forced) { + // should never, but just in case... + if(src.is_empty()) + return *this; + + if(lang==UL_PASS_APPENDED && src.last_chunk) { +#ifdef DEBUG_STRING_APPENDS_VS_EXPANDS + string_string_shortcut_economy+=src.used_rows()*sizeof(String::Chunk::Row); +#endif +/* + // using fact: + // src.head.count shrinks-only, + // so can't be less than this.head.count, + // which means that we know that src.head would fit into this.head + if(is_empty()) { // our head is empty + // "your head is my head" + memcpy(head.rows, src.head.rows, sizeof(Chunk::Row)*(head.count=src.head.count)); + // "your body is my body" + head.rows[head.count].link=src.head.rows[src.head.count].link; + } else { // our head contains something + // "chopping my tail-reserve" + last_chunk->count=append_here-last_chunk->rows; + // "you is my tail" + last_chunk->rows[last_chunk->count].link=src.head.rows; + } + // "your append_here is mine now" + append_here=src.append_here; + // "your last_chunk is mine now" + last_chunk=src.last_chunk; + +*/ + // stop-growing mark + src.last_chunk=0; +// return; + } + // manually unrolled code to avoid do{if(const)} constructs if(forced) STRING_SRC_FOREACH_ROW( @@ -171,7 +234,7 @@ break2: size_t String::cstr_bufsize(Untaint_lang lang, SQL_Connection *connection, - const char *charset) const { + Charset *buf_charset) const { size_t dest=1; // for terminating 0 STRING_FOREACH_ROW( uchar to_lang=lang==UL_UNSPECIFIED?row->item.lang:lang; @@ -195,7 +258,7 @@ size_t String::cstr_bufsize(Untaint_lang break; case UL_URI: // tainted, untaint language: uri - dest+=row->item.size*3/* worst: Z->%XX */; + dest+=row->item.size*6*3/* worst utf8 x worst Z->%XX */; break; case UL_HTTP_HEADER: // tainted, untaint language: http-field-content-text @@ -203,12 +266,13 @@ size_t String::cstr_bufsize(Untaint_lang break; case UL_MAIL_HEADER: // tainted, untaint language: mail-header - if(charset) { + if(buf_charset) { // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= - dest+=row->item.size*3+MAX_STRING/* worst: =?charset?Q?=%XX?= */; - } else { + dest+= + row->item.size*3+ + buf_charset->name().size()+MAX_STRING/* worst: =?charset?Q?=%XX?= */; + } else dest+=row->item.size; - } break; case UL_TABLE: // tainted, untaint language: table @@ -254,10 +318,11 @@ break2: char *String::store_to(char *dest, Untaint_lang lang, SQL_Connection *connection, - const char *charset) const { + Charset *store_to_charset) const { // WARNING: // before any changes check cstr_bufsize first!!! bool whitespace=true; + // expanded STRING_FOREACH_ROW here for debugging purposes const Chunk *chunk=&head; \ do { \ const Chunk::Row *row=chunk->rows; \ @@ -285,35 +350,49 @@ char *String::store_to(char *dest, Untai break; case UL_FILE_SPEC: // tainted, untaint language: file [name] - escape(switch(*src) { - case ' ': to_char('_'); break; - encode(need_file_encode, '+'); - }); + escape( + encode(need_file_encode, '_'); + ); break; case UL_URI: // tainted, untaint language: uri - escape(switch(*src) { - case ' ': to_char('+'); break; - encode(need_uri_encode, '%'); - }); + const void *client_ptr; + size_t client_size; + Charset::transcode(pool(), + pool().get_source_charset(), row->item.ptr, row->item.size, + pool().get_client_charset(), client_ptr, client_size); + { + const char *src=(const char *)client_ptr; + for(int size=client_size; size--; src++) + switch(*src) { + case ' ': to_char('+'); break; + default: encode(need_uri_encode, '%'); + }; + } break; case UL_HTTP_HEADER: // tainted, untaint language: http-field-content-text escape(switch(*src) { case ' ': to_char('+'); break; - encode(need_uri_encode, '%'); + default: encode(need_uri_encode, '%'); }); break; case UL_MAIL_HEADER: // tainted, untaint language: mail-header - if(charset) { + if(store_to_charset) { + const void *mail_ptr; + size_t mail_size; + Charset::transcode(pool(), + pool().get_source_charset(), row->item.ptr, row->item.size, + *store_to_charset, mail_ptr, mail_size); + // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= - const char *src=row->item.ptr; + const char *src=(const char *)mail_ptr; bool to_quoted_printable=false; - for(int size=row->item.size; size--; src++) { + for(int size=mail_size; size--; src++) { if(*src & 0x80) { if(!to_quoted_printable) { - dest+=sprintf(dest, "=?%.15s?Q?", charset); + dest+=sprintf(dest, "=?%s?Q?", store_to_charset->name().cstr()); to_quoted_printable=true; } dest+=sprintf(dest, "=%02X", *src & 0xFF); @@ -323,6 +402,7 @@ char *String::store_to(char *dest, Untai } if(to_quoted_printable) // close dest+=sprintf(dest, "?="); + } else { memcpy(dest, row->item.ptr, row->item.size); dest+=row->item.size; @@ -377,9 +457,8 @@ char *String::store_to(char *dest, Untai default: throw Exception(0, 0, this, - "unknown untaint language #%d of %d piece", - static_cast(row->item.lang), - i); // never + "unknown untaint language #%d", + static_cast(row->item.lang)); // sould never break; // never } @@ -389,7 +468,7 @@ char *String::store_to(char *dest, Untai for(char *src=start; src"); #endif - dest+=sprintf(dest, "#%s: ", - String_Untaint_lang_name[row->item.lang]); + uchar show_lang=row->item.lang & ~UL_OPTIMIZE_BIT; + if(show_lang>=sizeof(String_Untaint_lang_name)/sizeof(String_Untaint_lang_name[0])) + throw Exception(0, 0, + this, + "unknown untaint language #%d", + static_cast(show_lang)); // sould never + + dest+=sprintf(dest, "#%s%s: ", + String_Untaint_lang_name[show_lang], + row->item.lang & UL_OPTIMIZE_BIT?".O":""); char *dest_after_origins=dest; memcpy(dest, row->item.ptr, row->item.size);