--- parser3/src/main/untaint.C 2001/11/21 08:26:55 1.78 +++ parser3/src/main/untaint.C 2002/03/27 15:30:37 1.99 @@ -1,10 +1,10 @@ /** @file Parser: String class part: untaint mechanizm. - Copyright(c) 2001 ArtLebedev Group(http://www.artlebedev.com) - Author: Alexander Petrosyan (http://paf.design.ru) + Copyright(c) 2001, 2002 ArtLebedev Group (http://www.artlebedev.com) + Author: Alexandr Petrosian (http://paf.design.ru) - $Id: untaint.C,v 1.78 2001/11/21 08:26:55 paf Exp $ + $Id: untaint.C,v 1.99 2002/03/27 15:30:37 paf Exp $ */ #include "pa_pool.h" @@ -16,6 +16,13 @@ #include "pa_sql_connection.h" #include "pa_dictionary.h" #include "pa_common.h" +#include "pa_charset.h" + +//#define DEBUG_STRING_APPENDS_VS_EXPANDS + +#ifdef DEBUG_STRING_APPENDS_VS_EXPANDS +ulong string_string_shortcut_economy=0; +#endif #define escape(action) \ { \ @@ -25,7 +32,6 @@ } #define _default default: *dest++=*src; break #define encode(need_encode_func, prefix) \ - default: \ if(need_encode_func(*src)) { \ static const char *hex="0123456789ABCDEF"; \ char chunk[3]={prefix}; \ @@ -41,14 +47,20 @@ dest+=bsize; \ inline bool need_file_encode(unsigned char c){ + // theoretical problem with, for instance, "_2B" and "." fragments, + // they would yield the same + // because need_file_encode('_')=false + // but we need to delete such files somehow, getting names from ^index + if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) return false; return !strchr( + " _./()-" #ifdef WIN32 ":\\~" #endif - "./()_-", c); + , c); } inline bool need_uri_encode(unsigned char c){ if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) @@ -84,8 +96,7 @@ static const char * String_Untaint_lang_ "SQL", ///< ^table:sql body "JS", ///< JavaScript code "XML", ///< ^dom:set xml - "HTML", ///< HTML code (for editing) - "UHTML", ///< HTML code with USER chars + "HTML" ///< HTML code (for editing) }; @@ -133,7 +144,79 @@ inline bool need_quote_http_header(const return false; } +//#include "pa_sapi.h" +/** + appends other String, + marking all tainted pieces of it with @a lang. + or marking ALL pieces of it with a @a lang when @a forced to, + and propagating OPTIMIZE language bit. + + using architecture advantage: after string-to-string-append string never modified. + algorithm: + if no language-change specified and src not yet appended to some other string[last_chunk!=0] + shrinking dest last_chunk[preparing it for linking], + ///shrinking src last_chunk[preparing it to be linked, consequent dest.appends would go there], + linking[dest.last_chunk = src.head] + if some language-change specified or src already appended to some other string[last_chunk==0] + cloning pieces. +*/ String& String::append(const String& src, uchar lang, bool forced) { + if(!last_chunk) // growth stopped [we're appended as string to somebody] + throw Exception(0, + this, + "string growth stopped (append string)"); + + if(src.is_empty()) + return *this; + + // without language-chage, not-appended-before, big[not fitting our tail] string? + if(lang==UL_PASS_APPENDED + && src.last_chunk + && (uint(&last_chunk->rows[last_chunk->count]-append_here) < src.used_rows())) { +#ifdef DEBUG_STRING_APPENDS_VS_EXPANDS + string_string_shortcut_economy+=src.used_rows()*sizeof(String::Chunk::Row); +#endif + + // using fact: + // src.head.count initally equeals this.head.count and shrinks-only, + // so can't be more than this.head.count, + // which means that we know that + // src.head would fit into this.head + if(is_empty()) { // our head is empty + // they have more than head? we need all head : we need only filled-part of head + Chunk *src_head_link=src.head.rows[src.head.count].link; + size_t head_count=src_head_link?src.head.count:(src.append_here-src.head.rows); + // "your head is my head" + memcpy(head.rows, src.head.rows, sizeof(Chunk::Row)*(head_count)); + if(src_head_link) { + // "your body is my body" + head.rows[head.count=head_count].link=src_head_link; + // "your last_chunk is mine now" + last_chunk=src.last_chunk; + // "your append_here is mine now" + append_here=src.append_here; + } else { + // "your last_chunk is mine now" + last_chunk=&head; + // "your append_here is recalc-mine now" + append_here=head.rows+head_count; + } + } else { // our head contains something + // "chopping off my tail-reserve" + last_chunk->count=append_here-last_chunk->rows; + // "you is my tail" + append_here->link=&src.head; + // "your last_chunk is mine now" + last_chunk=src.last_chunk; + // "your append_here is mine now" + append_here=src.append_here; + } + + // stop-growing mark + src.last_chunk=0; + return *this; + } + // manually unrolled code to avoid do{if(const)} constructs if(forced) STRING_SRC_FOREACH_ROW( @@ -165,13 +248,16 @@ String& String::append(const String& src row->item.lang==UL_TAINTED?lang:row->item.lang, row->item.origin.file, row->item.origin.line); ); -break2: +/* + for(Chunk::Row *row=last_chunk->rows; rowlink==(void*)0xcdcdcdcd) + _asm int 3;*/ return *this; } size_t String::cstr_bufsize(Untaint_lang lang, SQL_Connection *connection, - const char *charset) const { + Charset *buf_charset) const { size_t dest=1; // for terminating 0 STRING_FOREACH_ROW( uchar to_lang=lang==UL_UNSPECIFIED?row->item.lang:lang; @@ -195,7 +281,7 @@ size_t String::cstr_bufsize(Untaint_lang break; case UL_URI: // tainted, untaint language: uri - dest+=row->item.size*3/* worst: Z->%XX */; + dest+=row->item.size*6*3/* worst utf8 x worst Z->%XX */; break; case UL_HTTP_HEADER: // tainted, untaint language: http-field-content-text @@ -203,12 +289,13 @@ size_t String::cstr_bufsize(Untaint_lang break; case UL_MAIL_HEADER: // tainted, untaint language: mail-header - if(charset) { + if(buf_charset) { // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= - dest+=row->item.size*3+MAX_STRING/* worst: =?charset?Q?=%XX?= */; - } else { + dest+= + row->item.size*3+ + buf_charset->name().size()+MAX_STRING/* worst: =?charset?Q?=%XX?= */; + } else dest+=row->item.size; - } break; case UL_TABLE: // tainted, untaint language: table @@ -248,23 +335,16 @@ size_t String::cstr_bufsize(Untaint_lang break; } ); -break2: return dest; } char *String::store_to(char *dest, Untaint_lang lang, SQL_Connection *connection, - const char *charset) const { + Charset *store_to_charset) const { // WARNING: // before any changes check cstr_bufsize first!!! bool whitespace=true; - const Chunk *chunk=&head; \ - do { \ - const Chunk::Row *row=chunk->rows; \ - for(uint i=0; icount; i++, row++) { \ - if(row==append_here) \ - goto break2; \ - \ + STRING_FOREACH_ROW( uchar to_lang=lang==UL_UNSPECIFIED?row->item.lang:lang; char *start=dest; @@ -285,35 +365,49 @@ char *String::store_to(char *dest, Untai break; case UL_FILE_SPEC: // tainted, untaint language: file [name] - escape(switch(*src) { - case ' ': to_char('_'); break; - encode(need_file_encode, '+'); - }); + escape( + encode(need_file_encode, '_'); + ); break; case UL_URI: // tainted, untaint language: uri - escape(switch(*src) { - case ' ': to_char('+'); break; - encode(need_uri_encode, '%'); - }); + const void *client_ptr; + size_t client_size; + Charset::transcode(pool(), + pool().get_source_charset(), row->item.ptr, row->item.size, + pool().get_client_charset(), client_ptr, client_size); + { + const char *src=(const char *)client_ptr; + for(int size=client_size; size--; src++) + switch(*src) { + case ' ': to_char('+'); break; + default: encode(need_uri_encode, '%'); + }; + } break; case UL_HTTP_HEADER: // tainted, untaint language: http-field-content-text escape(switch(*src) { case ' ': to_char('+'); break; - encode(need_uri_encode, '%'); + default: encode(need_uri_encode, '%'); }); break; case UL_MAIL_HEADER: // tainted, untaint language: mail-header - if(charset) { + if(store_to_charset) { + const void *mail_ptr; + size_t mail_size; + Charset::transcode(pool(), + pool().get_source_charset(), row->item.ptr, row->item.size, + *store_to_charset, mail_ptr, mail_size); + // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= - const char *src=row->item.ptr; + const char *src=(const char *)mail_ptr; bool to_quoted_printable=false; - for(int size=row->item.size; size--; src++) { + for(int size=mail_size; size--; src++) { if(*src & 0x80) { if(!to_quoted_printable) { - dest+=sprintf(dest, "=?%.15s?Q?", charset); + dest+=sprintf(dest, "=?%s?Q?", store_to_charset->name().cstr()); to_quoted_printable=true; } dest+=sprintf(dest, "=%02X", *src & 0xFF); @@ -323,6 +417,7 @@ char *String::store_to(char *dest, Untai } if(to_quoted_printable) // close dest+=sprintf(dest, "?="); + } else { memcpy(dest, row->item.ptr, row->item.size); dest+=row->item.size; @@ -341,7 +436,7 @@ char *String::store_to(char *dest, Untai if(connection) dest+=connection->quote(dest, row->item.ptr, row->item.size); else - throw Exception(0, 0, + throw Exception(0, this, "untaint in SQL language failed - no connection specified"); break; @@ -375,11 +470,10 @@ char *String::store_to(char *dest, Untai }); break; default: - throw Exception(0, 0, + throw Exception(0, this, - "unknown untaint language #%d of %d piece", - static_cast(row->item.lang), - i); // never + "unknown untaint language #%d", + static_cast(row->item.lang)); // sould never break; // never } @@ -389,7 +483,7 @@ char *String::store_to(char *dest, Untai for(char *src=start; srclink; \ - } while(chunk); \ - -break2: return dest; } char *String::cstr_debug_origins() const { + //_asm int 3; char *result=(char *)malloc(size()+used_rows()*MAX_STRING*2); char *dest=result; - const Chunk *chunk=&head; - do { - const Chunk::Row *row=chunk->rows; - for(uint i=0; icount; i++, row++) { - if(row==append_here) - goto break2; - -#ifndef NO_STRING_ORIGIN - if(row->item.origin.file) - dest+=sprintf(dest, ORIGIN_FILE_LINE_FORMAT, - row->item.origin.file, - 1+row->item.origin.line); - else - dest+=sprintf(dest, ""); -#endif - dest+=sprintf(dest, "#%s: ", - String_Untaint_lang_name[row->item.lang]); - char *dest_after_origins=dest; + STRING_FOREACH_ROW( +IFNDEF_NO_STRING_ORIGIN( + if(row->item.origin.file) + dest+=sprintf(dest, ORIGIN_FILE_LINE_FORMAT, + row->item.origin.file, + 1+row->item.origin.line); + else + dest+=sprintf(dest, ""); +); + uchar show_lang=row->item.lang & ~UL_OPTIMIZE_BIT; + if(show_lang>=sizeof(String_Untaint_lang_name)/sizeof(String_Untaint_lang_name[0])) + throw Exception(0, + this, + "unknown untaint language #%d", + static_cast(show_lang)); // sould never - memcpy(dest, row->item.ptr, row->item.size); - dest+=row->item.size; + dest+=sprintf(dest, "#%s%s: ", + String_Untaint_lang_name[show_lang], + row->item.lang & UL_OPTIMIZE_BIT?".O":""); + char *dest_after_origins=dest; - remove_crlf(dest_after_origins, dest); - to_char('\n'); - } - chunk=row->link; - } while(chunk); + memcpy(dest, row->item.ptr, row->item.size); + dest+=row->item.size; + + remove_crlf(dest_after_origins, dest); + to_char('\n'); + ); -break2: *dest=0; return result; }