--- parser3/src/main/untaint.C 2001/04/05 13:19:43 1.34 +++ parser3/src/main/untaint.C 2001/11/16 13:51:14 1.76 @@ -2,14 +2,11 @@ Parser: String class part: untaint mechanizm. Copyright(c) 2001 ArtLebedev Group(http://www.artlebedev.com) + Author: Alexander Petrosyan (http://paf.design.ru) - Author: Alexander Petrosyan (http://design.ru/paf) - - $Id: untaint.C,v 1.34 2001/04/05 13:19:43 paf Exp $ + $Id: untaint.C,v 1.76 2001/11/16 13:51:14 paf Exp $ */ -#include "pa_config_includes.h" - #include "pa_pool.h" #include "pa_string.h" #include "pa_hash.h" @@ -17,6 +14,8 @@ #include "pa_table.h" #include "pa_globals.h" #include "pa_sql_connection.h" +#include "pa_dictionary.h" +#include "pa_common.h" #define escape(action) \ { \ @@ -32,13 +31,13 @@ char chunk[3]={prefix}; \ chunk[1]=hex[((unsigned char)*src)/0x10]; \ chunk[2]=hex[((unsigned char)*src)%0x10]; \ - strncpy(dest, chunk, 3); dest+=3; \ + memcpy(dest, chunk, 3); dest+=3; \ } else \ *dest++=*src; \ break #define to_char(c) *dest++=c #define to_string(b, bsize) \ - strncpy(dest, b, bsize); \ + memcpy(dest, b, bsize); \ dest+=bsize; \ inline bool need_file_encode(unsigned char c){ @@ -47,9 +46,9 @@ inline bool need_file_encode(unsigned ch return !strchr( #ifdef WIN32 - ":\\" + ":\\~" #endif - "./", c); + "./()_-", c); } inline bool need_uri_encode(unsigned char c){ if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) @@ -57,47 +56,228 @@ inline bool need_uri_encode(unsigned cha return !strchr("_-./", c); } -inline bool need_header_encode(unsigned char c){ +inline bool need_http_header_encode(unsigned char c){ if(strchr(" , :", c)) return false; return need_uri_encode(c); } +// + +static const char * String_Untaint_lang_name[]={ + "U", ///< zero value handy for hash lookup @see untaint_lang_name2enum + "C", ///< clean + "T", ///< tainted, untaint language as assigned later + // untaint languages. assigned by ^untaint[lang]{...} + "P", + /**< + leave language built into string being appended. + just a flag, that value not stored + */ + "A", ///< leave all characters intact + "F", ///< file specification + "H", ///< ext in HTTP response header + "M", ///< text in mail header + "URI", ///< text in uri + "T", ///< ^table:set body + "SQL", ///< ^table:sql body + "JS", ///< JavaScript code + "XML", ///< ^dom:set xml + "HTML", ///< HTML code (for editing) + "UHTML", ///< HTML code with USER chars +}; + + // String -static bool typo_present(Array::Item *value, const void *info) { - Array *row=static_cast(value); - const char *src=static_cast(info); - - int partial; - row->get_string(0)->cmp(partial, src); - return - partial==0 || // full match - partial==1; // typo left column starts 'src' -} +/* + +HTTP-header = field-name ":" [ field-value ] CRLF + + field-name = token + field-value = *( field-content | LWS ) + + field-content = + + + +word = token | quoted-string + +token = 1* + + + +tspecials = "(" | ")" | "<" | ">" | "@" + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT + +SP = +HT = -/** - @test optimize whitespaces for all but 'html' - @todo fix theoretical \n mem overrun in TYPO replacements +LWS = [CRLF] 1*( SP | HT ) +TEXT = + +quoted-pair = "\" CHAR + + if(strchr("()<>@,;:\\\"/[]?={} \t", *ptr)) */ -char *String::store_to(char *dest, Untaint_lang lang, SQL_Connection *connection) const { - // $MAIN:html-typo table - Table *user_typo_table=static_cast(pool().tag()); - Table *typo_table=user_typo_table?user_typo_table:default_typo_table; +inline bool need_quote_http_header(const char *ptr, size_t size) { + for(; size--; ptr++) + if(strchr(";\\\"= \t" /* excluded ()<>@, :/ ? []{} */, *ptr)) + return true; + return false; +} +/// @test UL_OPTIMIZED_HTML optimize +size_t String::cstr_bufsize(Untaint_lang lang, + SQL_Connection *connection, + const char *charset) const { + size_t dest=1; + bool whitespace=true; const Chunk *chunk=&head; do { const Chunk::Row *row=chunk->rows; - for(size_t i=0; icount; i++, row++) { + for(uint i=0; icount; i++, row++) { if(row==append_here) goto break2; - // WARNING: - // string can grow only UNTAINT_TIMES_BIGGER - switch(lang==UL_UNKNOWN?row->item.lang:lang) { + Untaint_lang to_lang=lang==UL_UNSPECIFIED?(Untaint_lang)row->item.lang:lang; + + switch(to_lang) { case UL_CLEAN: // clean piece + { // optimizing whitespace + escape(switch(*src) { + case ' ': case '\n': case '\t': + if(!whitespace) { + dest++; + whitespace=true; + } + break; + default: + whitespace=false; + dest++; + break; + }); + } + break; + case UL_TAINTED: + // tainted piece, but undefined untaint language + // for VString.as_double of tainted values + // for ^process{body} evaluation + case UL_AS_IS: + // tainted, untaint language: as-is + dest+=row->item.size; + break; + case UL_FILE_SPEC: + // tainted, untaint language: file [name] + dest+=row->item.size*3/* worst: Z->%XX */; + break; + case UL_URI: + // tainted, untaint language: uri + dest+=row->item.size*3/* worst: Z->%XX */; + break; + case UL_HTTP_HEADER: + // tainted, untaint language: http-field-content-text + dest+=row->item.size*3/* worst: Z->%XX */; + break; + case UL_MAIL_HEADER: + // tainted, untaint language: mail-header + if(charset) { + // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= + dest+=row->item.size*3+MAX_STRING/* worst: =?charset?Q?=%XX?= */; + } else { + dest+=row->item.size; + } + break; + case UL_TABLE: + // tainted, untaint language: table + dest+=row->item.size; + break; + case UL_SQL: + // tainted, untaint language: sql + if(connection) + dest+=connection->quote(0, row->item.ptr, row->item.size); + break; + case UL_JS: + escape(switch(*src) { + case '"': case '\'': case '\n': case '\\': case '\xFF': + dest+=2; break; + default: + dest++; break; + }); + break; + case UL_XML: + escape(switch(*src) { + case '&': case '>': case '<': case '"': case '\'': + dest+= 6; break; + default: + dest++; break; + }); + break; + case UL_HTML: + case UL_OPTIMIZED_HTML: + escape(switch(*src) { + case '&': + case '>': + case '<': + case '"': + dest+=6; break; + default: + dest++; break; + }); + break; + } + + if((lang==UL_UNSPECIFIED?row->item.lang:lang)!=UL_CLEAN) + whitespace=false; + } + chunk=row->link; + } while(chunk); + +break2: + return dest; +} + +/// @test UL_OPTIMIZED_HTML optimize +char *String::store_to(char *dest, Untaint_lang lang, + SQL_Connection *connection, + const char *charset) const { + // WARNING: + // before any changes check cstr_bufsize first!!! + bool whitespace=true; + const Chunk *chunk=&head; + do { + const Chunk::Row *row=chunk->rows; + for(uint i=0; icount; i++, row++) { + if(row==append_here) + goto break2; + + Untaint_lang to_lang=lang==UL_UNSPECIFIED?(Untaint_lang)row->item.lang:lang; + + switch(to_lang) { + case UL_CLEAN: + // clean piece + { // optimizing whitespace + escape(switch(*src) { + case ' ': case '\n': case '\t': + if(!whitespace) { + *dest++=*src; + whitespace=true; + } + break; + default: + whitespace=false; + *dest++=*src; + break; + }); + } + break; case UL_TAINTED: // tainted piece, but undefined untaint language // for VString.as_double of tainted values @@ -107,11 +287,11 @@ char *String::store_to(char *dest, Untai memcpy(dest, row->item.ptr, row->item.size); dest+=row->item.size; break; - case UL_FILE_NAME: + case UL_FILE_SPEC: // tainted, untaint language: file [name] escape(switch(*src) { case ' ': to_char('_'); break; - encode(need_file_encode, '-'); + encode(need_file_encode, '+'); }); break; case UL_URI: @@ -121,12 +301,37 @@ char *String::store_to(char *dest, Untai encode(need_uri_encode, '%'); }); break; - case UL_HEADER: - // tainted, untaint language: header + case UL_HTTP_HEADER: + // tainted, untaint language: http-field-content-text escape(switch(*src) { - encode(need_header_encode, '%'); + case ' ': to_char('+'); break; + encode(need_uri_encode, '%'); }); break; + case UL_MAIL_HEADER: + // tainted, untaint language: mail-header + if(charset) { + // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= + const char *src=row->item.ptr; + bool to_quoted_printable=false; + for(int size=row->item.size; size--; src++) { + if(*src & 0x80) { + if(!to_quoted_printable) { + dest+=sprintf(dest, "=?%.15s?Q?", charset); + to_quoted_printable=true; + } + dest+=sprintf(dest, "=%02X", *src & 0xFF); + } else { + *dest++=*src; + } + } + if(to_quoted_printable) // close + dest+=sprintf(dest, "?="); + } else { + memcpy(dest, row->item.ptr, row->item.size); + dest+=row->item.size; + } + break; case UL_TABLE: // tainted, untaint language: table escape(switch(*src) { @@ -140,7 +345,7 @@ char *String::store_to(char *dest, Untai if(connection) dest+=connection->quote(dest, row->item.ptr, row->item.size); else - THROW(0, 0, + throw Exception(0, 0, this, "untaint in SQL language failed - no connection specified"); break; @@ -154,96 +359,78 @@ char *String::store_to(char *dest, Untai _default; }); break; - case UL_HTML: + case UL_XML: escape(switch(*src) { case '&': to_string("&", 5); break; case '>': to_string(">", 4); break; case '<': to_string("<", 4); break; case '"': to_string(""", 6); break; - //TODO: XSLT case '\'': to_string("'", 6); break; + case '\'': to_string("'", 6); break; _default; }); break; - case UL_HTML_TYPO: { - // tainted, untaint language: html-typo - char *html_for_typo=(char *)malloc(size()*2/* '\n' -> '\' 'n' */+1); - // note: - // there still is a possibility that user - // would not replace \n as she supposed to - // and rather replace \ and n into huge strings - // thus causing memory overrun - // this can be dealed by allocating *2 memory, but that's too expensive - size_t html_for_typo_size; - { // local dest - char *dest=html_for_typo; - escape(switch(*src) { - // convinient name for typo match "\n" - case '\r': - if(typo_table) { - *dest++='\\'; *dest++='n'; // \r -> \n - if(src[1]=='\n') { // \r\n -> remove \n - size--; src++; - } - } - break; - case '\n': - if(typo_table) - to_string("\\n", 2); - break; - //TODO: XSLT case '\'': to_string("'", 6); break; - _default; - }); - *dest=0; - html_for_typo_size=dest-html_for_typo; - } - // typo table replacements - const char *src=html_for_typo; - do { - // there is a row where first column starts 'src' - if(Table::Item *item=typo_table->first_that(typo_present, src)) { - // get a=>b values - const String& a=*static_cast(item)->get_string(0); - const String& b=*static_cast(item)->get_string(1); - // empty 'a' | 'b' checks - if(a.size()==0 || b.size()==0) { - pool().set_tag(default_typo_table); // avoid recursion - THROW(0, 0, - typo_table->origin_string(), - "typo table column elements must not be empty"); - } - // overflow check: - // b allowed to be max UNTAINT_TIMES_BIGGER then a - if(b.size()>UNTAINT_TIMES_BIGGER*a.size()) { - pool().set_tag(default_typo_table); // avoid recursion - THROW(0, 0, - &b, - "is %g times longer then '%s', " - "while maximum, handled by Parser, is %d", - ((double)b.size())/a.size(), - a.cstr(), - UNTAINT_TIMES_BIGGER); - } - - // skip 'a' in 'src' - src+=a.size(); - // write 'b' to 'dest' - b.store_to(dest); - dest+=b.size(); - } else - *dest++=*src++; - } while(*src); + case UL_HTML: + case UL_OPTIMIZED_HTML: + escape(switch(*src) { + case '&': to_string("&", 5); break; + case '>': to_string(">", 4); break; + case '<': to_string("<", 4); break; + case '"': to_string(""", 6); break; + _default; + }); break; - } default: - THROW(0, 0, + throw Exception(0, 0, this, "unknown untaint language #%d of %d piece", static_cast(row->item.lang), - i); + i); // never + break; // never } + + if((lang==UL_UNSPECIFIED?row->item.lang:lang)!=UL_CLEAN) + whitespace=false; } chunk=row->link; } while(chunk); + break2: return dest; } + +char *String::cstr_debug_origins() const { + char *result=(char *)malloc(size()+used_rows()*MAX_STRING*2); + char *dest=result; + + const Chunk *chunk=&head; + do { + const Chunk::Row *row=chunk->rows; + for(uint i=0; icount; i++, row++) { + if(row==append_here) + goto break2; + +#ifndef NO_STRING_ORIGIN + if(row->item.origin.file) + dest+=sprintf(dest, ORIGIN_FILE_LINE_FORMAT, + row->item.origin.file, + 1+row->item.origin.line); + else + dest+=sprintf(dest, ""); +#endif + dest+=sprintf(dest, "#%s: ", + String_Untaint_lang_name[row->item.lang]); + char *dest_after_origins=dest; + + memcpy(dest, row->item.ptr, row->item.size); + dest+=row->item.size; + + remove_crlf(dest_after_origins, dest); + to_char('\n'); + } + chunk=row->link; + } while(chunk); + +break2: + *dest=0; + return result; +}