--- parser3/src/main/untaint.C 2001/03/12 21:54:20 1.2 +++ parser3/src/main/untaint.C 2001/05/19 19:10:20 1.50 @@ -1,120 +1,345 @@ -/* - Parser - Copyright (c) 2001 ArtLebedev Group (http://www.artlebedev.com) - Author: Alexander Petrosyan (http://design.ru/paf) +/** @file + Parser: String class part: untaint mechanizm. - $Id: untaint.C,v 1.2 2001/03/12 21:54:20 paf Exp $ -*/ + Copyright(c) 2001 ArtLebedev Group(http://www.artlebedev.com) + + Author: Alexander Petrosyan (http://design.ru/paf) -#include + $Id: untaint.C,v 1.50 2001/05/19 19:10:20 parser Exp $ +*/ #include "pa_pool.h" #include "pa_string.h" #include "pa_hash.h" #include "pa_exception.h" +#include "pa_table.h" +#include "pa_globals.h" +#include "pa_sql_connection.h" -#define escape(cases) \ +#define escape(action) \ { \ - const char *ptr=row->item.ptr; \ - int size=row->item.size; \ - for (;*ptr;ptr++) \ - switch(*ptr) { \ - cases \ - default: *copy_here++=*ptr; break; \ - } \ + const char *src=row->item.ptr; \ + for(int size=row->item.size; size--; src++) \ + action \ } -#define escape_value(a, c) case a: *copy_here++=c; break; -#define escape_subst(a, b, bsize) \ - case a: \ - { \ - strncpy(copy_here, b, bsize); \ - copy_here+=bsize; \ - } \ - break; +#define _default default: *dest++=*src; break +#define encode(need_encode_func, prefix) \ + default: \ + if(need_encode_func(*src)) { \ + static const char *hex="0123456789ABCDEF"; \ + char chunk[3]={prefix}; \ + chunk[1]=hex[((unsigned char)*src)/0x10]; \ + chunk[2]=hex[((unsigned char)*src)%0x10]; \ + strncpy(dest, chunk, 3); dest+=3; \ + } else \ + *dest++=*src; \ + break +#define to_char(c) *dest++=c +#define to_string(b, bsize) \ + strncpy(dest, b, bsize); \ + dest+=bsize; \ + +inline bool need_file_encode(unsigned char c){ + if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) + return false; + + return !strchr( +#ifdef WIN32 + ":\\~" +#endif + "./()_-", c); +} +inline bool need_uri_encode(unsigned char c){ + if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) + return false; + + return !strchr("_-./", c); +} +inline bool need_http_header_encode(unsigned char c){ + if(strchr(" , :", c)) + return false; + + return need_uri_encode(c); +} // String -char *String::cstr() const { - char *result=(char *)malloc(size()*UNTAINT_TIMES_BIGGER+1); +static bool typo_present(Array::Item *value, const void *info) { + Array *row=static_cast(value); + const char *src=static_cast(info); + + int partial; + row->get_string(0)->cmp(partial, src); + return + partial==0 || // full match + partial==1; // typo left column starts 'src' +} + +/* + +HTTP-header = field-name ":" [ field-value ] CRLF + + field-name = token + field-value = *( field-content | LWS ) + + field-content = + + + +word = token | quoted-string - char *copy_here=result; +token = 1* + + + +tspecials = "(" | ")" | "<" | ">" | "@" + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT + +SP = +HT = + +LWS = [CRLF] 1*( SP | HT ) +TEXT = + +quoted-pair = "\" CHAR + + if(strchr("()<>@,;:\\\"/[]?={} \t", *ptr)) +*/ +inline bool need_quote_http_header(const char *ptr, size_t size) { + for(; size--; ptr++) + if(strchr(";\\\"= \t" /* excluded ()<>@, :/ ? []{} */, *ptr)) + return true; + return false; +} + +/** + @todo fix theoretical \n mem overrun in TYPO replacements +*/ +char *String::store_to(char *dest, Untaint_lang lang, + SQL_Connection *connection, + const char *charset) const { + // $MAIN:html-typo table + Table *user_typo_table=static_cast(pool().tag()); + Table *typo_table=user_typo_table?user_typo_table:default_typo_table; + + bool whitespace=true; const Chunk *chunk=&head; - // TODO: оптимизировать whitespaces для всех, кроме 'html' do { const Chunk::Row *row=chunk->rows; - for(int i=0; icount; i++) { + for(size_t i=0; icount; i++, row++) { if(row==append_here) goto break2; // WARNING: // string can grow only UNTAINT_TIMES_BIGGER - switch(row->item.lang) { - case NO: + switch(lang==UL_UNSPECIFIED?row->item.lang:lang) { + case UL_CLEAN: // clean piece - case YES: + { // optimizing whitespace + const char *src=row->item.ptr; + for(int size=row->item.size; size--; src++) + switch(*src) { + case ' ': case '\n': case '\r': case '\t': + if(!whitespace) { + *dest++=*src; + whitespace=true; + } + break; + default: + whitespace=false; + *dest++=*src; + break; + } + } + break; + case UL_TAINTED: // tainted piece, but undefined untaint language - // for VString.get_double of tainted values + // for VString.as_double of tainted values // for ^process{body} evaluation - case AS_IS: + case UL_AS_IS: // tainted, untaint language: as-is - memcpy(copy_here, row->item.ptr, row->item.size); - copy_here+=row->item.size; + memcpy(dest, row->item.ptr, row->item.size); + dest+=row->item.size; + break; + case UL_FILE_NAME: + // tainted, untaint language: file [name] + escape(switch(*src) { + case ' ': to_char('_'); break; + encode(need_file_encode, '+'); + }); + break; + case UL_URI: + // tainted, untaint language: uri + escape(switch(*src) { + case ' ': to_char('+'); break; + encode(need_uri_encode, '%'); + }); break; - case TABLE: - escape( - escape_value('\t', ' ') - escape_value('\n', ' ') - ); + case UL_HTTP_HEADER: + // tainted, untaint language: http-header + if(need_quote_http_header(row->item.ptr, row->item.size)) { + *dest++='\"'; + escape(switch(*src) { + case '\"': to_string("\\\"", 2); break; + _default; + }); + *dest++='\"'; + } else { + memcpy(dest, row->item.ptr, row->item.size); + dest+=row->item.size; + } break; - case SQL: + case UL_MAIL_HEADER: + // tainted, untaint language: mail-header + if(charset) { + // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= + const char *src=row->item.ptr; + bool to_base_64=false; + for(int size=row->item.size; size--; src++) { + if(*src & 0x80) { + if(!to_base_64) { + dest+=sprintf(dest, "=?%.15s?Q?", charset); + to_base_64=true; + } + dest+=sprintf(dest, "=%02X", *src & 0xFF); + } else { + *dest++=*src; + } + } + if(to_base_64) // close + dest+=sprintf(dest, "?="); + } else { + memcpy(dest, row->item.ptr, row->item.size); + dest+=row->item.size; + } + break; + case UL_TABLE: + // tainted, untaint language: table + escape(switch(*src) { + case '\t': to_char(' '); break; + case '\n': to_char(' '); break; + _default; + }); + break; + case UL_SQL: // tainted, untaint language: sql - // TODO: зависимость от sql сервера - memset(copy_here, '?', row->item.size); - copy_here+=row->item.size; - break; - case JS: - escape( - escape_subst('"', "\\\"", 2) - escape_subst('\'', "\\'", 2) - escape_subst('\n', "\\n", 2) - escape_subst('\r', "\\r", 2) - escape_subst('\\', "\\\\", 2) - escape_subst('я', "\\я", 2) - ); - break; - case HTML: - escape( - escape_subst('&', "&", 5) // BEFORE consequent relpaces yelding '&' - escape_subst('>', ">", 4) - escape_subst('<', "<",4) - escape_subst('"', """,6) - escape_value('\t', ' ') - //TODO: XSLT escape_subst('\'', "'", 6) - ); + if(connection) + dest+=connection->quote(dest, row->item.ptr, row->item.size); + else + THROW(0, 0, + this, + "untaint in SQL language failed - no connection specified"); + break; + case UL_JS: + escape(switch(*src) { + case '"': to_string("\\\"", 2); break; + case '\'': to_string("\\'", 2); break; + case '\n': to_string("\\n", 2); break; + case '\\': to_string("\\\\", 2); break; + case '\xFF': to_string("\\\xFF", 2); break; + _default; + }); break; - case HTML_TYPO: + case UL_HTML: + escape(switch(*src) { + case '&': to_string("&", 5); break; + case '>': to_string(">", 4); break; + case '<': to_string("<", 4); break; + case '"': to_string(""", 6); break; + //TODO: XSLT case '\'': to_string("'", 6); break; + _default; + }); + break; + case UL_USER_HTML: { // tainted, untaint language: html-typo - escape( - escape_subst('&', "&", 5) // BEFORE consequent relpaces yelding '&' - escape_subst('>', ">", 4) - escape_subst('<', "<",4) - escape_subst('"', """,6) - escape_value('\t', ' ') - //TODO: $MAIN:html-type table replace, max length(b)==UNTAINT_TIMES_BIGGER*length(a) - ); + char *html_for_typo= + (char *)malloc(row->item.size*2/* '\n' -> '\' 'n' */+1); + // note: + // there still is a possibility that user + // would not replace \n as she supposed to + // and rather replace \ and n into huge strings + // thus causing memory overrun + // this can be dealed by allocating *2 memory, but that's too expensive + size_t html_for_typo_size; + { // local dest + char *dest=html_for_typo; + escape(switch(*src) { + // convinient name for typo match "\n" + case '\r': + if(typo_table) { + *dest++='\\'; *dest++='n'; // \r -> \n + if(src[1]=='\n') { // \r\n -> remove \n + size--; src++; + } + } + break; + case '\n': + if(typo_table) + to_string("\\n", 2); + break; + //TODO: XSLT case '\'': to_string("'", 6); break; + _default; + }); + *dest=0; + html_for_typo_size=dest-html_for_typo; + } + // typo table replacements + const char *src=html_for_typo; + do { + // there is a row where first column starts 'src' + if(Table::Item *item=typo_table->first_that(typo_present, src)) { + // get a=>b values + const String& a=*static_cast(item)->get_string(0); + const String& b=*static_cast(item)->get_string(1); + // empty 'a' | 'b' checks + if(a.size()==0 || b.size()==0) { + pool().set_tag(default_typo_table); // avoid recursion + THROW(0, 0, + typo_table->origin_string(), + "typo table column elements must not be empty"); + } + // overflow check: + // b allowed to be max UNTAINT_TIMES_BIGGER then a + if(b.size()>UNTAINT_TIMES_BIGGER*a.size()) { + pool().set_tag(default_typo_table); // avoid recursion + THROW(0, 0, + &b, + "is %g times longer then '%s', " + "while maximum, handled by Parser, is %d", + ((double)b.size())/a.size(), + a.cstr(), + UNTAINT_TIMES_BIGGER); + } + + // skip 'a' in 'src' + src+=a.size(); + // write 'b' to 'dest' + b.store_to(dest); + dest+=b.size(); + } else + *dest++=*src++; + } while(*src); break; + } default: - THROW(0,0, - this, + THROW(0, 0, + this, "unknown untaint language #%d of %d piece", - static_cast(row->item.lang), - i); + static_cast(row->item.lang), + i); // never + break; // never } - row++; + + if((lang==UL_UNSPECIFIED?row->item.lang:lang)!=UL_CLEAN) + whitespace=false; } chunk=row->link; } while(chunk); break2: - *copy_here=0; - return result; + return dest; }