--- parser3/src/main/untaint.C 2001/03/20 06:45:19 1.10 +++ parser3/src/main/untaint.C 2001/07/09 16:51:54 1.54 @@ -1,61 +1,61 @@ /** @file Parser: String class part: untaint mechanizm. - Copyright (c) 2001 ArtLebedev Group (http://www.artlebedev.com) + Copyright(c) 2001 ArtLebedev Group(http://www.artlebedev.com) - Author: Alexander Petrosyan (http://design.ru/paf) - - $Id: untaint.C,v 1.10 2001/03/20 06:45:19 paf Exp $ + Author: Alexander Petrosyan (http://design.ru/paf) */ - -#include +static const char *RCSId="$Id: untaint.C,v 1.54 2001/07/09 16:51:54 parser Exp $"; #include "pa_pool.h" #include "pa_string.h" #include "pa_hash.h" #include "pa_exception.h" +#include "pa_table.h" +#include "pa_globals.h" +#include "pa_sql_connection.h" -#define escape(cases) \ +#define escape(action) \ { \ - const char *ptr=row->item.ptr; \ - for (int size=row->item.size; size--; ptr++) \ - switch(*ptr) { \ - cases \ - } \ + const char *src=row->item.ptr; \ + for(int size=row->item.size; size--; src++) \ + action \ } -#define escape_value(a, c) case a: *copy_here++=c; break -#define escape_default default: *copy_here++=*ptr; break -#define escape_subst(a, b, bsize) \ - case a: \ - strncpy(copy_here, b, bsize); \ - copy_here+=bsize; \ - break -#define escape_encode(need_encode_func, prefix) \ +#define _default default: *dest++=*src; break +#define encode(need_encode_func, prefix) \ default: \ - if(need_encode_func(*ptr)) { \ + if(need_encode_func(*src)) { \ static const char *hex="0123456789ABCDEF"; \ char chunk[3]={prefix}; \ - chunk[1]=hex[((unsigned char)*ptr)/0x10]; \ - chunk[2]=hex[((unsigned char)*ptr)%0x10]; \ - strncpy(copy_here, chunk, 3); copy_here+=3; \ + chunk[1]=hex[((unsigned char)*src)/0x10]; \ + chunk[2]=hex[((unsigned char)*src)%0x10]; \ + strncpy(dest, chunk, 3); dest+=3; \ } else \ - *copy_here++=*ptr; \ + *dest++=*src; \ break +#define to_char(c) *dest++=c +#define to_string(b, bsize) \ + strncpy(dest, b, bsize); \ + dest+=bsize; \ inline bool need_file_encode(unsigned char c){ - if ((c>='0') && (c<='9') || (c>='A') && (c<='Z') || (c>='a') && (c<='z')) + if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) return false; - return !strchr("./\\", c); + return !strchr( +#ifdef WIN32 + ":\\~" +#endif + "./()_-", c); } inline bool need_uri_encode(unsigned char c){ - if ((c>='0') && (c<='9') || (c>='A') && (c<='Z') || (c>='a') && (c<='z')) + if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) return false; return !strchr("_-./", c); } -inline bool need_header_encode(unsigned char c){ - if(strchr(" ,:", c)) +inline bool need_http_header_encode(unsigned char c){ + if(strchr(" , :", c)) return false; return need_uri_encode(c); @@ -63,111 +63,287 @@ inline bool need_header_encode(unsigned // String -/// @todo optimize whitespaces for all but 'html' -char *String::cstr() const { - char *result=(char *)malloc(size()*UNTAINT_TIMES_BIGGER+1); +static bool typo_present(Array::Item *value, const void *info) { + Array *row=static_cast(value); + const char *src=static_cast(info); + + int partial; + row->get_string(0)->cmp(partial, src); + return + partial==0 || // full match + partial==1; // typo left column starts 'src' +} + +/* + +HTTP-header = field-name ":" [ field-value ] CRLF + + field-name = token + field-value = *( field-content | LWS ) + + field-content = + + + +word = token | quoted-string + +token = 1* + + + +tspecials = "(" | ")" | "<" | ">" | "@" + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT + +SP = +HT = + +LWS = [CRLF] 1*( SP | HT ) +TEXT = - char *copy_here=result; +quoted-pair = "\" CHAR + + if(strchr("()<>@,;:\\\"/[]?={} \t", *ptr)) +*/ +inline bool need_quote_http_header(const char *ptr, size_t size) { + for(; size--; ptr++) + if(strchr(";\\\"= \t" /* excluded ()<>@, :/ ? []{} */, *ptr)) + return true; + return false; +} + +/// @todo maybe additional check "are all pieces are clean?" would be profitable? +size_t String::cstr_bufsize(Untaint_lang lang) const { + return (lang==UL_AS_IS?size():size()*UNTAINT_TIMES_BIGGER) +1; +} + +/** @todo fix theoretical \n mem overrun in TYPO replacements +@todo rename base_64 to quoted_printable [invalid name now] +*/ +char *String::store_to(char *dest, Untaint_lang lang, + SQL_Connection *connection, + const char *charset) const { + // $MAIN:html-typo table + Table *user_typo_table=static_cast(pool().tag()); + Table *typo_table=user_typo_table?user_typo_table:default_typo_table; + + bool whitespace=true; const Chunk *chunk=&head; do { const Chunk::Row *row=chunk->rows; - for(int i=0; icount; i++) { + for(size_t i=0; icount; i++, row++) { if(row==append_here) goto break2; // WARNING: // string can grow only UNTAINT_TIMES_BIGGER - switch(row->item.lang) { - case NO: + switch(lang==UL_UNSPECIFIED?row->item.lang:lang) { + case UL_CLEAN: // clean piece - case YES: + { // optimizing whitespace + const char *src=row->item.ptr; + for(int size=row->item.size; size--; src++) + switch(*src) { + case ' ': case '\n': case '\r': case '\t': + if(!whitespace) { + *dest++=*src; + whitespace=true; + } + break; + default: + whitespace=false; + *dest++=*src; + break; + } + } + break; + case UL_TAINTED: // tainted piece, but undefined untaint language - // for VString.get_double of tainted values + // for VString.as_double of tainted values // for ^process{body} evaluation - case AS_IS: + case UL_AS_IS: // tainted, untaint language: as-is - memcpy(copy_here, row->item.ptr, row->item.size); - copy_here+=row->item.size; + memcpy(dest, row->item.ptr, row->item.size); + dest+=row->item.size; break; - case FILE_NAME: + case UL_FILE_NAME: // tainted, untaint language: file [name] - escape( - escape_value(' ', '_'); - escape_encode(need_file_encode, '-'); - ); + escape(switch(*src) { + case ' ': to_char('_'); break; + encode(need_file_encode, '+'); + }); break; - case URI: + case UL_URI: // tainted, untaint language: uri - escape( - escape_value(' ', '+'); - escape_encode(need_uri_encode, '%'); - ); - break; - case HEADER: - // tainted, untaint language: header - escape( - escape_encode(need_header_encode, '%'); - ); - break; - case TABLE: - escape( - escape_value('\t', ' '); - escape_value('\n', ' '); - escape_default; - ); + escape(switch(*src) { + case ' ': to_char('+'); break; + encode(need_uri_encode, '%'); + }); + break; + case UL_HTTP_HEADER: + // tainted, untaint language: http-header + if(need_quote_http_header(row->item.ptr, row->item.size)) { + *dest++='\"'; + escape(switch(*src) { + case '\"': to_string("\\\"", 2); break; + _default; + }); + *dest++='\"'; + } else { + memcpy(dest, row->item.ptr, row->item.size); + dest+=row->item.size; + } + break; + case UL_MAIL_HEADER: + // tainted, untaint language: mail-header + if(charset) { + // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= + const char *src=row->item.ptr; + bool to_base_64=false; + for(int size=row->item.size; size--; src++) { + if(*src & 0x80) { + if(!to_base_64) { + dest+=sprintf(dest, "=?%.15s?Q?", charset); + to_base_64=true; + } + dest+=sprintf(dest, "=%02X", *src & 0xFF); + } else { + *dest++=*src; + } + } + if(to_base_64) // close + dest+=sprintf(dest, "?="); + } else { + memcpy(dest, row->item.ptr, row->item.size); + dest+=row->item.size; + } + break; + case UL_TABLE: + // tainted, untaint language: table + escape(switch(*src) { + case '\t': to_char(' '); break; + case '\n': to_char(' '); break; + _default; + }); break; - case SQL: + case UL_SQL: // tainted, untaint language: sql - // TODO: зависимость от sql сервера - memset(copy_here, '?', row->item.size); - copy_here+=row->item.size; - break; - case JS: - escape( - escape_subst('"', "\\\"", 2); - escape_subst('\'', "\\'", 2); - escape_subst('\n', "\\n", 2); - escape_subst('\r', "\\r", 2); - escape_subst('\\', "\\\\", 2); - escape_subst('я', "\\я", 2); - escape_default; - ); - break; - case HTML: - escape( - escape_subst('&', "&", 5); // BEFORE consequent relpaces yelding '&' - escape_subst('>', ">", 4); - escape_subst('<', "<",4); - escape_subst('"', """,6); - escape_value('\t', ' '); - //TODO: XSLT escape_subst('\'', "'", 6) - escape_default; - ); + if(connection) + dest+=connection->quote(dest, row->item.ptr, row->item.size); + else + THROW(0, 0, + this, + "untaint in SQL language failed - no connection specified"); break; - case HTML_TYPO: + case UL_JS: + escape(switch(*src) { + case '"': to_string("\\\"", 2); break; + case '\'': to_string("\\'", 2); break; + case '\n': to_string("\\n", 2); break; + case '\\': to_string("\\\\", 2); break; + case '\xFF': to_string("\\\xFF", 2); break; + _default; + }); + break; + case UL_HTML: + escape(switch(*src) { + case '&': to_string("&", 5); break; + case '>': to_string(">", 4); break; + case '<': to_string("<", 4); break; + case '"': to_string(""", 6); break; + //TODO: XSLT case '\'': to_string("'", 6); break; + _default; + }); + break; + case UL_USER_HTML: { // tainted, untaint language: html-typo - escape( - escape_subst('&', "&", 5); // BEFORE consequent relpaces yelding '&' - escape_subst('>', ">", 4); - escape_subst('<', "<",4); - escape_subst('"', """,6); - escape_value('\t', ' '); - //TODO: $MAIN:html-type table replace, max length(b)==UNTAINT_TIMES_BIGGER*length(a) - escape_default; - ); + char *html_for_typo= + (char *)malloc(row->item.size*2/* '\n' -> '\' 'n' */+1); + // note: + // there still is a possibility that user + // would not replace \n as she supposed to + // and rather replace \ and n into huge strings + // thus causing memory overrun + // this can be dealed by allocating *2 memory, but that's too expensive + size_t html_for_typo_size; + { // local dest + char *dest=html_for_typo; + escape(switch(*src) { + // convinient name for typo match "\n" + case '\r': + if(typo_table) { + *dest++='\\'; *dest++='n'; // \r -> \n + if(src[1]=='\n') { // \r\n -> remove \n + size--; src++; + } + } + break; + case '\n': + if(typo_table) + to_string("\\n", 2); + break; + //TODO: XSLT case '\'': to_string("'", 6); break; + _default; + }); + *dest=0; + html_for_typo_size=dest-html_for_typo; + } + // typo table replacements + const char *src=html_for_typo; + do { + // there is a row where first column starts 'src' + if(Table::Item *item=typo_table->first_that(typo_present, src)) { + // get a=>b values + const String& a=*static_cast(item)->get_string(0); + const String& b=*static_cast(item)->get_string(1); + // empty 'a' | 'b' checks + if(a.size()==0 || b.size()==0) { + pool().set_tag(default_typo_table); // avoid recursion + THROW(0, 0, + typo_table->origin_string(), + "typo table column elements must not be empty"); + } + // overflow check: + // b allowed to be max UNTAINT_TIMES_BIGGER then a + if(b.size()>UNTAINT_TIMES_BIGGER*a.size()) { + pool().set_tag(default_typo_table); // avoid recursion + THROW(0, 0, + &b, + "is %g times longer then '%s', " + "while maximum, handled by Parser, is %d", + ((double)b.size())/a.size(), + a.cstr(), + UNTAINT_TIMES_BIGGER); + } + + // skip 'a' in 'src' + src+=a.size(); + // write 'b' to 'dest' + b.store_to(dest); + dest+=b.size(); + } else + *dest++=*src++; + } while(*src); break; + } default: - THROW(0,0, - this, + THROW(0, 0, + this, "unknown untaint language #%d of %d piece", - static_cast(row->item.lang), - i); + static_cast(row->item.lang), + i); // never + break; // never } - row++; + + if((lang==UL_UNSPECIFIED?row->item.lang:lang)!=UL_CLEAN) + whitespace=false; } chunk=row->link; } while(chunk); break2: - *copy_here=0; - return result; + return dest; }