--- parser3/src/main/untaint.C 2001/03/24 19:12:20 1.13 +++ parser3/src/main/untaint.C 2001/11/21 08:26:55 1.78 @@ -2,35 +2,28 @@ Parser: String class part: untaint mechanizm. Copyright(c) 2001 ArtLebedev Group(http://www.artlebedev.com) + Author: Alexander Petrosyan (http://paf.design.ru) - Author: Alexander Petrosyan (http://design.ru/paf) - - $Id: untaint.C,v 1.13 2001/03/24 19:12:20 paf Exp $ + $Id: untaint.C,v 1.78 2001/11/21 08:26:55 paf Exp $ */ -#include "pa_config_includes.h" - #include "pa_pool.h" #include "pa_string.h" #include "pa_hash.h" #include "pa_exception.h" #include "pa_table.h" +#include "pa_globals.h" +#include "pa_sql_connection.h" +#include "pa_dictionary.h" +#include "pa_common.h" -#define escape(cases) \ +#define escape(action) \ { \ const char *src=row->item.ptr; \ for(int size=row->item.size; size--; src++) \ - switch(*src) { \ - cases \ - } \ + action \ } -#define to_char(a, c) case a: *dest++=c; break #define _default default: *dest++=*src; break -#define to_string(a, b, bsize) \ - case a: \ - strncpy(dest, b, bsize); \ - dest+=bsize; \ - break #define encode(need_encode_func, prefix) \ default: \ if(need_encode_func(*src)) { \ @@ -38,16 +31,24 @@ char chunk[3]={prefix}; \ chunk[1]=hex[((unsigned char)*src)/0x10]; \ chunk[2]=hex[((unsigned char)*src)%0x10]; \ - strncpy(dest, chunk, 3); dest+=3; \ + memcpy(dest, chunk, 3); dest+=3; \ } else \ *dest++=*src; \ break +#define to_char(c) *dest++=c +#define to_string(b, bsize) \ + memcpy(dest, b, bsize); \ + dest+=bsize; \ inline bool need_file_encode(unsigned char c){ if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) return false; - return !strchr("./", c); + return !strchr( +#ifdef WIN32 + ":\\~" +#endif + "./()_-", c); } inline bool need_uri_encode(unsigned char c){ if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) @@ -55,176 +56,394 @@ inline bool need_uri_encode(unsigned cha return !strchr("_-./", c); } -inline bool need_header_encode(unsigned char c){ - if(strchr(" ,:", c)) +inline bool need_http_header_encode(unsigned char c){ + if(strchr(" , :", c)) return false; return need_uri_encode(c); } +// + +static const char * String_Untaint_lang_name[]={ + "U", ///< zero value handy for hash lookup @see untaint_lang_name2enum + "C", ///< clean + "T", ///< tainted, untaint language as assigned later + // untaint languages. assigned by ^untaint[lang]{...} + "P", + /**< + leave language built into string being appended. + just a flag, that value not stored + */ + "A", ///< leave all characters intact + "F", ///< file specification + "H", ///< ext in HTTP response header + "M", ///< text in mail header + "URI", ///< text in uri + "T", ///< ^table:set body + "SQL", ///< ^table:sql body + "JS", ///< JavaScript code + "XML", ///< ^dom:set xml + "HTML", ///< HTML code (for editing) + "UHTML", ///< HTML code with USER chars +}; + + // String -static bool typo_present(Array::Item *value, const void *info) { - Array *row=static_cast(value); - const char *src=static_cast(info); - - int partial; - row->get_string(0)->cmp(src, partial); - return partial==-1; +/* + +HTTP-header = field-name ":" [ field-value ] CRLF + + field-name = token + field-value = *( field-content | LWS ) + + field-content = + + + +word = token | quoted-string + +token = 1* + + + +tspecials = "(" | ")" | "<" | ">" | "@" + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT + +SP = +HT = + +LWS = [CRLF] 1*( SP | HT ) +TEXT = + +quoted-pair = "\" CHAR + + if(strchr("()<>@,;:\\\"/[]?={} \t", *ptr)) +*/ +inline bool need_quote_http_header(const char *ptr, size_t size) { + for(; size--; ptr++) + if(strchr(";\\\"= \t" /* excluded ()<>@, :/ ? []{} */, *ptr)) + return true; + return false; } -/// @todo optimize whitespaces for all but 'html' -char *String::store_to(char *dest) const { - // $MAIN:html-typo table - Table *typo_table=static_cast(pool().tag()); +String& String::append(const String& src, uchar lang, bool forced) { + // manually unrolled code to avoid do{if(const)} constructs + if(forced) + STRING_SRC_FOREACH_ROW( + APPEND(row->item.ptr, row->item.size, + lang, //forcing passed lang + row->item.origin.file, row->item.origin.line); + ) + else if(lang==UL_PASS_APPENDED) + STRING_SRC_FOREACH_ROW( + APPEND(row->item.ptr, row->item.size, + row->item.lang, // passing item's lang + row->item.origin.file, row->item.origin.line); + ) + else if(lang&UL_OPTIMIZE_BIT) // main idea here + // tainted piece would get OPTIMIZED bit from 'lang' + // clean piece would be marked OPTIMIZED manually + // pieces with determined languages [not tainted|clean] would retain theirs langs + STRING_SRC_FOREACH_ROW( + APPEND(row->item.ptr, row->item.size, + row->item.lang==UL_TAINTED?lang:( + row->item.lang==UL_CLEAN?UL_CLEAN|UL_OPTIMIZE_BIT: // ORing with OPTIMIZED flag + row->item.lang + ), + row->item.origin.file, row->item.origin.line); + ) + else + STRING_SRC_FOREACH_ROW( + APPEND(row->item.ptr, row->item.size, + row->item.lang==UL_TAINTED?lang:row->item.lang, + row->item.origin.file, row->item.origin.line); + ); +break2: + return *this; +} +size_t String::cstr_bufsize(Untaint_lang lang, + SQL_Connection *connection, + const char *charset) const { + size_t dest=1; // for terminating 0 + STRING_FOREACH_ROW( + uchar to_lang=lang==UL_UNSPECIFIED?row->item.lang:lang; + + switch(to_lang & ~UL_OPTIMIZE_BIT) { + case UL_CLEAN: + case UL_TAINTED: + case UL_AS_IS: + // clean piece + + // tainted piece, but undefined untaint language + // for VString.as_double of tainted values + // for ^process{body} evaluation + + // tainted, untaint language: as-is + dest+=row->item.size; + break; + case UL_FILE_SPEC: + // tainted, untaint language: file [name] + dest+=row->item.size*3/* worst: Z->%XX */; + break; + case UL_URI: + // tainted, untaint language: uri + dest+=row->item.size*3/* worst: Z->%XX */; + break; + case UL_HTTP_HEADER: + // tainted, untaint language: http-field-content-text + dest+=row->item.size*3/* worst: Z->%XX */; + break; + case UL_MAIL_HEADER: + // tainted, untaint language: mail-header + if(charset) { + // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= + dest+=row->item.size*3+MAX_STRING/* worst: =?charset?Q?=%XX?= */; + } else { + dest+=row->item.size; + } + break; + case UL_TABLE: + // tainted, untaint language: table + dest+=row->item.size; + break; + case UL_SQL: + // tainted, untaint language: sql + if(connection) + dest+=connection->quote(0, row->item.ptr, row->item.size); + break; + case UL_JS: + escape(switch(*src) { + case '"': case '\'': case '\n': case '\\': case '\xFF': + dest+=2; break; + default: + dest++; break; + }); + break; + case UL_XML: + escape(switch(*src) { + case '&': case '>': case '<': case '"': case '\'': + dest+= 6; break; + default: + dest++; break; + }); + break; + case UL_HTML: + escape(switch(*src) { + case '&': + case '>': + case '<': + case '"': + dest+=6; break; + default: + dest++; break; + }); + break; + } + ); +break2: + return dest; +} + +char *String::store_to(char *dest, Untaint_lang lang, + SQL_Connection *connection, + const char *charset) const { + // WARNING: + // before any changes check cstr_bufsize first!!! + bool whitespace=true; + const Chunk *chunk=&head; \ + do { \ + const Chunk::Row *row=chunk->rows; \ + for(uint i=0; icount; i++, row++) { \ + if(row==append_here) \ + goto break2; \ + \ + uchar to_lang=lang==UL_UNSPECIFIED?row->item.lang:lang; + + char *start=dest; + + switch(to_lang & ~UL_OPTIMIZE_BIT) { + case UL_CLEAN: + case UL_TAINTED: + case UL_AS_IS: + // clean piece + + // tainted piece, but undefined untaint language + // for VString.as_double of tainted values + // for ^process{body} evaluation + + // tainted, untaint language: as-is + memcpy(dest, row->item.ptr, row->item.size); + dest+=row->item.size; + break; + case UL_FILE_SPEC: + // tainted, untaint language: file [name] + escape(switch(*src) { + case ' ': to_char('_'); break; + encode(need_file_encode, '+'); + }); + break; + case UL_URI: + // tainted, untaint language: uri + escape(switch(*src) { + case ' ': to_char('+'); break; + encode(need_uri_encode, '%'); + }); + break; + case UL_HTTP_HEADER: + // tainted, untaint language: http-field-content-text + escape(switch(*src) { + case ' ': to_char('+'); break; + encode(need_uri_encode, '%'); + }); + break; + case UL_MAIL_HEADER: + // tainted, untaint language: mail-header + if(charset) { + // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= + const char *src=row->item.ptr; + bool to_quoted_printable=false; + for(int size=row->item.size; size--; src++) { + if(*src & 0x80) { + if(!to_quoted_printable) { + dest+=sprintf(dest, "=?%.15s?Q?", charset); + to_quoted_printable=true; + } + dest+=sprintf(dest, "=%02X", *src & 0xFF); + } else { + *dest++=*src; + } + } + if(to_quoted_printable) // close + dest+=sprintf(dest, "?="); + } else { + memcpy(dest, row->item.ptr, row->item.size); + dest+=row->item.size; + } + break; + case UL_TABLE: + // tainted, untaint language: table + escape(switch(*src) { + case '\t': to_char(' '); break; + case '\n': to_char(' '); break; + _default; + }); + break; + case UL_SQL: + // tainted, untaint language: sql + if(connection) + dest+=connection->quote(dest, row->item.ptr, row->item.size); + else + throw Exception(0, 0, + this, + "untaint in SQL language failed - no connection specified"); + break; + case UL_JS: + escape(switch(*src) { + case '"': to_string("\\\"", 2); break; + case '\'': to_string("\\'", 2); break; + case '\n': to_string("\\n", 2); break; + case '\\': to_string("\\\\", 2); break; + case '\xFF': to_string("\\\xFF", 2); break; + _default; + }); + break; + case UL_XML: + escape(switch(*src) { + case '&': to_string("&", 5); break; + case '>': to_string(">", 4); break; + case '<': to_string("<", 4); break; + case '"': to_string(""", 6); break; + case '\'': to_string("'", 6); break; + _default; + }); + break; + case UL_HTML: + escape(switch(*src) { + case '&': to_string("&", 5); break; + case '>': to_string(">", 4); break; + case '<': to_string("<", 4); break; + case '"': to_string(""", 6); break; + _default; + }); + break; + default: + throw Exception(0, 0, + this, + "unknown untaint language #%d of %d piece", + static_cast(row->item.lang), + i); // never + break; // never + } + + if(to_lang & UL_OPTIMIZE_BIT) { + // optimizing whitespace + char *stop=dest; dest=start; + for(char *src=start; srclink; \ + } while(chunk); \ + +break2: + return dest; +} + +char *String::cstr_debug_origins() const { + char *result=(char *)malloc(size()+used_rows()*MAX_STRING*2); + char *dest=result; + const Chunk *chunk=&head; do { const Chunk::Row *row=chunk->rows; - for(int i=0; icount; i++) { + for(uint i=0; icount; i++, row++) { if(row==append_here) goto break2; - // WARNING: - // string can grow only UNTAINT_TIMES_BIGGER - switch(row->item.lang) { - case UL_NO: - // clean piece - case UL_YES: - // tainted piece, but undefined untaint language - // for VString.get_double of tainted values - // for ^process{body} evaluation - case UL_AS_IS: - // tainted, untaint language: as-is - memcpy(dest, row->item.ptr, row->item.size); - dest+=row->item.size; - break; - case UL_FILE_NAME: - // tainted, untaint language: file [name] - escape( - to_char(' ', '_'); - encode(need_file_encode, '-'); - ); - break; - case UL_URI: - // tainted, untaint language: uri - escape( - to_char(' ', '+'); - encode(need_uri_encode, '%'); - ); - break; - case UL_HEADER: - // tainted, untaint language: header - escape( - encode(need_header_encode, '%'); - ); - break; - case UL_TABLE: - escape( - to_char('\t', ' '); - to_char('\n', ' '); - _default; - ); - break; - case UL_SQL: - // tainted, untaint language: sql - // TODO: зависимость от sql сервера - memset(dest, '?', row->item.size); - dest+=row->item.size; - break; - case UL_JS: - escape( - to_string('"', "\\\"", 2); - to_string('\'', "\\'", 2); - to_string('\n', "\\n", 2); - to_string('\r', "\\r", 2); - to_string('\\', "\\\\", 2); - to_string('\xFF', "\\\xFF", 2); - _default; - ); - break; - case UL_HTML: - escape( - to_string('&', "&", 5); // BEFORE consequent relpaces yelding '&' - to_string('>', ">", 4); - to_string('<', "<",4); - to_string('"', """,6); - to_char('\t', ' '); - //TODO: XSLT to_string('\'', "'", 6) - _default; - ); - break; - case UL_HTML_TYPO: { - // tainted, untaint language: html-typo - char *html=(char *)malloc(size()*6/*""" the longest possible*/+1); - size_t html_size; - { // local dest - char *dest=html; - escape( - to_string('&', "&", 5); // BEFORE consequent relpaces yelding '&' - to_string('>', ">", 4); - to_string('<', "<",4); - to_string('"', """,6); - to_char('\t', ' '); - _default; - ); - *dest=0; - html_size=dest-html; - } - // typo table replacements - if(typo_table) { - const char *src=html; - do { - // there is a row where first column starts 'src' - if(Table::Item *item=typo_table->first_that(typo_present, src)) { - // get a=>b values - const String& a=*static_cast(item)->get_string(0); - const String& b=*static_cast(item)->get_string(1); - // empty 'a' check - if(a.size()==0) { - pool().set_tag(0); // avoid recursion - THROW(0, 0, - &a, - "typo table first column elements must not be empty"); - } - // overflow check: - // b allowed to be max UNTAINT_TIMES_BIGGER then a - if(b.size()>UNTAINT_TIMES_BIGGER*a.size()) { - pool().set_tag(0); // avoid recursion - THROW(0, 0, - &b, - "is %g times longer then '%s', " - "while maximum, handled by Parser, is %d", - ((double)b.size())/a.size(), - a.cstr(), - UNTAINT_TIMES_BIGGER); - } - - // skip 'a' in 'src' - src+=a.size(); - // write 'b' to 'dest' - b.store_to(dest); - dest+=b.size(); - } else - *dest++=*src++; - } while(*src); - } else { - memcpy(dest, html, html_size); - dest+=html_size; - } - break; - } - default: - THROW(0,0, - this, - "unknown untaint language #%d of %d piece", - static_cast(row->item.lang), - i); - } - row++; +#ifndef NO_STRING_ORIGIN + if(row->item.origin.file) + dest+=sprintf(dest, ORIGIN_FILE_LINE_FORMAT, + row->item.origin.file, + 1+row->item.origin.line); + else + dest+=sprintf(dest, ""); +#endif + dest+=sprintf(dest, "#%s: ", + String_Untaint_lang_name[row->item.lang]); + char *dest_after_origins=dest; + + memcpy(dest, row->item.ptr, row->item.size); + dest+=row->item.size; + + remove_crlf(dest_after_origins, dest); + to_char('\n'); } chunk=row->link; } while(chunk); + break2: - return dest; + *dest=0; + return result; }