--- parser3/src/main/untaint.C 2001/03/12 21:54:20 1.2 +++ parser3/src/main/untaint.C 2010/10/28 21:49:46 1.158 @@ -1,120 +1,635 @@ -/* - Parser - Copyright (c) 2001 ArtLebedev Group (http://www.artlebedev.com) - Author: Alexander Petrosyan (http://design.ru/paf) +/** @file + Parser: String class part: untaint mechanizm. - $Id: untaint.C,v 1.2 2001/03/12 21:54:20 paf Exp $ + Copyright(c) 2001-2005 ArtLebedev Group (http://www.artlebedev.com) + Author: Alexandr Petrosian (http://paf.design.ru) */ -#include +static const char * const IDENT_UNTAINT_C="$Date: 2010/10/28 21:49:46 $"; + -#include "pa_pool.h" #include "pa_string.h" #include "pa_hash.h" #include "pa_exception.h" +#include "pa_table.h" +#include "pa_globals.h" +#include "pa_dictionary.h" +#include "pa_common.h" +#include "pa_charset.h" +#include "pa_request_charsets.h" +#include "pa_sapi.h" -#define escape(cases) \ - { \ - const char *ptr=row->item.ptr; \ - int size=row->item.size; \ - for (;*ptr;ptr++) \ - switch(*ptr) { \ - cases \ - default: *copy_here++=*ptr; break; \ - } \ - } -#define escape_value(a, c) case a: *copy_here++=c; break; -#define escape_subst(a, b, bsize) \ - case a: \ - { \ - strncpy(copy_here, b, bsize); \ - copy_here+=bsize; \ +extern "C" { // author forgot to do that +#include "ec.h" +} + +#include "pa_sql_connection.h" + +// defines + +#undef CORD_ec_append +// redefining to intercept flushes and implement whitespace optimization +// of all consequent white space chars leaving only first one +#define CORD_ec_append(x, c) \ + { \ + bool skip=false; \ + if(optimize) switch(c) { \ + case ' ': case '\n': case '\t': \ + if(whitespace) \ + skip=true; /*skipping subsequent*/ \ + else \ + whitespace=true; \ + break; \ + default: \ + whitespace=false; \ + break; \ + } \ + if(!skip) { \ + if ((x)[0].ec_bufptr == (x)[0].ec_buf + CORD_BUFSZ) { \ + CORD_ec_flush_buf(x); \ } \ - break; + *((x)[0].ec_bufptr)++ = (c); \ + } \ + } + + +#define escape_fragment(action) \ + for(; fragment_length--; CORD_next(info->pos)) { \ + char c=CORD_pos_fetch(info->pos); \ + action \ + } +#define _default CORD_ec_append(info->result, c) +#define encode(need_encode_func, prefix, otherwise) \ + if(need_encode_func(c)) { \ + static const char* hex="0123456789ABCDEF"; \ + CORD_ec_append(info->result, prefix); \ + CORD_ec_append(info->result, hex[((unsigned char)c)/0x10]); \ + CORD_ec_append(info->result, hex[((unsigned char)c)%0x10]); \ + } else \ + CORD_ec_append(info->result, otherwise); +#define to_char(c) { CORD_ec_append(info->result, c); whitespace=false; } +#define to_string(s) { CORD_ec_append_cord(info->result, s); whitespace=false; } + +inline bool need_file_encode(unsigned char c){ + // russian letters and space ENABLED + // encoding only these... + return strchr( + "*?'\"<>|" +#ifndef WIN32 + ":\\" +#endif + , c)!=0; +} + +inline bool need_uri_encode(unsigned char c){ + if((c>='0') && (c<='9') || (c>='A') && (c<='Z') || (c>='a') && (c<='z')) + return false; + + return !strchr("_-./*", c); +} + +inline bool need_regex_escape(unsigned char c){ + return strchr("\\^$.[]|()?*+{}-", c)!=0; +} + +inline bool need_parser_code_escape(unsigned char c){ + return strchr("^$;@()[]{}:#\"", c)!=0; +} // String -char *String::cstr() const { - char *result=(char *)malloc(size()*UNTAINT_TIMES_BIGGER+1); +/* +HTTP-header = field-name ":" [ field-value ] CRLF - char *copy_here=result; - const Chunk *chunk=&head; - // TODO: оптимизировать whitespaces для всех, кроме 'html' - do { - const Chunk::Row *row=chunk->rows; - for(int i=0; icount; i++) { - if(row==append_here) - goto break2; - - // WARNING: - // string can grow only UNTAINT_TIMES_BIGGER - switch(row->item.lang) { - case NO: - // clean piece - case YES: - // tainted piece, but undefined untaint language - // for VString.get_double of tainted values - // for ^process{body} evaluation - case AS_IS: - // tainted, untaint language: as-is - memcpy(copy_here, row->item.ptr, row->item.size); - copy_here+=row->item.size; - break; - case TABLE: - escape( - escape_value('\t', ' ') - escape_value('\n', ' ') - ); - break; - case SQL: - // tainted, untaint language: sql - // TODO: зависимость от sql сервера - memset(copy_here, '?', row->item.size); - copy_here+=row->item.size; - break; - case JS: - escape( - escape_subst('"', "\\\"", 2) - escape_subst('\'', "\\'", 2) - escape_subst('\n', "\\n", 2) - escape_subst('\r', "\\r", 2) - escape_subst('\\', "\\\\", 2) - escape_subst('я', "\\я", 2) - ); +field-name = token +field-value = *( field-content | LWS ) + +field-content = + + +token = 1* +word = token | quoted-string +quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) +qdtext = > +quoted-pair = "\" CHAR + +OCTET = +CHAR = + +tspecials = "(" | ")" | "<" | ">" | "@" + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT + +SP = +HT = + +LWS = [CRLF] 1*( SP | HT ) +TEXT = +CTL = + + + if(strchr("()<>@,;:\\\"/[]?={} \t", *ptr)) +*/ +inline bool need_quote_http_header(const char* ptr, size_t size) { + for(; size--; ptr++) + if(strchr(";\\\"= \t" /* excluded ()<>@, :/ ? []{} */, *ptr)) + return true; + return false; +} + +#ifndef DOXYGEN +struct Append_fragment_info { + String::Language lang; + String::Languages* dest_languages; + size_t dest_body_plan_length; +}; +#endif +int append_fragment_optimizing(char alang, size_t asize, Append_fragment_info* info) { + const String::Language lang=(String::Language)(unsigned char)alang; + // main idea here: + // tainted piece would get OPTIMIZED bit from 'lang' + // clean piece would be marked OPTIMIZED manually + // pieces with determined languages [not tainted|clean] would retain theirs langs + info->dest_languages->append(info->dest_body_plan_length, + lang==String::L_TAINTED? + info->lang + :lang==String::L_CLEAN? + (String::Language)(String::L_CLEAN|String::L_OPTIMIZE_BIT) // ORing with OPTIMIZED flag + :lang, + asize); + info->dest_body_plan_length+=asize; + + return 0; // 0=continue +} +int append_fragment_nonoptimizing(char alang, size_t asize, Append_fragment_info* info) { + const String::Language lang=(String::Language)(unsigned char)alang; + // The core idea: tainted pieces got marked with context's lang + info->dest_languages->append(info->dest_body_plan_length, + lang==String::L_TAINTED? + info->lang + :lang, + asize); + info->dest_body_plan_length+=asize; + + return 0; // 0=continue +} + +/** + appends to other String, + marking all tainted pieces of it with @a lang. + or marking ALL pieces of it with a @a lang when @a forced to, + and propagating OPTIMIZE language bit. +*/ +String& String::append_to(String& dest, Language ilang, bool forced) const { + if(is_empty()) + return dest; + + // first: fragment infos + + if(ilang==L_PASS_APPENDED) // without language-change? + dest.langs.appendHelper(dest.body, langs, body); + else if(forced) //forcing passed lang? + dest.langs.appendHelper(dest.body, ilang, body); + else { + if(langs.opt.is_not_just_lang){ + Append_fragment_info info={ilang, &dest.langs, dest.body.length()}; + langs.for_each(body, ilang&L_OPTIMIZE_BIT? + append_fragment_optimizing + :append_fragment_nonoptimizing, &info); + } else { + Language lang=langs.opt.lang; + // see append_fragment_* for explanation + if(ilang&L_OPTIMIZE_BIT){ + dest.langs.appendHelper(dest.body, + lang==String::L_TAINTED? + ilang + :lang==String::L_CLEAN? + (String::Language)(String::L_CLEAN|String::L_OPTIMIZE_BIT) + :lang, + body); + } else { + dest.langs.appendHelper(dest.body, lang==String::L_TAINTED ? ilang:lang, body); + } + } + } + + // next: letters + dest.body<. An 'encoded-word' that appears within a + 'phrase' MUST be separated from any adjacent 'word', 'text' or + 'special' by 'linear-white-space'. +... + (2) The 8-bit hexadecimal value 20 (e.g., ISO-8859-1 SPACE) may be + represented as "_" (underscore, ASCII 95.). (This character may + not pass through some internetwork mail gateways, but its use + will greatly enhance readability of "Q" encoded data with mail + readers that do not support this encoding.) Note that the "_" + always represents hexadecimal 20, even if the SPACE character + occupies a different code position in the character set in use. + + paf: obviously, + without "=", or one could not differ "=E0" and "russian letter a" + and without "_", or in would mean 0x20 +*/ +inline bool mail_header_char_valid_within_Qencoded(char c) { + return c>='A' && c<='Z' + || c>='a' && c<='Z' + || c>='0' && c<='9' + || strchr("!*+-/", c); +} +inline bool addr_spec_soon(const char *src) { + for(char c; (c=*src); src++) + if(c=='<') + return true; + else if(!(c==' ' || c=='\t')) + return false; + return false; +} +/** + RFC + Upper case should be used for hexadecimal digits "A" through "F" + The 8-bit hexadecimal value 20 (e.g., ISO-8859-1 SPACE) + may be represented as "_" +*/ +inline bool mail_header_nonspace_char(char c) { + return c != 0x20; +} + +inline void ec_append(CORD_ec& result, bool& optimize, bool& whitespace, CORD_pos pos, size_t size) { + while(size--) { + CORD_ec_append(result, CORD_pos_fetch(pos)); + CORD_next(pos); + } +} +inline void pa_CORD_pos_advance(CORD_pos pos, size_t n) { + while(true) { + long avail=CORD_pos_chars_left(pos); + if(avail<=0) { + CORD_next(pos); + if(!--n) break; - case HTML: - escape( - escape_subst('&', "&", 5) // BEFORE consequent relpaces yelding '&' - escape_subst('>', ">", 4) - escape_subst('<', "<",4) - escape_subst('"', """,6) - escape_value('\t', ' ') - //TODO: XSLT escape_subst('\'', "'", 6) - ); + } else if((size_t)avail=n + CORD_pos_advance(pos, n); + break; + } + } +} + +#ifndef DOXYGEN +struct Cstr_to_string_body_block_info { + // input + String::Language lang; + SQL_Connection* connection; + const Request_charsets* charsets; + const String::Body* body; + + // output + CORD_ec result; + + // private + CORD_pos pos; + size_t fragment_begin; + bool whitespace; + const char* exception; +}; +#endif + +// @todo: replace info->body->mid with something that uses info->pos +int cstr_to_string_body_block(String::Language to_lang, size_t fragment_length, Cstr_to_string_body_block_info* info) { + bool& whitespace=info->whitespace; + size_t fragment_end=info->fragment_begin+fragment_length; + //fprintf(stderr, "%d, %d =%s=\n", to_lang, fragment_length, info->body->cstr()); + + bool optimize=(to_lang & String::L_OPTIMIZE_BIT)!=0; + if(!optimize) + whitespace=false; + + switch(to_lang & ~String::L_OPTIMIZE_BIT) { + case String::L_CLEAN: + case String::L_TAINTED: + case String::L_AS_IS: + // clean piece + + // tainted piece, but undefined untaint language + // for VString.as_double of tainted values + // for ^process{body} evaluation + + // tainted, untaint language: as-is + ec_append(info->result, optimize, whitespace, info->pos, fragment_length); + break; + case String::L_FILE_SPEC: + // tainted, untaint language: file [name] + { + escape_fragment( + encode(need_file_encode, '_', c); + ); + } + break; + case String::L_URI: + // tainted, untaint language: uri + { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + if(info->charsets) + output=Charset::transcode(output, + info->charsets->source(), + info->charsets->client()); + + char c; + for(const char* src=output.str; (c=*src++); ) + encode(need_uri_encode, '%', c); + } + break; + case String::L_HTTP_HEADER: + // tainted, untaint language: http-field-content-text + escape_fragment( + encode(need_uri_encode, '%', c); + ); + break; + case String::L_MAIL_HEADER: + // tainted, untaint language: mail-header + // http://www.ietf.org/rfc/rfc2047.txt + if(info->charsets) { + size_t mail_size; + const char *mail_ptr= + info->body->mid(info->fragment_begin, mail_size=fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, mail_size); + + const char* charset_name=info->charsets->mail().NAME().cstr(); + + // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= + bool to_quoted_printable=false; + + bool email=false; + uchar c; + for(const char* src=mail_ptr; (c=(uchar)*src++); ) { + if(c=='\r' || c=='\n') + c=' '; + if(to_quoted_printable && (c==',' || c == '"' || addr_spec_soon(src-1/*position to 'c'*/))) { + email=c=='<'; + to_string("?="); + to_quoted_printable=false; + } + //RFC + An 'encoded-word' MUST NOT appear in any portion of an 'addr-spec'. + if(!email && ( + !to_quoted_printable && (c & 0x80) // starting quote-printable-encoding on first 8bit char + || to_quoted_printable && !mail_header_char_valid_within_Qencoded(c) + )) { + if(!to_quoted_printable) { + to_string("=?"); + to_string(charset_name); + to_string("?Q?"); + to_quoted_printable=true; + } + encode(mail_header_nonspace_char, '=', '_'); + } else + to_char(c); + if(c=='>') + email=false; + } + if(to_quoted_printable) // close + to_string("?="); + + } else + ec_append(info->result, optimize, whitespace, info->pos, fragment_length); + break; + case String::L_SQL: + // tainted, untaint language: sql + if(info->connection) { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + + to_string(info->connection->quote(fragment_str, fragment_length)); + } else { + info->exception="untaint in SQL language failed - no connection specified"; + info->fragment_begin=fragment_end; + return 1; // stop processing. can't throw exception here + } + break; + case String::L_JS: + escape_fragment(switch(c) { + case '\n': to_string("\\n"); break; + case '"': to_string("\\\""); break; + case '\'': to_string("\\'"); break; + case '\\': to_string("\\\\"); break; + case '\xFF': to_string("\\\xFF"); break; + case '\r': to_string("\\r"); break; + default: _default; break; + }); + break; + case String::L_XML: + // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + escape_fragment(switch(c) { + case '\x20': + case '\x9': + case '\xA': + case '\xD': // this is usually removed on input + _default; break; - case HTML_TYPO: - // tainted, untaint language: html-typo - escape( - escape_subst('&', "&", 5) // BEFORE consequent relpaces yelding '&' - escape_subst('>', ">", 4) - escape_subst('<', "<",4) - escape_subst('"', """,6) - escape_value('\t', ' ') - //TODO: $MAIN:html-type table replace, max length(b)==UNTAINT_TIMES_BIGGER*length(a) - ); + case '&': to_string("&"); break; + case '>': to_string(">"); break; + case '<': to_string("<"); break; + case '"': to_string("""); break; + case '\'': to_string("'"); break; + default: + if(((unsigned char)c)<0x20) { + // fixing it, so that libxml would not result + // in fatal error parsing text + // though it really violates standard. + // to indicate there were an error + // replace bad char not to it's code, + // which we can do, + // but rather to '!' to show that input were actually + // invalid. + // life: shows that MSIE can somehow garble form values + // so that they contain these chars. + to_char('!'); + } else { + _default; + } break; - default: - THROW(0,0, - this, - "unknown untaint language #%d of %d piece", - static_cast(row->item.lang), - i); + }); + break; + case String::L_HTML: + escape_fragment(switch(c) { + case '&': to_string("&"); break; + case '>': to_string(">"); break; + case '<': to_string("<"); break; + case '"': to_string("""); break; + default: _default; break; + }); + break; + case String::L_REGEX: + // tainted, untaint language: regex + escape_fragment( + if(need_regex_escape(c)) + to_char('\\') + _default; + ); + break; + case String::L_JSON: + // tainted, untaint language: json + // escape '"' '\' '/' '\n' '\t' '\r' '\b' '\f' chars and escape chars as \uXXXX if output charset != UTF-8 + { + if(info->charsets->client().isUTF8()){ + // escaping to \uXXXX is not needed + escape_fragment(switch(c) { + case '\n': to_string("\\n"); break; + case '"' : to_string("\\\""); break; + case '\\': to_string("\\\\"); break; + case '/' : to_string("\\/"); break; + case '\t': to_string("\\t"); break; + case '\r': to_string("\\r"); break; + case '\b': to_string("\\b"); break; + case '\f': to_string("\\f"); break; + default : _default; break; + }); + } else { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + + output=Charset::escape_JSON(output, info->charsets->source()); + to_string(output); } - row++; } - chunk=row->link; - } while(chunk); -break2: - *copy_here=0; - return result; + break; + case String::L_HTTP_COOKIE: + // tainted, untaint language: cookie (3.3.0 and higher: %uXXXX in UTF-8) + { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + + output=Charset::escape(output, info->charsets->source()); + //throw Exception(0, 0, output); + to_string(output); + + } + break; + case String::L_PARSER_CODE: + // for auto-untaint in process + escape_fragment( + if(need_parser_code_escape(c)) + to_char('^'); + _default; + ); + break; + default: + SAPI::abort("unknown untaint language #%d", + static_cast(to_lang)); // should never + break; // never + } + + info->fragment_begin=fragment_end; + + return 0; // 0=continue +} + + +String::Body String::cstr_to_string_body_taint(Language lang, SQL_Connection* connection, const Request_charsets *charsets) const { + if(is_empty()) + return String::Body(); + + Cstr_to_string_body_block_info info; + // input + info.lang=lang; + info.connection=connection; + info.charsets=charsets; + info.body=&body; + // output + CORD_ec_init(info.result); + // private + body.set_pos(info.pos, 0); + info.fragment_begin=0; + info.exception=0; + info.whitespace=true; + + cstr_to_string_body_block(lang, length(), &info); + + if(info.exception) + throw Exception(0, + 0, + info.exception); + + return String::Body(CORD_ec_to_cord(info.result), info.fragment_begin); +} + +int cstr_to_string_body_block_untaint(char alang, size_t fragment_length, Cstr_to_string_body_block_info* info){ + const String::Language lang=(String::Language)(unsigned char)alang; + // see append_fragment_* for explanation + if(info->lang&String::L_OPTIMIZE_BIT) + return cstr_to_string_body_block( + lang==String::L_TAINTED? + info->lang + :lang==String::L_CLEAN? + (String::Language)(String::L_CLEAN|String::L_OPTIMIZE_BIT) + :lang, + fragment_length, info); + else + return cstr_to_string_body_block(lang==String::L_TAINTED ? info->lang:lang, fragment_length, info); +} + +String::Body String::cstr_to_string_body_untaint(Language lang, SQL_Connection* connection, const Request_charsets *charsets) const { + if(is_empty()) + return String::Body(); + + Cstr_to_string_body_block_info info; + // input + info.lang=lang; + info.connection=connection; + info.charsets=charsets; + info.body=&body; + // output + CORD_ec_init(info.result); + // private + body.set_pos(info.pos, 0); + info.fragment_begin=0; + info.exception=0; + info.whitespace=true; + + langs.for_each(body, cstr_to_string_body_block_untaint, &info); + + if(info.exception) + throw Exception(0, + 0, + info.exception); + + return String::Body(CORD_ec_to_cord(info.result), info.fragment_begin); +} + +const char* String::untaint_and_transcode_cstr(Language lang, const Request_charsets *charsets) const { + if(charsets && &charsets->source() != &charsets->client()){ + // Note: L_URI is allready transcoded during untaint, but transcode does not affect %XX + return Charset::transcode(cstr_to_string_body_untaint(lang, 0, charsets), charsets->source(), charsets->client()).cstr(); + } else + return cstr_to_string_body_untaint(lang, 0, charsets).cstr(); }