--- parser3/src/main/untaint.C 2009/07/06 08:45:35 1.148 +++ parser3/src/main/untaint.C 2013/10/14 21:17:38 1.165 @@ -1,11 +1,11 @@ /** @file Parser: String class part: untaint mechanizm. - Copyright(c) 2001-2005 ArtLebedev Group (http://www.artlebedev.com) + Copyright (c) 2001-2012 Art. Lebedev Studio (http://www.artlebedev.com) Author: Alexandr Petrosian (http://paf.design.ru) */ -static const char * const IDENT_UNTAINT_C="$Date: 2009/07/06 08:45:35 $"; +volatile const char * IDENT_UNTAINT_C="$Id: untaint.C,v 1.165 2013/10/14 21:17:38 moko Exp $"; #include "pa_string.h" @@ -61,10 +61,9 @@ extern "C" { // author forgot to do that #define _default CORD_ec_append(info->result, c) #define encode(need_encode_func, prefix, otherwise) \ if(need_encode_func(c)) { \ - static const char* hex="0123456789ABCDEF"; \ CORD_ec_append(info->result, prefix); \ - CORD_ec_append(info->result, hex[((unsigned char)c)/0x10]); \ - CORD_ec_append(info->result, hex[((unsigned char)c)%0x10]); \ + CORD_ec_append(info->result, hex_digits[((unsigned char)c) >> 4]); \ + CORD_ec_append(info->result, hex_digits[((unsigned char)c) & 0x0F]); \ } else \ CORD_ec_append(info->result, otherwise); #define to_char(c) { CORD_ec_append(info->result, c); whitespace=false; } @@ -74,7 +73,7 @@ inline bool need_file_encode(unsigned ch // russian letters and space ENABLED // encoding only these... return strchr( - "*?'\"<>|" + "*?\"<>|" #ifndef WIN32 ":\\" #endif @@ -82,56 +81,51 @@ inline bool need_file_encode(unsigned ch } inline bool need_uri_encode(unsigned char c){ - if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) - return false; - - return !strchr("_-./", c); -} - -inline bool need_http_header_encode(unsigned char c){ - if(strchr(" , :", c)) - return false; - - return need_uri_encode(c); + return !(pa_isalnum(c) || strchr("_-./*", c)); } inline bool need_regex_escape(unsigned char c){ return strchr("\\^$.[]|()?*+{}-", c)!=0; } +inline bool need_parser_code_escape(unsigned char c){ + return strchr("^$;@()[]{}:#\"", c)!=0; +} + // String /* HTTP-header = field-name ":" [ field-value ] CRLF - field-name = token - field-value = *( field-content | LWS ) +field-name = token +field-value = *( field-content | LWS ) - field-content = - -word = token | quoted-string - token = 1* +word = token | quoted-string +quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) +qdtext = > +quoted-pair = "\" CHAR - +OCTET = +CHAR = tspecials = "(" | ")" | "<" | ">" | "@" - | "," | ";" | ":" | "\" | <"> - | "/" | "[" | "]" | "?" | "=" - | "{" | "}" | SP | HT + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT SP = HT = LWS = [CRLF] 1*( SP | HT ) -TEXT = +TEXT = +CTL = -quoted-pair = "\" CHAR if(strchr("()<>@,;:\\\"/[]?={} \t", *ptr)) */ @@ -253,10 +247,7 @@ RFC and without "_", or in would mean 0x20 */ inline bool mail_header_char_valid_within_Qencoded(char c) { - return c>='A' && c<='Z' - || c>='a' && c<='Z' - || c>='0' && c<='9' - || strchr("!*+-/", c); + return (pa_isalnum((unsigned char)c) || strchr("!*+-/", c)); } inline bool addr_spec_soon(const char *src) { for(char c; (c=*src); src++) @@ -317,6 +308,8 @@ struct Cstr_to_string_body_block_info { const char* exception; }; #endif + +// @todo: replace info->body->mid with something that uses info->pos int cstr_to_string_body_block(String::Language to_lang, size_t fragment_length, Cstr_to_string_body_block_info* info) { bool& whitespace=info->whitespace; size_t fragment_end=info->fragment_begin+fragment_length; @@ -342,49 +335,35 @@ int cstr_to_string_body_block(String::La case String::L_FILE_SPEC: // tainted, untaint language: file [name] { - bool is1251=(info->charsets && info->charsets->source().NAME()=="WINDOWS-1251"); escape_fragment( - // Macintosh has problems with small Russian letter 'r' - if( is1251 && c=='\xF0' ) { - // fixing that letter for most common charset - to_char('p'); - } else // fallback to default - encode(need_file_encode, '_', c); + encode(need_file_encode, '_', c); ); } break; - case String::L_FILE_POST: - { - escape_fragment(switch(c) { - case '\0': to_string("\\0"); break; - case '\\': to_string("\\\\"); break; - default: _default; break; - }); - } - break; case String::L_URI: - // tainted, untaint language: uri - { - const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); - // skip source [we use recoded version] - pa_CORD_pos_advance(info->pos, fragment_length); - String::C output(fragment_str, fragment_length); - if(info->charsets) - output=Charset::transcode(output, - info->charsets->source(), - info->charsets->client()); - - char c; - for(const char* src=output.str; (c=*src++); ) - encode(need_uri_encode, '%', c); + // tainted, untaint language: uri + { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + if(info->charsets) + output=Charset::transcode(output, + info->charsets->source(), + info->charsets->client()); + + char c; + for(const char* src=output.str; (c=*src++); ) + encode(need_uri_encode, '%', c); } break; case String::L_HTTP_HEADER: // tainted, untaint language: http-field-content-text - // the same as L_URI BUT not transcoded into $response:charset before encoding - escape_fragment( - encode(need_uri_encode, '%', c); - ); + escape_fragment(switch(c) { + case '\n': + case '\r': to_string(" "); break; + default: _default; break; + }); break; case String::L_MAIL_HEADER: // tainted, untaint language: mail-header @@ -509,19 +488,62 @@ int cstr_to_string_body_block(String::La _default; ); break; + case String::L_JSON: + // tainted, untaint language: json + // escape '"' '\' '/' '\n' '\t' '\r' '\b' '\f' chars and escape chars as \uXXXX if output charset != UTF-8 + { + if(info->charsets==NULL || info->charsets->client().isUTF8()){ + // escaping to \uXXXX is not needed + escape_fragment(switch(c) { + case '\n': to_string("\\n"); break; + case '"' : to_string("\\\""); break; + case '\\': to_string("\\\\"); break; + case '/' : to_string("\\/"); break; + case '\t': to_string("\\t"); break; + case '\r': to_string("\\r"); break; + case '\b': to_string("\\b"); break; + case '\f': to_string("\\f"); break; + default: + if((unsigned char)c < 0x20){ + to_string("\\u00"); + to_char(hex_digits[((unsigned char)c) >> 4]); + to_char(hex_digits[((unsigned char)c) & 0x0F]); + } else { + _default; + } + break; + }); + } else { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + + output=Charset::escape_JSON(output, info->charsets->source()); + to_string(output); + } + } + break; case String::L_HTTP_COOKIE: // tainted, untaint language: cookie (3.3.0 and higher: %uXXXX in UTF-8) - { + if(info->charsets) { const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); // skip source [we use recoded version] pa_CORD_pos_advance(info->pos, fragment_length); String::C output(fragment_str, fragment_length); output=Charset::escape(output, info->charsets->source()); - //throw Exception(0, 0, output); to_string(output); - - } + } else + ec_append(info->result, optimize, whitespace, info->pos, fragment_length); + break; + case String::L_PARSER_CODE: + // for auto-untaint in process + escape_fragment( + if(need_parser_code_escape(c)) + to_char('^'); + _default; + ); break; default: SAPI::abort("unknown untaint language #%d", @@ -535,37 +557,6 @@ int cstr_to_string_body_block(String::La } -int cstr_to_string_body_block_default(char alang, size_t fragment_length, Cstr_to_string_body_block_info* info){ - return cstr_to_string_body_block(info->lang==String::L_UNSPECIFIED ? (String::Language)(unsigned char)alang : info->lang, fragment_length, info); -} - -String::Body String::cstr_to_string_body(Language lang, SQL_Connection* connection, const Request_charsets *charsets) const { - if(is_empty()) - return String::Body(); - - Cstr_to_string_body_block_info info; - // input - info.lang=lang; - info.connection=connection; - info.charsets=charsets; - info.body=&body; - // output - CORD_ec_init(info.result); - // private - body.set_pos(info.pos, 0); - info.fragment_begin=0; - info.exception=0; - info.whitespace=true; - - langs.for_each(body, cstr_to_string_body_block_default, &info); - if(info.exception) - throw Exception(0, - 0, - info.exception); - - return String::Body(CORD_ec_to_cord(info.result)); -} - String::Body String::cstr_to_string_body_taint(Language lang, SQL_Connection* connection, const Request_charsets *charsets) const { if(is_empty()) return String::Body(); @@ -628,6 +619,7 @@ String::Body String::cstr_to_string_body info.whitespace=true; langs.for_each(body, cstr_to_string_body_block_untaint, &info); + if(info.exception) throw Exception(0, 0, @@ -635,3 +627,11 @@ String::Body String::cstr_to_string_body return String::Body(CORD_ec_to_cord(info.result)); } + +const char* String::untaint_and_transcode_cstr(Language lang, const Request_charsets *charsets) const { + if(charsets && &charsets->source() != &charsets->client()){ + // Note: L_URI is allready transcoded during untaint, but transcode does not affect %XX + return Charset::transcode(cstr_to_string_body_untaint(lang, 0, charsets), charsets->source(), charsets->client()).cstr(); + } else + return cstr_to_string_body_untaint(lang, 0, charsets).cstr(); +}