--- parser3/src/main/untaint.C 2009/04/15 07:48:13 1.145 +++ parser3/src/main/untaint.C 2015/04/08 18:08:53 1.166 @@ -1,11 +1,11 @@ /** @file Parser: String class part: untaint mechanizm. - Copyright(c) 2001-2005 ArtLebedev Group (http://www.artlebedev.com) + Copyright (c) 2001-2012 Art. Lebedev Studio (http://www.artlebedev.com) Author: Alexandr Petrosian (http://paf.design.ru) */ -static const char * const IDENT_UNTAINT_C="$Date: 2009/04/15 07:48:13 $"; +volatile const char * IDENT_UNTAINT_C="$Id: untaint.C,v 1.166 2015/04/08 18:08:53 moko Exp $"; #include "pa_string.h" @@ -61,10 +61,9 @@ extern "C" { // author forgot to do that #define _default CORD_ec_append(info->result, c) #define encode(need_encode_func, prefix, otherwise) \ if(need_encode_func(c)) { \ - static const char* hex="0123456789ABCDEF"; \ CORD_ec_append(info->result, prefix); \ - CORD_ec_append(info->result, hex[((unsigned char)c)/0x10]); \ - CORD_ec_append(info->result, hex[((unsigned char)c)%0x10]); \ + CORD_ec_append(info->result, hex_digits[((unsigned char)c) >> 4]); \ + CORD_ec_append(info->result, hex_digits[((unsigned char)c) & 0x0F]); \ } else \ CORD_ec_append(info->result, otherwise); #define to_char(c) { CORD_ec_append(info->result, c); whitespace=false; } @@ -74,7 +73,7 @@ inline bool need_file_encode(unsigned ch // russian letters and space ENABLED // encoding only these... return strchr( - "*?'\"<>|" + "*?\"<>|" #ifndef WIN32 ":\\" #endif @@ -82,56 +81,51 @@ inline bool need_file_encode(unsigned ch } inline bool need_uri_encode(unsigned char c){ - if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) - return false; - - return !strchr("_-./", c); -} - -inline bool need_http_header_encode(unsigned char c){ - if(strchr(" , :", c)) - return false; - - return need_uri_encode(c); + return !(pa_isalnum(c) || strchr("_-./*", c)); } inline bool need_regex_escape(unsigned char c){ return strchr("\\^$.[]|()?*+{}-", c)!=0; } +inline bool need_parser_code_escape(unsigned char c){ + return strchr("^$;@()[]{}:#\"", c)!=0; +} + // String /* HTTP-header = field-name ":" [ field-value ] CRLF - field-name = token - field-value = *( field-content | LWS ) +field-name = token +field-value = *( field-content | LWS ) - field-content = - -word = token | quoted-string - token = 1* +word = token | quoted-string +quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) +qdtext = > +quoted-pair = "\" CHAR - +OCTET = +CHAR = tspecials = "(" | ")" | "<" | ">" | "@" - | "," | ";" | ":" | "\" | <"> - | "/" | "[" | "]" | "?" | "=" - | "{" | "}" | SP | HT + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT SP = HT = LWS = [CRLF] 1*( SP | HT ) -TEXT = +TEXT = +CTL = -quoted-pair = "\" CHAR if(strchr("()<>@,;:\\\"/[]?={} \t", *ptr)) */ @@ -185,21 +179,37 @@ int append_fragment_nonoptimizing(char a or marking ALL pieces of it with a @a lang when @a forced to, and propagating OPTIMIZE language bit. */ -String& String::append_to(String& dest, Language lang, bool forced) const { +String& String::append_to(String& dest, Language ilang, bool forced) const { if(is_empty()) return dest; // first: fragment infos - if(lang==L_PASS_APPENDED) // without language-change? - dest.langs.appendHelper(dest.body, body, langs); + if(ilang==L_PASS_APPENDED) // without language-change? + dest.langs.appendHelper(dest.body, langs, body); else if(forced) //forcing passed lang? - dest.langs.append(dest.body, lang, length()); - else { - Append_fragment_info info={lang, &dest.langs, dest.body.length()}; - langs.for_each(body, lang&L_OPTIMIZE_BIT? - append_fragment_optimizing - :append_fragment_nonoptimizing, &info); + dest.langs.appendHelper(dest.body, ilang, body); + else { + if(langs.opt.is_not_just_lang){ + Append_fragment_info info={ilang, &dest.langs, dest.body.length()}; + langs.for_each(body, ilang&L_OPTIMIZE_BIT? + append_fragment_optimizing + :append_fragment_nonoptimizing, &info); + } else { + Language lang=langs.opt.lang; + // see append_fragment_* for explanation + if(ilang&L_OPTIMIZE_BIT){ + dest.langs.appendHelper(dest.body, + lang==String::L_TAINTED? + ilang + :lang==String::L_CLEAN? + (String::Language)(String::L_CLEAN|String::L_OPTIMIZE_BIT) + :lang, + body); + } else { + dest.langs.appendHelper(dest.body, lang==String::L_TAINTED ? ilang:lang, body); + } + } } // next: letters @@ -237,10 +247,7 @@ RFC and without "_", or in would mean 0x20 */ inline bool mail_header_char_valid_within_Qencoded(char c) { - return c>='A' && c<='Z' - || c>='a' && c<='Z' - || c>='0' && c<='9' - || strchr("!*+-/", c); + return (pa_isalnum((unsigned char)c) || strchr("!*+-/", c)); } inline bool addr_spec_soon(const char *src) { for(char c; (c=*src); src++) @@ -301,13 +308,13 @@ struct Cstr_to_string_body_block_info { const char* exception; }; #endif -int cstr_to_string_body_block(char alang, size_t fragment_length, Cstr_to_string_body_block_info* info) { - const String::Language fragment_lang=(String::Language)(unsigned char)alang; + +// @todo: replace info->body->mid with something that uses info->pos +int cstr_to_string_body_block(String::Language to_lang, size_t fragment_length, Cstr_to_string_body_block_info* info) { bool& whitespace=info->whitespace; size_t fragment_end=info->fragment_begin+fragment_length; - //fprintf(stderr, "%d, %d =%s=\n", fragment_lang, fragment_length, info->body->cstr()); + //fprintf(stderr, "%d, %d =%s=\n", to_lang, fragment_length, info->body->cstr()); - String::Language to_lang=info->lang==String::L_UNSPECIFIED?fragment_lang:info->lang; bool optimize=(to_lang & String::L_OPTIMIZE_BIT)!=0; if(!optimize) whitespace=false; @@ -328,49 +335,35 @@ int cstr_to_string_body_block(char alang case String::L_FILE_SPEC: // tainted, untaint language: file [name] { - bool is1251=(info->charsets && info->charsets->source().NAME()=="WINDOWS-1251"); escape_fragment( - // Macintosh has problems with small Russian letter 'r' - if( is1251 && c=='\xF0' ) { - // fixing that letter for most common charset - to_char('p'); - } else // fallback to default - encode(need_file_encode, '_', c); + encode(need_file_encode, '_', c); ); } break; - case String::L_FILE_POST: - { - escape_fragment(switch(c) { - case '\0': to_string("\\0"); break; - case '\\': to_string("\\\\"); break; - default: _default; break; - }); - } - break; case String::L_URI: - // tainted, untaint language: uri - { - const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); - // skip source [we use recoded version] - pa_CORD_pos_advance(info->pos, fragment_length); - String::C output(fragment_str, fragment_length); - if(info->charsets) - output=Charset::transcode(output, - info->charsets->source(), - info->charsets->client()); - - char c; - for(const char* src=output.str; (c=*src++); ) - encode(need_uri_encode, '%', c); + // tainted, untaint language: uri + { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + if(info->charsets) + output=Charset::transcode(output, + info->charsets->source(), + info->charsets->client()); + + char c; + for(const char* src=output.str; (c=*src++); ) + encode(need_uri_encode, '%', c); } break; case String::L_HTTP_HEADER: // tainted, untaint language: http-field-content-text - // the same as L_URI BUT not transcoded into $response:charset before encoding - escape_fragment( - encode(need_uri_encode, '%', c); - ); + escape_fragment(switch(c) { + case '\n': + case '\r': to_string(" "); break; + default: _default; break; + }); break; case String::L_MAIL_HEADER: // tainted, untaint language: mail-header @@ -399,8 +392,8 @@ int cstr_to_string_body_block(char alang } //RFC + An 'encoded-word' MUST NOT appear in any portion of an 'addr-spec'. if(!email && ( - !to_quoted_printable && (c & 0x80) // starting quote-printable-encoding on first 8bit char - || to_quoted_printable && !mail_header_char_valid_within_Qencoded(c) + ( !to_quoted_printable && (c & 0x80) ) // starting quote-printable-encoding on first 8bit char + || ( to_quoted_printable && !mail_header_char_valid_within_Qencoded(c) ) )) { if(!to_quoted_printable) { to_string("=?"); @@ -441,6 +434,7 @@ int cstr_to_string_body_block(char alang case '\'': to_string("\\'"); break; case '\\': to_string("\\\\"); break; case '\xFF': to_string("\\\xFF"); break; + case '\r': to_string("\\r"); break; default: _default; break; }); break; @@ -494,19 +488,62 @@ int cstr_to_string_body_block(char alang _default; ); break; + case String::L_JSON: + // tainted, untaint language: json + // escape '"' '\' '/' '\n' '\t' '\r' '\b' '\f' chars and escape chars as \uXXXX if output charset != UTF-8 + { + if(info->charsets==NULL || info->charsets->client().isUTF8()){ + // escaping to \uXXXX is not needed + escape_fragment(switch(c) { + case '\n': to_string("\\n"); break; + case '"' : to_string("\\\""); break; + case '\\': to_string("\\\\"); break; + case '/' : to_string("\\/"); break; + case '\t': to_string("\\t"); break; + case '\r': to_string("\\r"); break; + case '\b': to_string("\\b"); break; + case '\f': to_string("\\f"); break; + default: + if((unsigned char)c < 0x20){ + to_string("\\u00"); + to_char(hex_digits[((unsigned char)c) >> 4]); + to_char(hex_digits[((unsigned char)c) & 0x0F]); + } else { + _default; + } + break; + }); + } else { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + + output=Charset::escape_JSON(output, info->charsets->source()); + to_string(output); + } + } + break; case String::L_HTTP_COOKIE: // tainted, untaint language: cookie (3.3.0 and higher: %uXXXX in UTF-8) - { + if(info->charsets) { const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); // skip source [we use recoded version] pa_CORD_pos_advance(info->pos, fragment_length); String::C output(fragment_str, fragment_length); output=Charset::escape(output, info->charsets->source()); - //throw Exception(0, 0, output); to_string(output); - - } + } else + ec_append(info->result, optimize, whitespace, info->pos, fragment_length); + break; + case String::L_PARSER_CODE: + // for auto-untaint in process + escape_fragment( + if(need_parser_code_escape(c)) + to_char('^'); + _default; + ); break; default: SAPI::abort("unknown untaint language #%d", @@ -520,9 +557,7 @@ int cstr_to_string_body_block(char alang } -String::Body String::cstr_to_string_body(Language lang, - SQL_Connection* connection, - const Request_charsets *charsets) const { +String::Body String::cstr_to_string_body_taint(Language lang, SQL_Connection* connection, const Request_charsets *charsets) const { if(is_empty()) return String::Body(); @@ -540,12 +575,63 @@ String::Body String::cstr_to_string_body info.exception=0; info.whitespace=true; - langs.for_each(body, cstr_to_string_body_block, &info); - if(info.exception){ + cstr_to_string_body_block(lang, length(), &info); + + if(info.exception) throw Exception(0, 0, info.exception); - } return String::Body(CORD_ec_to_cord(info.result)); } + +int cstr_to_string_body_block_untaint(char alang, size_t fragment_length, Cstr_to_string_body_block_info* info){ + const String::Language lang=(String::Language)(unsigned char)alang; + // see append_fragment_* for explanation + if(info->lang&String::L_OPTIMIZE_BIT) + return cstr_to_string_body_block( + lang==String::L_TAINTED? + info->lang + :lang==String::L_CLEAN? + (String::Language)(String::L_CLEAN|String::L_OPTIMIZE_BIT) + :lang, + fragment_length, info); + else + return cstr_to_string_body_block(lang==String::L_TAINTED ? info->lang:lang, fragment_length, info); +} + +String::Body String::cstr_to_string_body_untaint(Language lang, SQL_Connection* connection, const Request_charsets *charsets) const { + if(is_empty()) + return String::Body(); + + Cstr_to_string_body_block_info info; + // input + info.lang=lang; + info.connection=connection; + info.charsets=charsets; + info.body=&body; + // output + CORD_ec_init(info.result); + // private + body.set_pos(info.pos, 0); + info.fragment_begin=0; + info.exception=0; + info.whitespace=true; + + langs.for_each(body, cstr_to_string_body_block_untaint, &info); + + if(info.exception) + throw Exception(0, + 0, + info.exception); + + return String::Body(CORD_ec_to_cord(info.result)); +} + +const char* String::untaint_and_transcode_cstr(Language lang, const Request_charsets *charsets) const { + if(charsets && &charsets->source() != &charsets->client()){ + // Note: L_URI is allready transcoded during untaint, but transcode does not affect %XX + return Charset::transcode(cstr_to_string_body_untaint(lang, 0, charsets), charsets->source(), charsets->client()).cstr(); + } else + return cstr_to_string_body_untaint(lang, 0, charsets).cstr(); +}