--- parser3/src/main/untaint.C 2004/02/06 09:40:28 1.128 +++ parser3/src/main/untaint.C 2015/10/03 00:03:49 1.167 @@ -1,11 +1,11 @@ /** @file Parser: String class part: untaint mechanizm. - Copyright(c) 2001-2003 ArtLebedev Group (http://www.artlebedev.com) + Copyright (c) 2001-2012 Art. Lebedev Studio (http://www.artlebedev.com) Author: Alexandr Petrosian (http://paf.design.ru) */ -static const char * const IDENT_UNTAINT_C="$Date: 2004/02/06 09:40:28 $"; +volatile const char * IDENT_UNTAINT_C="$Id: untaint.C,v 1.167 2015/10/03 00:03:49 moko Exp $"; #include "pa_string.h" @@ -34,7 +34,7 @@ extern "C" { // author forgot to do that { \ bool skip=false; \ if(optimize) switch(c) { \ - case ' ': case '\r': case '\n': case '\t': \ + case ' ': case '\n': case '\t': \ if(whitespace) \ skip=true; /*skipping subsequent*/ \ else \ @@ -52,45 +52,50 @@ extern "C" { // author forgot to do that } \ } - -#define escape(action) \ +#define escape_fragment(action) \ for(; fragment_length--; CORD_next(info->pos)) { \ char c=CORD_pos_fetch(info->pos); \ action \ } -#define _default default: CORD_ec_append(info->result, c); break + #define encode(need_encode_func, prefix, otherwise) \ if(need_encode_func(c)) { \ - static const char* hex="0123456789ABCDEF"; \ CORD_ec_append(info->result, prefix); \ - CORD_ec_append(info->result, hex[((unsigned char)c)/0x10]); \ - CORD_ec_append(info->result, hex[((unsigned char)c)%0x10]); \ + to_hex(c); \ } else \ CORD_ec_append(info->result, otherwise); + +#define to_hex(c) \ + { \ + CORD_ec_append(info->result, hex_digits[((unsigned char)c) >> 4]); \ + CORD_ec_append(info->result, hex_digits[((unsigned char)c) & 0x0F]); \ + } + #define to_char(c) { CORD_ec_append(info->result, c); whitespace=false; } #define to_string(s) { CORD_ec_append_cord(info->result, s); whitespace=false; } +#define _default CORD_ec_append(info->result, c) inline bool need_file_encode(unsigned char c){ // russian letters and space ENABLED // encoding only these... return strchr( - "*?'\"<>|" + "*?\"<>|" #ifndef WIN32 - ":\\" + ":\\" #endif - , c)!=0; + , c)!=0; } + inline bool need_uri_encode(unsigned char c){ - if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) - return false; + return !(pa_isalnum(c) || strchr("_-./*", c)); +} - return !strchr("_-./", c); +inline bool need_regex_escape(unsigned char c){ + return strchr("\\^$.[]|()?*+{}-", c)!=0; } -inline bool need_http_header_encode(unsigned char c){ - if(strchr(" , :", c)) - return false; - return need_uri_encode(c); +inline bool need_parser_code_escape(unsigned char c){ + return strchr("^$;@()[]{}:#\"", c)!=0; } // String @@ -98,34 +103,35 @@ inline bool need_http_header_encode(unsi /* HTTP-header = field-name ":" [ field-value ] CRLF - field-name = token - field-value = *( field-content | LWS ) +field-name = token +field-value = *( field-content | LWS ) - field-content = - -word = token | quoted-string - token = 1* +word = token | quoted-string +quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) +qdtext = > +quoted-pair = "\" CHAR - +OCTET = +CHAR = tspecials = "(" | ")" | "<" | ">" | "@" - | "," | ";" | ":" | "\" | <"> - | "/" | "[" | "]" | "?" | "=" - | "{" | "}" | SP | HT + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT SP = HT = LWS = [CRLF] 1*( SP | HT ) -TEXT = +TEXT = +CTL = -quoted-pair = "\" CHAR if(strchr("()<>@,;:\\\"/[]?={} \t", *ptr)) */ @@ -179,21 +185,37 @@ int append_fragment_nonoptimizing(char a or marking ALL pieces of it with a @a lang when @a forced to, and propagating OPTIMIZE language bit. */ -String& String::append_to(String& dest, Language lang, bool forced) const { +String& String::append_to(String& dest, Language ilang, bool forced) const { if(is_empty()) return dest; // first: fragment infos - if(lang==L_PASS_APPENDED) // without language-change? - dest.langs.append(dest.body, body.length(), langs); + if(ilang==L_PASS_APPENDED) // without language-change? + dest.langs.appendHelper(dest.body, langs, body); else if(forced) //forcing passed lang? - dest.langs.append(dest.body, lang, length()); - else { - Append_fragment_info info={lang, &dest.langs, dest.body.length()}; - langs.for_each(body, lang&L_OPTIMIZE_BIT? - append_fragment_optimizing - :append_fragment_nonoptimizing, &info); + dest.langs.appendHelper(dest.body, ilang, body); + else { + if(langs.opt.is_not_just_lang){ + Append_fragment_info info={ilang, &dest.langs, dest.body.length()}; + langs.for_each(body, ilang&L_OPTIMIZE_BIT? + append_fragment_optimizing + :append_fragment_nonoptimizing, &info); + } else { + Language lang=langs.opt.lang; + // see append_fragment_* for explanation + if(ilang&L_OPTIMIZE_BIT){ + dest.langs.appendHelper(dest.body, + lang==String::L_TAINTED? + ilang + :lang==String::L_CLEAN? + (String::Language)(String::L_CLEAN|String::L_OPTIMIZE_BIT) + :lang, + body); + } else { + dest.langs.appendHelper(dest.body, lang==String::L_TAINTED ? ilang:lang, body); + } + } } // next: letters @@ -231,10 +253,7 @@ RFC and without "_", or in would mean 0x20 */ inline bool mail_header_char_valid_within_Qencoded(char c) { - return c>='A' && c<='Z' - || c>='a' && c<='Z' - || c>='0' && c<='9' - || strchr("!*+-/", c); + return (pa_isalnum((unsigned char)c) || strchr("!*+-/", c)); } inline bool addr_spec_soon(const char *src) { for(char c; (c=*src); src++) @@ -292,16 +311,16 @@ struct Cstr_to_string_body_block_info { CORD_pos pos; size_t fragment_begin; bool whitespace; + const char* exception; }; #endif -int cstr_to_string_body_block(char alang, size_t fragment_length, Cstr_to_string_body_block_info* info) { - const String::Language fragment_lang=(String::Language)(unsigned char)alang; + +// @todo: replace info->body->mid with something that uses info->pos +int cstr_to_string_body_block(String::Language to_lang, size_t fragment_length, Cstr_to_string_body_block_info* info) { bool& whitespace=info->whitespace; size_t fragment_end=info->fragment_begin+fragment_length; - //fprintf(stderr, "%d, %d\n", fragment.lang, fragment.length); - + //fprintf(stderr, "%d, %d =%s=\n", to_lang, fragment_length, info->body->cstr()); - String::Language to_lang=info->lang==String::L_UNSPECIFIED?fragment_lang:info->lang; bool optimize=(to_lang & String::L_OPTIMIZE_BIT)!=0; if(!optimize) whitespace=false; @@ -321,37 +340,36 @@ int cstr_to_string_body_block(char alang break; case String::L_FILE_SPEC: // tainted, untaint language: file [name] - escape( - // Macintosh has problems with small Russian letter 'r' - if( c=='\xF0' && info->charsets && info->charsets->source().NAME()=="WINDOWS-1251" ) { - // fixing that letter for most common charset - to_char('p'); - } else // fallback to default + { + escape_fragment( encode(need_file_encode, '_', c); - ); + ); + } break; case String::L_URI: - // tainted, untaint language: uri - { - const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); - // skip source [we use recoded version] - pa_CORD_pos_advance(info->pos, fragment_length); - String::C output(fragment_str, fragment_length); - if(info->charsets) - output=Charset::transcode(output, - info->charsets->source(), - info->charsets->client()); - - char c; - for(const char* src=output.str; (c=*src++); ) - encode(need_uri_encode, '%', c); + // tainted, untaint language: uri + { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + if(info->charsets) + output=Charset::transcode(output, + info->charsets->source(), + info->charsets->client()); + + char c; + for(const char* src=output.str; (c=*src++); ) + encode(need_uri_encode, '%', c); } break; case String::L_HTTP_HEADER: // tainted, untaint language: http-field-content-text - escape( - encode(need_uri_encode, '%', c); - ); + escape_fragment(switch(c) { + case '\n': + case '\r': to_string(" "); break; + default: _default; break; + }); break; case String::L_MAIL_HEADER: // tainted, untaint language: mail-header @@ -371,15 +389,17 @@ int cstr_to_string_body_block(char alang bool email=false; uchar c; for(const char* src=mail_ptr; (c=(uchar)*src++); ) { - //RFC + An 'encoded-word' MUST NOT appear in any portion of an 'addr-spec'. - if(to_quoted_printable && (c==',' || addr_spec_soon(src) || c == '"')) { + if(c=='\r' || c=='\n') + c=' '; + if(to_quoted_printable && (c==',' || c == '"' || addr_spec_soon(src-1/*position to 'c'*/))) { email=c=='<'; to_string("?="); to_quoted_printable=false; } + //RFC + An 'encoded-word' MUST NOT appear in any portion of an 'addr-spec'. if(!email && ( - !to_quoted_printable && (c & 0x80) // starting quote-printable-encoding on first 8bit char - || to_quoted_printable && !mail_header_char_valid_within_Qencoded(c) + ( !to_quoted_printable && (c & 0x80) ) // starting quote-printable-encoding on first 8bit char + || ( to_quoted_printable && !mail_header_char_valid_within_Qencoded(c) ) )) { if(!to_quoted_printable) { to_string("=?"); @@ -399,14 +419,6 @@ int cstr_to_string_body_block(char alang } else ec_append(info->result, optimize, whitespace, info->pos, fragment_length); break; - case String::L_TABLE: - // tainted, untaint language: table - escape(switch(c) { - case '\t': to_char(' '); break; - case '\n': to_char(' '); break; - _default; - }); - break; case String::L_SQL: // tainted, untaint language: sql if(info->connection) { @@ -415,40 +427,148 @@ int cstr_to_string_body_block(char alang pa_CORD_pos_advance(info->pos, fragment_length); to_string(info->connection->quote(fragment_str, fragment_length)); - } else - throw Exception(0, - 0, - "untaint in SQL language failed - no connection specified"); + } else { + info->exception="untaint in SQL language failed - no connection specified"; + info->fragment_begin=fragment_end; + return 1; // stop processing. can't throw exception here + } break; case String::L_JS: - escape(switch(c) { + escape_fragment(switch(c) { + case '\n': to_string("\\n"); break; case '"': to_string("\\\""); break; case '\'': to_string("\\'"); break; - case '\n': to_string("\\n"); break; case '\\': to_string("\\\\"); break; case '\xFF': to_string("\\\xFF"); break; - _default; + case '\r': to_string("\\r"); break; + default: _default; break; }); break; case String::L_XML: - escape(switch(c) { + // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + escape_fragment(switch(c) { + case '\x20': + case '\x9': + case '\xA': + case '\xD': // this is usually removed on input + _default; + break; case '&': to_string("&"); break; case '>': to_string(">"); break; case '<': to_string("<"); break; case '"': to_string("""); break; case '\'': to_string("'"); break; - _default; + default: + if(((unsigned char)c)<0x20) { + // fixing it, so that libxml would not result + // in fatal error parsing text + // though it really violates standard. + // to indicate there were an error + // replace bad char not to it's code, + // which we can do, + // but rather to '!' to show that input were actually + // invalid. + // life: shows that MSIE can somehow garble form values + // so that they contain these chars. + to_char('!'); + } else { + _default; + } + break; }); break; case String::L_HTML: - escape(switch(c) { + escape_fragment(switch(c) { case '&': to_string("&"); break; case '>': to_string(">"); break; case '<': to_string("<"); break; case '"': to_string("""); break; - _default; + default: _default; break; }); break; + case String::L_REGEX: + // tainted, untaint language: regex + escape_fragment( + if(need_regex_escape(c)) + to_char('\\') + _default; + ); + break; + case String::L_JSON: + // tainted, untaint language: json + // escape '"' '\' '/' '\n' '\t' '\r' '\b' '\f' chars and escape chars as \uXXXX if output charset != UTF-8 + { + if(info->charsets==NULL || info->charsets->client().isUTF8()){ + // escaping to \uXXXX is not needed + escape_fragment(switch((unsigned char)c) { + case '\n': to_string("\\n"); break; + case '"' : to_string("\\\""); break; + case '\\': to_string("\\\\"); break; + case '/' : to_string("\\/"); break; + case '\t': to_string("\\t"); break; + case '\r': to_string("\\r"); break; + case '\b': to_string("\\b"); break; + case '\f': to_string("\\f"); break; + case 0xE2: // \u2028 and \u2029 (line/paragraph separators), check bug #1023 + if(info->charsets && info->charsets->source().isUTF8() && fragment_length>=2){ + CORD_next(info->pos); + char c1=CORD_pos_fetch(info->pos); + CORD_next(info->pos); + char c2=CORD_pos_fetch(info->pos); + if((unsigned char)c1 == 0x80 && ((unsigned char)c2 >= 0xA8 && (unsigned char)c2 <= 0xAF)){ + to_string("\\u20"); + to_hex(((unsigned char)c2-0x80)); + } else { + CORD_ec_append(info->result, c); + CORD_ec_append(info->result, c1); + CORD_ec_append(info->result, c2); + } + fragment_length-=2; + } else { + _default; + } + break; + default: + if((unsigned char)c < 0x20){ + to_string("\\u00"); + to_hex(c); + } else { + _default; + } + break; + }); + } else { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + + output=Charset::escape_JSON(output, info->charsets->source()); + to_string(output); + } + } + break; + case String::L_HTTP_COOKIE: + // tainted, untaint language: cookie (3.3.0 and higher: %uXXXX in UTF-8) + if(info->charsets) { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + + output=Charset::escape(output, info->charsets->source()); + to_string(output); + } else + ec_append(info->result, optimize, whitespace, info->pos, fragment_length); + break; + case String::L_PARSER_CODE: + // for auto-untaint in process + escape_fragment( + if(need_parser_code_escape(c)) + to_char('^'); + _default; + ); + break; default: SAPI::abort("unknown untaint language #%d", static_cast(to_lang)); // should never @@ -461,9 +581,52 @@ int cstr_to_string_body_block(char alang } -String::Body String::cstr_to_string_body(Language lang, - SQL_Connection* connection, - const Request_charsets *charsets) const { +String::Body String::cstr_to_string_body_taint(Language lang, SQL_Connection* connection, const Request_charsets *charsets) const { + if(is_empty()) + return String::Body(); + + Cstr_to_string_body_block_info info; + // input + info.lang=lang; + info.connection=connection; + info.charsets=charsets; + info.body=&body; + // output + CORD_ec_init(info.result); + // private + body.set_pos(info.pos, 0); + info.fragment_begin=0; + info.exception=0; + info.whitespace=true; + + cstr_to_string_body_block(lang, length(), &info); + + if(info.exception) + throw Exception(0, + 0, + info.exception); + + return String::Body(CORD_ec_to_cord(info.result)); +} + +int cstr_to_string_body_block_untaint(char alang, size_t fragment_length, Cstr_to_string_body_block_info* info){ + const String::Language lang=(String::Language)(unsigned char)alang; + // see append_fragment_* for explanation + if(info->lang&String::L_OPTIMIZE_BIT) + return cstr_to_string_body_block( + lang==String::L_TAINTED? + info->lang + :lang==String::L_CLEAN? + (String::Language)(String::L_CLEAN|String::L_OPTIMIZE_BIT) + :lang, + fragment_length, info); + else + return cstr_to_string_body_block(lang==String::L_TAINTED ? info->lang:lang, fragment_length, info); +} + +String::Body String::cstr_to_string_body_untaint(Language lang, SQL_Connection* connection, const Request_charsets *charsets) const { + if(is_empty()) + return String::Body(); Cstr_to_string_body_block_info info; // input @@ -476,10 +639,23 @@ String::Body String::cstr_to_string_body // private body.set_pos(info.pos, 0); info.fragment_begin=0; + info.exception=0; info.whitespace=true; - if(!is_empty()) - langs.for_each(body, cstr_to_string_body_block, &info); + langs.for_each(body, cstr_to_string_body_block_untaint, &info); + + if(info.exception) + throw Exception(0, + 0, + info.exception); return String::Body(CORD_ec_to_cord(info.result)); } + +const char* String::untaint_and_transcode_cstr(Language lang, const Request_charsets *charsets) const { + if(charsets && &charsets->source() != &charsets->client()){ + // Note: L_URI is allready transcoded during untaint, but transcode does not affect %XX + return Charset::transcode(cstr_to_string_body_untaint(lang, 0, charsets), charsets->source(), charsets->client()).cstr(); + } else + return cstr_to_string_body_untaint(lang, 0, charsets).cstr(); +}