--- parser3/src/main/untaint.C 2009/10/15 01:07:54 1.155 +++ parser3/src/main/untaint.C 2020/08/13 11:44:20 1.175 @@ -1,11 +1,11 @@ /** @file Parser: String class part: untaint mechanizm. - Copyright(c) 2001-2005 ArtLebedev Group (http://www.artlebedev.com) + Copyright (c) 2001-2017 Art. Lebedev Studio (http://www.artlebedev.com) Author: Alexandr Petrosian (http://paf.design.ru) */ -static const char * const IDENT_UNTAINT_C="$Date: 2009/10/15 01:07:54 $"; +volatile const char * IDENT_UNTAINT_C="$Id: untaint.C,v 1.175 2020/08/13 11:44:20 moko Exp $"; #include "pa_string.h" @@ -20,7 +20,7 @@ static const char * const IDENT_UNTAINT_ #include "pa_sapi.h" extern "C" { // author forgot to do that -#include "ec.h" +#include "../lib/cord/include/ec.h" } #include "pa_sql_connection.h" @@ -52,29 +52,34 @@ extern "C" { // author forgot to do that } \ } - #define escape_fragment(action) \ for(; fragment_length--; CORD_next(info->pos)) { \ char c=CORD_pos_fetch(info->pos); \ action \ } -#define _default CORD_ec_append(info->result, c) + #define encode(need_encode_func, prefix, otherwise) \ if(need_encode_func(c)) { \ - static const char* hex="0123456789ABCDEF"; \ CORD_ec_append(info->result, prefix); \ - CORD_ec_append(info->result, hex[((unsigned char)c)/0x10]); \ - CORD_ec_append(info->result, hex[((unsigned char)c)%0x10]); \ + to_hex(c); \ } else \ CORD_ec_append(info->result, otherwise); + +#define to_hex(c) \ + { \ + CORD_ec_append(info->result, hex_digits[((unsigned char)c) >> 4]); \ + CORD_ec_append(info->result, hex_digits[((unsigned char)c) & 0x0F]); \ + } + #define to_char(c) { CORD_ec_append(info->result, c); whitespace=false; } -#define to_string(s) { CORD_ec_append_cord(info->result, s); whitespace=false; } +#define to_string(s) { CORD_ec_append_cord(info->result, (CORD)(s)); whitespace=false; } +#define _default CORD_ec_append(info->result, c) inline bool need_file_encode(unsigned char c){ // russian letters and space ENABLED // encoding only these... return strchr( - "*?'\"<>|" + "*?\"<>|" #ifndef WIN32 ":\\" #endif @@ -82,17 +87,7 @@ inline bool need_file_encode(unsigned ch } inline bool need_uri_encode(unsigned char c){ - if((c>='0') && (c<='9') || (c>='A') && (c<='Z') || (c>='a') && (c<='z')) - return false; - - return !strchr("_-./", c); -} - -inline bool need_http_header_encode(unsigned char c){ - if(strchr(" , :", c)) - return false; - - return need_uri_encode(c); + return !(pa_isalnum(c) || strchr("_-./*", c)); } inline bool need_regex_escape(unsigned char c){ @@ -184,6 +179,25 @@ int append_fragment_nonoptimizing(char a return 0; // 0=continue } + +/** + appends to other String without language change +*/ + +String& String::append_to(String& dest) const { + if(is_empty()) + return dest; + + // first: fragment infos + dest.langs.appendHelper(dest.body, langs, body); + + // next: letters + dest.body<='A' && c<='Z' - || c>='a' && c<='Z' - || c>='0' && c<='9' - || strchr("!*+-/", c); + return (pa_isalnum((unsigned char)c) || strchr("!*+-/", c)); } inline bool addr_spec_soon(const char *src) { for(char c; (c=*src); src++) @@ -354,37 +363,28 @@ int cstr_to_string_body_block(String::La ); } break; - case String::L_FILE_POST: - { - escape_fragment(switch(c) { - case '\0': to_string("\\0"); break; - case '\\': to_string("\\\\"); break; - default: _default; break; - }); - } - break; case String::L_URI: - // tainted, untaint language: uri - { - const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); - // skip source [we use recoded version] - pa_CORD_pos_advance(info->pos, fragment_length); - String::C output(fragment_str, fragment_length); - if(info->charsets) - output=Charset::transcode(output, - info->charsets->source(), - info->charsets->client()); - - char c; - for(const char* src=output.str; (c=*src++); ) - encode(need_uri_encode, '%', c); + // tainted, untaint language: uri + { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + if(info->charsets) + output=Charset::transcode(output, info->charsets->source(), info->charsets->client()); + + char c; + for(const char* src=output.str; (c=*src++); ) + encode(need_uri_encode, '%', c); } break; case String::L_HTTP_HEADER: // tainted, untaint language: http-field-content-text - escape_fragment( - encode(need_uri_encode, '%', c); - ); + escape_fragment(switch(c) { + case '\n': + case '\r': to_string(" "); break; + default: _default; break; + }); break; case String::L_MAIL_HEADER: // tainted, untaint language: mail-header @@ -403,18 +403,18 @@ int cstr_to_string_body_block(String::La bool email=false; uchar c; - for(const char* src=mail_ptr; (c=(uchar)*src++); ) { + for(const char* src=mail_ptr; c=(uchar)*src; src++) { if(c=='\r' || c=='\n') c=' '; - if(to_quoted_printable && (c==',' || c == '"' || addr_spec_soon(src-1/*position to 'c'*/))) { + if(to_quoted_printable && (c==',' || c == '"' || addr_spec_soon(src))) { email=c=='<'; to_string("?="); to_quoted_printable=false; } //RFC + An 'encoded-word' MUST NOT appear in any portion of an 'addr-spec'. if(!email && ( - !to_quoted_printable && (c & 0x80) // starting quote-printable-encoding on first 8bit char - || to_quoted_printable && !mail_header_char_valid_within_Qencoded(c) + ( !to_quoted_printable && (c & 0x80 || (c == ' ' && src == mail_ptr) ) ) // starting quote-printable-encoding on first 8bit char or leading space (issue #123) + || ( to_quoted_printable && !mail_header_char_valid_within_Qencoded(c) ) )) { if(!to_quoted_printable) { to_string("=?"); @@ -509,19 +509,66 @@ int cstr_to_string_body_block(String::La _default; ); break; + case String::L_JSON: + // tainted, untaint language: json + // escape '"' '\' '/' '\n' '\t' '\r' '\b' '\f' chars and escape chars as \uXXXX if output charset != UTF-8 + { + if(info->charsets==NULL || info->charsets->client().isUTF8()){ + // escaping to \uXXXX is not needed + escape_fragment(switch((unsigned char)c) { + case '\n': to_string("\\n"); break; + case '"' : to_string("\\\""); break; + case '\\': to_string("\\\\"); break; + case '/' : to_string("\\/"); break; + case '\t': to_string("\\t"); break; + case '\r': to_string("\\r"); break; + case '\b': to_string("\\b"); break; + case '\f': to_string("\\f"); break; + case 0xE2: // \u2028 and \u2029 (line/paragraph separators), check bug #1023 + if(info->charsets && info->charsets->source().isUTF8() && fragment_length>=2){ + CORD_next(info->pos); + char c1=CORD_pos_fetch(info->pos); + CORD_next(info->pos); + char c2=CORD_pos_fetch(info->pos); + if((unsigned char)c1 == 0x80 && ((unsigned char)c2 >= 0xA8 && (unsigned char)c2 <= 0xAF)){ + to_string("\\u20"); + to_hex(((unsigned char)c2-0x80)); + } else { + CORD_ec_append(info->result, c); + CORD_ec_append(info->result, c1); + CORD_ec_append(info->result, c2); + } + fragment_length-=2; + } else { + _default; + } + break; + default: + if((unsigned char)c < 0x20){ + to_string("\\u00"); + to_hex(c); + } else { + _default; + } + break; + }); + } else { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + to_string(Charset::escape_JSON(String::C(fragment_str, fragment_length), info->charsets->source()).str); + } + } + break; case String::L_HTTP_COOKIE: // tainted, untaint language: cookie (3.3.0 and higher: %uXXXX in UTF-8) - { + if(info->charsets) { const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); // skip source [we use recoded version] pa_CORD_pos_advance(info->pos, fragment_length); - String::C output(fragment_str, fragment_length); - - output=Charset::escape(output, info->charsets->source()); - //throw Exception(0, 0, output); - to_string(output); - - } + to_string(Charset::escape(String::C(fragment_str, fragment_length), info->charsets->source()).str); + } else + ec_append(info->result, optimize, whitespace, info->pos, fragment_length); break; case String::L_PARSER_CODE: // for auto-untaint in process @@ -532,8 +579,7 @@ int cstr_to_string_body_block(String::La ); break; default: - SAPI::abort("unknown untaint language #%d", - static_cast(to_lang)); // should never + SAPI::die("unknown untaint language #%d", static_cast(to_lang)); // should never break; // never } @@ -568,7 +614,7 @@ String::Body String::cstr_to_string_body 0, info.exception); - return String::Body(CORD_ec_to_cord(info.result), info.fragment_begin); + return String::Body(CORD_ec_to_cord(info.result)); } int cstr_to_string_body_block_untaint(char alang, size_t fragment_length, Cstr_to_string_body_block_info* info){ @@ -611,7 +657,7 @@ String::Body String::cstr_to_string_body 0, info.exception); - return String::Body(CORD_ec_to_cord(info.result), info.fragment_begin); + return String::Body(CORD_ec_to_cord(info.result)); } const char* String::untaint_and_transcode_cstr(Language lang, const Request_charsets *charsets) const {