--- parser3/src/main/untaint.C 2001/11/21 08:26:55 1.78 +++ parser3/src/main/untaint.C 2010/10/28 21:49:46 1.158 @@ -1,449 +1,635 @@ /** @file Parser: String class part: untaint mechanizm. - Copyright(c) 2001 ArtLebedev Group(http://www.artlebedev.com) - Author: Alexander Petrosyan (http://paf.design.ru) - - $Id: untaint.C,v 1.78 2001/11/21 08:26:55 paf Exp $ + Copyright(c) 2001-2005 ArtLebedev Group (http://www.artlebedev.com) + Author: Alexandr Petrosian (http://paf.design.ru) */ -#include "pa_pool.h" +static const char * const IDENT_UNTAINT_C="$Date: 2010/10/28 21:49:46 $"; + + #include "pa_string.h" #include "pa_hash.h" #include "pa_exception.h" #include "pa_table.h" #include "pa_globals.h" -#include "pa_sql_connection.h" #include "pa_dictionary.h" #include "pa_common.h" +#include "pa_charset.h" +#include "pa_request_charsets.h" +#include "pa_sapi.h" -#define escape(action) \ - { \ - const char *src=row->item.ptr; \ - for(int size=row->item.size; size--; src++) \ - action \ +extern "C" { // author forgot to do that +#include "ec.h" +} + +#include "pa_sql_connection.h" + +// defines + +#undef CORD_ec_append +// redefining to intercept flushes and implement whitespace optimization +// of all consequent white space chars leaving only first one +#define CORD_ec_append(x, c) \ + { \ + bool skip=false; \ + if(optimize) switch(c) { \ + case ' ': case '\n': case '\t': \ + if(whitespace) \ + skip=true; /*skipping subsequent*/ \ + else \ + whitespace=true; \ + break; \ + default: \ + whitespace=false; \ + break; \ + } \ + if(!skip) { \ + if ((x)[0].ec_bufptr == (x)[0].ec_buf + CORD_BUFSZ) { \ + CORD_ec_flush_buf(x); \ + } \ + *((x)[0].ec_bufptr)++ = (c); \ + } \ + } + + +#define escape_fragment(action) \ + for(; fragment_length--; CORD_next(info->pos)) { \ + char c=CORD_pos_fetch(info->pos); \ + action \ } -#define _default default: *dest++=*src; break -#define encode(need_encode_func, prefix) \ - default: \ - if(need_encode_func(*src)) { \ - static const char *hex="0123456789ABCDEF"; \ - char chunk[3]={prefix}; \ - chunk[1]=hex[((unsigned char)*src)/0x10]; \ - chunk[2]=hex[((unsigned char)*src)%0x10]; \ - memcpy(dest, chunk, 3); dest+=3; \ - } else \ - *dest++=*src; \ - break -#define to_char(c) *dest++=c -#define to_string(b, bsize) \ - memcpy(dest, b, bsize); \ - dest+=bsize; \ +#define _default CORD_ec_append(info->result, c) +#define encode(need_encode_func, prefix, otherwise) \ + if(need_encode_func(c)) { \ + static const char* hex="0123456789ABCDEF"; \ + CORD_ec_append(info->result, prefix); \ + CORD_ec_append(info->result, hex[((unsigned char)c)/0x10]); \ + CORD_ec_append(info->result, hex[((unsigned char)c)%0x10]); \ + } else \ + CORD_ec_append(info->result, otherwise); +#define to_char(c) { CORD_ec_append(info->result, c); whitespace=false; } +#define to_string(s) { CORD_ec_append_cord(info->result, s); whitespace=false; } inline bool need_file_encode(unsigned char c){ - if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) - return false; - - return !strchr( -#ifdef WIN32 - ":\\~" + // russian letters and space ENABLED + // encoding only these... + return strchr( + "*?'\"<>|" +#ifndef WIN32 + ":\\" #endif - "./()_-", c); + , c)!=0; } + inline bool need_uri_encode(unsigned char c){ - if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) + if((c>='0') && (c<='9') || (c>='A') && (c<='Z') || (c>='a') && (c<='z')) return false; - return !strchr("_-./", c); + return !strchr("_-./*", c); } -inline bool need_http_header_encode(unsigned char c){ - if(strchr(" , :", c)) - return false; - return need_uri_encode(c); +inline bool need_regex_escape(unsigned char c){ + return strchr("\\^$.[]|()?*+{}-", c)!=0; } -// - -static const char * String_Untaint_lang_name[]={ - "U", ///< zero value handy for hash lookup @see untaint_lang_name2enum - "C", ///< clean - "T", ///< tainted, untaint language as assigned later - // untaint languages. assigned by ^untaint[lang]{...} - "P", - /**< - leave language built into string being appended. - just a flag, that value not stored - */ - "A", ///< leave all characters intact - "F", ///< file specification - "H", ///< ext in HTTP response header - "M", ///< text in mail header - "URI", ///< text in uri - "T", ///< ^table:set body - "SQL", ///< ^table:sql body - "JS", ///< JavaScript code - "XML", ///< ^dom:set xml - "HTML", ///< HTML code (for editing) - "UHTML", ///< HTML code with USER chars -}; - +inline bool need_parser_code_escape(unsigned char c){ + return strchr("^$;@()[]{}:#\"", c)!=0; +} // String /* - HTTP-header = field-name ":" [ field-value ] CRLF - field-name = token - field-value = *( field-content | LWS ) +field-name = token +field-value = *( field-content | LWS ) - field-content = - -word = token | quoted-string - token = 1* +word = token | quoted-string +quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) +qdtext = > +quoted-pair = "\" CHAR - +OCTET = +CHAR = tspecials = "(" | ")" | "<" | ">" | "@" - | "," | ";" | ":" | "\" | <"> - | "/" | "[" | "]" | "?" | "=" - | "{" | "}" | SP | HT + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT SP = HT = LWS = [CRLF] 1*( SP | HT ) -TEXT = +TEXT = +CTL = -quoted-pair = "\" CHAR if(strchr("()<>@,;:\\\"/[]?={} \t", *ptr)) */ -inline bool need_quote_http_header(const char *ptr, size_t size) { +inline bool need_quote_http_header(const char* ptr, size_t size) { for(; size--; ptr++) if(strchr(";\\\"= \t" /* excluded ()<>@, :/ ? []{} */, *ptr)) return true; return false; } -String& String::append(const String& src, uchar lang, bool forced) { - // manually unrolled code to avoid do{if(const)} constructs - if(forced) - STRING_SRC_FOREACH_ROW( - APPEND(row->item.ptr, row->item.size, - lang, //forcing passed lang - row->item.origin.file, row->item.origin.line); - ) - else if(lang==UL_PASS_APPENDED) - STRING_SRC_FOREACH_ROW( - APPEND(row->item.ptr, row->item.size, - row->item.lang, // passing item's lang - row->item.origin.file, row->item.origin.line); - ) - else if(lang&UL_OPTIMIZE_BIT) // main idea here - // tainted piece would get OPTIMIZED bit from 'lang' - // clean piece would be marked OPTIMIZED manually - // pieces with determined languages [not tainted|clean] would retain theirs langs - STRING_SRC_FOREACH_ROW( - APPEND(row->item.ptr, row->item.size, - row->item.lang==UL_TAINTED?lang:( - row->item.lang==UL_CLEAN?UL_CLEAN|UL_OPTIMIZE_BIT: // ORing with OPTIMIZED flag - row->item.lang - ), - row->item.origin.file, row->item.origin.line); - ) - else - STRING_SRC_FOREACH_ROW( - APPEND(row->item.ptr, row->item.size, - row->item.lang==UL_TAINTED?lang:row->item.lang, - row->item.origin.file, row->item.origin.line); - ); -break2: - return *this; +#ifndef DOXYGEN +struct Append_fragment_info { + String::Language lang; + String::Languages* dest_languages; + size_t dest_body_plan_length; +}; +#endif +int append_fragment_optimizing(char alang, size_t asize, Append_fragment_info* info) { + const String::Language lang=(String::Language)(unsigned char)alang; + // main idea here: + // tainted piece would get OPTIMIZED bit from 'lang' + // clean piece would be marked OPTIMIZED manually + // pieces with determined languages [not tainted|clean] would retain theirs langs + info->dest_languages->append(info->dest_body_plan_length, + lang==String::L_TAINTED? + info->lang + :lang==String::L_CLEAN? + (String::Language)(String::L_CLEAN|String::L_OPTIMIZE_BIT) // ORing with OPTIMIZED flag + :lang, + asize); + info->dest_body_plan_length+=asize; + + return 0; // 0=continue } +int append_fragment_nonoptimizing(char alang, size_t asize, Append_fragment_info* info) { + const String::Language lang=(String::Language)(unsigned char)alang; + // The core idea: tainted pieces got marked with context's lang + info->dest_languages->append(info->dest_body_plan_length, + lang==String::L_TAINTED? + info->lang + :lang, + asize); + info->dest_body_plan_length+=asize; -size_t String::cstr_bufsize(Untaint_lang lang, - SQL_Connection *connection, - const char *charset) const { - size_t dest=1; // for terminating 0 - STRING_FOREACH_ROW( - uchar to_lang=lang==UL_UNSPECIFIED?row->item.lang:lang; - - switch(to_lang & ~UL_OPTIMIZE_BIT) { - case UL_CLEAN: - case UL_TAINTED: - case UL_AS_IS: - // clean piece - - // tainted piece, but undefined untaint language - // for VString.as_double of tainted values - // for ^process{body} evaluation + return 0; // 0=continue +} - // tainted, untaint language: as-is - dest+=row->item.size; - break; - case UL_FILE_SPEC: - // tainted, untaint language: file [name] - dest+=row->item.size*3/* worst: Z->%XX */; - break; - case UL_URI: - // tainted, untaint language: uri - dest+=row->item.size*3/* worst: Z->%XX */; - break; - case UL_HTTP_HEADER: - // tainted, untaint language: http-field-content-text - dest+=row->item.size*3/* worst: Z->%XX */; - break; - case UL_MAIL_HEADER: - // tainted, untaint language: mail-header - if(charset) { - // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= - dest+=row->item.size*3+MAX_STRING/* worst: =?charset?Q?=%XX?= */; +/** + appends to other String, + marking all tainted pieces of it with @a lang. + or marking ALL pieces of it with a @a lang when @a forced to, + and propagating OPTIMIZE language bit. +*/ +String& String::append_to(String& dest, Language ilang, bool forced) const { + if(is_empty()) + return dest; + + // first: fragment infos + + if(ilang==L_PASS_APPENDED) // without language-change? + dest.langs.appendHelper(dest.body, langs, body); + else if(forced) //forcing passed lang? + dest.langs.appendHelper(dest.body, ilang, body); + else { + if(langs.opt.is_not_just_lang){ + Append_fragment_info info={ilang, &dest.langs, dest.body.length()}; + langs.for_each(body, ilang&L_OPTIMIZE_BIT? + append_fragment_optimizing + :append_fragment_nonoptimizing, &info); + } else { + Language lang=langs.opt.lang; + // see append_fragment_* for explanation + if(ilang&L_OPTIMIZE_BIT){ + dest.langs.appendHelper(dest.body, + lang==String::L_TAINTED? + ilang + :lang==String::L_CLEAN? + (String::Language)(String::L_CLEAN|String::L_OPTIMIZE_BIT) + :lang, + body); } else { - dest+=row->item.size; + dest.langs.appendHelper(dest.body, lang==String::L_TAINTED ? ilang:lang, body); } - break; - case UL_TABLE: - // tainted, untaint language: table - dest+=row->item.size; - break; - case UL_SQL: - // tainted, untaint language: sql - if(connection) - dest+=connection->quote(0, row->item.ptr, row->item.size); - break; - case UL_JS: - escape(switch(*src) { - case '"': case '\'': case '\n': case '\\': case '\xFF': - dest+=2; break; - default: - dest++; break; - }); - break; - case UL_XML: - escape(switch(*src) { - case '&': case '>': case '<': case '"': case '\'': - dest+= 6; break; - default: - dest++; break; - }); - break; - case UL_HTML: - escape(switch(*src) { - case '&': - case '>': - case '<': - case '"': - dest+=6; break; - default: - dest++; break; - }); - break; } - ); -break2: + } + + // next: letters + dest.body<rows; \ - for(uint i=0; icount; i++, row++) { \ - if(row==append_here) \ - goto break2; \ - \ - uchar to_lang=lang==UL_UNSPECIFIED?row->item.lang:lang; - - char *start=dest; - - switch(to_lang & ~UL_OPTIMIZE_BIT) { - case UL_CLEAN: - case UL_TAINTED: - case UL_AS_IS: - // clean piece - - // tainted piece, but undefined untaint language - // for VString.as_double of tainted values - // for ^process{body} evaluation - - // tainted, untaint language: as-is - memcpy(dest, row->item.ptr, row->item.size); - dest+=row->item.size; - break; - case UL_FILE_SPEC: - // tainted, untaint language: file [name] - escape(switch(*src) { - case ' ': to_char('_'); break; - encode(need_file_encode, '+'); - }); - break; - case UL_URI: - // tainted, untaint language: uri - escape(switch(*src) { - case ' ': to_char('+'); break; - encode(need_uri_encode, '%'); - }); - break; - case UL_HTTP_HEADER: - // tainted, untaint language: http-field-content-text - escape(switch(*src) { - case ' ': to_char('+'); break; - encode(need_uri_encode, '%'); - }); +/** http://www.ietf.org/rfc/rfc2047.txt +RFC +(3) As a replacement for a 'word' entity within a 'phrase', for example, + one that precedes an address in a From, To, or Cc header. The ABNF + definition for 'phrase' from RFC 822 thus becomes: + + phrase = 1*( encoded-word / word ) + + In this case the set of characters that may be used in a "Q"-encoded + 'encoded-word' is restricted to: . An 'encoded-word' that appears within a + 'phrase' MUST be separated from any adjacent 'word', 'text' or + 'special' by 'linear-white-space'. +... + (2) The 8-bit hexadecimal value 20 (e.g., ISO-8859-1 SPACE) may be + represented as "_" (underscore, ASCII 95.). (This character may + not pass through some internetwork mail gateways, but its use + will greatly enhance readability of "Q" encoded data with mail + readers that do not support this encoding.) Note that the "_" + always represents hexadecimal 20, even if the SPACE character + occupies a different code position in the character set in use. + + paf: obviously, + without "=", or one could not differ "=E0" and "russian letter a" + and without "_", or in would mean 0x20 +*/ +inline bool mail_header_char_valid_within_Qencoded(char c) { + return c>='A' && c<='Z' + || c>='a' && c<='Z' + || c>='0' && c<='9' + || strchr("!*+-/", c); +} +inline bool addr_spec_soon(const char *src) { + for(char c; (c=*src); src++) + if(c=='<') + return true; + else if(!(c==' ' || c=='\t')) + return false; + return false; +} +/** + RFC + Upper case should be used for hexadecimal digits "A" through "F" + The 8-bit hexadecimal value 20 (e.g., ISO-8859-1 SPACE) + may be represented as "_" +*/ +inline bool mail_header_nonspace_char(char c) { + return c != 0x20; +} + +inline void ec_append(CORD_ec& result, bool& optimize, bool& whitespace, CORD_pos pos, size_t size) { + while(size--) { + CORD_ec_append(result, CORD_pos_fetch(pos)); + CORD_next(pos); + } +} +inline void pa_CORD_pos_advance(CORD_pos pos, size_t n) { + while(true) { + long avail=CORD_pos_chars_left(pos); + if(avail<=0) { + CORD_next(pos); + if(!--n) + break; + } else if((size_t)avail=n + CORD_pos_advance(pos, n); break; - case UL_MAIL_HEADER: - // tainted, untaint language: mail-header - if(charset) { - // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= - const char *src=row->item.ptr; - bool to_quoted_printable=false; - for(int size=row->item.size; size--; src++) { - if(*src & 0x80) { - if(!to_quoted_printable) { - dest+=sprintf(dest, "=?%.15s?Q?", charset); - to_quoted_printable=true; - } - dest+=sprintf(dest, "=%02X", *src & 0xFF); - } else { - *dest++=*src; + } + } +} + +#ifndef DOXYGEN +struct Cstr_to_string_body_block_info { + // input + String::Language lang; + SQL_Connection* connection; + const Request_charsets* charsets; + const String::Body* body; + + // output + CORD_ec result; + + // private + CORD_pos pos; + size_t fragment_begin; + bool whitespace; + const char* exception; +}; +#endif + +// @todo: replace info->body->mid with something that uses info->pos +int cstr_to_string_body_block(String::Language to_lang, size_t fragment_length, Cstr_to_string_body_block_info* info) { + bool& whitespace=info->whitespace; + size_t fragment_end=info->fragment_begin+fragment_length; + //fprintf(stderr, "%d, %d =%s=\n", to_lang, fragment_length, info->body->cstr()); + + bool optimize=(to_lang & String::L_OPTIMIZE_BIT)!=0; + if(!optimize) + whitespace=false; + + switch(to_lang & ~String::L_OPTIMIZE_BIT) { + case String::L_CLEAN: + case String::L_TAINTED: + case String::L_AS_IS: + // clean piece + + // tainted piece, but undefined untaint language + // for VString.as_double of tainted values + // for ^process{body} evaluation + + // tainted, untaint language: as-is + ec_append(info->result, optimize, whitespace, info->pos, fragment_length); + break; + case String::L_FILE_SPEC: + // tainted, untaint language: file [name] + { + escape_fragment( + encode(need_file_encode, '_', c); + ); + } + break; + case String::L_URI: + // tainted, untaint language: uri + { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + if(info->charsets) + output=Charset::transcode(output, + info->charsets->source(), + info->charsets->client()); + + char c; + for(const char* src=output.str; (c=*src++); ) + encode(need_uri_encode, '%', c); + } + break; + case String::L_HTTP_HEADER: + // tainted, untaint language: http-field-content-text + escape_fragment( + encode(need_uri_encode, '%', c); + ); + break; + case String::L_MAIL_HEADER: + // tainted, untaint language: mail-header + // http://www.ietf.org/rfc/rfc2047.txt + if(info->charsets) { + size_t mail_size; + const char *mail_ptr= + info->body->mid(info->fragment_begin, mail_size=fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, mail_size); + + const char* charset_name=info->charsets->mail().NAME().cstr(); + + // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= + bool to_quoted_printable=false; + + bool email=false; + uchar c; + for(const char* src=mail_ptr; (c=(uchar)*src++); ) { + if(c=='\r' || c=='\n') + c=' '; + if(to_quoted_printable && (c==',' || c == '"' || addr_spec_soon(src-1/*position to 'c'*/))) { + email=c=='<'; + to_string("?="); + to_quoted_printable=false; + } + //RFC + An 'encoded-word' MUST NOT appear in any portion of an 'addr-spec'. + if(!email && ( + !to_quoted_printable && (c & 0x80) // starting quote-printable-encoding on first 8bit char + || to_quoted_printable && !mail_header_char_valid_within_Qencoded(c) + )) { + if(!to_quoted_printable) { + to_string("=?"); + to_string(charset_name); + to_string("?Q?"); + to_quoted_printable=true; } + encode(mail_header_nonspace_char, '=', '_'); + } else + to_char(c); + if(c=='>') + email=false; + } + if(to_quoted_printable) // close + to_string("?="); + + } else + ec_append(info->result, optimize, whitespace, info->pos, fragment_length); + break; + case String::L_SQL: + // tainted, untaint language: sql + if(info->connection) { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + + to_string(info->connection->quote(fragment_str, fragment_length)); + } else { + info->exception="untaint in SQL language failed - no connection specified"; + info->fragment_begin=fragment_end; + return 1; // stop processing. can't throw exception here + } + break; + case String::L_JS: + escape_fragment(switch(c) { + case '\n': to_string("\\n"); break; + case '"': to_string("\\\""); break; + case '\'': to_string("\\'"); break; + case '\\': to_string("\\\\"); break; + case '\xFF': to_string("\\\xFF"); break; + case '\r': to_string("\\r"); break; + default: _default; break; + }); + break; + case String::L_XML: + // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + escape_fragment(switch(c) { + case '\x20': + case '\x9': + case '\xA': + case '\xD': // this is usually removed on input + _default; + break; + case '&': to_string("&"); break; + case '>': to_string(">"); break; + case '<': to_string("<"); break; + case '"': to_string("""); break; + case '\'': to_string("'"); break; + default: + if(((unsigned char)c)<0x20) { + // fixing it, so that libxml would not result + // in fatal error parsing text + // though it really violates standard. + // to indicate there were an error + // replace bad char not to it's code, + // which we can do, + // but rather to '!' to show that input were actually + // invalid. + // life: shows that MSIE can somehow garble form values + // so that they contain these chars. + to_char('!'); + } else { + _default; } - if(to_quoted_printable) // close - dest+=sprintf(dest, "?="); + break; + }); + break; + case String::L_HTML: + escape_fragment(switch(c) { + case '&': to_string("&"); break; + case '>': to_string(">"); break; + case '<': to_string("<"); break; + case '"': to_string("""); break; + default: _default; break; + }); + break; + case String::L_REGEX: + // tainted, untaint language: regex + escape_fragment( + if(need_regex_escape(c)) + to_char('\\') + _default; + ); + break; + case String::L_JSON: + // tainted, untaint language: json + // escape '"' '\' '/' '\n' '\t' '\r' '\b' '\f' chars and escape chars as \uXXXX if output charset != UTF-8 + { + if(info->charsets->client().isUTF8()){ + // escaping to \uXXXX is not needed + escape_fragment(switch(c) { + case '\n': to_string("\\n"); break; + case '"' : to_string("\\\""); break; + case '\\': to_string("\\\\"); break; + case '/' : to_string("\\/"); break; + case '\t': to_string("\\t"); break; + case '\r': to_string("\\r"); break; + case '\b': to_string("\\b"); break; + case '\f': to_string("\\f"); break; + default : _default; break; + }); } else { - memcpy(dest, row->item.ptr, row->item.size); - dest+=row->item.size; + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + + output=Charset::escape_JSON(output, info->charsets->source()); + to_string(output); } - break; - case UL_TABLE: - // tainted, untaint language: table - escape(switch(*src) { - case '\t': to_char(' '); break; - case '\n': to_char(' '); break; - _default; - }); - break; - case UL_SQL: - // tainted, untaint language: sql - if(connection) - dest+=connection->quote(dest, row->item.ptr, row->item.size); - else - throw Exception(0, 0, - this, - "untaint in SQL language failed - no connection specified"); - break; - case UL_JS: - escape(switch(*src) { - case '"': to_string("\\\"", 2); break; - case '\'': to_string("\\'", 2); break; - case '\n': to_string("\\n", 2); break; - case '\\': to_string("\\\\", 2); break; - case '\xFF': to_string("\\\xFF", 2); break; - _default; - }); - break; - case UL_XML: - escape(switch(*src) { - case '&': to_string("&", 5); break; - case '>': to_string(">", 4); break; - case '<': to_string("<", 4); break; - case '"': to_string(""", 6); break; - case '\'': to_string("'", 6); break; - _default; - }); - break; - case UL_HTML: - escape(switch(*src) { - case '&': to_string("&", 5); break; - case '>': to_string(">", 4); break; - case '<': to_string("<", 4); break; - case '"': to_string(""", 6); break; - _default; - }); - break; - default: - throw Exception(0, 0, - this, - "unknown untaint language #%d of %d piece", - static_cast(row->item.lang), - i); // never - break; // never } + break; + case String::L_HTTP_COOKIE: + // tainted, untaint language: cookie (3.3.0 and higher: %uXXXX in UTF-8) + { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + + output=Charset::escape(output, info->charsets->source()); + //throw Exception(0, 0, output); + to_string(output); - if(to_lang & UL_OPTIMIZE_BIT) { - // optimizing whitespace - char *stop=dest; dest=start; - for(char *src=start; src(to_lang)); // should never + break; // never + } - } \ - chunk=row->link; \ - } while(chunk); \ + info->fragment_begin=fragment_end; -break2: - return dest; + return 0; // 0=continue } -char *String::cstr_debug_origins() const { - char *result=(char *)malloc(size()+used_rows()*MAX_STRING*2); - char *dest=result; - - const Chunk *chunk=&head; - do { - const Chunk::Row *row=chunk->rows; - for(uint i=0; icount; i++, row++) { - if(row==append_here) - goto break2; - -#ifndef NO_STRING_ORIGIN - if(row->item.origin.file) - dest+=sprintf(dest, ORIGIN_FILE_LINE_FORMAT, - row->item.origin.file, - 1+row->item.origin.line); - else - dest+=sprintf(dest, ""); -#endif - dest+=sprintf(dest, "#%s: ", - String_Untaint_lang_name[row->item.lang]); - char *dest_after_origins=dest; - memcpy(dest, row->item.ptr, row->item.size); - dest+=row->item.size; +String::Body String::cstr_to_string_body_taint(Language lang, SQL_Connection* connection, const Request_charsets *charsets) const { + if(is_empty()) + return String::Body(); + + Cstr_to_string_body_block_info info; + // input + info.lang=lang; + info.connection=connection; + info.charsets=charsets; + info.body=&body; + // output + CORD_ec_init(info.result); + // private + body.set_pos(info.pos, 0); + info.fragment_begin=0; + info.exception=0; + info.whitespace=true; + + cstr_to_string_body_block(lang, length(), &info); + + if(info.exception) + throw Exception(0, + 0, + info.exception); - remove_crlf(dest_after_origins, dest); - to_char('\n'); - } - chunk=row->link; - } while(chunk); + return String::Body(CORD_ec_to_cord(info.result), info.fragment_begin); +} + +int cstr_to_string_body_block_untaint(char alang, size_t fragment_length, Cstr_to_string_body_block_info* info){ + const String::Language lang=(String::Language)(unsigned char)alang; + // see append_fragment_* for explanation + if(info->lang&String::L_OPTIMIZE_BIT) + return cstr_to_string_body_block( + lang==String::L_TAINTED? + info->lang + :lang==String::L_CLEAN? + (String::Language)(String::L_CLEAN|String::L_OPTIMIZE_BIT) + :lang, + fragment_length, info); + else + return cstr_to_string_body_block(lang==String::L_TAINTED ? info->lang:lang, fragment_length, info); +} + +String::Body String::cstr_to_string_body_untaint(Language lang, SQL_Connection* connection, const Request_charsets *charsets) const { + if(is_empty()) + return String::Body(); + + Cstr_to_string_body_block_info info; + // input + info.lang=lang; + info.connection=connection; + info.charsets=charsets; + info.body=&body; + // output + CORD_ec_init(info.result); + // private + body.set_pos(info.pos, 0); + info.fragment_begin=0; + info.exception=0; + info.whitespace=true; + + langs.for_each(body, cstr_to_string_body_block_untaint, &info); + + if(info.exception) + throw Exception(0, + 0, + info.exception); + + return String::Body(CORD_ec_to_cord(info.result), info.fragment_begin); +} -break2: - *dest=0; - return result; +const char* String::untaint_and_transcode_cstr(Language lang, const Request_charsets *charsets) const { + if(charsets && &charsets->source() != &charsets->client()){ + // Note: L_URI is allready transcoded during untaint, but transcode does not affect %XX + return Charset::transcode(cstr_to_string_body_untaint(lang, 0, charsets), charsets->source(), charsets->client()).cstr(); + } else + return cstr_to_string_body_untaint(lang, 0, charsets).cstr(); }