--- parser3/src/main/untaint.C 2002/08/01 11:26:51 1.103 +++ parser3/src/main/untaint.C 2010/12/29 12:17:58 1.161 @@ -1,542 +1,636 @@ /** @file Parser: String class part: untaint mechanizm. - Copyright(c) 2001, 2002 ArtLebedev Group (http://www.artlebedev.com) + Copyright(c) 2001-2005 ArtLebedev Group (http://www.artlebedev.com) Author: Alexandr Petrosian (http://paf.design.ru) */ -static const char* IDENT_UNTAINT_C="$Id: untaint.C,v 1.103 2002/08/01 11:26:51 paf Exp $"; +static const char * const IDENT_UNTAINT_C="$Date: 2010/12/29 12:17:58 $"; + -#include "pa_pool.h" #include "pa_string.h" #include "pa_hash.h" #include "pa_exception.h" #include "pa_table.h" #include "pa_globals.h" -#include "pa_sql_connection.h" #include "pa_dictionary.h" #include "pa_common.h" #include "pa_charset.h" +#include "pa_request_charsets.h" +#include "pa_sapi.h" -//#define DEBUG_STRING_APPENDS_VS_EXPANDS +extern "C" { // author forgot to do that +#include "ec.h" +} -#ifdef DEBUG_STRING_APPENDS_VS_EXPANDS -ulong string_string_shortcut_economy=0; -#endif +#include "pa_sql_connection.h" + +// defines -#define escape(action) \ - { \ - const char *src=row->item.ptr; \ - for(int size=row->item.size; size--; src++) \ - action \ +#undef CORD_ec_append +// redefining to intercept flushes and implement whitespace optimization +// of all consequent white space chars leaving only first one +#define CORD_ec_append(x, c) \ + { \ + bool skip=false; \ + if(optimize) switch(c) { \ + case ' ': case '\n': case '\t': \ + if(whitespace) \ + skip=true; /*skipping subsequent*/ \ + else \ + whitespace=true; \ + break; \ + default: \ + whitespace=false; \ + break; \ + } \ + if(!skip) { \ + if ((x)[0].ec_bufptr == (x)[0].ec_buf + CORD_BUFSZ) { \ + CORD_ec_flush_buf(x); \ + } \ + *((x)[0].ec_bufptr)++ = (c); \ + } \ + } + + +#define escape_fragment(action) \ + for(; fragment_length--; CORD_next(info->pos)) { \ + char c=CORD_pos_fetch(info->pos); \ + action \ } -#define _default default: *dest++=*src; break -#define encode(need_encode_func, prefix) \ - if(need_encode_func(*src)) { \ - static const char *hex="0123456789ABCDEF"; \ - char chunk[3]={prefix}; \ - chunk[1]=hex[((unsigned char)*src)/0x10]; \ - chunk[2]=hex[((unsigned char)*src)%0x10]; \ - memcpy(dest, chunk, 3); dest+=3; \ - } else \ - *dest++=*src; \ - break -#define to_char(c) *dest++=c -#define to_string(b, bsize) \ - memcpy(dest, b, bsize); \ - dest+=bsize; \ +#define _default CORD_ec_append(info->result, c) +#define encode(need_encode_func, prefix, otherwise) \ + if(need_encode_func(c)) { \ + static const char* hex="0123456789ABCDEF"; \ + CORD_ec_append(info->result, prefix); \ + CORD_ec_append(info->result, hex[((unsigned char)c)/0x10]); \ + CORD_ec_append(info->result, hex[((unsigned char)c)%0x10]); \ + } else \ + CORD_ec_append(info->result, otherwise); +#define to_char(c) { CORD_ec_append(info->result, c); whitespace=false; } +#define to_string(s) { CORD_ec_append_cord(info->result, s); whitespace=false; } inline bool need_file_encode(unsigned char c){ - // theoretical problem with, for instance, "_2B" and "." fragments, - // they would yield the same - // because need_file_encode('_')=false - // but we need to delete such files somehow, getting names from ^index - - if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) - return false; - - return !strchr( - " _./()-" -#ifdef WIN32 - ":\\~" + // russian letters and space ENABLED + // encoding only these... + return strchr( + "*?'\"<>|" +#ifndef WIN32 + ":\\" #endif - , c); + , c)!=0; } + inline bool need_uri_encode(unsigned char c){ - if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) + if((c>='0') && (c<='9') || (c>='A') && (c<='Z') || (c>='a') && (c<='z')) return false; - return !strchr("_-./", c); + return !strchr("_-./*", c); } -inline bool need_http_header_encode(unsigned char c){ - if(strchr(" , :", c)) - return false; - return need_uri_encode(c); +inline bool need_regex_escape(unsigned char c){ + return strchr("\\^$.[]|()?*+{}-", c)!=0; } -// - -static const char * String_Untaint_lang_name[]={ - "U", ///< zero value handy for hash lookup @see untaint_lang_name2enum - "C", ///< clean - "T", ///< tainted, untaint language as assigned later - // untaint languages. assigned by ^untaint[lang]{...} - "P", - /**< - leave language built into string being appended. - just a flag, that value not stored - */ - "A", ///< leave all characters intact - "F", ///< file specification - "H", ///< ext in HTTP response header - "M", ///< text in mail header - "URI", ///< text in uri - "T", ///< ^table:set body - "SQL", ///< ^table:sql body - "JS", ///< JavaScript code - "XML", ///< ^dom:set xml - "HTML" ///< HTML code (for editing) -}; - +inline bool need_parser_code_escape(unsigned char c){ + return strchr("^$;@()[]{}:#\"", c)!=0; +} // String /* - HTTP-header = field-name ":" [ field-value ] CRLF - field-name = token - field-value = *( field-content | LWS ) +field-name = token +field-value = *( field-content | LWS ) - field-content = - -word = token | quoted-string - token = 1* +word = token | quoted-string +quoted-string = ( <"> *(qdtext | quoted-pair ) <"> ) +qdtext = > +quoted-pair = "\" CHAR - +OCTET = +CHAR = tspecials = "(" | ")" | "<" | ">" | "@" - | "," | ";" | ":" | "\" | <"> - | "/" | "[" | "]" | "?" | "=" - | "{" | "}" | SP | HT + | "," | ";" | ":" | "\" | <"> + | "/" | "[" | "]" | "?" | "=" + | "{" | "}" | SP | HT SP = HT = LWS = [CRLF] 1*( SP | HT ) -TEXT = +TEXT = +CTL = -quoted-pair = "\" CHAR if(strchr("()<>@,;:\\\"/[]?={} \t", *ptr)) */ -inline bool need_quote_http_header(const char *ptr, size_t size) { +inline bool need_quote_http_header(const char* ptr, size_t size) { for(; size--; ptr++) if(strchr(";\\\"= \t" /* excluded ()<>@, :/ ? []{} */, *ptr)) return true; return false; } -//#include "pa_sapi.h" +#ifndef DOXYGEN +struct Append_fragment_info { + String::Language lang; + String::Languages* dest_languages; + size_t dest_body_plan_length; +}; +#endif +int append_fragment_optimizing(char alang, size_t asize, Append_fragment_info* info) { + const String::Language lang=(String::Language)(unsigned char)alang; + // main idea here: + // tainted piece would get OPTIMIZED bit from 'lang' + // clean piece would be marked OPTIMIZED manually + // pieces with determined languages [not tainted|clean] would retain theirs langs + info->dest_languages->append(info->dest_body_plan_length, + lang==String::L_TAINTED? + info->lang + :lang==String::L_CLEAN? + (String::Language)(String::L_CLEAN|String::L_OPTIMIZE_BIT) // ORing with OPTIMIZED flag + :lang, + asize); + info->dest_body_plan_length+=asize; + + return 0; // 0=continue +} +int append_fragment_nonoptimizing(char alang, size_t asize, Append_fragment_info* info) { + const String::Language lang=(String::Language)(unsigned char)alang; + // The core idea: tainted pieces got marked with context's lang + info->dest_languages->append(info->dest_body_plan_length, + lang==String::L_TAINTED? + info->lang + :lang, + asize); + info->dest_body_plan_length+=asize; + + return 0; // 0=continue +} + /** - appends other String, + appends to other String, marking all tainted pieces of it with @a lang. or marking ALL pieces of it with a @a lang when @a forced to, and propagating OPTIMIZE language bit. - - using architecture advantage: after string-to-string-append string never modified. - algorithm: - if no language-change specified and src not yet appended to some other string[last_chunk!=0] - shrinking dest last_chunk[preparing it for linking], - ///shrinking src last_chunk[preparing it to be linked, consequent dest.appends would go there], - linking[dest.last_chunk = src.head.chunk] - if some language-change specified or src already appended to some other string[last_chunk==0] - cloning pieces. */ -String& String::append(const String& src, uchar lang, bool forced) { - if(!last_chunk) // growth stopped [we're appended as string to somebody] - throw Exception(0, - this, - "string growth stopped (append string)"); - - if(src.is_empty()) - return *this; - - // without language-chage, not-appended-before, big[not fitting our tail] string? - if(lang==UL_PASS_APPENDED - && src.last_chunk - && (uint(&last_chunk->rows[last_chunk->count]-append_here) < src.used_rows())) { -#ifdef DEBUG_STRING_APPENDS_VS_EXPANDS - string_string_shortcut_economy+=src.used_rows()*sizeof(String::Chunk::Row); -#endif +String& String::append_to(String& dest, Language ilang, bool forced) const { + if(is_empty()) + return dest; - // using fact: - // src.head.chunk.count initally equeals this.head.chunk.count and shrinks-only, - // so can't be more than this.head.chunk.count, - // which means that we know that - // src.head.chunk would fit into this.head.chunk - if(is_empty()) { // our head.chunk is empty - // they have more than head.chunk? we need all head.chunk : we need only filled-part of head.chunk - Chunk *src_head_link=src.head.chunk.rows[src.head.chunk.count].link; - size_t head_count=src_head_link?src.head.chunk.count:(src.append_here-src.head.chunk.rows); - // "your head.chunk is my head.chunk" - memcpy(head.chunk.rows, src.head.chunk.rows, sizeof(Chunk::Row)*(head_count)); - if(src_head_link) { - // "your body is my body" - head.chunk.rows[head.chunk.count=head_count].link=src_head_link; - // "your last_chunk is mine now" - last_chunk=src.last_chunk; - // "your append_here is mine now" - append_here=src.append_here; + // first: fragment infos + + if(ilang==L_PASS_APPENDED) // without language-change? + dest.langs.appendHelper(dest.body, langs, body); + else if(forced) //forcing passed lang? + dest.langs.appendHelper(dest.body, ilang, body); + else { + if(langs.opt.is_not_just_lang){ + Append_fragment_info info={ilang, &dest.langs, dest.body.length()}; + langs.for_each(body, ilang&L_OPTIMIZE_BIT? + append_fragment_optimizing + :append_fragment_nonoptimizing, &info); + } else { + Language lang=langs.opt.lang; + // see append_fragment_* for explanation + if(ilang&L_OPTIMIZE_BIT){ + dest.langs.appendHelper(dest.body, + lang==String::L_TAINTED? + ilang + :lang==String::L_CLEAN? + (String::Language)(String::L_CLEAN|String::L_OPTIMIZE_BIT) + :lang, + body); } else { - // "your last_chunk is mine now" - last_chunk=&head.chunk; - // "your append_here is recalc-mine now" - append_here=head.chunk.rows+head_count; + dest.langs.appendHelper(dest.body, lang==String::L_TAINTED ? ilang:lang, body); } - } else { // our head.chunk contains something - // "chopping off my tail-reserve" - last_chunk->count=append_here-last_chunk->rows; - // "you is my tail" - append_here->link=&src.head.chunk; - // "your last_chunk is mine now" - last_chunk=src.last_chunk; - // "your append_here is mine now" - append_here=src.append_here; } - - // stop-growing mark - src.last_chunk=0; - return *this; } - // manually unrolled code to avoid do{if(const)} constructs - if(forced) - STRING_SRC_FOREACH_ROW( - APPEND(row->item.ptr, row->item.size, - lang, //forcing passed lang - row->item.origin.file, row->item.origin.line); - ) - else if(lang==UL_PASS_APPENDED) - STRING_SRC_FOREACH_ROW( - APPEND(row->item.ptr, row->item.size, - row->item.lang, // passing item's lang - row->item.origin.file, row->item.origin.line); - ) - else if(lang&UL_OPTIMIZE_BIT) // main idea here - // tainted piece would get OPTIMIZED bit from 'lang' - // clean piece would be marked OPTIMIZED manually - // pieces with determined languages [not tainted|clean] would retain theirs langs - STRING_SRC_FOREACH_ROW( - APPEND(row->item.ptr, row->item.size, - row->item.lang==UL_TAINTED?lang:( - row->item.lang==UL_CLEAN?UL_CLEAN|UL_OPTIMIZE_BIT: // ORing with OPTIMIZED flag - row->item.lang - ), - row->item.origin.file, row->item.origin.line); - ) - else - STRING_SRC_FOREACH_ROW( - APPEND(row->item.ptr, row->item.size, - row->item.lang==UL_TAINTED?lang:row->item.lang, - row->item.origin.file, row->item.origin.line); - ); -/* - for(Chunk::Row *row=last_chunk->rows; rowlink==(void*)0xcdcdcdcd) - _asm int 3;*/ - return *this; -} - -size_t String::cstr_bufsize(Untaint_lang lang, - SQL_Connection *connection, - Charset *buf_charset) const { - size_t dest=1; // for terminating 0 - STRING_FOREACH_ROW( - uchar to_lang=lang==UL_UNSPECIFIED?row->item.lang:lang; - - switch(to_lang & ~UL_OPTIMIZE_BIT) { - case UL_CLEAN: - case UL_TAINTED: - case UL_AS_IS: - // clean piece - - // tainted piece, but undefined untaint language - // for VString.as_double of tainted values - // for ^process{body} evaluation + // next: letters + dest.body<item.size; - break; - case UL_FILE_SPEC: - // tainted, untaint language: file [name] - dest+=row->item.size*3/* worst: Z->%XX */; - break; - case UL_URI: - // tainted, untaint language: uri - dest+=row->item.size*6*3/* worst utf8 x worst Z->%XX */; - break; - case UL_HTTP_HEADER: - // tainted, untaint language: http-field-content-text - dest+=row->item.size*3/* worst: Z->%XX */; - break; - case UL_MAIL_HEADER: - // tainted, untaint language: mail-header - if(buf_charset) { - // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= - dest+= - row->item.size*3+ - buf_charset->name().size()+MAX_STRING/* worst: =?charset?Q?=%XX?= */; - } else - dest+=row->item.size; - break; - case UL_TABLE: - // tainted, untaint language: table - dest+=row->item.size; - break; - case UL_SQL: - // tainted, untaint language: sql - if(connection) - dest+=connection->quote(0, row->item.ptr, row->item.size); - break; - case UL_JS: - escape(switch(*src) { - case '"': case '\'': case '\n': case '\\': case '\xFF': - dest+=2; break; - default: - dest++; break; - }); - break; - case UL_XML: - escape(switch(*src) { - case '&': case '>': case '<': case '"': case '\'': - dest+= 6; break; - default: - dest++; break; - }); - break; - case UL_HTML: - escape(switch(*src) { - case '&': - case '>': - case '<': - case '"': - dest+=6; break; - default: - dest++; break; - }); - break; - } - ); + ASSERT_STRING_INVARIANT(dest); return dest; } -char *String::store_to(char *dest, Untaint_lang lang, - SQL_Connection *connection, - Charset *store_to_charset, - const char *store_to_charset_name) const { - // WARNING: - // before any changes check cstr_bufsize first!!! - bool whitespace=true; - STRING_FOREACH_ROW( - uchar to_lang=lang==UL_UNSPECIFIED?row->item.lang:lang; - - char *start=dest; - - switch(to_lang & ~UL_OPTIMIZE_BIT) { - case UL_CLEAN: - case UL_TAINTED: - case UL_AS_IS: - // clean piece - - // tainted piece, but undefined untaint language - // for VString.as_double of tainted values - // for ^process{body} evaluation - - // tainted, untaint language: as-is - memcpy(dest, row->item.ptr, row->item.size); - dest+=row->item.size; +/** http://www.ietf.org/rfc/rfc2047.txt +RFC +(3) As a replacement for a 'word' entity within a 'phrase', for example, + one that precedes an address in a From, To, or Cc header. The ABNF + definition for 'phrase' from RFC 822 thus becomes: + + phrase = 1*( encoded-word / word ) + + In this case the set of characters that may be used in a "Q"-encoded + 'encoded-word' is restricted to: . An 'encoded-word' that appears within a + 'phrase' MUST be separated from any adjacent 'word', 'text' or + 'special' by 'linear-white-space'. +... + (2) The 8-bit hexadecimal value 20 (e.g., ISO-8859-1 SPACE) may be + represented as "_" (underscore, ASCII 95.). (This character may + not pass through some internetwork mail gateways, but its use + will greatly enhance readability of "Q" encoded data with mail + readers that do not support this encoding.) Note that the "_" + always represents hexadecimal 20, even if the SPACE character + occupies a different code position in the character set in use. + + paf: obviously, + without "=", or one could not differ "=E0" and "russian letter a" + and without "_", or in would mean 0x20 +*/ +inline bool mail_header_char_valid_within_Qencoded(char c) { + return c>='A' && c<='Z' + || c>='a' && c<='Z' + || c>='0' && c<='9' + || strchr("!*+-/", c); +} +inline bool addr_spec_soon(const char *src) { + for(char c; (c=*src); src++) + if(c=='<') + return true; + else if(!(c==' ' || c=='\t')) + return false; + return false; +} +/** + RFC + Upper case should be used for hexadecimal digits "A" through "F" + The 8-bit hexadecimal value 20 (e.g., ISO-8859-1 SPACE) + may be represented as "_" +*/ +inline bool mail_header_nonspace_char(char c) { + return c != 0x20; +} + +inline void ec_append(CORD_ec& result, bool& optimize, bool& whitespace, CORD_pos pos, size_t size) { + while(size--) { + CORD_ec_append(result, CORD_pos_fetch(pos)); + CORD_next(pos); + } +} +inline void pa_CORD_pos_advance(CORD_pos pos, size_t n) { + while(true) { + long avail=CORD_pos_chars_left(pos); + if(avail<=0) { + CORD_next(pos); + if(!--n) + break; + } else if((size_t)avail=n + CORD_pos_advance(pos, n); break; - case UL_FILE_SPEC: - // tainted, untaint language: file [name] - escape( - encode(need_file_encode, '_'); + } + } +} + +#ifndef DOXYGEN +struct Cstr_to_string_body_block_info { + // input + String::Language lang; + SQL_Connection* connection; + const Request_charsets* charsets; + const String::Body* body; + + // output + CORD_ec result; + + // private + CORD_pos pos; + size_t fragment_begin; + bool whitespace; + const char* exception; +}; +#endif + +// @todo: replace info->body->mid with something that uses info->pos +int cstr_to_string_body_block(String::Language to_lang, size_t fragment_length, Cstr_to_string_body_block_info* info) { + bool& whitespace=info->whitespace; + size_t fragment_end=info->fragment_begin+fragment_length; + //fprintf(stderr, "%d, %d =%s=\n", to_lang, fragment_length, info->body->cstr()); + + bool optimize=(to_lang & String::L_OPTIMIZE_BIT)!=0; + if(!optimize) + whitespace=false; + + switch(to_lang & ~String::L_OPTIMIZE_BIT) { + case String::L_CLEAN: + case String::L_TAINTED: + case String::L_AS_IS: + // clean piece + + // tainted piece, but undefined untaint language + // for VString.as_double of tainted values + // for ^process{body} evaluation + + // tainted, untaint language: as-is + ec_append(info->result, optimize, whitespace, info->pos, fragment_length); + break; + case String::L_FILE_SPEC: + // tainted, untaint language: file [name] + { + escape_fragment( + encode(need_file_encode, '_', c); ); - break; - case UL_URI: - // tainted, untaint language: uri - const void *client_ptr; - size_t client_size; - Charset::transcode(pool(), - pool().get_source_charset(), row->item.ptr, row->item.size, - pool().get_client_charset(), client_ptr, client_size); - { - const char *src=(const char *)client_ptr; - for(int size=client_size; size--; src++) - switch(*src) { - case ' ': to_char('+'); break; - default: encode(need_uri_encode, '%'); - }; - } - break; - case UL_HTTP_HEADER: - // tainted, untaint language: http-field-content-text - escape(switch(*src) { - case ' ': to_char('+'); break; - default: encode(need_uri_encode, '%'); - }); - break; - case UL_MAIL_HEADER: - // tainted, untaint language: mail-header - if(store_to_charset && store_to_charset_name) { - const void *mail_ptr; - size_t mail_size; - Charset::transcode(pool(), - pool().get_source_charset(), row->item.ptr, row->item.size, - *store_to_charset, mail_ptr, mail_size); - - // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= - const char *src=(const char *)mail_ptr; - bool to_quoted_printable=false; - for(int size=mail_size; size--; src++) { - if((*src & 0x80) // starting quote-printable-encoding on first 8bit char - || (to_quoted_printable && (*src=='?' || *src=='=')) // additionally encoding '?' and '|' - ) { - if(!to_quoted_printable) { - dest+=sprintf(dest, "=?%s?Q?", store_to_charset_name); - to_quoted_printable=true; - } - dest+=sprintf(dest, "=%02X", *src & 0xFF); - } else { - *dest++=*src; + } + break; + case String::L_URI: + // tainted, untaint language: uri + { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + if(info->charsets) + output=Charset::transcode(output, + info->charsets->source(), + info->charsets->client()); + + char c; + for(const char* src=output.str; (c=*src++); ) + encode(need_uri_encode, '%', c); + } + break; + case String::L_HTTP_HEADER: + // tainted, untaint language: http-field-content-text + escape_fragment(switch(c) { + case '\n': + case '\r': to_string(" "); break; + default: _default; break; + }); + break; + case String::L_MAIL_HEADER: + // tainted, untaint language: mail-header + // http://www.ietf.org/rfc/rfc2047.txt + if(info->charsets) { + size_t mail_size; + const char *mail_ptr= + info->body->mid(info->fragment_begin, mail_size=fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, mail_size); + + const char* charset_name=info->charsets->mail().NAME().cstr(); + + // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= + bool to_quoted_printable=false; + + bool email=false; + uchar c; + for(const char* src=mail_ptr; (c=(uchar)*src++); ) { + if(c=='\r' || c=='\n') + c=' '; + if(to_quoted_printable && (c==',' || c == '"' || addr_spec_soon(src-1/*position to 'c'*/))) { + email=c=='<'; + to_string("?="); + to_quoted_printable=false; + } + //RFC + An 'encoded-word' MUST NOT appear in any portion of an 'addr-spec'. + if(!email && ( + !to_quoted_printable && (c & 0x80) // starting quote-printable-encoding on first 8bit char + || to_quoted_printable && !mail_header_char_valid_within_Qencoded(c) + )) { + if(!to_quoted_printable) { + to_string("=?"); + to_string(charset_name); + to_string("?Q?"); + to_quoted_printable=true; } + encode(mail_header_nonspace_char, '=', '_'); + } else + to_char(c); + if(c=='>') + email=false; + } + if(to_quoted_printable) // close + to_string("?="); + + } else + ec_append(info->result, optimize, whitespace, info->pos, fragment_length); + break; + case String::L_SQL: + // tainted, untaint language: sql + if(info->connection) { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + + to_string(info->connection->quote(fragment_str, fragment_length)); + } else { + info->exception="untaint in SQL language failed - no connection specified"; + info->fragment_begin=fragment_end; + return 1; // stop processing. can't throw exception here + } + break; + case String::L_JS: + escape_fragment(switch(c) { + case '\n': to_string("\\n"); break; + case '"': to_string("\\\""); break; + case '\'': to_string("\\'"); break; + case '\\': to_string("\\\\"); break; + case '\xFF': to_string("\\\xFF"); break; + case '\r': to_string("\\r"); break; + default: _default; break; + }); + break; + case String::L_XML: + // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + escape_fragment(switch(c) { + case '\x20': + case '\x9': + case '\xA': + case '\xD': // this is usually removed on input + _default; + break; + case '&': to_string("&"); break; + case '>': to_string(">"); break; + case '<': to_string("<"); break; + case '"': to_string("""); break; + case '\'': to_string("'"); break; + default: + if(((unsigned char)c)<0x20) { + // fixing it, so that libxml would not result + // in fatal error parsing text + // though it really violates standard. + // to indicate there were an error + // replace bad char not to it's code, + // which we can do, + // but rather to '!' to show that input were actually + // invalid. + // life: shows that MSIE can somehow garble form values + // so that they contain these chars. + to_char('!'); + } else { + _default; } - if(to_quoted_printable) // close - dest+=sprintf(dest, "?="); - + break; + }); + break; + case String::L_HTML: + escape_fragment(switch(c) { + case '&': to_string("&"); break; + case '>': to_string(">"); break; + case '<': to_string("<"); break; + case '"': to_string("""); break; + default: _default; break; + }); + break; + case String::L_REGEX: + // tainted, untaint language: regex + escape_fragment( + if(need_regex_escape(c)) + to_char('\\') + _default; + ); + break; + case String::L_JSON: + // tainted, untaint language: json + // escape '"' '\' '/' '\n' '\t' '\r' '\b' '\f' chars and escape chars as \uXXXX if output charset != UTF-8 + { + if(info->charsets==NULL || info->charsets->client().isUTF8()){ + // escaping to \uXXXX is not needed + escape_fragment(switch(c) { + case '\n': to_string("\\n"); break; + case '"' : to_string("\\\""); break; + case '\\': to_string("\\\\"); break; + case '/' : to_string("\\/"); break; + case '\t': to_string("\\t"); break; + case '\r': to_string("\\r"); break; + case '\b': to_string("\\b"); break; + case '\f': to_string("\\f"); break; + default : _default; break; + }); } else { - memcpy(dest, row->item.ptr, row->item.size); - dest+=row->item.size; + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + + output=Charset::escape_JSON(output, info->charsets->source()); + to_string(output); } - break; - case UL_TABLE: - // tainted, untaint language: table - escape(switch(*src) { - case '\t': to_char(' '); break; - case '\n': to_char(' '); break; - _default; - }); - break; - case UL_SQL: - // tainted, untaint language: sql - if(connection) - dest+=connection->quote(dest, row->item.ptr, row->item.size); - else - throw Exception(0, - this, - "untaint in SQL language failed - no connection specified"); - break; - case UL_JS: - escape(switch(*src) { - case '"': to_string("\\\"", 2); break; - case '\'': to_string("\\'", 2); break; - case '\n': to_string("\\n", 2); break; - case '\\': to_string("\\\\", 2); break; - case '\xFF': to_string("\\\xFF", 2); break; - _default; - }); - break; - case UL_XML: - escape(switch(*src) { - case '&': to_string("&", 5); break; - case '>': to_string(">", 4); break; - case '<': to_string("<", 4); break; - case '"': to_string(""", 6); break; - case '\'': to_string("'", 6); break; - _default; - }); - break; - case UL_HTML: - escape(switch(*src) { - case '&': to_string("&", 5); break; - case '>': to_string(">", 4); break; - case '<': to_string("<", 4); break; - case '"': to_string(""", 6); break; - _default; - }); - break; - default: - throw Exception(0, - this, - "unknown untaint language #%d", - static_cast(row->item.lang)); // sould never - break; // never } + break; + case String::L_HTTP_COOKIE: + // tainted, untaint language: cookie (3.3.0 and higher: %uXXXX in UTF-8) + if(info->charsets) { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + + output=Charset::escape(output, info->charsets->source()); + to_string(output); + } else + ec_append(info->result, optimize, whitespace, info->pos, fragment_length); + break; + case String::L_PARSER_CODE: + // for auto-untaint in process + escape_fragment( + if(need_parser_code_escape(c)) + to_char('^'); + _default; + ); + break; + default: + SAPI::abort("unknown untaint language #%d", + static_cast(to_lang)); // should never + break; // never + } - if(to_lang & UL_OPTIMIZE_BIT) { - // optimizing whitespace - char *stop=dest; dest=start; - for(char *src=start; srcfragment_begin=fragment_end; - return dest; + return 0; // 0=continue } -char *String::cstr_debug_origins() const { - //_asm int 3; - char *result=(char *)malloc(size()+used_rows()*MAX_STRING*2); - char *dest=result; - - STRING_FOREACH_ROW( -IFNDEF_NO_STRING_ORIGIN( - if(row->item.origin.file) - dest+=sprintf(dest, ORIGIN_FILE_LINE_FORMAT, - row->item.origin.file, - 1+row->item.origin.line); - else - dest+=sprintf(dest, ""); -); - uchar show_lang=row->item.lang & ~UL_OPTIMIZE_BIT; - if(show_lang>=sizeof(String_Untaint_lang_name)/sizeof(String_Untaint_lang_name[0])) - throw Exception(0, - this, - "unknown untaint language #%d", - static_cast(show_lang)); // sould never - - dest+=sprintf(dest, "#%s%s: ", - String_Untaint_lang_name[show_lang], - row->item.lang & UL_OPTIMIZE_BIT?".O":""); - char *dest_after_origins=dest; - - memcpy(dest, row->item.ptr, row->item.size); - dest+=row->item.size; - - remove_crlf(dest_after_origins, dest); - to_char('\n'); - ); - *dest=0; - return result; +String::Body String::cstr_to_string_body_taint(Language lang, SQL_Connection* connection, const Request_charsets *charsets) const { + if(is_empty()) + return String::Body(); + + Cstr_to_string_body_block_info info; + // input + info.lang=lang; + info.connection=connection; + info.charsets=charsets; + info.body=&body; + // output + CORD_ec_init(info.result); + // private + body.set_pos(info.pos, 0); + info.fragment_begin=0; + info.exception=0; + info.whitespace=true; + + cstr_to_string_body_block(lang, length(), &info); + + if(info.exception) + throw Exception(0, + 0, + info.exception); + + return String::Body(CORD_ec_to_cord(info.result)); +} + +int cstr_to_string_body_block_untaint(char alang, size_t fragment_length, Cstr_to_string_body_block_info* info){ + const String::Language lang=(String::Language)(unsigned char)alang; + // see append_fragment_* for explanation + if(info->lang&String::L_OPTIMIZE_BIT) + return cstr_to_string_body_block( + lang==String::L_TAINTED? + info->lang + :lang==String::L_CLEAN? + (String::Language)(String::L_CLEAN|String::L_OPTIMIZE_BIT) + :lang, + fragment_length, info); + else + return cstr_to_string_body_block(lang==String::L_TAINTED ? info->lang:lang, fragment_length, info); +} + +String::Body String::cstr_to_string_body_untaint(Language lang, SQL_Connection* connection, const Request_charsets *charsets) const { + if(is_empty()) + return String::Body(); + + Cstr_to_string_body_block_info info; + // input + info.lang=lang; + info.connection=connection; + info.charsets=charsets; + info.body=&body; + // output + CORD_ec_init(info.result); + // private + body.set_pos(info.pos, 0); + info.fragment_begin=0; + info.exception=0; + info.whitespace=true; + + langs.for_each(body, cstr_to_string_body_block_untaint, &info); + + if(info.exception) + throw Exception(0, + 0, + info.exception); + + return String::Body(CORD_ec_to_cord(info.result)); +} + +const char* String::untaint_and_transcode_cstr(Language lang, const Request_charsets *charsets) const { + if(charsets && &charsets->source() != &charsets->client()){ + // Note: L_URI is allready transcoded during untaint, but transcode does not affect %XX + return Charset::transcode(cstr_to_string_body_untaint(lang, 0, charsets), charsets->source(), charsets->client()).cstr(); + } else + return cstr_to_string_body_untaint(lang, 0, charsets).cstr(); }