--- parser3/src/main/untaint.C 2001/10/29 16:59:30 1.73 +++ parser3/src/main/untaint.C 2004/01/22 08:28:20 1.122.2.2 @@ -1,54 +1,89 @@ /** @file Parser: String class part: untaint mechanizm. - Copyright(c) 2001 ArtLebedev Group(http://www.artlebedev.com) - Author: Alexander Petrosyan (http://design.ru/paf) - - $Id: untaint.C,v 1.73 2001/10/29 16:59:30 paf Exp $ + Copyright(c) 2001-2003 ArtLebedev Group (http://www.artlebedev.com) + Author: Alexandr Petrosian (http://paf.design.ru) */ -#include "pa_pool.h" +static const char* IDENT_UNTAINT_C="$Date: 2004/01/22 08:28:20 $"; + + #include "pa_string.h" #include "pa_hash.h" #include "pa_exception.h" #include "pa_table.h" #include "pa_globals.h" -#include "pa_sql_connection.h" #include "pa_dictionary.h" #include "pa_common.h" +#include "pa_charset.h" +#include "pa_request_charsets.h" +#include "pa_sapi.h" + +extern "C" { // author forgot to do that +#include "ec.h" +} + +#define PA_SQL + +#ifdef PA_SQL +#include "pa_sql_connection.h" +#endif + +// defines + + +#undef CORD_ec_append +// redefining to intercept flushes and implement whitespace optimization +// of all consequent white space chars leaving only first one +#define CORD_ec_append(x, c) \ + { \ + bool skip=false; \ + if(optimize) switch(c) { \ + case ' ': case '\r': case '\n': case '\t': \ + if(whitespace) \ + skip=true; /*skipping subsequent*/ \ + else \ + whitespace=true; \ + break; \ + default: \ + whitespace=false; \ + break; \ + } \ + if(!skip) { \ + if ((x)[0].ec_bufptr == (x)[0].ec_buf + CORD_BUFSZ) { \ + CORD_ec_flush_buf(x); \ + } \ + *((x)[0].ec_bufptr)++ = (c); \ + } \ + } + #define escape(action) \ - { \ - const char *src=row->item.ptr; \ - for(int size=row->item.size; size--; src++) \ - action \ + for(; fragment_length--; CORD_next(info->pos)) { \ + char c=CORD_pos_fetch(info->pos); \ + action \ } -#define _default default: *dest++=*src; break -#define encode(need_encode_func, prefix) \ - default: \ - if(need_encode_func(*src)) { \ - static const char *hex="0123456789ABCDEF"; \ - char chunk[3]={prefix}; \ - chunk[1]=hex[((unsigned char)*src)/0x10]; \ - chunk[2]=hex[((unsigned char)*src)%0x10]; \ - memcpy(dest, chunk, 3); dest+=3; \ - } else \ - *dest++=*src; \ - break -#define to_char(c) *dest++=c -#define to_string(b, bsize) \ - memcpy(dest, b, bsize); \ - dest+=bsize; \ +#define _default default: CORD_ec_append(info->result, c); break +#define encode(need_encode_func, prefix, otherwise) \ + if(need_encode_func(c)) { \ + static const char* hex="0123456789ABCDEF"; \ + CORD_ec_append(info->result, prefix); \ + CORD_ec_append(info->result, hex[((unsigned char)c)/0x10]); \ + CORD_ec_append(info->result, hex[((unsigned char)c)%0x10]); \ + } else \ + CORD_ec_append(info->result, otherwise); +#define to_char(c) { CORD_ec_append(info->result, c); whitespace=false; } +#define to_string(s) { CORD_ec_append_cord(info->result, s); whitespace=false; } inline bool need_file_encode(unsigned char c){ - if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) - return false; - - return !strchr( -#ifdef WIN32 - ":\\~" + // russian letters and space ENABLED + // encoding only these... + return strchr( + "*?'\"<>|" +#ifndef WIN32 + ":\\" #endif - "./()_-", c); + , c)!=0; } inline bool need_uri_encode(unsigned char c){ if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) @@ -65,7 +100,7 @@ inline bool need_http_header_encode(unsi // -static const char * String_Untaint_lang_name[]={ +static const char* String_Untaint_lang_name[]={ "U", ///< zero value handy for hash lookup @see untaint_lang_name2enum "C", ///< clean "T", ///< tainted, untaint language as assigned later @@ -84,24 +119,12 @@ static const char * String_Untaint_lang_ "SQL", ///< ^table:sql body "JS", ///< JavaScript code "XML", ///< ^dom:set xml - "HTML", ///< HTML code (for editing) - "UHTML", ///< HTML code with USER chars + "HTML" ///< HTML code (for editing) }; // String -static bool typo_present(Array::Item *value, const void *info) { - Array *row=static_cast(value); - const char *src=static_cast(info); - - int partial; - row->get_string(0)->cmp(partial, src); - return - partial==0 || // full match - partial==1; // typo left column starts 'src' -} - /* HTTP-header = field-name ":" [ field-value ] CRLF @@ -137,269 +160,359 @@ quoted-pair = "\" CHAR if(strchr("()<>@,;:\\\"/[]?={} \t", *ptr)) */ -inline bool need_quote_http_header(const char *ptr, size_t size) { +inline bool need_quote_http_header(const char* ptr, size_t size) { for(; size--; ptr++) if(strchr(";\\\"= \t" /* excluded ()<>@, :/ ? []{} */, *ptr)) return true; return false; } -/** @todo maybe additional check "are all pieces are clean?" would be profitable? - @todo fix potential forigins_mode buf overrun +#ifndef DOXYGEN +struct Append_fragment_info { + String::Language lang; + String::Languages* dest_languages; + size_t dest_body_plan_length; +}; +#endif +int append_fragment_optimizing(char alang, size_t asize, Append_fragment_info* info) { + const String::Language lang=(String::Language)(unsigned char)alang; + // main idea here: + // tainted piece would get OPTIMIZED bit from 'lang' + // clean piece would be marked OPTIMIZED manually + // pieces with determined languages [not tainted|clean] would retain theirs langs + info->dest_languages->append(info->dest_body_plan_length, + lang==String::L_TAINTED? + info->lang + :lang==String::L_CLEAN? + (String::Language)(String::L_CLEAN|String::L_OPTIMIZE_BIT) // ORing with OPTIMIZED flag + :lang, + asize); + info->dest_body_plan_length+=asize; + + return 0; // 0=continue +} +int append_fragment_nonoptimizing(char alang, size_t asize, Append_fragment_info* info) { + const String::Language lang=(String::Language)(unsigned char)alang; + // The core idea: tainted pieces got marked with context's lang + info->dest_languages->append(info->dest_body_plan_length, + lang==String::L_TAINTED? + info->lang + :lang, + asize); + info->dest_body_plan_length+=asize; + + return 0; // 0=continue +} + +/** + appends to other String, + marking all tainted pieces of it with @a lang. + or marking ALL pieces of it with a @a lang when @a forced to, + and propagating OPTIMIZE language bit. */ -size_t String::cstr_bufsize(Untaint_lang lang) const { - return ( - lang==UL_AS_IS? - size() - : - size() - *UNTAINT_TIMES_BIGGER - *(forigins_mode?10:1) - ) - +1; +String& String::append_to(String& dest, Language lang, bool forced) const { + if(is_empty()) + return dest; + + // first: fragment infos + + if(lang==L_PASS_APPENDED) // without language-change? + dest.langs.append(dest.body, body.length(), langs); + else if(forced) //forcing passed lang? + dest.langs.append(dest.body, lang, length()); + else { + Append_fragment_info info={lang, &dest.langs, dest.body.length()}; + langs.for_each(body, lang&L_OPTIMIZE_BIT? + append_fragment_optimizing + :append_fragment_nonoptimizing, &info); + } + + // next: letters + dest.body<. An 'encoded-word' that appears within a + 'phrase' MUST be separated from any adjacent 'word', 'text' or + 'special' by 'linear-white-space'. +... + (2) The 8-bit hexadecimal value 20 (e.g., ISO-8859-1 SPACE) may be + represented as "_" (underscore, ASCII 95.). (This character may + not pass through some internetwork mail gateways, but its use + will greatly enhance readability of "Q" encoded data with mail + readers that do not support this encoding.) Note that the "_" + always represents hexadecimal 20, even if the SPACE character + occupies a different code position in the character set in use. + + paf: obviously, + without "=", or one could not differ "=E0" and "russian letter a" + and without "_", or in would mean 0x20 */ -char *String::store_to(char *dest, Untaint_lang lang, - SQL_Connection *connection, - const char *charset) const { - // $MAIN:html-typo table - Dictionary *user_typo_dict=static_cast(pool().tag()); - Dictionary *typo_dict=user_typo_dict?user_typo_dict:default_typo_dict; - - bool whitespace=true; - const Chunk *chunk=&head; - do { - const Chunk::Row *row=chunk->rows; - for(uint i=0; icount; i++, row++) { - if(row==append_here) - goto break2; - - Untaint_lang to_lang=lang==UL_UNSPECIFIED?(Untaint_lang)row->item.lang:lang; - - char *dest_before_origins=dest; - - if(forigins_mode) { -#ifndef NO_STRING_ORIGIN - if(row->item.origin.file) - dest+=sprintf(dest, "%s(%d)", - row->item.origin.file, - 1+row->item.origin.line); - else - dest+=sprintf(dest, "unknown"); -#endif - dest+=sprintf(dest, "#%s: ", - String_Untaint_lang_name[to_lang]); - } - char *dest_after_origins=dest; +inline bool mail_header_char_valid_within_Qencoded(char c) { + return c>='A' && c<='Z' + || c>='a' && c<='Z' + || c>='0' && c<='9' + || strchr("!*+-/", c); +} +inline bool addr_spec_soon(const char *src) { + for(char c; c=*src; src++) + if(c=='<') + return true; + else if(!(c==' ' || c=='\t')) + return false; + return false; +} +/** + RFC + Upper case should be used for hexadecimal digits "A" through "F" + The 8-bit hexadecimal value 20 (e.g., ISO-8859-1 SPACE) + may be represented as "_" +*/ +inline bool mail_header_nonspace_char(char c) { + return c != 0x20; +} +inline void ec_append(CORD_ec& result, bool& optimize, bool& whitespace, CORD_pos pos, size_t size) { + while(size--) { + CORD_ec_append(result, CORD_pos_fetch(pos)); + CORD_next(pos); + } +} +inline void pa_CORD_pos_advance(CORD_pos pos, size_t n) { + while(true) { + size_t avail=CORD_pos_chars_left(pos); + if(avail==0) { + CORD_next(pos); + if(!--n) + break; + } else if(avail=n + CORD_pos_advance(pos, n); + break; + } + } +} - // WARNING: - // string can grow only UNTAINT_TIMES_BIGGER - switch(to_lang) { - case UL_CLEAN: - // clean piece - { // optimizing whitespace - const char *src=row->item.ptr; - for(int size=row->item.size; size--; src++) - switch(*src) { - case ' ': case '\n': case '\t': - if(!whitespace) { - *dest++=*src; - whitespace=true; - } - break; - default: - whitespace=false; - *dest++=*src; - break; - } +#ifndef DOXYGEN +struct Cstr_to_string_body_block_info { + // input + String::Language lang; + SQL_Connection* connection; + const Request_charsets* charsets; + const String::Body* body; + + // output + CORD_ec result; + + // private + CORD_pos pos; + size_t fragment_begin; + bool whitespace; +}; +#endif +int cstr_to_string_body_block(char alang, size_t fragment_length, Cstr_to_string_body_block_info* info) { + const String::Language fragment_lang=(String::Language)(unsigned char)alang; + bool& whitespace=info->whitespace; + size_t fragment_end=info->fragment_begin+fragment_length; + //fprintf(stderr, "%d, %d\n", fragment.lang, fragment.length); + + + String::Language to_lang=info->lang==String::L_UNSPECIFIED?fragment_lang:info->lang; + bool optimize=(to_lang & String::L_OPTIMIZE_BIT)!=0; + if(!optimize) + whitespace=false; + + switch(to_lang & ~String::L_OPTIMIZE_BIT) { + case String::L_CLEAN: + case String::L_TAINTED: + case String::L_AS_IS: + // clean piece + + // tainted piece, but undefined untaint language + // for VString.as_double of tainted values + // for ^process{body} evaluation + + // tainted, untaint language: as-is + ec_append(info->result, optimize, whitespace, info->pos, fragment_length); + break; + case String::L_FILE_SPEC: + // tainted, untaint language: file [name] + escape( + // Macintosh has problems with small Russian letter 'r' + if( c=='\xF0' && info->charsets && info->charsets->source().NAME()=="WINDOWS-1251" ) { + // fixing that letter for most common charset + to_char('p'); + } else // fallback to default + encode(need_file_encode, '_', c); + ); + break; + case String::L_URI: + // tainted, untaint language: uri + { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + String::C output(fragment_str, fragment_length); + if(info->charsets) + output=Charset::transcode(output, + info->charsets->source(), + info->charsets->client()); + + char c; + for(const char* src=output.str; c=*src++; ) + encode(need_uri_encode, '%', c); + } + break; + case String::L_HTTP_HEADER: + // tainted, untaint language: http-field-content-text + escape( + encode(need_uri_encode, '%', c); + ); + break; + case String::L_MAIL_HEADER: + // tainted, untaint language: mail-header + // http://www.ietf.org/rfc/rfc2047.txt + if(info->charsets) { + size_t mail_size; + const char *mail_ptr= + info->body->mid(info->fragment_begin, mail_size=fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, mail_size); + + const char* charset_name=info->charsets->mail().NAME().cstr(); + + // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= + bool to_quoted_printable=false; + + bool email=false; + uchar c; + for(const char* src=mail_ptr; c=(uchar)*src++; ) { + //RFC + An 'encoded-word' MUST NOT appear in any portion of an 'addr-spec'. + if(to_quoted_printable && (c==',' || addr_spec_soon(src) || c == '"')) { + email=c=='<'; + to_string("?="); + to_quoted_printable=false; } - break; - case UL_TAINTED: - // tainted piece, but undefined untaint language - // for VString.as_double of tainted values - // for ^process{body} evaluation - case UL_AS_IS: - // tainted, untaint language: as-is - memcpy(dest, row->item.ptr, row->item.size); - dest+=row->item.size; - break; - case UL_FILE_SPEC: - // tainted, untaint language: file [name] - escape(switch(*src) { - case ' ': to_char('_'); break; - encode(need_file_encode, '+'); - }); - break; - case UL_URI: - // tainted, untaint language: uri - escape(switch(*src) { - case ' ': to_char('+'); break; - encode(need_uri_encode, '%'); - }); - break; - case UL_HTTP_HEADER: - // tainted, untaint language: http-field-content-text - escape(switch(*src) { - case ' ': to_char('+'); break; - encode(need_uri_encode, '%'); - }); - break; - case UL_MAIL_HEADER: - // tainted, untaint language: mail-header - if(charset) { - // Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?= - const char *src=row->item.ptr; - bool to_quoted_printable=false; - for(int size=row->item.size; size--; src++) { - if(*src & 0x80) { - if(!to_quoted_printable) { - dest+=sprintf(dest, "=?%.15s?Q?", charset); - to_quoted_printable=true; - } - dest+=sprintf(dest, "=%02X", *src & 0xFF); - } else { - *dest++=*src; - } + if(!email && ( + !to_quoted_printable && (c & 0x80) // starting quote-printable-encoding on first 8bit char + || to_quoted_printable && !mail_header_char_valid_within_Qencoded(c) + )) { + if(!to_quoted_printable) { + to_string("=?"); + to_string(charset_name); + to_string("?Q?"); + to_quoted_printable=true; } - if(to_quoted_printable) // close - dest+=sprintf(dest, "?="); - } else { - memcpy(dest, row->item.ptr, row->item.size); - dest+=row->item.size; - } - break; - case UL_TABLE: - // tainted, untaint language: table - escape(switch(*src) { - case '\t': to_char(' '); break; - case '\n': to_char(' '); break; - _default; - }); - break; - case UL_SQL: - // tainted, untaint language: sql - if(connection) - dest+=connection->quote(dest, row->item.ptr, row->item.size); - else - throw Exception(0, 0, - this, - "untaint in SQL language failed - no connection specified"); - break; - case UL_JS: - escape(switch(*src) { - case '"': to_string("\\\"", 2); break; - case '\'': to_string("\\'", 2); break; - case '\n': to_string("\\n", 2); break; - case '\\': to_string("\\\\", 2); break; - case '\xFF': to_string("\\\xFF", 2); break; - _default; - }); - break; - case UL_XML: - escape(switch(*src) { - case '&': to_string("&", 5); break; - case '>': to_string(">", 4); break; - case '<': to_string("<", 4); break; - case '"': to_string(""", 6); break; - case '\'': to_string("'", 6); break; - _default; - }); - break; - case UL_HTML: - escape(switch(*src) { - case '&': to_string("&", 5); break; - case '>': to_string(">", 4); break; - case '<': to_string("<", 4); break; - case '"': to_string(""", 6); break; - _default; - }); - break; - case UL_USER_HTML: { - // tainted, untaint language: html-typo - if(!typo_dict) // never, always has default - throw Exception(0, 0, - this, - "untaint to user-html lang failed, no typo table"); - - char *html_for_typo= - (char *)malloc(row->item.size*2/* '\n' -> '\' 'n' */+1,16); - // note: - // there still is a possibility that user - // would not replace \n as she supposed to - // and rather replace \ and n into huge strings - // thus causing memory overrun - // this can be dealed by allocating *2 memory, but that's too expensive - size_t html_for_typo_size; - { // local dest - char *dest=html_for_typo; - escape(switch(*src) { - // convinient name for typo match "\n" - case '\n': - to_string("\\n", 2); - break; - _default; - }); - *dest=0; - html_for_typo_size=dest-html_for_typo; - } - // typo table replacements - const char *src=html_for_typo; - do { - // there is a row where first column starts 'src' - if(Table::Item *item=typo_dict->first_that_starts(src)) { - // get a=>b values - const String& a=*static_cast(item)->get_string(0); - const String& b=*static_cast(item)->get_string(1); - // overflow check: - // b allowed to be max UNTAINT_TIMES_BIGGER then a - if(b.size()>UNTAINT_TIMES_BIGGER*a.size()) { - pool().set_tag(0); // avoid recursion - throw Exception(0, 0, - &b, - "is %g times longer then '%s', " - "while maximum, handled by Parser, is %d", - ((double)b.size())/a.size(), - a.cstr(), - UNTAINT_TIMES_BIGGER); - } - - // skip 'a' in 'src' - src+=a.size(); - // write 'b' to 'dest' - b.store_to(dest); - // skip 'b' in 'dest' - dest+=b.size(); - } else - *dest++=*src++; - } while(*src); - break; - } - default: - throw Exception(0, 0, - this, - "unknown untaint language #%d of %d piece", - static_cast(row->item.lang), - i); // never - break; // never + encode(mail_header_nonspace_char, '=', '_'); + } else + to_char(c); + if(c=='>') + email=false; } + if(to_quoted_printable) // close + to_string("?="); + + } else + ec_append(info->result, optimize, whitespace, info->pos, fragment_length); + break; + case String::L_TABLE: + // tainted, untaint language: table + escape(switch(c) { + case '\t': to_char(' '); break; + case '\n': to_char(' '); break; + _default; + }); + break; +#ifdef PA_SQL + case String::L_SQL: + // tainted, untaint language: sql + if(info->connection) { + const char *fragment_str=info->body->mid(info->fragment_begin, fragment_length).cstr(); + // skip source [we use recoded version] + pa_CORD_pos_advance(info->pos, fragment_length); + + to_string(info->connection->quote(fragment_str, fragment_length)); + } else + throw Exception(0, + 0, + "untaint in SQL language failed - no connection specified"); + break; +#endif + case String::L_JS: + escape(switch(c) { + case '"': to_string("\\\""); break; + case '\'': to_string("\\'"); break; + case '\n': to_string("\\n"); break; + case '\\': to_string("\\\\"); break; + case '\xFF': to_string("\\\xFF"); break; + _default; + }); + break; + case String::L_XML: + escape(switch(c) { + case '&': to_string("&"); break; + case '>': to_string(">"); break; + case '<': to_string("<"); break; + case '"': to_string("""); break; + case '\'': to_string("'"); break; + _default; + }); + break; + case String::L_HTML: + escape(switch(c) { + case '&': to_string("&"); break; + case '>': to_string(">"); break; + case '<': to_string("<"); break; + case '"': to_string("""); break; + _default; + }); + break; + default: + assert(!"should never"); + SAPI::die("unknown untaint language #%d", + static_cast(to_lang)); // should never + break; // never + } - if((lang==UL_UNSPECIFIED?row->item.lang:lang)!=UL_CLEAN) - whitespace=false; + info->fragment_begin=fragment_end; - if(forigins_mode) - if(dest==dest_after_origins) // never moved==optimized space - dest=dest_before_origins; - else { - remove_crlf(dest_after_origins, dest); + return 0; // 0=continue +} - to_char('\n'); - } - } - chunk=row->link; - } while(chunk); -break2: - return dest; +String::Body String::cstr_to_string_body(Language lang, + SQL_Connection* connection, + const Request_charsets *charsets) const { + + Cstr_to_string_body_block_info info; + // input + info.lang=lang; + info.connection=connection; + info.charsets=charsets; + info.body=&body; + // output + CORD_ec_init(info.result); + // private + body.set_pos(info.pos, 0); + info.fragment_begin=0; + info.whitespace=true; + + if(!is_empty()) + langs.for_each(body, cstr_to_string_body_block, &info); + + return String::Body(CORD_ec_to_cord(info.result)); }