--- parser3/src/main/untaint.C	2001/03/20 06:45:19	1.10
+++ parser3/src/main/untaint.C	2001/07/09 16:51:54	1.54
@@ -1,61 +1,61 @@
 /** @file
 	Parser: String class part: untaint mechanizm.
 
-	Copyright (c) 2001 ArtLebedev Group (http://www.artlebedev.com)
+	Copyright(c) 2001 ArtLebedev Group(http://www.artlebedev.com)
 
-	Author: Alexander Petrosyan <paf@design.ru> (http://design.ru/paf)
-
-	$Id: untaint.C,v 1.10 2001/03/20 06:45:19 paf Exp $
+	Author: Alexander Petrosyan <paf@design.ru>(http://design.ru/paf)
 */
-
-#include <string.h>
+static const char *RCSId="$Id: untaint.C,v 1.54 2001/07/09 16:51:54 parser Exp $"; 
 
 #include "pa_pool.h"
 #include "pa_string.h"
 #include "pa_hash.h"
 #include "pa_exception.h"
+#include "pa_table.h"
+#include "pa_globals.h"
+#include "pa_sql_connection.h"
 
-#define escape(cases) \
+#define escape(action) \
 	{ \
-		const char *ptr=row->item.ptr; \
-		for (int size=row->item.size; size--; ptr++) \
-			switch(*ptr) { \
-				cases \
-			} \
+		const char *src=row->item.ptr; \
+		for(int size=row->item.size; size--; src++) \
+			action \
 	}
-#define escape_value(a, c)  case a: *copy_here++=c; break
-#define escape_default  default: *copy_here++=*ptr; break
-#define escape_subst(a, b, bsize)  \
-		case a: \
-			strncpy(copy_here, b, bsize); \
-			copy_here+=bsize; \
-		break
-#define escape_encode(need_encode_func, prefix)  \
+#define _default  default: *dest++=*src; break
+#define encode(need_encode_func, prefix)  \
 		default: \
-			if(need_encode_func(*ptr)) { \
+			if(need_encode_func(*src)) { \
 				static const char *hex="0123456789ABCDEF"; \
 				char chunk[3]={prefix}; \
-				chunk[1]=hex[((unsigned char)*ptr)/0x10]; \
-				chunk[2]=hex[((unsigned char)*ptr)%0x10]; \
-				strncpy(copy_here, chunk, 3);  copy_here+=3; \
+				chunk[1]=hex[((unsigned char)*src)/0x10]; \
+				chunk[2]=hex[((unsigned char)*src)%0x10]; \
+				strncpy(dest, chunk, 3);  dest+=3; \
 			} else \
-				*copy_here++=*ptr; \
+				*dest++=*src; \
 			break
+#define to_char(c)  *dest++=c
+#define to_string(b, bsize)  \
+		strncpy(dest, b, bsize); \
+		dest+=bsize; \
 
 inline bool need_file_encode(unsigned char c){
-    if ((c>='0') && (c<='9') || (c>='A') && (c<='Z') || (c>='a') && (c<='z')) 
+    if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) 
 		return false;
 
-    return !strchr("./\\", c);
+    return !strchr(
+#ifdef WIN32
+		":\\~"
+#endif
+		"./()_-", c);
 }
 inline bool need_uri_encode(unsigned char c){
-    if ((c>='0') && (c<='9') || (c>='A') && (c<='Z') || (c>='a') && (c<='z')) 
+    if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) 
 		return false;
 
     return !strchr("_-./", c);
 }
-inline bool need_header_encode(unsigned char c){
-    if(strchr(" ,:", c))
+inline bool need_http_header_encode(unsigned char c){
+    if(strchr(" , :", c))
 		return false;
 
 	return need_uri_encode(c);
@@ -63,111 +63,287 @@ inline bool need_header_encode(unsigned
 
 // String
 
-/// @todo optimize whitespaces for all but 'html'
-char *String::cstr() const {
-	char *result=(char *)malloc(size()*UNTAINT_TIMES_BIGGER+1);
+static bool typo_present(Array::Item *value, const void *info) {
+	Array *row=static_cast<Array *>(value);
+	const char *src=static_cast<const char *>(info);
+
+	int partial;
+	row->get_string(0)->cmp(partial, src);
+	return 
+		partial==0 || // full match
+		partial==1; // typo left column starts 'src'
+}
+
+/*
+
+HTTP-header    = field-name ":" [ field-value ] CRLF
+
+       field-name     = token
+       field-value    = *( field-content | LWS )
+
+       field-content  = <the OCTETs making up the field-value
+                        and consisting of either *TEXT or combinations
+                        of token, tspecials, and quoted-string>
+
+
+
+word           = token | quoted-string
+
+token          = 1*<any CHAR except CTLs or tspecials>
+
+
+
+tspecials      = "(" | ")" | "<" | ">" | "@"
+                      | "," | ";" | ":" | "\" | <">
+                      | "/" | "[" | "]" | "?" | "="
+                      | "{" | "}" | SP | HT
+
+SP             = <US-ASCII SP, space (32)>
+HT             = <US-ASCII HT, horizontal-tab (9)>
+
+LWS            = [CRLF] 1*( SP | HT )
+TEXT           = <any OCTET except CTLs,
+                        but including LWS>
 
-	char *copy_here=result;
+quoted-pair    = "\" CHAR
+
+  if(strchr("()<>@,;:\\\"/[]?={} \t", *ptr))
+*/
+inline bool need_quote_http_header(const char *ptr, size_t size) {
+	for(; size--; ptr++)
+		if(strchr(";\\\"= \t" /* excluded ()<>@, :/ ? []{} */, *ptr))
+			return true;
+	return false;
+}
+
+/// @todo maybe additional check "are all pieces are clean?" would be profitable?
+size_t String::cstr_bufsize(Untaint_lang lang) const {
+	return (lang==UL_AS_IS?size():size()*UNTAINT_TIMES_BIGGER) +1;
+}
+
+/** @todo fix theoretical \n mem overrun in TYPO replacements
+@todo rename base_64 to quoted_printable [invalid name now]
+*/
+char *String::store_to(char *dest, Untaint_lang lang, 
+					   SQL_Connection *connection,
+					   const char *charset) const {
+	// $MAIN:html-typo table
+	Table *user_typo_table=static_cast<Table *>(pool().tag());
+	Table *typo_table=user_typo_table?user_typo_table:default_typo_table;
+
+	bool whitespace=true;
 	const Chunk *chunk=&head; 
 	do {
 		const Chunk::Row *row=chunk->rows;
-		for(int i=0; i<chunk->count; i++) {
+		for(size_t i=0; i<chunk->count; i++, row++) {
 			if(row==append_here)
 				goto break2;
 
 			// WARNING:
 			//	string can grow only UNTAINT_TIMES_BIGGER
-			switch(row->item.lang) {
-			case NO:
+			switch(lang==UL_UNSPECIFIED?row->item.lang:lang) {
+			case UL_CLEAN:
 				// clean piece
-			case YES:
+				{ // optimizing whitespace
+					const char *src=row->item.ptr; 
+					for(int size=row->item.size; size--; src++)
+						switch(*src) {
+						case ' ': case '\n': case '\r': case '\t':
+							if(!whitespace) {
+								*dest++=*src;
+								whitespace=true;
+							}
+							break;
+						default:
+							whitespace=false;
+							*dest++=*src;
+							break;
+						}
+				}
+				break;
+			case UL_TAINTED:
 				// tainted piece, but undefined untaint language
-				// for VString.get_double of tainted values
+				// for VString.as_double of tainted values
 				// for ^process{body} evaluation
-			case AS_IS:
+			case UL_AS_IS:
 				// tainted, untaint language: as-is
-				memcpy(copy_here, row->item.ptr, row->item.size); 
-				copy_here+=row->item.size;
+				memcpy(dest, row->item.ptr, row->item.size); 
+				dest+=row->item.size;
 				break;
-			case FILE_NAME:
+			case UL_FILE_NAME:
 				// tainted, untaint language: file [name]
-				escape(
-					escape_value(' ', '_');
-					escape_encode(need_file_encode, '-');
-				);
+				escape(switch(*src) {
+					case ' ': to_char('_');  break;
+					encode(need_file_encode, '+');
+				});
 				break;
-			case URI:
+			case UL_URI:
 				// tainted, untaint language: uri
-				escape(
-					escape_value(' ', '+');
-					escape_encode(need_uri_encode, '%');
-				);
-				break;
-			case HEADER:
-				// tainted, untaint language: header
-				escape(
-					escape_encode(need_header_encode, '%');
-				);
-				break;
-			case TABLE: 
-				escape(
-					escape_value('\t', ' ');
-					escape_value('\n', ' ');
-					escape_default;
-				);
+				escape(switch(*src) {
+					case ' ': to_char('+');  break;
+					encode(need_uri_encode, '%');
+				});
+				break;
+			case UL_HTTP_HEADER:
+				// tainted, untaint language: http-header
+				if(need_quote_http_header(row->item.ptr, row->item.size)) {
+					*dest++='\"';
+					escape(switch(*src) {
+						case '\"': to_string("\\\"", 2);  break;
+						_default;
+					});
+					*dest++='\"';
+				} else {
+					memcpy(dest, row->item.ptr, row->item.size); 
+					dest+=row->item.size;
+				}
+				break;
+			case UL_MAIL_HEADER:
+				// tainted, untaint language: mail-header
+				if(charset) {
+					// Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?=
+					const char *src=row->item.ptr; 
+					bool to_base_64=false;
+					for(int size=row->item.size; size--; src++) {
+						if(*src & 0x80) {
+							if(!to_base_64) {
+								dest+=sprintf(dest, "=?%.15s?Q?", charset);
+								to_base_64=true;
+							}
+							dest+=sprintf(dest, "=%02X", *src & 0xFF);
+						} else {
+							*dest++=*src;						
+						}
+					}
+					if(to_base_64) // close
+						dest+=sprintf(dest, "?=");
+				} else {
+					memcpy(dest, row->item.ptr, row->item.size); 
+					dest+=row->item.size;
+				}
+				break;
+			case UL_TABLE: 
+				// tainted, untaint language: table
+				escape(switch(*src) {
+					case '\t': to_char(' ');  break;
+					case '\n': to_char(' ');  break;
+					_default;
+				});
 				break;
-			case SQL:
+			case UL_SQL:
 				// tainted, untaint language: sql
-				// TODO: зависимость от sql сервера
-				memset(copy_here, '?', row->item.size); 
-				copy_here+=row->item.size;
-				break;
-			case JS:
-				escape(
-					escape_subst('"', "\\\"", 2);
-					escape_subst('\'', "\\'", 2);
-					escape_subst('\n', "\\n", 2);
-					escape_subst('\r', "\\r", 2);
-					escape_subst('\\', "\\\\", 2);
-					escape_subst('я', "\\я", 2);
-					escape_default;
-				);
-				break;
-			case HTML:
-				escape(
-					escape_subst('&', "&amp;", 5); // BEFORE consequent relpaces yelding '&'
-					escape_subst('>', "&gt;", 4);
-					escape_subst('<', "&lt;",4);
-					escape_subst('"', "&quot;",6);
-					escape_value('\t', ' ');
-					//TODO: XSLT escape_subst('\'', "&apos;", 6)
-					escape_default;
-				);
+				if(connection)
+					dest+=connection->quote(dest, row->item.ptr, row->item.size);
+				else
+					THROW(0, 0,
+						this,
+						"untaint in SQL language failed - no connection specified");
 				break;
-			case HTML_TYPO: 
+			case UL_JS:
+				escape(switch(*src) {
+					case '"': to_string("\\\"", 2);  break;
+					case '\'': to_string("\\'", 2);  break;
+					case '\n': to_string("\\n", 2);  break;
+					case '\\': to_string("\\\\", 2);  break;
+					case '\xFF': to_string("\\\xFF", 2);  break;
+					_default;
+				});
+				break;
+			case UL_HTML:
+				escape(switch(*src) {
+					case '&': to_string("&amp;", 5);  break;
+					case '>': to_string("&gt;", 4);  break;
+					case '<': to_string("&lt;", 4);  break;
+					case '"': to_string("&quot;", 6);  break;
+					//TODO: XSLT case '\'': to_string("&apos;", 6);  break;
+					_default;
+				});
+				break;
+			case UL_USER_HTML: {
 				// tainted, untaint language: html-typo
-				escape(
-					escape_subst('&', "&amp;", 5); // BEFORE consequent relpaces yelding '&'
-					escape_subst('>', "&gt;", 4);
-					escape_subst('<', "&lt;",4);
-					escape_subst('"', "&quot;",6);
-					escape_value('\t', ' ');
-					//TODO: $MAIN:html-type table replace, max length(b)==UNTAINT_TIMES_BIGGER*length(a)
-					escape_default;
-				);
+				char *html_for_typo=
+					(char *)malloc(row->item.size*2/* '\n' -> '\' 'n' */+1);
+				// note:
+				//   there still is a possibility that user 
+				//   would not replace \n as she supposed to
+				//   and rather replace \ and n into huge strings
+				//   thus causing memory overrun
+				//   this can be dealed by allocating *2 memory, but that's too expensive
+				size_t html_for_typo_size;
+				{ // local dest
+					char *dest=html_for_typo;
+					escape(switch(*src) {
+						// convinient name for typo match "\n"
+						case '\r': 
+							if(typo_table) {
+								*dest++='\\';  *dest++='n'; // \r -> \n
+								if(src[1]=='\n') { // \r\n -> remove \n
+									size--; src++;
+								}
+							}
+							break;
+						case '\n': 
+							if(typo_table)
+								to_string("\\n", 2);
+							break;
+						//TODO: XSLT case '\'': to_string("&apos;", 6);  break;
+						_default;
+					});
+					*dest=0;
+					html_for_typo_size=dest-html_for_typo;
+				}
+				// typo table replacements
+				const char *src=html_for_typo;
+				do {
+					// there is a row where first column starts 'src'
+					if(Table::Item *item=typo_table->first_that(typo_present, src)) {
+						// get a=>b values
+						const String& a=*static_cast<Array *>(item)->get_string(0);
+						const String& b=*static_cast<Array *>(item)->get_string(1);
+						// empty 'a' | 'b' checks
+						if(a.size()==0 || b.size()==0) {
+							pool().set_tag(default_typo_table); // avoid recursion
+							THROW(0, 0, 
+								typo_table->origin_string(), 
+								"typo table column elements must not be empty");
+						}
+						// overflow check:
+						//   b allowed to be max UNTAINT_TIMES_BIGGER then a
+						if(b.size()>UNTAINT_TIMES_BIGGER*a.size()) {
+							pool().set_tag(default_typo_table); // avoid recursion
+							THROW(0, 0, 
+								&b, 
+								"is %g times longer then '%s', "
+								"while maximum, handled by Parser, is %d", 
+									((double)b.size())/a.size(), 
+									a.cstr(), 
+									UNTAINT_TIMES_BIGGER);
+						}
+						
+						// skip 'a' in 'src'
+						src+=a.size();
+						// write 'b' to 'dest'
+						b.store_to(dest);
+						dest+=b.size();
+					} else
+						*dest++=*src++;
+				} while(*src);
 				break;
+				}
 			default:
-				THROW(0,0,
-					this,
+				THROW(0, 0, 
+					this, 
 					"unknown untaint language #%d of %d piece", 
-						static_cast<int>(row->item.lang),
-						i);
+						static_cast<int>(row->item.lang), 
+						i); // never
+				break; // never
 			}
-			row++;
+
+			if((lang==UL_UNSPECIFIED?row->item.lang:lang)!=UL_CLEAN)
+				whitespace=false;
 		}
 		chunk=row->link;
 	} while(chunk);
 break2:
-	*copy_here=0;
-	return result;
+	return dest;
 }