--- parser3/src/main/untaint.C	2001/04/05 13:19:43	1.34
+++ parser3/src/main/untaint.C	2001/11/16 13:51:14	1.76
@@ -2,14 +2,11 @@
 	Parser: String class part: untaint mechanizm.
 
 	Copyright(c) 2001 ArtLebedev Group(http://www.artlebedev.com)
+	Author: Alexander Petrosyan <paf@design.ru>(http://paf.design.ru)
 
-	Author: Alexander Petrosyan <paf@design.ru>(http://design.ru/paf)
-
-	$Id: untaint.C,v 1.34 2001/04/05 13:19:43 paf Exp $
+	$Id: untaint.C,v 1.76 2001/11/16 13:51:14 paf Exp $
 */
 
-#include "pa_config_includes.h"
-
 #include "pa_pool.h"
 #include "pa_string.h"
 #include "pa_hash.h"
@@ -17,6 +14,8 @@
 #include "pa_table.h"
 #include "pa_globals.h"
 #include "pa_sql_connection.h"
+#include "pa_dictionary.h"
+#include "pa_common.h"
 
 #define escape(action) \
 	{ \
@@ -32,13 +31,13 @@
 				char chunk[3]={prefix}; \
 				chunk[1]=hex[((unsigned char)*src)/0x10]; \
 				chunk[2]=hex[((unsigned char)*src)%0x10]; \
-				strncpy(dest, chunk, 3);  dest+=3; \
+				memcpy(dest, chunk, 3);  dest+=3; \
 			} else \
 				*dest++=*src; \
 			break
 #define to_char(c)  *dest++=c
 #define to_string(b, bsize)  \
-		strncpy(dest, b, bsize); \
+		memcpy(dest, b, bsize); \
 		dest+=bsize; \
 
 inline bool need_file_encode(unsigned char c){
@@ -47,9 +46,9 @@ inline bool need_file_encode(unsigned ch
 
     return !strchr(
 #ifdef WIN32
-		":\\"
+		":\\~"
 #endif
-		"./", c);
+		"./()_-", c);
 }
 inline bool need_uri_encode(unsigned char c){
     if((c>='0') &&(c<='9') ||(c>='A') &&(c<='Z') ||(c>='a') &&(c<='z')) 
@@ -57,47 +56,228 @@ inline bool need_uri_encode(unsigned cha
 
     return !strchr("_-./", c);
 }
-inline bool need_header_encode(unsigned char c){
+inline bool need_http_header_encode(unsigned char c){
     if(strchr(" , :", c))
 		return false;
 
 	return need_uri_encode(c);
 }
 
+//
+
+static const char * String_Untaint_lang_name[]={
+	"U", ///< zero value handy for hash lookup @see untaint_lang_name2enum
+	"C", ///< clean
+	"T",  ///< tainted, untaint language as assigned later 
+	// untaint languages. assigned by ^untaint[lang]{...}
+	"P",
+		/**<
+			leave language built into string being appended.
+			just a flag, that value not stored
+		*/
+	"A",     ///< leave all characters intact
+	"F", ///< file specification
+	"H",    ///< ext in HTTP response header
+	"M",    ///< text in mail header
+	"URI",       ///< text in uri
+	"T",     ///< ^table:set body
+	"SQL",       ///< ^table:sql body
+	"JS",        ///< JavaScript code
+	"XML",		///< ^dom:set xml
+	"HTML",      ///< HTML code (for editing)
+	"UHTML", ///< HTML code with USER chars
+};
+
+
 // String
 
-static bool typo_present(Array::Item *value, const void *info) {
-	Array *row=static_cast<Array *>(value);
-	const char *src=static_cast<const char *>(info);
-
-	int partial;
-	row->get_string(0)->cmp(partial, src);
-	return 
-		partial==0 || // full match
-		partial==1; // typo left column starts 'src'
-}
+/*
+
+HTTP-header    = field-name ":" [ field-value ] CRLF
+
+       field-name     = token
+       field-value    = *( field-content | LWS )
+
+       field-content  = <the OCTETs making up the field-value
+                        and consisting of either *TEXT or combinations
+                        of token, tspecials, and quoted-string>
+
+
+
+word           = token | quoted-string
+
+token          = 1*<any CHAR except CTLs or tspecials>
+
+
+
+tspecials      = "(" | ")" | "<" | ">" | "@"
+                      | "," | ";" | ":" | "\" | <">
+                      | "/" | "[" | "]" | "?" | "="
+                      | "{" | "}" | SP | HT
+
+SP             = <US-ASCII SP, space (32)>
+HT             = <US-ASCII HT, horizontal-tab (9)>
 
-/**
-	@test optimize whitespaces for all but 'html'
-	@todo fix theoretical \n mem overrun in TYPO replacements
+LWS            = [CRLF] 1*( SP | HT )
+TEXT           = <any OCTET except CTLs,
+                        but including LWS>
+
+quoted-pair    = "\" CHAR
+
+  if(strchr("()<>@,;:\\\"/[]?={} \t", *ptr))
 */
-char *String::store_to(char *dest, Untaint_lang lang, SQL_Connection *connection) const {
-	// $MAIN:html-typo table
-	Table *user_typo_table=static_cast<Table *>(pool().tag());
-	Table *typo_table=user_typo_table?user_typo_table:default_typo_table;
+inline bool need_quote_http_header(const char *ptr, size_t size) {
+	for(; size--; ptr++)
+		if(strchr(";\\\"= \t" /* excluded ()<>@, :/ ? []{} */, *ptr))
+			return true;
+	return false;
+}
 
+/// @test UL_OPTIMIZED_HTML optimize
+size_t String::cstr_bufsize(Untaint_lang lang,
+							SQL_Connection *connection,
+							const char *charset) const {
+	size_t dest=1;
+	bool whitespace=true;
 	const Chunk *chunk=&head; 
 	do {
 		const Chunk::Row *row=chunk->rows;
-		for(size_t i=0; i<chunk->count; i++, row++) {
+		for(uint i=0; i<chunk->count; i++, row++) {
 			if(row==append_here)
 				goto break2;
 
-			// WARNING:
-			//	string can grow only UNTAINT_TIMES_BIGGER
-			switch(lang==UL_UNKNOWN?row->item.lang:lang) {
+			Untaint_lang to_lang=lang==UL_UNSPECIFIED?(Untaint_lang)row->item.lang:lang;
+
+			switch(to_lang) {
 			case UL_CLEAN:
 				// clean piece
+				{ // optimizing whitespace
+					escape(switch(*src) {
+						case ' ': case '\n': case '\t':
+							if(!whitespace) {
+								dest++;
+								whitespace=true;
+							}
+							break;
+						default:
+							whitespace=false;
+							dest++;
+							break;
+					});
+				}
+				break;
+			case UL_TAINTED:
+				// tainted piece, but undefined untaint language
+				// for VString.as_double of tainted values
+				// for ^process{body} evaluation
+			case UL_AS_IS:
+				// tainted, untaint language: as-is
+				dest+=row->item.size;
+				break;
+			case UL_FILE_SPEC:
+				// tainted, untaint language: file [name]
+				dest+=row->item.size*3/* worst: Z->%XX */;
+				break;
+			case UL_URI:
+				// tainted, untaint language: uri
+				dest+=row->item.size*3/* worst: Z->%XX */;
+				break;
+			case UL_HTTP_HEADER:
+				// tainted, untaint language: http-field-content-text
+				dest+=row->item.size*3/* worst: Z->%XX */;
+				break;
+			case UL_MAIL_HEADER:
+				// tainted, untaint language: mail-header
+				if(charset) {
+					// Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?=
+					dest+=row->item.size*3+MAX_STRING/* worst: =?charset?Q?=%XX?= */;
+				} else {
+					dest+=row->item.size;
+				}
+				break;
+			case UL_TABLE: 
+				// tainted, untaint language: table
+				dest+=row->item.size;
+				break;
+			case UL_SQL:
+				// tainted, untaint language: sql
+				if(connection)
+					dest+=connection->quote(0, row->item.ptr, row->item.size);
+				break;
+			case UL_JS:
+				escape(switch(*src) {
+					case '"': case '\'': case '\n': case '\\': case '\xFF':
+						dest+=2;  break;
+					default: 
+						dest++;  break;
+				});
+				break;
+			case UL_XML:
+				escape(switch(*src) {
+					case '&': case '>': case '<': case '"': case '\'': 
+						dest+= 6;  break;
+					default: 
+						dest++;  break;
+				});
+				break;
+			case UL_HTML:
+			case UL_OPTIMIZED_HTML:
+				escape(switch(*src) {
+					case '&': 
+					case '>': 
+					case '<': 
+					case '"': 
+						dest+=6;  break;
+					default: 
+						dest++;  break;
+				});
+				break;
+			}
+
+			if((lang==UL_UNSPECIFIED?row->item.lang:lang)!=UL_CLEAN)
+				whitespace=false;
+		}
+		chunk=row->link;
+	} while(chunk);
+
+break2:
+	return dest;
+}
+
+/// @test UL_OPTIMIZED_HTML optimize
+char *String::store_to(char *dest, Untaint_lang lang, 
+					   SQL_Connection *connection,
+					   const char *charset) const {
+	// WARNING:
+	//	 before any changes check cstr_bufsize first!!!
+	bool whitespace=true;
+	const Chunk *chunk=&head; 
+	do {
+		const Chunk::Row *row=chunk->rows;
+		for(uint i=0; i<chunk->count; i++, row++) {
+			if(row==append_here)
+				goto break2;
+
+			Untaint_lang to_lang=lang==UL_UNSPECIFIED?(Untaint_lang)row->item.lang:lang;
+
+			switch(to_lang) {
+			case UL_CLEAN:
+				// clean piece
+				{ // optimizing whitespace
+					escape(switch(*src) {
+						case ' ': case '\n': case '\t':
+							if(!whitespace) {
+								*dest++=*src;
+								whitespace=true;
+							}
+							break;
+						default:
+							whitespace=false;
+							*dest++=*src;
+							break;
+					});
+				}
+				break;
 			case UL_TAINTED:
 				// tainted piece, but undefined untaint language
 				// for VString.as_double of tainted values
@@ -107,11 +287,11 @@ char *String::store_to(char *dest, Untai
 				memcpy(dest, row->item.ptr, row->item.size); 
 				dest+=row->item.size;
 				break;
-			case UL_FILE_NAME:
+			case UL_FILE_SPEC:
 				// tainted, untaint language: file [name]
 				escape(switch(*src) {
 					case ' ': to_char('_');  break;
-					encode(need_file_encode, '-');
+					encode(need_file_encode, '+');
 				});
 				break;
 			case UL_URI:
@@ -121,12 +301,37 @@ char *String::store_to(char *dest, Untai
 					encode(need_uri_encode, '%');
 				});
 				break;
-			case UL_HEADER:
-				// tainted, untaint language: header
+			case UL_HTTP_HEADER:
+				// tainted, untaint language: http-field-content-text
 				escape(switch(*src) {
-					encode(need_header_encode, '%');
+					case ' ': to_char('+');  break;
+					encode(need_uri_encode, '%');
 				});
 				break;
+			case UL_MAIL_HEADER:
+				// tainted, untaint language: mail-header
+				if(charset) {
+					// Subject: Re: parser3: =?koi8-r?Q?=D3=C5=CD=C9=CE=C1=D2?=
+					const char *src=row->item.ptr; 
+					bool to_quoted_printable=false;
+					for(int size=row->item.size; size--; src++) {
+						if(*src & 0x80) {
+							if(!to_quoted_printable) {
+								dest+=sprintf(dest, "=?%.15s?Q?", charset);
+								to_quoted_printable=true;
+							}
+							dest+=sprintf(dest, "=%02X", *src & 0xFF);
+						} else {
+							*dest++=*src;						
+						}
+					}
+					if(to_quoted_printable) // close
+						dest+=sprintf(dest, "?=");
+				} else {
+					memcpy(dest, row->item.ptr, row->item.size); 
+					dest+=row->item.size;
+				}
+				break;
 			case UL_TABLE: 
 				// tainted, untaint language: table
 				escape(switch(*src) {
@@ -140,7 +345,7 @@ char *String::store_to(char *dest, Untai
 				if(connection)
 					dest+=connection->quote(dest, row->item.ptr, row->item.size);
 				else
-					THROW(0, 0,
+					throw Exception(0, 0,
 						this,
 						"untaint in SQL language failed - no connection specified");
 				break;
@@ -154,96 +359,78 @@ char *String::store_to(char *dest, Untai
 					_default;
 				});
 				break;
-			case UL_HTML:
+			case UL_XML:
 				escape(switch(*src) {
 					case '&': to_string("&amp;", 5);  break;
 					case '>': to_string("&gt;", 4);  break;
 					case '<': to_string("&lt;", 4);  break;
 					case '"': to_string("&quot;", 6);  break;
-					//TODO: XSLT case '\'': to_string("&apos;", 6);  break;
+					case '\'': to_string("&apos;", 6);  break;
 					_default;
 				});
 				break;
-			case UL_HTML_TYPO: {
-				// tainted, untaint language: html-typo
-				char *html_for_typo=(char *)malloc(size()*2/* '\n' -> '\' 'n' */+1);
-				// note:
-				//   there still is a possibility that user 
-				//   would not replace \n as she supposed to
-				//   and rather replace \ and n into huge strings
-				//   thus causing memory overrun
-				//   this can be dealed by allocating *2 memory, but that's too expensive
-				size_t html_for_typo_size;
-				{ // local dest
-					char *dest=html_for_typo;
-					escape(switch(*src) {
-						// convinient name for typo match "\n"
-						case '\r': 
-							if(typo_table) {
-								*dest++='\\';  *dest++='n'; // \r -> \n
-								if(src[1]=='\n') { // \r\n -> remove \n
-									size--; src++;
-								}
-							}
-							break;
-						case '\n': 
-							if(typo_table)
-								to_string("\\n", 2);
-							break;
-						//TODO: XSLT case '\'': to_string("&apos;", 6);  break;
-						_default;
-					});
-					*dest=0;
-					html_for_typo_size=dest-html_for_typo;
-				}
-				// typo table replacements
-				const char *src=html_for_typo;
-				do {
-					// there is a row where first column starts 'src'
-					if(Table::Item *item=typo_table->first_that(typo_present, src)) {
-						// get a=>b values
-						const String& a=*static_cast<Array *>(item)->get_string(0);
-						const String& b=*static_cast<Array *>(item)->get_string(1);
-						// empty 'a' | 'b' checks
-						if(a.size()==0 || b.size()==0) {
-							pool().set_tag(default_typo_table); // avoid recursion
-							THROW(0, 0, 
-								typo_table->origin_string(), 
-								"typo table column elements must not be empty");
-						}
-						// overflow check:
-						//   b allowed to be max UNTAINT_TIMES_BIGGER then a
-						if(b.size()>UNTAINT_TIMES_BIGGER*a.size()) {
-							pool().set_tag(default_typo_table); // avoid recursion
-							THROW(0, 0, 
-								&b, 
-								"is %g times longer then '%s', "
-								"while maximum, handled by Parser, is %d", 
-								((double)b.size())/a.size(), 
-								a.cstr(), 
-								UNTAINT_TIMES_BIGGER);
-						}
-						
-						// skip 'a' in 'src'
-						src+=a.size();
-						// write 'b' to 'dest'
-						b.store_to(dest);
-						dest+=b.size();
-					} else
-						*dest++=*src++;
-				} while(*src);
+			case UL_HTML:
+			case UL_OPTIMIZED_HTML:
+				escape(switch(*src) {
+					case '&': to_string("&amp;", 5);  break;
+					case '>': to_string("&gt;", 4);  break;
+					case '<': to_string("&lt;", 4);  break;
+					case '"': to_string("&quot;", 6);  break;
+					_default;
+				});
 				break;
-				}
 			default:
-				THROW(0, 0, 
+				throw Exception(0, 0, 
 					this, 
 					"unknown untaint language #%d of %d piece", 
 						static_cast<int>(row->item.lang), 
-						i);
+						i); // never
+				break; // never
 			}
+
+			if((lang==UL_UNSPECIFIED?row->item.lang:lang)!=UL_CLEAN)
+				whitespace=false;
 		}
 		chunk=row->link;
 	} while(chunk);
+
 break2:
 	return dest;
 }
+
+char *String::cstr_debug_origins() const {
+	char *result=(char *)malloc(size()+used_rows()*MAX_STRING*2);
+	char *dest=result;
+	
+	const Chunk *chunk=&head; 
+	do {
+		const Chunk::Row *row=chunk->rows;
+		for(uint i=0; i<chunk->count; i++, row++) {
+			if(row==append_here)
+				goto break2;
+
+#ifndef NO_STRING_ORIGIN
+			if(row->item.origin.file)
+				dest+=sprintf(dest, ORIGIN_FILE_LINE_FORMAT,
+					row->item.origin.file,
+					1+row->item.origin.line);
+			else
+				dest+=sprintf(dest, "<unknown>");
+#endif
+			dest+=sprintf(dest, "#%s: ",
+				String_Untaint_lang_name[row->item.lang]);
+			char *dest_after_origins=dest;
+
+			memcpy(dest, row->item.ptr, row->item.size); 
+			dest+=row->item.size;
+
+			remove_crlf(dest_after_origins, dest);
+			to_char('\n');
+		}
+		chunk=row->link;
+	} while(chunk);
+
+break2:
+	*dest=0;
+	return result;
+}