--- parser3/src/main/pa_http.C	2009/07/15 13:00:24	1.31
+++ parser3/src/main/pa_http.C	2011/02/18 06:03:53	1.52
@@ -5,7 +5,7 @@
 	Author: Alexandr Petrosian <paf@design.ru> (http://paf.design.ru)
  */
 
-static const char * const IDENT_HTTP_C="$Date: 2009/07/15 13:00:24 $"; 
+static const char * const IDENT_HTTP_C="$Date: 2011/02/18 06:03:53 $"; 
 
 #include "pa_http.h"
 #include "pa_common.h"
@@ -66,7 +66,7 @@ size_t guess_content_length(char* buf) {
 	char* ptr;
 	if((ptr=strstr(buf, "Content-Length:"))) // Apache
 		goto found;
-	if((ptr=strstr(buf, "content-length:"))) // Parser 3
+	if((ptr=strstr(buf, "content-length:"))) // Parser 3 before 3.4.0
 		goto found;
 	if((ptr=strstr(buf, "Content-length:"))) // maybe 1
 		goto found;
@@ -75,7 +75,7 @@ size_t guess_content_length(char* buf) {
 	return 0;
 found:
 	char *error_pos;
-	size_t result=(size_t)strtol(ptr+15/*strlen("CONTENT-LENGTH:")*/, &error_pos, 0);
+	size_t result=(size_t)strtol(ptr+15/*strlen("Content-Length:")*/, &error_pos, 0);
 	
 	const size_t reasonable_initial_max=0x400*0x400*10 /*10M*/;
 	if(result>reasonable_initial_max) // sanity check
@@ -85,8 +85,8 @@ found:
 
 static int http_read_response(char*& response, size_t& response_size, int sock, bool fail_on_status_ne_200) {
 	int result=0;
-	// fetching some to local buffer, guessing on possible content-length	
-	response_size=0x400*20; // initial size if content-length could not be determined	
+	// fetching some to local buffer, guessing on possible Content-Length
+	response_size=0x400*20; // initial size if Content-Length could not be determined	
 	const size_t preview_size=0x400*20;
 	char preview_buf[preview_size+1/*terminator*/];  // 20K buffer to preview headers
 	ssize_t received_size=recv(sock, preview_buf, preview_size, 0); 
@@ -135,9 +135,9 @@ static int http_read_response(char*& res
 		// we use terminator byte for two purposes here:
 		// 1. we return there zero always, not knowing: maybe they would want to create String form $file.body?
 		//     invariant: all Strings should have zero-terminated buffers
-		// 2. we use that out-of-size byte to detect if our content-length guess was wrong
+		// 2. we use that out-of-size byte to detect if our Content-Length guess was wrong
 		//    when recv gets more than we expected
-		//    a) we know that the content-length guess was wrong
+		//    a) we know that the Content-Length guess was wrong
 		//    b) we have space to put the first byte of extra data
 		//    c) we use less code to detect normal situation: on last while-cycle recv expected to just return 0
 		while(true) {
@@ -194,29 +194,6 @@ static void timeout_handler(int /*sig*/)
 }
 #endif
 
-static size_t file_untaint(const char* str, size_t len) {
-	// untaint file from L_FILE_POST encoding
-	char* j=(char *)str;
-	const char* end=str+len-1;
-	for(const char* i=str; i<=end; i++, j++){
-		if(*i=='\\' && i!=end){
-			switch(*(i+1)){
-				case '0':
-					*j='\0';
-					i++;
-					continue;
-				case '\\':
-					*j='\\';
-					i++;
-					continue;
-			}
-		}
-		if(i!=j)
-			*j=*i;
-	}
-	return j-str; // new length
-} 
-
 static int http_request(char*& response, size_t& response_size,
 			const char* host, short port, 
 			const char* request, size_t request_size,
@@ -313,25 +290,44 @@ static int http_request(char*& response,
 struct Http_pass_header_info {
 	Request_charsets* charsets;
 	String* request;
-	bool user_agent_specified;
-	bool content_type_specified;
+	bool* user_agent_specified;
+	bool* content_type_specified;
+	bool* content_type_url_encoded;
 };
 #endif
-static void http_pass_header(HashStringValue::key_type name, 
-				HashStringValue::value_type value, 
+
+char *pa_http_safe_header_name(const char *name) {
+	char *result=pa_strdup(name);
+	char *n=result;
+	if(!pa_isalpha((unsigned char)*n))
+		*n++ = '_';
+	for(; *n; ++n) {
+		if (!pa_isalnum((unsigned char)*n) && *n != '-' && *n != '_')
+			*n = '_';
+	}
+	return result;
+}
+
+static void http_pass_header(HashStringValue::key_type aname, 
+				HashStringValue::value_type avalue, 
 				Http_pass_header_info *info) {
 
-	String aname=String(name, String::L_URI);
+	const char* name_cstr=aname.cstr();
+
+	if(strcasecmp(name_cstr, HTTP_CONTENT_LENGTH)==0)
+		return;
+
+	String name=String(pa_http_safe_header_name(capitalize(name_cstr)), String::L_AS_IS);
+	String value=attributed_meaning_to_string(*avalue, String::L_HTTP_HEADER, true);
 
-	*info->request << aname << ": "
-		<< attributed_meaning_to_string(*value, String::L_URI, false)
-		<< CRLF; 
+	*info->request << name << ": " << value << CRLF;
 	
-	const String::Body name_upper=aname.change_case(info->charsets->source(), String::CC_UPPER);
-	if(name_upper==HTTP_USER_AGENT_UPPER)
-		info->user_agent_specified=true;
-	if(name_upper==HTTP_CONTENT_TYPE_UPPER)
-		info->content_type_specified=true;
+	if(strcasecmp(name_cstr, HTTP_USER_AGENT)==0)
+		*info->user_agent_specified=true;
+	if(strcasecmp(name_cstr, HTTP_CONTENT_TYPE)==0){
+		*info->content_type_specified=true;
+		*info->content_type_url_encoded=StrStartFromNC(value.cstr(), HTTP_CONTENT_TYPE_FORM_URLENCODED);
+	}
 }
 
 static void http_pass_cookie(HashStringValue::key_type name, 
@@ -399,27 +395,63 @@ static void form_value2string(
 const char* pa_form2string(HashStringValue& form, Request_charsets& charsets) {
 	String string;
 	form.for_each<String*>(form_value2string, &string);
-	return string.untaint_cstr(String::L_AS_IS, 0, &charsets);
+	return string.untaint_and_transcode_cstr(String::L_URI, &charsets);
 }
 
 struct FormPart {
 	Request* r;
 	const char* boundary;
-	String string;
+	String* string;
 	Form_table_value2string_info* info;
+
+	struct BinaryBlock{
+		const char* ptr;
+		size_t length;
+
+		BinaryBlock(String* astring, Request* r): ptr(astring->untaint_and_transcode_cstr(String::L_AS_IS, &r->charsets)), length(strlen(ptr)){}
+		BinaryBlock(const char* aptr, size_t alength): ptr(aptr), length(alength){}
+	};
+
+	Array<BinaryBlock> blocks;
+
+	FormPart(Request* ar, const char* aboundary): r(ar), boundary(aboundary), string(new String()){}
+
+	const char *post(size_t &length){
+		if(blocks.count()){
+			blocks+=BinaryBlock(string, r);
+
+			length=0;
+			for(size_t i=0; i<blocks.count(); i++)
+				length+=blocks[i].length;
+
+			char *result=(char *)pa_malloc_atomic(length);
+			char *ptr=result;
+
+			for(size_t i=0; i<blocks.count(); i++){
+				memcpy(ptr, blocks[i].ptr, blocks[i].length);
+				ptr+=blocks[i].length;
+			}
+
+			return result;
+		} else {
+			BinaryBlock result(string, r);
+			length=result.length;
+			return result.ptr;
+		}
+	}
 };
 
 static void form_part_boundary_header(FormPart& part, String::Body name, const char* file_name=0){
-	part.string << "--" << part.boundary
-				<< CRLF HTTP_CONTENT_DISPOSITION ": form-data; name=\"" 
-				<< Charset::transcode(name, part.r->charsets.source(), part.r->charsets.client())
+	*part.string << "--" << part.boundary
+				<< CRLF CONTENT_DISPOSITION_CAPITALIZED ": form-data; name=\"" 
+				<< name
 				<< "\"";
 	if(file_name){
 		if(strcmp(file_name, NONAME_DAT)!=0)
-			part.string << "; filename=\"" << file_name << "\"";
-		part.string << CRLF HTTP_CONTENT_TYPE ": " << part.r->mime_type_of(file_name);
+			*part.string << "; filename=\"" << file_name << "\"";
+		*part.string << CRLF HTTP_CONTENT_TYPE_CAPITALIZED ": " << part.r->mime_type_of(file_name);
 	}
-	part.string << CRLF CRLF;
+	*part.string << CRLF CRLF;
 }
 
 static void form_string_value2part(
@@ -428,7 +460,7 @@ static void form_string_value2part(
 				FormPart& part)
 {
 	form_part_boundary_header(part, key);
-	part.string << Charset::transcode(value, part.r->charsets.source(), part.r->charsets.client()) << CRLF;
+	*part.string << value << CRLF;
 }
 
 static void form_file_value2part(
@@ -437,8 +469,10 @@ static void form_file_value2part(
 				FormPart& part)
 {
 	form_part_boundary_header(part, key, vfile.fields().get(name_name)->as_string().cstr());
-	part.string.append_know_length(vfile.value_ptr(), vfile.value_size(), String::L_FILE_POST);
-	part.string << CRLF;
+	part.blocks+=FormPart::BinaryBlock(part.string, part.r);
+	part.blocks+=FormPart::BinaryBlock(vfile.value_ptr(), vfile.value_size());
+	part.string=new String();
+	*part.string << CRLF;
 }
 
 static void form_table_value2part(Table::element_type row, FormPart* part) {
@@ -453,10 +487,10 @@ static void form_value2part(
 	if(const String* svalue=value->get_string())
 		form_string_value2part(key, *svalue, part);
 	else if(Table* tvalue=value->get_table()) {
-		Form_table_value2string_info info(key, part.string);
+		Form_table_value2string_info info(key, *part.string);
 		part.info = &info;
 		tvalue->for_each(form_table_value2part, &part);
-	} else if(VFile* vfile=static_cast<VFile *>(value->as("file", false))){
+	} else if(VFile* vfile=static_cast<VFile *>(value->as("file"))){
 		form_file_value2part(key, *vfile, part);
 	} else
 		throw Exception(PARSER_RUNTIME,
@@ -465,14 +499,11 @@ static void form_value2part(
 }
 
 const char* pa_form2string_multipart(HashStringValue& form, Request& r, const char* boundary, size_t& post_size){
-	FormPart formpart;
-	formpart.r=&r;
-	formpart.boundary=boundary;
-	formpart.info=NULL;
+	FormPart formpart(&r, boundary);
 	form.for_each<FormPart&>(form_value2part, formpart);
-	formpart.string << "--" << boundary << "--";
-	post_size=formpart.string.length(); // very surprizing, but it calculates correct post_size even with binary files!
-	return formpart.string.untaint_cstr(String::L_AS_IS); // without transcoding
+	*formpart.string << "--" << boundary << "--";
+	// @todo: return binary blocks here to save memory in pa_internal_file_read_http
+	return formpart.post(post_size);
 }
 
 static void find_headers_end(char* p,
@@ -503,11 +534,10 @@ File_read_http_result pa_internal_file_r
 	File_read_http_result result;
 	char host[MAX_STRING];
 	const char* uri; 
-	short port;
+	short port=80;
 	const char* method="GET";
 	bool method_is_get=true;
 	HashStringValue* form=0;
-	const char* body_cstr=0;
 	int timeout_secs=2;
 	bool fail_on_status_ne_200=true;
 	bool omit_post_charset=false;
@@ -571,9 +601,7 @@ File_read_http_result pa_internal_file_r
 		}
 
 		if(valid_options!=options->count())
-			throw Exception(PARSER_RUNTIME,
-				0,
-				"invalid option passed");
+			throw Exception(PARSER_RUNTIME, 0, CALLED_WITH_INVALID_OPTION);
 	}
 	if(!asked_remote_charset) // defaulting to $request:charset
 		asked_remote_charset=&(r.charsets).source();
@@ -607,12 +635,13 @@ File_read_http_result pa_internal_file_r
 	//preparing request
 	String& connect_string=*new String(file_spec);
 
-	String request_head_and_body;
+	const char* request;
+	size_t request_size;
 	{
 		// influence URLencoding of tainted pieces to String::L_URI lang
 		Temp_client_charset temp(r.charsets, *asked_remote_charset);
 
-		const char* connect_string_cstr=connect_string.untaint_cstr(String::L_URI, 0, &(r.charsets));
+		const char* connect_string_cstr=connect_string.untaint_and_transcode_cstr(String::L_URI, &(r.charsets));
 
 		const char* current=connect_string_cstr;
 		if(strncmp(current, "http://", 7)!=0)
@@ -622,11 +651,16 @@ File_read_http_result pa_internal_file_r
 		current+=7;
 
 		strncpy(host, current, sizeof(host)-1);  host[sizeof(host)-1]=0;
-		char* host_uri=lsplit(host, '/'); 
-		uri=host_uri?current+(host_uri-1-host):"/"; 
-		char* port_cstr=lsplit(host, ':'); 
-		char* error_pos=0;
-		port=port_cstr?(short)strtol(port_cstr, &error_pos, 0):80;
+		char* host_uri=lsplit(host, '/');
+		uri=host_uri?current+(host_uri-1-host):"/";
+		char* port_cstr=lsplit(host, ':');
+		
+		if (port_cstr){
+			char* error_pos=0;
+			port=(short)strtol(port_cstr, &error_pos, 10);
+			if(port==0 || *error_pos)
+				throw Exception(PARSER_RUNTIME, &connect_string, "invalid port number '%s'", port_cstr);
+		}
 
 		// making request head
 		String head;
@@ -634,7 +668,10 @@ File_read_http_result pa_internal_file_r
 		if(method_is_get && form)
 			head << (strchr(uri, '?')!=0?"&":"?") << pa_form2string(*form, r.charsets);
 
-		head <<" HTTP/1.0" CRLF "host: "<< host << CRLF;
+		head <<" HTTP/1.0" CRLF "Host: "<< host;
+		if (port != 80)
+			head << ":" << port_cstr;
+		head << CRLF;
 
 		char* boundary=0;
 
@@ -650,95 +687,109 @@ File_read_http_result pa_internal_file_r
 				uuid.node[3], uuid.node[4], uuid.node[5]);
 		}
 
+		String user_headers;
+		bool user_agent_specified=false;
+		bool content_type_specified=false;
+		bool content_type_url_encoded=false;
+		if(vheaders && !vheaders->is_string()) { // allow empty
+			if(HashStringValue *headers=vheaders->get_hash()) {
+				Http_pass_header_info info={
+					&(r.charsets),
+					&user_headers,
+					&user_agent_specified,
+					&content_type_specified,
+					&content_type_url_encoded};
+				headers->for_each<Http_pass_header_info*>(http_pass_header, &info); 
+			} else
+				throw Exception(PARSER_RUNTIME, 
+					0,
+					"headers param must be hash"); 
+		};
+
+		const char* request_body=0;
 		size_t post_size=0;
 		if(form && !method_is_get) {
-			head << HTTP_CONTENT_TYPE ": " << (multipart ? HTTP_CONTENT_TYPE_MULTIPART_FORMDATA : HTTP_CONTENT_TYPE_FORM_URLENCODED);
+			head << "Content-Type: " << (multipart ? HTTP_CONTENT_TYPE_MULTIPART_FORMDATA : HTTP_CONTENT_TYPE_FORM_URLENCODED);
 
 			if(!omit_post_charset)
 				head << "; charset=" << asked_remote_charset->NAME_CSTR();
 
 			if(multipart) {
 				head << "; boundary=" << boundary;
-				body_cstr=pa_form2string_multipart(*form, r/*charsets & mime_type needed*/, boundary, post_size/*correct post_size returned here*/);
+				request_body=pa_form2string_multipart(*form, r/*charsets & mime_type needed*/, boundary, post_size/*correct post_size returned here*/);
 			} else {
-				body_cstr=pa_form2string(*form, r.charsets);
-				post_size=strlen(body_cstr);
+				request_body=pa_form2string(*form, r.charsets);
+				post_size=strlen(request_body);
 			}
 			head << CRLF;
-		} else if (vbody) {
-			// transcode tainted pieces and then URI-encode them
-			body_cstr=vbody->as_string().untaint_cstr(String::L_AS_IS, 0, &(r.charsets));
-
-			// now transcode is needed only if own content-type was specified _and_ clean chars with code>127 are in the body
-			// @todo: I don't like the current behaviour
-			body_cstr=Charset::transcode(
-				String::C(body_cstr, strlen(body_cstr)),
-				r.charsets.source(),
-				*asked_remote_charset
-			);
-			post_size=strlen(body_cstr);
+		} else if(vbody) {
+			// $.body was specified
+			if(content_type_url_encoded){
+				// transcode + url-encode
+				request_body=vbody->as_string().untaint_and_transcode_cstr(String::L_URI, &(r.charsets));
+			} else {
+				// content-type != application/x-www-form-urlencoded -> transcode only, don't url-encode!
+				request_body=Charset::transcode(
+					String::C(vbody->as_string().cstr(), vbody->as_string().length()),
+					r.charsets.source(),
+					*asked_remote_charset
+				);
+			}
+			post_size=strlen(request_body);
 		}
 
 		// http://www.ietf.org/rfc/rfc2617.txt
 		if(const String* authorization_field_value=basic_authorization_field(user_cstr, password_cstr))
-			head<<"authorization: "<<*authorization_field_value<<CRLF;
+			head << "Authorization: " << *authorization_field_value << CRLF;
+
+		head << user_headers;
 
-		bool user_agent_specified=false;
-		bool content_type_specified=false;
-		if(vheaders && !vheaders->is_string()) { // allow empty
-			if(HashStringValue *headers=vheaders->get_hash()) {
-				Http_pass_header_info info={&(r.charsets), &head, false};
-				headers->for_each<Http_pass_header_info*>(http_pass_header, &info); 
-				user_agent_specified=info.user_agent_specified;
-				content_type_specified=info.content_type_specified;
-			} else
-				throw Exception(PARSER_RUNTIME, 
-					&connect_string,
-					"headers param must be hash"); 
-		};
 		if(!user_agent_specified) // defaulting
-			head << HTTP_USER_AGENT ": " DEFAULT_USER_AGENT CRLF;
+			head << "User-Agent: " DEFAULT_USER_AGENT CRLF;
 
 		if(form && !method_is_get && content_type_specified) // POST + form + content-type was specified
 			throw Exception(PARSER_RUNTIME,
-				&connect_string,
+				0,
 				"$.content-type can't be specified with method POST"); 
 
 		if(vcookies && !vcookies->is_string()){ // allow empty
 			if(HashStringValue* cookies=vcookies->get_hash()) {
-				head << "cookie: ";
-				Http_pass_header_info info={&(r.charsets), &head, false};
+				head << "Cookie: ";
+				Http_pass_header_info info={&(r.charsets), &head, 0, 0, 0};
 				cookies->for_each<Http_pass_header_info*>(http_pass_cookie, &info); 
 				head << CRLF;
 			} else
 				throw Exception(PARSER_RUNTIME, 
-					&connect_string,
-					"cookies param must be hash"); 
+					0,
+					"cookies param must be hash");
 		}
 
-		if(body_cstr)
-			head << "content-length: " << format(post_size, "%u") << CRLF;
-
-		// head + end of header
-		request_head_and_body << head.untaint_cstr(String::L_AS_IS, 0, &(r.charsets)) << CRLF;
+		if(request_body)
+			head << "Content-Length: " << format(post_size, "%u") << CRLF;
+		
+		head << CRLF;
+		
+		const char *request_head=head.untaint_and_transcode_cstr(String::L_URI, &(r.charsets));
 
-		// body
-		if(body_cstr)
-			request_head_and_body << body_cstr;
+		if(request_body){
+			size_t head_size = strlen(request_head);
+			request_size=post_size + head_size;
+			char *ptr=(char *)pa_malloc_atomic(request_size);
+			memcpy(ptr, request_head, head_size);
+			memcpy(ptr+head_size, request_body, post_size);
+			request=ptr;
+		} else {
+			request_size=strlen(request_head);
+			request=request_head;
+		}
 	}
 	
-	const char* request_cstr=request_head_and_body.cstr();
-	size_t request_size=strlen(request_cstr);
-
-	if(multipart)
-		request_size=file_untaint(request_cstr, request_size);
-
 	char* response;
 	size_t response_size;
 
 	// sending request
 	int status_code=http_request(response, response_size,
-		host, port, request_cstr, request_size,
+		host, port, request, request_size,
 		timeout_secs, fail_on_status_ne_200); 
 	
 	// processing results	
@@ -776,7 +827,7 @@ File_read_http_result pa_internal_file_r
 			const String::Body HEADER_NAME=line.mid(0, pos).change_case(r.charsets.source(), String::CC_UPPER);
 			const String& HEADER_VALUE=line.mid(pos+1, line.length()).trim(String::TRIM_BOTH, " \t\r");
 			if(as_text && HEADER_NAME==HTTP_CONTENT_TYPE_UPPER)
-				real_remote_charset=detect_charset(HEADER_VALUE.cstr(), true/*already uppercased*/);
+				real_remote_charset=detect_charset(HEADER_VALUE.cstr());
 
 			// tables
 			{
@@ -809,6 +860,8 @@ File_read_http_result pa_internal_file_r
 		// skip UTF-8 signature (BOM code)
 		raw_body+=3;
 		raw_body_size-=3;
+		if(!real_remote_charset)
+			real_remote_charset=&UTF8_charset;
 	}
 
 	// output response