--- parser3/src/main/pa_http.C 2009/02/01 09:32:47 1.24 +++ parser3/src/main/pa_http.C 2009/08/30 05:28:49 1.37 @@ -1,11 +1,11 @@ /** @file Parser: http support functions. - Copyright(c) 2001-2005 ArtLebedev Group (http://www.artlebedev.com) + Copyright(c) 2001-2009 ArtLebedev Group (http://www.artlebedev.com) Author: Alexandr Petrosian (http://paf.design.ru) */ -static const char * const IDENT_HTTP_C="$Date: 2009/02/01 09:32:47 $"; +static const char * const IDENT_HTTP_C="$Date: 2009/08/30 05:28:49 $"; #include "pa_http.h" #include "pa_common.h" @@ -41,7 +41,6 @@ static const char * const IDENT_HTTP_C=" #undef CRLF #define CRLF "\r\n" -#define DCRLF "\r\n\r\n" static bool set_addr(struct sockaddr_in *addr, const char* host, const short port){ memset(addr, 0, sizeof(*addr)); @@ -67,7 +66,7 @@ size_t guess_content_length(char* buf) { char* ptr; if((ptr=strstr(buf, "Content-Length:"))) // Apache goto found; - if((ptr=strstr(buf, "content-length:"))) // Parser 3 + if((ptr=strstr(buf, "content-length:"))) // Parser 3 before 3.4.0 goto found; if((ptr=strstr(buf, "Content-length:"))) // maybe 1 goto found; @@ -76,7 +75,7 @@ size_t guess_content_length(char* buf) { return 0; found: char *error_pos; - size_t result=(size_t)strtol(ptr+15/*strlen("CONTENT-LENGTH:")*/, &error_pos, 0); + size_t result=(size_t)strtol(ptr+15/*strlen("Content-Length:")*/, &error_pos, 0); const size_t reasonable_initial_max=0x400*0x400*10 /*10M*/; if(result>reasonable_initial_max) // sanity check @@ -86,8 +85,8 @@ found: static int http_read_response(char*& response, size_t& response_size, int sock, bool fail_on_status_ne_200) { int result=0; - // fetching some to local buffer, guessing on possible content-length - response_size=0x400*20; // initial size if content-length could not be determined + // fetching some to local buffer, guessing on possible Content-Length + response_size=0x400*20; // initial size if Content-Length could not be determined const size_t preview_size=0x400*20; char preview_buf[preview_size+1/*terminator*/]; // 20K buffer to preview headers ssize_t received_size=recv(sock, preview_buf, preview_size, 0); @@ -136,9 +135,9 @@ static int http_read_response(char*& res // we use terminator byte for two purposes here: // 1. we return there zero always, not knowing: maybe they would want to create String form $file.body? // invariant: all Strings should have zero-terminated buffers - // 2. we use that out-of-size byte to detect if our content-length guess was wrong + // 2. we use that out-of-size byte to detect if our Content-Length guess was wrong // when recv gets more than we expected - // a) we know that the content-length guess was wrong + // a) we know that the Content-Length guess was wrong // b) we have space to put the first byte of extra data // c) we use less code to detect normal situation: on last while-cycle recv expected to just return 0 while(true) { @@ -314,25 +313,27 @@ static int http_request(char*& response, struct Http_pass_header_info { Request_charsets* charsets; String* request; - bool user_agent_specified; - bool content_type_specified; + bool* user_agent_specified; + bool* content_type_specified; + bool* content_type_url_encoded; }; #endif -static void http_pass_header(HashStringValue::key_type name, - HashStringValue::value_type value, +static void http_pass_header(HashStringValue::key_type aname, + HashStringValue::value_type avalue, Http_pass_header_info *info) { - String aname=String(name, String::L_URI); + String name=String(aname, String::L_URI); + String value=attributed_meaning_to_string(*avalue, String::L_URI, false); - *info->request << aname << ": " - << attributed_meaning_to_string(*value, String::L_URI, false) - << CRLF; + *info->request << name << ": " << value << CRLF; - const String::Body name_upper=aname.change_case(info->charsets->source(), String::CC_UPPER); - if(name_upper==HTTP_USER_AGENT_UPPER) - info->user_agent_specified=true; - if(name_upper==HTTP_CONTENT_TYPE_UPPER) - info->content_type_specified=true; + const String::Body NAME=name.change_case(info->charsets->source(), String::CC_UPPER); + if(NAME==HTTP_USER_AGENT_UPPER) + *info->user_agent_specified=true; + if(NAME==HTTP_CONTENT_TYPE_UPPER){ + *info->content_type_specified=true; + *info->content_type_url_encoded=StrStartFromNC(value.cstr(), HTTP_CONTENT_TYPE_FORM_URLENCODED); + } } static void http_pass_cookie(HashStringValue::key_type name, @@ -340,7 +341,7 @@ static void http_pass_cookie(HashStringV Http_pass_header_info *info) { *info->request << String(name, String::L_HTTP_COOKIE) << "=" - << attributed_meaning_to_string(*value, String::L_HTTP_COOKIE, false) + << attributed_meaning_to_string(*value, String::L_HTTP_COOKIE, true) << "; "; } @@ -366,9 +367,7 @@ static void form_string_value2string( const String& value, String& result) { - result << String(key, String::L_URI) << "="; - result.append(value, String::L_URI, true); - result << "&"; + result << String(key, String::L_URI) << "=" << String(value, String::L_URI) << "&"; } #ifndef DOXYGEN @@ -402,7 +401,7 @@ static void form_value2string( const char* pa_form2string(HashStringValue& form, Request_charsets& charsets) { String string; form.for_each(form_value2string, &string); - return string.cstr(String::L_UNSPECIFIED, 0, &charsets); + return string.transcode_and_untaint_cstr(String::L_URI, &charsets); } struct FormPart { @@ -412,33 +411,34 @@ struct FormPart { Form_table_value2string_info* info; }; -static void form_part_boundary_header(FormPart& part, String name, const char* file_name=0){ - part.string << "--" << part.boundary; - part.string << CRLF HTTP_CONTENT_DISPOSITION ": form-data; name=\"" << name << "\""; +static void form_part_boundary_header(FormPart& part, String::Body name, const char* file_name=0){ + part.string << "--" << part.boundary + << CRLF CONTENT_DISPOSITION ": form-data; name=\"" + << Charset::transcode(name, part.r->charsets.source(), part.r->charsets.client()) + << "\""; if(file_name){ if(strcmp(file_name, NONAME_DAT)!=0) part.string << "; filename=\"" << file_name << "\""; part.string << CRLF HTTP_CONTENT_TYPE ": " << part.r->mime_type_of(file_name); } - part.string << DCRLF; + part.string << CRLF CRLF; } static void form_string_value2part( - HashStringValue::key_type key, - const String& value, - FormPart& part) + HashStringValue::key_type key, + const String& value, + FormPart& part) { - form_part_boundary_header(part, String(key, String::L_URI)); - part.string.append(value, String::L_AS_IS, true); - part.string << CRLF; + form_part_boundary_header(part, key); + part.string << Charset::transcode(value, part.r->charsets.source(), part.r->charsets.client()) << CRLF; } static void form_file_value2part( - HashStringValue::key_type key, - VFile& vfile, - FormPart& part) + HashStringValue::key_type key, + VFile& vfile, + FormPart& part) { - form_part_boundary_header(part, String(key, String::L_URI), vfile.fields().get(name_name)->as_string().cstr()); + form_part_boundary_header(part, key, vfile.fields().get(name_name)->as_string().cstr()); part.string.append_know_length(vfile.value_ptr(), vfile.value_size(), String::L_FILE_POST); part.string << CRLF; } @@ -448,9 +448,9 @@ static void form_table_value2part(Table: } static void form_value2part( - HashStringValue::key_type key, - HashStringValue::value_type value, - FormPart& part) + HashStringValue::key_type key, + HashStringValue::value_type value, + FormPart& part) { if(const String* svalue=value->get_string()) form_string_value2part(key, *svalue, part); @@ -458,7 +458,7 @@ static void form_value2part( Form_table_value2string_info info(key, part.string); part.info = &info; tvalue->for_each(form_table_value2part, &part); - } else if(VFile* vfile=static_cast(value->as("file", false))){ + } else if(VFile* vfile=static_cast(value->as("file"))){ form_file_value2part(key, *vfile, part); } else throw Exception(PARSER_RUNTIME, @@ -473,8 +473,8 @@ const char* pa_form2string_multipart(Has formpart.info=NULL; form.for_each(form_value2part, formpart); formpart.string << "--" << boundary << "--"; - post_size=formpart.string.length(); - return formpart.string.cstr(String::L_UNSPECIFIED, 0, &(r.charsets)); + post_size=formpart.string.length(); // very surprizing, but it calculates correct post_size even with binary files! + return formpart.string.untaint_cstr(String::L_AS_IS); // without transcoding } static void find_headers_end(char* p, @@ -607,16 +607,14 @@ File_read_http_result pa_internal_file_r } //preparing request - String& connect_string=*new String; - // not in ^sql{... L_SQL ...} spirit, but closer to ^file::load one - connect_string.append(file_spec, String::L_URI); // tainted pieces -> URI pieces + String& connect_string=*new String(file_spec); String request_head_and_body; { // influence URLencoding of tainted pieces to String::L_URI lang Temp_client_charset temp(r.charsets, *asked_remote_charset); - const char* connect_string_cstr=connect_string.cstr(String::L_UNSPECIFIED, 0, &(r.charsets)); + const char* connect_string_cstr=connect_string.transcode_and_untaint_cstr(String::L_URI, &(r.charsets)); const char* current=connect_string_cstr; if(strncmp(current, "http://", 7)!=0) @@ -626,23 +624,21 @@ File_read_http_result pa_internal_file_r current+=7; strncpy(host, current, sizeof(host)-1); host[sizeof(host)-1]=0; - char* host_uri=lsplit(host, '/'); - uri=host_uri?current+(host_uri-1-host):"/"; - char* port_cstr=lsplit(host, ':'); + char* host_uri=lsplit(host, '/'); + uri=host_uri?current+(host_uri-1-host):"/"; + char* port_cstr=lsplit(host, ':'); char* error_pos=0; port=port_cstr?(short)strtol(port_cstr, &error_pos, 0):80; - bool uri_has_query_string=strchr(uri, '?')!=0; - // making request head String head; head << method << " " << uri; - if(form && method_is_get) - head << (uri_has_query_string?"&":"?") << pa_form2string(*form, r.charsets); + if(method_is_get && form) + head << (strchr(uri, '?')!=0?"&":"?") << pa_form2string(*form, r.charsets); - head <<" HTTP/1.0" CRLF "host: "<< host << CRLF; + head <<" HTTP/1.0" CRLF "Host: "<< host << CRLF; - char* boundary; + char* boundary=0; if(multipart){ uuid uuid=get_uuid(); @@ -656,97 +652,107 @@ File_read_http_result pa_internal_file_r uuid.node[3], uuid.node[4], uuid.node[5]); } + String user_headers; + bool user_agent_specified=false; + bool content_type_specified=false; + bool content_type_url_encoded=false; + if(vheaders && !vheaders->is_string()) { // allow empty + if(HashStringValue *headers=vheaders->get_hash()) { + Http_pass_header_info info={ + &(r.charsets), + &user_headers, + &user_agent_specified, + &content_type_specified, + &content_type_url_encoded}; + headers->for_each(http_pass_header, &info); + } else + throw Exception(PARSER_RUNTIME, + 0, + "headers param must be hash"); + }; + size_t post_size=0; if(form && !method_is_get) { - head << HTTP_CONTENT_TYPE ": "; + head << HTTP_CONTENT_TYPE ": " << (multipart ? HTTP_CONTENT_TYPE_MULTIPART_FORMDATA : HTTP_CONTENT_TYPE_FORM_URLENCODED); + + if(!omit_post_charset) + head << "; charset=" << asked_remote_charset->NAME_CSTR(); + if(multipart) { - head << HTTP_CONTENT_TYPE_MULTIPART_FORMDATA "; boundary=" << boundary << CRLF; - // !!! charset? - body_cstr=pa_form2string_multipart(*form, r, boundary, post_size); + head << "; boundary=" << boundary; + body_cstr=pa_form2string_multipart(*form, r/*charsets & mime_type needed*/, boundary, post_size/*correct post_size returned here*/); } else { - head << HTTP_CONTENT_TYPE_FORM_URLENCODED; - if(!omit_post_charset) - head << "; charset=" << asked_remote_charset->NAME_CSTR() << ";"; - head << CRLF; body_cstr=pa_form2string(*form, r.charsets); post_size=strlen(body_cstr); } - } else if (vbody) { - body_cstr=vbody->as_string().cstr(String::L_UNSPECIFIED, 0, &(r.charsets)); - // needed for transcoded $.body[] first of all - body_cstr=Charset::transcode( - String::C(body_cstr, strlen(body_cstr)), - r.charsets.source(), - *asked_remote_charset - ); + head << CRLF; + } else if(vbody) { + if(content_type_url_encoded){ + // transcode + url-encode + body_cstr=vbody->as_string().transcode_and_untaint_cstr(String::L_URI, &(r.charsets)); + } else { + // content-type != application/x-www-form-urlencoded -> transcode only, don't url-encode! + body_cstr=Charset::transcode( + String::C(vbody->as_string().cstr(), vbody->as_string().length()), + r.charsets.source(), + *asked_remote_charset + ); + } + post_size=strlen(body_cstr); } // http://www.ietf.org/rfc/rfc2617.txt if(const String* authorization_field_value=basic_authorization_field(user_cstr, password_cstr)) - head<<"authorization: "<<*authorization_field_value<is_string()) { // allow empty - if(HashStringValue *headers=vheaders->get_hash()) { - Http_pass_header_info info={&(r.charsets), &head, false}; - headers->for_each(http_pass_header, &info); - user_agent_specified=info.user_agent_specified; - content_type_specified=info.content_type_specified; - } else - throw Exception(PARSER_RUNTIME, - &connect_string, - "headers param must be hash"); - }; if(!user_agent_specified) // defaulting head << HTTP_USER_AGENT ": " DEFAULT_USER_AGENT CRLF; if(form && !method_is_get && content_type_specified) // POST + form + content-type was specified throw Exception(PARSER_RUNTIME, - &connect_string, + 0, "$.content-type can't be specified with method POST"); if(vcookies && !vcookies->is_string()){ // allow empty if(HashStringValue* cookies=vcookies->get_hash()) { - head << "cookie: "; - Http_pass_header_info info={&(r.charsets), &head, false}; + head << "Cookie: "; + Http_pass_header_info info={&(r.charsets), &head, 0, 0, 0}; cookies->for_each(http_pass_cookie, &info); head << CRLF; } else throw Exception(PARSER_RUNTIME, - &connect_string, + 0, "cookies param must be hash"); } - if(body_cstr) { - head << "content-length: " << format(post_size, "%u") << CRLF; - } - - const char* head_cstr=head.cstr(String::L_UNSPECIFIED, 0, &(r.charsets)); + if(body_cstr) + head << HTTP_CONTENT_LENGTH << ": " << format(post_size, "%u") << CRLF; // head + end of header - request_head_and_body << head_cstr << CRLF; + request_head_and_body << head.untaint_cstr(String::L_AS_IS, 0, &(r.charsets)) << CRLF; // body if(body_cstr) request_head_and_body << body_cstr; } - //sending request - char* response; - size_t response_size; - - const char* request=request_head_and_body.cstr(); - size_t request_size=strlen(request); + const char* request_cstr=request_head_and_body.cstr(); + size_t request_size=strlen(request_cstr); if(multipart) - request_size=file_untaint(request, request_size); + request_size=file_untaint(request_cstr, request_size); + + char* response; + size_t response_size; + // sending request int status_code=http_request(response, response_size, - host, port, request, request_size, + host, port, request_cstr, request_size, timeout_secs, fail_on_status_ne_200); - //processing results + // processing results char* raw_body; size_t raw_body_size; char* headers_end_at; find_headers_end(response, @@ -761,7 +767,7 @@ File_read_http_result pa_internal_file_r if(headers_end_at) { *headers_end_at=0; - const String header_block(String::C(response, headers_end_at-response), true); + const String header_block(String::C(response, headers_end_at-response), String::L_TAINTED); ArrayString aheaders; HashStringValue& tables=vtables->hash(); @@ -769,7 +775,7 @@ File_read_http_result pa_internal_file_r size_t pos_after=0; header_block.split(aheaders, pos_after, "\n"); - //processing headers + // processing headers size_t aheaders_count=aheaders.count(); for(size_t i=1; i