|
|
| version 1.89, 2001/05/21 16:38:46 | version 1.209, 2007/04/20 10:19:06 |
|---|---|
| Line 1 | Line 1 |
| /** @file | /** @file |
| Parser: string class. @see untasize_t.C. | Parser: string class. @see untalength_t.C. |
| Copyright (c) 2001 ArtLebedev Group (http://www.artlebedev.com) | Copyright (c) 2001-2005 ArtLebedev Group (http://www.artlebedev.com) |
| Author: Alexandr Petrosian <paf@design.ru> (http://paf.design.ru) | |
| Author: Alexander Petrosyan <paf@design.ru> (http://design.ru/paf) | |
| $Id$ | |
| */ | */ |
| #include "pa_config_includes.h" | static const char * const IDENT_STRING_C="$Date$"; |
| #include "pcre.h" | #include "pcre.h" |
| #include "internal.h" | |
| #include "pa_pool.h" | |
| #include "pa_string.h" | #include "pa_string.h" |
| #include "pa_hash.h" | |
| #include "pa_exception.h" | #include "pa_exception.h" |
| #include "pa_common.h" | |
| #include "pa_array.h" | |
| #include "pa_globals.h" | |
| #include "pa_table.h" | #include "pa_table.h" |
| #include "pa_dictionary.h" | |
| #include "pa_charset.h" | |
| String::String(Pool& apool, const char *src, size_t src_size, bool tainted) : | const String String::Empty; |
| Pooled(apool) { | |
| last_chunk=&head; | |
| head.count=CR_PREALLOCATED_COUNT; | |
| append_here=head.rows; | |
| head.preallocated_link=0; | |
| link_row=&head.rows[head.count]; | |
| fused_rows=fsize=0; | |
| if(src) | |
| if(tainted) | |
| APPEND_TAINTED(src, src_size, 0, 0); | |
| else | |
| APPEND_CLEAN(src, src_size, 0, 0); | |
| } | |
| void String::expand() { | |
| size_t new_chunk_count=last_chunk->count+CR_GROW_COUNT; | |
| last_chunk=static_cast<Chunk *>( | |
| malloc(sizeof(size_t)+sizeof(Chunk::Row)*new_chunk_count+sizeof(Chunk *))); | |
| last_chunk->count=new_chunk_count; | |
| link_row->link=last_chunk; | |
| append_here=last_chunk->rows; | |
| link_row=&last_chunk->rows[last_chunk->count]; | |
| link_row->link=0; | |
| } | |
| String::String(const String& src) : Pooled(src.pool()) { | int pa_atoi(const char* str, const String* problem_source) { |
| head.count=CR_PREALLOCATED_COUNT; | if(!str) |
| return 0; | |
| size_t src_used_rows=src.fused_rows; | |
| if(src_used_rows<=head.count) { | while(*str && isspace((unsigned char)*str)) |
| // all new rows fit size_to preallocated area | str++; |
| size_t curr_chunk_rows=head.count; | if(!*str) |
| memcpy(head.rows, src.head.rows, sizeof(Chunk::Row)*src_used_rows); | return 0; |
| append_here=&head.rows[src_used_rows]; | |
| link_row=&head.rows[curr_chunk_rows]; | int result; |
| } else { | char *error_pos; |
| // warning: | bool negative=false; |
| // heavily relies on the fact | if(str[0]=='-') { |
| // "preallocated area is the same for all strings" | negative=true; |
| // | str++; |
| // info: | } else if(str[0]=='+') { |
| // allocating only enough mem to fit src string rows | str++; |
| // next append would allocate a new chunk | |
| // | |
| // new rows don't fit size_to preallocated area: splitting size_to two chunks | |
| // preallocated chunk src to constructing head | |
| memcpy(head.rows, src.head.rows, sizeof(Chunk::Row)*head.count); | |
| // remaining rows size_to new_chunk | |
| size_t curr_chunk_rows=src_used_rows-head.count; | |
| Chunk *new_chunk=static_cast<Chunk *>( | |
| malloc(sizeof(size_t)+sizeof(Chunk::Row)*curr_chunk_rows+sizeof(Chunk *))); | |
| new_chunk->count=curr_chunk_rows; | |
| head.preallocated_link=new_chunk; | |
| append_here=link_row=&new_chunk->rows[new_chunk->count]; | |
| Chunk *old_chunk=src.head.preallocated_link; | |
| Chunk::Row *new_rows=new_chunk->rows; | |
| size_t rows_left_to_copy=new_chunk->count; | |
| while(true) { | |
| size_t old_count=old_chunk->count; | |
| Chunk *next_chunk=old_chunk->rows[old_count].link; | |
| if(next_chunk) { | |
| // not last source chunk | |
| // taking it all | |
| memcpy(new_rows, old_chunk->rows, sizeof(Chunk::Row)*old_count); | |
| new_rows+=old_count; | |
| rows_left_to_copy-=old_count; | |
| old_chunk=next_chunk; | |
| } else { | |
| // the last source chunk | |
| // taking only those rows of chunk that _left_to_copy | |
| memcpy(new_rows, old_chunk->rows, sizeof(Chunk::Row)*rows_left_to_copy); | |
| break; | |
| } | |
| } | |
| } | } |
| link_row->link=0; | // 0xABC |
| fused_rows=src_used_rows; | if(str[0]=='0') |
| fsize=src.fsize; | if(str[1]=='x' || str[1]=='X') |
| } | result=(int)(unsigned long)strtol(str, &error_pos, 0); |
| else { | |
| String& String::append(const String& src, Untaint_lang lang, bool forced) { | // skip leading 0000, to disable octal interpretation |
| const Chunk *chunk=&src.head; | do str++; while(*str=='0'); |
| do { | result=(int)strtol(str, &error_pos, 0); |
| const Chunk::Row *row=chunk->rows; | } |
| for(size_t i=0; i<chunk->count; i++, row++) { | else |
| if(row==src.append_here) | result=(int)strtol(str, &error_pos, 0); |
| goto break2; | if(negative) |
| result=-result; | |
| APPEND(row->item.ptr, row->item.size, | |
| (lang!=UL_PASS_APPENDED && (row->item.lang==UL_TAINTED || forced))?lang:row->item.lang, | while(char c=*error_pos++) |
| row->item.origin.file, row->item.origin.line); | if(!isspace((unsigned char)c)) |
| } | throw Exception("number.format", |
| chunk=row->link; | problem_source, |
| } while(chunk); | problem_source?"invalid number (int)": "'%s' is invalid number (int)", str); |
| break2: | |
| return *this; | return result; |
| } | } |
| String& String::real_append(STRING_APPEND_PARAMS) { | double pa_atod(const char* str, const String* problem_source) { |
| if(!src) | if(!str) |
| return *this; | return 0; |
| if(!size) | |
| size=strlen(src); | while(*str && isspace((unsigned char)*str)) |
| if(!size) | str++; |
| return *this; | if(!*str) |
| return 0; | |
| if(chunk_is_full()) | double result; |
| expand(); | char *error_pos; |
| bool negative=false; | |
| if(str[0]=='-') { | |
| negative=true; | |
| str++; | |
| } else if(str[0]=='+') { | |
| str++; | |
| } | |
| // 0xABC | |
| if(str[0]=='0') | |
| if(str[1]=='x' || str[1]=='X') | |
| result=(double)(unsigned long)strtol(str, &error_pos, 0); | |
| else { | |
| // skip leading 0000, to disable octal interpretation | |
| do str++; while(*str=='0'); | |
| result=(double)strtod(str, &error_pos); | |
| } | |
| else | |
| result=(double)strtod(str, &error_pos); | |
| if(negative) | |
| result=-result; | |
| while(char c=*error_pos++) | |
| if(!isspace((unsigned char)c)) | |
| throw Exception("number.format", | |
| problem_source, | |
| problem_source?"invalid number (double)": "'%s' is invalid number (double)", str); | |
| append_here->item.ptr=src; | return result; |
| fsize+=append_here->item.size=size; | } |
| append_here->item.lang=lang; | |
| #ifndef NO_STRING_ORIGIN | |
| append_here->item.origin.file=file; | |
| append_here->item.origin.line=line; | |
| #endif | |
| append_here++; fused_rows++; | |
| return *this; | // cord lib extension |
| #ifndef DOXYGEN | |
| typedef struct { | |
| ssize_t countdown; | |
| int target; /* Character we're looking for */ | |
| } chr_data; | |
| #endif | |
| static int CORD_range_contains_chr_greater_then_proc(char c, size_t size, void* client_data) | |
| { | |
| register chr_data * d = (chr_data *)client_data; | |
| if (d -> countdown<=0) return(2); | |
| d -> countdown -= size; | |
| if (c > d -> target) return(1); | |
| return(0); | |
| } | } |
| int CORD_range_contains_chr_greater_then(CORD x, size_t i, size_t n, int c) | |
| { | |
| chr_data d; | |
| uint String::hash_code() const { | d.countdown = n; |
| uint result=0; | d.target = c; |
| return(CORD_block_iter(x, i, CORD_range_contains_chr_greater_then_proc, &d) == 1/*alternatives: 0 normally ended, 2=struck 'n'*/); | |
| } | |
| const Chunk *chunk=&head; | static int CORD_block_count_proc(char /*c*/, size_t /*size*/, void* client_data) |
| do { | { |
| const Chunk::Row *row=chunk->rows; | int* result=(int*)client_data; |
| for(size_t i=0; i<chunk->count; i++) { | (*result)++; |
| if(row==append_here) | return(0); // 0=continue |
| goto break2; | } |
| size_t CORD_block_count(CORD x) | |
| result=Hash::generic_code(result, row->item.ptr, row->item.size); | { |
| row++; | size_t result=0; |
| } | CORD_block_iter(x, 0, CORD_block_count_proc, &result); |
| chunk=row->link; | return result; |
| } while(chunk); | |
| break2: | |
| return result; | |
| } | } |
| /// @todo move 'lang' skipping to pos | // helpers |
| int String::cmp(int& partial, const String& src, | |
| size_t this_offset, Untaint_lang lang) const { | |
| partial=-1; | |
| this_offset=min(this_offset, size()-1); | |
| const Chunk *a_chunk=&head; | |
| const Chunk *b_chunk=&src.head; | |
| const Chunk::Row *a_row=a_chunk->rows; | |
| const Chunk::Row *b_row=b_chunk->rows; | |
| size_t a_offset=this_offset; | |
| size_t b_offset=0; | |
| Chunk::Row *a_end=append_here; | |
| Chunk::Row *b_end=src.append_here; | |
| size_t a_countdown=a_chunk->count; | |
| size_t b_countdown=b_chunk->count; | |
| size_t result; | |
| size_t pos=0; | |
| bool a_break=size()==0; | |
| bool b_break=size()==0; | |
| if(!(a_break || b_break)) while(true) { | |
| if(pos+a_row->item.size > this_offset) { | |
| if(lang!=UL_UNSPECIFIED && a_row->item.lang!=lang) | |
| return -1; // wrong lang -- bail out | |
| int size_diff= | |
| (a_row->item.size-a_offset)- | |
| (b_row->item.size-b_offset); | |
| if(size_diff==0) { // a has same size as b | |
| result=memcmp(a_row->item.ptr+a_offset, b_row->item.ptr+b_offset, | |
| a_row->item.size-a_offset); | |
| if(result) | |
| return result; | |
| pos+=a_row->item.size; | |
| a_row++; a_countdown--; a_offset=0; | |
| b_row++; b_countdown--; b_offset=0; | |
| } else if (size_diff>0) { // a longer | |
| result=memcmp(a_row->item.ptr+a_offset, b_row->item.ptr+b_offset, | |
| b_row->item.size-b_offset); | |
| if(result) | |
| return result; | |
| a_offset+=b_row->item.size-b_offset; | |
| b_row++; b_countdown--; b_offset=0; | |
| } else { // b longer | |
| result=memcmp(a_row->item.ptr+a_offset, b_row->item.ptr+b_offset, | |
| a_row->item.size-a_offset); | |
| if(result) | |
| return result; | |
| b_offset+=a_row->item.size-a_offset; | |
| pos+=a_row->item.size; | |
| a_row++; a_countdown--; a_offset=0; | |
| } | |
| if(b_break=b_row==b_end) { | |
| a_break=a_row==a_end; | |
| break; | |
| } | |
| if(!b_countdown) { | |
| b_chunk=b_row->link; | |
| b_row=b_chunk->rows; | |
| b_countdown=b_chunk->count; | |
| } | |
| } else { | |
| a_offset-=a_row->item.size; | |
| pos+=a_row->item.size; | |
| a_row++; a_countdown--; | |
| } | |
| if(a_break=a_row==a_end) { | /// String::match uses this as replace & global search table columns |
| b_break=b_row==b_end; | |
| break; | const int MAX_MATCH_GROUPS=100; |
| } | |
| if(!a_countdown) { | class String_match_table_template_columns: public ArrayString { |
| a_chunk=a_row->link; | public: |
| a_row=a_chunk->rows; | String_match_table_template_columns() { |
| a_countdown=a_chunk->count; | *this+=new String("prematch"); |
| *this+=new String("match"); | |
| *this+=new String("postmatch"); | |
| for(int i=0; i<MAX_MATCH_GROUPS; i++) { | |
| *this+=new String(String::Body::Format(1+i), String::L_CLEAN); | |
| } | } |
| } | } |
| if(a_break==b_break) { // ended simultaneously | }; |
| partial=0; return 0; | |
| } else if(a_break) { // first bytes equal, but a ended before b | Table string_match_table_template(new String_match_table_template_columns); |
| partial=1; return -1; | |
| } else { | // String::Body methods |
| partial=2; return +1; | |
| } | String::Body String::Body::Format(int value) { |
| char local[MAX_NUMBER]; | |
| size_t length=snprintf(local, MAX_NUMBER, "%d", value); | |
| return String::Body(pa_strdup(local, length), length); | |
| } | } |
| /// @todo move 'lang' skipping to pos | String::Body String::Body::trim(String::Trim_kind kind, const char* chars, |
| int String::cmp(int& partial, const char* b_ptr, size_t src_size, | size_t* out_start, size_t* out_length) const { |
| size_t this_offset, Untaint_lang lang) const { | size_t our_length=length(); |
| partial=-1; | if(!our_length) |
| size_t b_size=src_size?src_size:b_ptr?strlen(b_ptr):0; | return *this; |
| this_offset=min(this_offset, size()-1); | if(!chars) |
| chars=" \t\n"; // white space | |
| const Chunk *a_chunk=&head; | |
| const Chunk::Row *a_row=a_chunk->rows; | |
| size_t a_offset=this_offset; | |
| size_t b_offset=0; | |
| Chunk::Row *a_end=append_here; | |
| size_t a_countdown=a_chunk->count; | |
| size_t pos=0; | |
| bool a_break=size()==0; | |
| bool b_break=b_size==0; | |
| if(!(a_break || b_break)) while(true) { | |
| if(pos+a_row->item.size > this_offset) { | |
| if(lang!=UL_UNSPECIFIED && a_row->item.lang!=lang) | |
| return -1; // wrong lang -- bail out | |
| int size_diff= | |
| (a_row->item.size-a_offset)- | |
| (b_size-b_offset); | |
| if(size_diff==0) { // a has same size as b | |
| if(size_t result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset, | |
| a_row->item.size-a_offset)!=0) | |
| return result; | |
| pos+=a_row->item.size; | |
| a_row++; a_countdown--; a_offset=0; | |
| b_break=true; | |
| } else if (size_diff>0) { // a longer | |
| if(size_t result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset, | |
| b_size-b_offset)!=0) | |
| return result; | |
| a_offset+=b_size-b_offset; | |
| b_break=true; | |
| } else { // b longer | |
| if(size_t result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset, | |
| a_row->item.size-a_offset)!=0) | |
| return result; | |
| b_offset+=a_row->item.size-a_offset; | |
| pos+=a_row->item.size; | |
| a_row++; a_countdown--; a_offset=0; | |
| } | |
| } else { | |
| a_offset-=a_row->item.size; | |
| pos+=a_row->item.size; | |
| a_row++; a_countdown--; | |
| } | |
| a_break=a_row==a_end; | size_t start=0; |
| if(a_break || b_break) | size_t end=our_length; |
| break; | // from left... |
| if(!a_countdown) { | if(kind!=TRIM_END) { |
| a_chunk=a_row->link; | CORD_pos pos; set_pos(pos, 0); |
| a_row=a_chunk->rows; | while(true) { |
| a_countdown=a_chunk->count; | char c=CORD_pos_fetch(pos); |
| if(strchr(chars, c)) { | |
| if(++start==our_length) | |
| return 0; // all chars are empty, just return empty string | |
| } else | |
| break; | |
| CORD_next(pos); | |
| } | } |
| } | } |
| if(a_break==b_break) { // ended simultaneously | // from right.. |
| partial=0; return 0; | if(kind!=TRIM_START) { |
| } else if(a_break) { // first bytes equal, but a ended before b | CORD_pos pos; set_pos(pos, end-1); |
| partial=1; return -1; | while(true) { |
| } else { | char c=CORD_pos_fetch(pos); |
| partial=2; return +1; | if(strchr(chars, c)) { |
| if(--end==0) // optimization: NO need to check for 'end>=start', that's(<) impossible | |
| return 0; // all chars are empty, just return empty string | |
| } else | |
| break; | |
| CORD_prev(pos); | |
| } | |
| } | } |
| if(start==0 && end==our_length) // nobody moved a thing | |
| return *this; | |
| if(out_start) | |
| *out_start=start; | |
| size_t new_length=end-start; | |
| if(out_length) | |
| *out_length=new_length; | |
| return mid(start, new_length); | |
| } | |
| static int CORD_batched_iter_fn_generic_hash_code(char c, void * client_data) { | |
| uint& result=*static_cast<uint*>(client_data); | |
| generic_hash_code(result, c); | |
| return 0; | |
| } | |
| static int CORD_batched_iter_fn_generic_hash_code(const char* s, void * client_data) { | |
| uint& result=*static_cast<uint*>(client_data); | |
| generic_hash_code(result, s); | |
| return 0; | |
| }; | |
| uint String::Body::hash_code() const { | |
| uint result=0; | |
| CORD_iter5(body, 0, | |
| CORD_batched_iter_fn_generic_hash_code, | |
| CORD_batched_iter_fn_generic_hash_code, &result); | |
| return result; | |
| } | } |
| #ifndef NO_STRING_ORIGIN | // String methods |
| const Origin& String::origin() const { | |
| if(!fused_rows) | String::String(const char* cstr, size_t helper_length, bool tainted): body(CORD_EMPTY) { |
| THROW(0, 0, | append_help_length(cstr, helper_length, tainted?L_TAINTED:L_CLEAN); |
| 0, | } |
| "String::origin() of empty string called"); | String::String(const String::C cstr, bool tainted): body(CORD_EMPTY) { |
| append_know_length(cstr.str, cstr.length, tainted?L_TAINTED:L_CLEAN); | |
| // determining origin by last appended piece | |
| // because first one frequently constant. | |
| // ex: ^load[/file] "document_root" + "/file" | |
| // when last peice is constant, | |
| // ex: parser_root_auto_path{dynamic} / auto.p{const} | |
| // using first piece | |
| Origin& last_origin=append_here[-1].item.origin; | |
| return last_origin.file ? last_origin : head.rows[0].item.origin; | |
| } | } |
| #endif | |
| String& String::mid(size_t start, size_t finish) const { | String& String::append_know_length(const char* str, size_t known_length, Language lang) { |
| start=max(0, start); | if(!known_length) |
| finish=min(size(), finish); | return *this; |
| if(start==finish) | |
| return *empty_string; | // first: langs |
| langs.append(body, lang, known_length); | |
| String& result=*NEW String(pool()); | // next: letters themselves |
| body.append_know_length(str, known_length); | |
| size_t pos=0; | |
| const Chunk *chunk=&head; | ASSERT_STRING_INVARIANT(*this); |
| do { | return *this; |
| const Chunk::Row *row=chunk->rows; | } |
| for(size_t i=0; i<chunk->count; pos+=row->item.size, i++, row++) { | String& String::append_help_length(const char* str, size_t helper_length, Language lang) { |
| if(row==append_here) | if(!str) |
| goto break2; | return *this; |
| size_t known_length=helper_length?helper_length:strlen(str); | |
| size_t item_finish=pos+row->item.size; | if(!known_length) |
| if(item_finish > start) { // started now or already? | return *this; |
| bool started=result.size()==0; // started now? | |
| bool finished=finish <= item_finish; // finished now? | return append_know_length(str, known_length, lang); |
| size_t offset=started?start-pos:0; | } |
| size_t size=finished?finish-pos:row->item.size; | String& String::append_strdup(const char* str, size_t helper_length, Language lang) { |
| result.APPEND( | size_t known_length=helper_length?helper_length:strlen(str); |
| row->item.ptr+offset, size-offset, | if(!known_length) |
| row->item.lang, | return *this; |
| row->item.origin.file, row->item.origin.line); | |
| if(finished) | // first: langs |
| goto break2; | langs.append(body, lang, known_length); |
| } | // next: letters themselves |
| } | body.append_strdup_know_length(str, known_length); |
| chunk=row->link; | |
| } while(chunk); | ASSERT_STRING_INVARIANT(*this); |
| break2: | return *this; |
| // SAPI::log(pool(), "piece of '%s' from %d to %d is '%s'", | } |
| //cstr(), start, finish, result.cstr()); | |
| /// @todo check in doc: whether it documents NOW bad situation "abc".mid(-1, 3) =were?="ab" | |
| String& String::mid(size_t substr_begin, size_t substr_end) const { | |
| String& result=*new String; | |
| size_t self_length=length(); | |
| substr_begin=min(substr_begin, self_length); | |
| substr_end=min(max(substr_end, substr_begin), self_length); | |
| size_t substr_length=substr_end-substr_begin; | |
| if(!substr_length) | |
| return result; | |
| // first: their langs | |
| result.langs.append(result.body, langs, substr_begin, substr_length); | |
| // next: letters themselves | |
| result.body=body.mid(substr_begin, substr_length); | |
| ASSERT_STRING_INVARIANT(result); | |
| return result; | return result; |
| } | } |
| int String::pos(const String& substr, | size_t String::pos(const String::Body substr, size_t this_offset, Language lang) const { |
| size_t result, Untaint_lang lang) const { | size_t substr_length=substr.length(); |
| for(; result<size(); result++) { | while(true) { |
| int partial; cmp(partial, substr, result, lang); | size_t substr_begin=body.pos(substr, this_offset); |
| if( | |
| partial==0 || // full match | if(substr_begin==CORD_NOT_FOUND) |
| partial==2) // 'substr' starts 'this'+'result' | return STRING_NOT_FOUND; |
| return result; | |
| if(langs.check_lang(lang, substr_begin, substr_length)) | |
| return substr_begin; | |
| this_offset=substr_begin+substr_length; | |
| } | } |
| return -1; | |
| } | } |
| int String::pos(const char *substr, size_t substr_size, | size_t String::pos(const String& substr, |
| size_t result, Untaint_lang lang) const { | size_t this_offset, Language lang) const { |
| for(; result<size(); result++) { | return pos(substr.body, this_offset, lang); |
| int partial; cmp(partial, substr, substr_size, result, lang); | |
| if( | |
| partial==0 || // full match | |
| partial==2) // 'substr' starts 'this'+'result' | |
| return result; | |
| } | |
| return -1; | |
| } | } |
| void String::split(Array& result, | void String::split(ArrayString& result, |
| size_t* pos_after_ref, | size_t& pos_after, |
| const char *delim, size_t delim_size, | const char* delim, |
| Untaint_lang lang, int limit) const { | Language lang, int limit) const { |
| if(delim_size) { | size_t self_length=length(); |
| size_t pos_after=pos_after_ref?*pos_after_ref:0; | if(size_t delim_length=strlen(delim)) { |
| int pos_before; | size_t pos_before; |
| // while we have 'delim'... | // while we have 'delim'... |
| for(; (pos_before=pos(delim, delim_size, pos_after, lang))>=0 && limit; limit--) { | for(; (pos_before=pos(delim, pos_after, lang))!=STRING_NOT_FOUND && limit; limit--) { |
| result+=&mid(pos_after, pos_before); | result+=&mid(pos_after, pos_before); |
| pos_after=pos_before+delim_size; | pos_after=pos_before+delim_length; |
| } | } |
| // last piece | // last piece |
| if(pos_after<size() && limit) { | if(pos_after<self_length && limit) { |
| result+=&mid(pos_after, size()); | result+=&mid(pos_after, self_length); |
| pos_after=size(); | pos_after=self_length; |
| } | } |
| if(pos_after_ref) | |
| *pos_after_ref=pos_after; | |
| } else { // empty delim | } else { // empty delim |
| result+=this; | result+=this; |
| if(pos_after_ref) | pos_after+=self_length; |
| *pos_after_ref+=size(); | |
| } | } |
| } | } |
| void String::split(Array& result, | void String::split(ArrayString& result, |
| size_t* pos_after_ref, | size_t& pos_after, |
| const String& delim, Untaint_lang lang, | const String& delim, Language lang, |
| int limit) const { | int limit) const { |
| if(delim.size()) { | if(!delim.is_empty()) { |
| size_t pos_after=pos_after_ref?*pos_after_ref:0; | size_t pos_before; |
| int pos_before; | |
| // while we have 'delim'... | // while we have 'delim'... |
| for(; (pos_before=pos(delim, pos_after, lang))>=0 && limit; limit--) { | for(; (pos_before=pos(delim, pos_after, lang))!=STRING_NOT_FOUND && limit; limit--) { |
| result+=&mid(pos_after, pos_before); | result+=&mid(pos_after, pos_before); |
| pos_after=pos_before+delim.size(); | pos_after=pos_before+delim.length(); |
| } | } |
| // last piece | // last piece |
| if(pos_after<size() && limit) { | if(pos_after<length() && limit) { |
| result+=&mid(pos_after, size()); | result+=&mid(pos_after, length()); |
| pos_after=size(); | pos_after=length(); |
| } | } |
| if(pos_after_ref) | |
| *pos_after_ref=pos_after; | |
| } else { // empty delim | } else { // empty delim |
| result+=this; | result+=this; |
| if(pos_after_ref) | pos_after+=length(); |
| *pos_after_ref+=size(); | |
| } | } |
| } | } |
| static void regex_options(char *options, int *result){ | enum Match_feature { |
| MF_NEED_PRE_POST_MATCH = 0x01, | |
| MF_JUST_COUNT_MATCHES = 0x02 | |
| }; | |
| static void regex_options(const String* options, int* result, int* match_features){ | |
| struct Regex_option { | struct Regex_option { |
| char key; | const char* keyL; |
| int clear, set; | const char* keyU; |
| int clear; | |
| int set; | |
| int *result; | int *result; |
| int flag; | |
| } regex_option[]={ | } regex_option[]={ |
| {'i', 0, PCRE_CASELESS, result}, // a=A | {"i", "I", 0, PCRE_CASELESS, result, 0}, // a=A |
| {'s', 0, PCRE_DOTALL, result}, // \n\n$ [default] | {"s", "S", 0, PCRE_DOTALL, result, 0}, // \n\n$ [default] |
| {'x', 0, PCRE_EXTENDED, result}, // whitespace in regex ignored | {"x", "U", 0, PCRE_EXTENDED, result, 0}, // whitespace in regex ignored |
| {'m', PCRE_DOTALL, PCRE_MULTILINE, result}, // ^aaa\n$^bbb\n$ | {"m", "M", PCRE_DOTALL, PCRE_MULTILINE, result, 0}, // ^aaa\n$^bbb\n$ |
| {'g', 0, true, result+1}, // many rows | {"g", "G", 0, 1, result+1, 0}, // many rows |
| {0}, | {"'", 0, 0, 0, 0, MF_NEED_PRE_POST_MATCH}, |
| {"n", "N", 0, 0, 0, MF_JUST_COUNT_MATCHES}, | |
| {0, 0, 0, 0, 0, 0} | |
| }; | }; |
| result[0]=PCRE_EXTRA | PCRE_DOTALL; | result[0]=PCRE_EXTRA | PCRE_DOTALL | PCRE_DOLLAR_ENDONLY; |
| result[1]=0; | result[1]=0; |
| if(options) | if(options && !options->is_empty()) |
| for(Regex_option *o=regex_option; o->key; o++) | for(Regex_option *o=regex_option; o->keyL; o++) |
| if( | if( |
| strchr(options, o->key) || | options->pos(o->keyL)!=STRING_NOT_FOUND |
| strchr(options, toupper(o->key))) { | || (o->keyU && options->pos(o->keyU)!=STRING_NOT_FOUND) |
| *(o->result)&=~o->clear; | ){ |
| *(o->result)|=o->set; | if(o->flag){ |
| (*match_features) |= o->flag; | |
| } else { | |
| *o->result &= ~o->clear; | |
| *o->result |= o->set; | |
| } | |
| } | } |
| } | } |
| /// @todo maybe need speedup: some option to remove pre/match/post string generation | Table* String::match(Charset& source_charset, |
| bool String::match(const unsigned char *pcre_tables, | const String& regexp, |
| const String *aorigin, | const String* options, |
| const String& regexp, | Row_action row_action, void *info, |
| const String *options, | int& matches_count) const { |
| Table **table, | if(regexp.is_empty()) |
| Row_action row_action, void *info) const { | throw Exception(0, |
| 0, | |
| if(!regexp.size()) | |
| THROW(0, 0, | |
| aorigin, | |
| "regexp is empty"); | "regexp is empty"); |
| const char *pattern=regexp.cstr(UL_AS_IS); | |
| const char *errptr; | const char* pattern=regexp.cstr(String::L_UNSPECIFIED); // fix any tainted with L_REGEX |
| const char* errptr; | |
| int erroffset; | int erroffset; |
| int option_bits[2]; regex_options(options?options->cstr():0, option_bits); | int option_bits[2]={0}; |
| int match_features=0; | |
| regex_options(options, option_bits, &match_features); | |
| bool need_pre_post_match=(match_features & MF_NEED_PRE_POST_MATCH) != 0; | |
| bool just_count_matches=(match_features & MF_JUST_COUNT_MATCHES) != 0; | |
| bool global=option_bits[1]!=0; | |
| pcre *code=pcre_compile(pattern, option_bits[0], | pcre *code=pcre_compile(pattern, option_bits[0], |
| &errptr, &erroffset, | &errptr, &erroffset, |
| pcre_tables); | source_charset.pcre_tables); |
| if(!code) | if(!code) |
| THROW(0, 0, | throw Exception(0, |
| ®exp.mid(erroffset, regexp.size()), | ®exp.mid(erroffset, regexp.length()), |
| "regular expression syntax error - %s", errptr); | "regular expression syntax error - %s", errptr); |
| int info_substrings=pcre_info(code, 0, 0); | int subpatterns=pcre_info(code, 0, 0); |
| if(info_substrings<0) { | if(subpatterns<0) { |
| (*pcre_free)(code); | pcre_free(code); |
| THROW(0, 0, | throw Exception(0, |
| aorigin, | ®exp, |
| "pcre_info error (%d)", | "pcre_info error (%d)", |
| info_substrings); | subpatterns); |
| } | } |
| int startoffset=0; | const char* subject=cstr(); |
| const char *subject=cstr(UL_AS_IS); | size_t subject_length=strlen(subject); |
| int length=strlen(subject); | const int oveclength=(1/*match*/+MAX_MATCH_GROUPS)*3; |
| int ovecsize; | int ovector[oveclength]; |
| int *ovector=(int *)malloc(sizeof(int)* | |
| (ovecsize=(1/*match*/+info_substrings)*3)); | // create table |
| Table::Action_options table_options; | |
| { // create table | Table& table=*new Table(string_match_table_template, table_options); |
| Array& columns=*NEW Array(pool()); | |
| columns+=string_pre_match_name; | |
| columns+=string_match_name; | |
| columns+=string_post_match_name; | |
| for(int i=1; i<=info_substrings; i++) { | |
| char *column=(char *)malloc(MAX_NUMBER); | |
| snprintf(column, MAX_NUMBER, "%d", i); | |
| columns+=NEW String(pool(), column); // .i column name | |
| } | |
| *table=NEW Table(pool(), aorigin, &columns); | |
| } | |
| int exec_option_bits=0; | int exec_option_bits=0; |
| int prestart=0; | |
| int poststart=0; | |
| int postfinish=length(); | |
| while(true) { | while(true) { |
| int exec_substrings=pcre_exec(code, 0, | int exec_substrings=pcre_exec(code, 0, |
| subject, length, startoffset, | subject, subject_length, prestart, |
| exec_option_bits, ovector, ovecsize); | exec_option_bits, ovector, oveclength); |
| if(exec_substrings==PCRE_ERROR_NOMATCH) { | if(exec_substrings==PCRE_ERROR_NOMATCH) { |
| (*pcre_free)(code); | pcre_free(code); |
| (*row_action)(**table, 0/*last time, no row*/, 0, 0, info); | row_action(table, 0/*last time, no raw*/, 0, 0, poststart, postfinish, info); |
| return option_bits[1]!=0; // global=true+table, not global=false | // if(global || subpatterns) |
| // return &table; // global or with subpatterns=true+result | |
| // else { | |
| // just_matched=false; return 0; // not global=no result | |
| // } | |
| return just_count_matches ? 0 : &table; | |
| } | } |
| if(exec_substrings<0) { | if(exec_substrings<0) { |
| (*pcre_free)(code); | pcre_free(code); |
| THROW(0, 0, | throw Exception(0, |
| aorigin, | ®exp, |
| "regular expression execute error (%d)", | "regular expression execute error (%d)", |
| exec_substrings); | exec_substrings); |
| } | } |
| Array& row=*NEW Array(pool()); | int prefinish=ovector[0]; |
| row+=&mid(0, ovector[0]); // .prematch column value | poststart=ovector[1]; |
| row+=&mid(ovector[0], ovector[1]); // .match | ArrayString* row=new ArrayString; |
| row+=&mid(ovector[1], size()); // .postmatch | if(need_pre_post_match) { |
| *row+=&mid(0, prefinish); // .prematch column value | |
| *row+=&mid(prefinish, poststart); // .match | |
| *row+=&mid(poststart, postfinish); // .postmatch | |
| } else { | |
| *row+=&Empty; // .prematch column value | |
| *row+=&Empty; // .match | |
| *row+=&Empty; // .postmatch | |
| } | |
| for(int i=1; i<exec_substrings; i++) { | for(int i=1; i<exec_substrings; i++) { |
| // -1:-1 case handled peacefully by mid() itself | // -1:-1 case handled peacefully by mid() itself |
| row+=&mid(ovector[i*2+0], ovector[i*2+1]); // .i column value | *row+=&mid(ovector[i*2+0], ovector[i*2+1]); // .i column value |
| } | } |
| (*row_action)(**table, &row, startoffset, ovector[0], info); | matches_count++; |
| row_action(table, row, prestart, prefinish, poststart, postfinish, info); | |
| if(!option_bits[1] || !(startoffset=ovector[1])) { // not global | going to hang | if(!global || prestart==poststart) { // not global | going to hang |
| (*pcre_free)(code); | pcre_free(code); |
| (*row_action)(**table, 0/*last time, no row*/, 0, 0, info); | row_action(table, 0/*last time, no row*/, 0, 0, poststart, postfinish, info); |
| return true; | return just_count_matches ? 0 : &table; |
| // return &table; | |
| } | } |
| prestart=poststart; | |
| /* | /* |
| if(option_bits[0] & PCRE_MULTILINE) | if(option_bits[0] & PCRE_MULTILINE) |
| Line 584 bool String::match(const unsigned char * | Line 515 bool String::match(const unsigned char * |
| } | } |
| } | } |
| String& String::change_case(Pool& pool, const unsigned char *tables, | String& String::change_case(Charset& source_charset, Change_case_kind kind) const { |
| Change_case_kind kind) const { | String& result=*new String(); |
| String& result=*new(pool) String(pool); | if(is_empty()) |
| return result; | |
| const unsigned char *a; | |
| const unsigned char *b; | char* new_cstr=cstrm(); |
| switch(kind) { | size_t new_cstr_len=length(); |
| case CC_UPPER: | if(source_charset.isUTF8()) { |
| a=tables+lcc_offset; | switch(kind) { |
| b=tables+fcc_offset; | case CC_UPPER: |
| break; | change_case_UTF8((const XMLByte*)new_cstr, new_cstr_len, (XMLByte*)new_cstr, new_cstr_len, UTF8CaseToUpper); |
| case CC_LOWER: | break; |
| a=tables+lcc_offset; | case CC_LOWER: |
| b=0; | change_case_UTF8((const XMLByte*)new_cstr, new_cstr_len, (XMLByte*)new_cstr, new_cstr_len, UTF8CaseToLower); |
| break; | break; |
| default: | default: |
| PTHROW(0, 0, | assert(!"unknown change case kind"); |
| this, | break; // never |
| "unknown change case kind #%d", | } |
| static_cast<int>(kind)); // never | |
| a=b=0; // calm, compiler | } else { |
| break; // never | const unsigned char *tables=source_charset.pcre_tables; |
| } | |
| const Chunk *chunk=&head; | |
| do { | |
| const Chunk::Row *row=chunk->rows; | |
| for(size_t i=0; i<chunk->count; i++, row++) { | |
| if(row==append_here) | |
| goto break2; | |
| char *new_cstr=(char *)pool.malloc(row->item.size); | |
| char *dest=new_cstr; | |
| const char *src=row->item.ptr; | |
| for(int size=row->item.size; size--; src++) { | |
| unsigned char c=a[(unsigned char)*src]; | |
| if(b) | |
| c=b[c]; | |
| *dest++=(char)c; | const unsigned char *a; |
| } | const unsigned char *b; |
| switch(kind) { | |
| result.APPEND(new_cstr, row->item.size, | case CC_UPPER: |
| row->item.lang, | a=tables+lcc_offset; |
| row->item.origin.file, row->item.origin.line); | b=tables+fcc_offset; |
| } | break; |
| chunk=row->link; | case CC_LOWER: |
| } while(chunk); | a=tables+lcc_offset; |
| break2: | b=0; |
| break; | |
| default: | |
| assert(!"unknown change case kind"); | |
| a=b=0; // calm, compiler | |
| break; // never | |
| } | |
| char *dest=new_cstr; | |
| unsigned char index; | |
| for(const char* current=new_cstr; (index=(unsigned char)*current); current++) { | |
| unsigned char c=a[index]; | |
| if(b) | |
| c=b[c]; | |
| *dest++=(char)c; | |
| } | |
| } | |
| result.langs=langs; | |
| result.body=new_cstr; | |
| return result; | return result; |
| } | } |
| double String::as_double() { | const String& String::replace(const Dictionary& dict) const { |
| double result; | String& result=*new String(); |
| const char *cstr=this->cstr(); | const char* old_cstr=cstr(); |
| char *error_pos=0; | const char* prematch_begin=old_cstr; |
| // 0xABC | |
| if(cstr[0]=='0' && (cstr[1]=='x' || cstr[1]=='X')) | const char* current=old_cstr; |
| result=(double)(unsigned long)strtol(cstr, &error_pos, 0); | while(*current) { |
| else | if(Dictionary::Subst subst=dict.first_that_begins(current)) { |
| result=strtod(cstr, &error_pos); | // prematch |
| if(size_t prematch_length=current-prematch_begin) { | |
| result.langs.append(result.body, langs, prematch_begin-old_cstr, prematch_length); | |
| result.body.append_strdup_know_length(prematch_begin, prematch_length); | |
| } | |
| // match | |
| // skip 'a' in 'current'; move prematch_begin | |
| current+=subst.from_length; prematch_begin=current; | |
| if(error_pos && *error_pos) | if(const String* b=subst.to) // are there any b? |
| THROW(0, 0, | result<<*b; |
| this, | } else // simply advance |
| "invalid number (double)"); | current++; |
| } | |
| // postmatch | |
| if(size_t postmatch_length=current-prematch_begin) { | |
| result.langs.append(result.body, langs, prematch_begin-old_cstr, postmatch_length); | |
| result.body.append_strdup_know_length(prematch_begin, postmatch_length); | |
| } | |
| ASSERT_STRING_INVARIANT(result); | |
| return result; | return result; |
| } | } |
| int String::as_int() { | |
| int result; | static int serialize_body_char(char c, char** cur) { |
| const char *cstr=this->cstr(); | *((*cur)++)=c; |
| char *error_pos=0; | return 0; // 0=continue |
| // 0xABC | }; |
| if(cstr[0]=='0' && (cstr[1]=='x' || cstr[1]=='X')) | static int serialize_body_piece(const char* s, char** cur) { |
| result=(int)(unsigned long)strtol(cstr, &error_pos, 0); | size_t length=strlen(s); |
| memcpy(*cur, s, length); *cur+=length; | |
| return 0; // 0=continue | |
| }; | |
| static int serialize_lang_piece(char alang, size_t asize, char** cur) { | |
| // lang | |
| **cur=alang; (*cur)++; | |
| // length [WARNING: not cast, addresses must be %4=0 on sparc] | |
| memcpy(*cur, &asize, sizeof(asize)); *cur+=sizeof(asize); | |
| return 0; // 0=continue | |
| } | |
| String::Cm String::serialize(size_t prolog_length) const { | |
| size_t fragments_count=langs.count(); | |
| size_t body_length=body.length(); | |
| size_t buf_length= | |
| prolog_length //1 | |
| +sizeof(size_t) //2 | |
| +body_length //3 | |
| +1 // 4 for zero terminator used in deserialize | |
| +sizeof(size_t) //5 | |
| +fragments_count*(sizeof(char)+sizeof(size_t)); //6 | |
| String::Cm result(new(PointerFreeGC) char[buf_length], buf_length); | |
| // 1: prolog | |
| char *cur=result.str+prolog_length; | |
| // 2: chars.count [WARNING: not cast, addresses must be %4=0 on sparc] | |
| memcpy(cur, &body_length, sizeof(body_length)); cur+=sizeof(body_length); | |
| // 3: letters | |
| body.for_each(serialize_body_char, serialize_body_piece, &cur); | |
| // 4: zero terminator | |
| *cur++=0; | |
| // 5: langs.count [WARNING: not cast, addresses must be %4=0 on sparc] | |
| memcpy(cur, &fragments_count, sizeof(fragments_count)); cur+=sizeof(fragments_count); | |
| // 6: lang info | |
| langs.for_each(body, serialize_lang_piece, &cur); | |
| return result; | |
| } | |
| bool String::deserialize(size_t prolog_size, void *buf, size_t buf_size) { | |
| size_t in_buf=buf_size; | |
| if(in_buf<=prolog_size) | |
| return false; | |
| in_buf-=prolog_size; | |
| // 1: prolog | |
| const char* cur=(const char* )buf+prolog_size; | |
| // 2: chars.count | |
| size_t body_length; | |
| if(in_buf<sizeof(body_length)) // body.length don't fit? | |
| return false; | |
| // [WARNING: not cast, addresses must be %4=0 on sparc] | |
| memcpy(&body_length, cur, sizeof(body_length)); cur+=sizeof(body_length); | |
| in_buf-=sizeof(body_length); | |
| if(in_buf<body_length+1) // letters+terminator don't fit? | |
| return false; | |
| // 4: zero terminator | |
| if(cur[body_length] != 0) // in place? | |
| return false; | |
| // 3: letters | |
| body=String::Body(cur, body_length); | |
| cur+=body_length+1; | |
| in_buf-=body_length+1; | |
| // 5: langs.count | |
| size_t fragments_count; | |
| if(in_buf<sizeof(fragments_count)) // langs.count don't fit? | |
| return false; | |
| // [WARNING: not cast, addresses must be %4=0 on sparc] | |
| memcpy(&fragments_count, cur, sizeof(fragments_count)); cur+=sizeof(fragments_count); | |
| in_buf-=sizeof(fragments_count); | |
| if(fragments_count) { | |
| // 6: lang info | |
| size_t total_length=0; | |
| for(size_t f=0; f<fragments_count; f++) { | |
| char lang; | |
| size_t fragment_length; | |
| size_t piece_length=sizeof(lang)+sizeof(fragment_length); | |
| if(in_buf<piece_length) // lang+length | |
| return false; | |
| // lang | |
| lang=*cur++; | |
| // length [WARNING: not cast, addresses must be %4=0 on sparc] | |
| memcpy(&fragment_length, cur, sizeof(fragment_length)); cur+=sizeof(fragment_length); | |
| size_t combined_length=total_length+fragment_length; | |
| if(combined_length>body_length) | |
| return false; // file curruption | |
| // uchar needed to prevent propagating 0x80 bit to upper bytes | |
| langs.append(total_length, (String::Language)(uchar)lang, fragment_length); | |
| total_length=combined_length; | |
| in_buf-=piece_length; | |
| } | |
| if(total_length!=body_length) // length(all language fragments) vs length(letters) | |
| return false; | |
| } | |
| if(in_buf!=0) // some strange extra bytes | |
| return false; | |
| ASSERT_STRING_INVARIANT(*this); | |
| return true; | |
| } | |
| const char* String::Body::v() const { | |
| return CORD_to_const_char_star(body); | |
| } | |
| void String::Body::dump() const { | |
| CORD_dump(body); | |
| } | |
| const char* String::Languages::v() const { | |
| if(opt.is_not_just_lang) | |
| return CORD_to_const_char_star(langs); | |
| else | |
| return (const char*)&langs; | |
| } | |
| void String::Languages::dump() const { | |
| if(opt.is_not_just_lang) | |
| CORD_dump(langs); | |
| else | else |
| result=(int)strtol(cstr, &error_pos, 0); | puts((const char*)&langs); |
| } | |
| const char* String::v() const { | |
| const uint LIMIT_VIEW=20; | |
| char* buf=(char*)malloc(MAX_STRING); | |
| const char*body_view=body.v(); | |
| const char*langs_view=langs.v(); | |
| snprintf(buf, MAX_STRING, | |
| "%d:%.*s%s} " | |
| "{%d:%s", | |
| langs.count(), LIMIT_VIEW, langs_view, strlen(langs_view)>LIMIT_VIEW?"...":"", | |
| strlen(body_view), body_view | |
| ); | |
| return buf; | |
| } | |
| void String::dump() const { | |
| body.dump(); | |
| langs.dump(); | |
| } | |
| const String& String::trim(String::Trim_kind kind, const char* chars) const { | |
| if(!length()) | |
| return *this; | |
| size_t substr_begin, substr_length; | |
| Body new_body=body.trim(kind, chars, &substr_begin, &substr_length); | |
| if(new_body==body) // we received unchanged pointer, do likewise | |
| return *this; | |
| // new_body differs from body, adjust langs along | |
| if(error_pos && *error_pos) | String& result=*new String; |
| THROW(0, 0, | if(!new_body) // body.trim produced empty result |
| this, | return result; |
| "invalid number (int)"); | // body.trim produced nonempty result |
| // first: their langs | |
| result.langs.append(result.body, langs, substr_begin, substr_length); | |
| // next: letters themselves | |
| result.body=new_body; | |
| ASSERT_STRING_INVARIANT(result); | |
| return result; | return result; |
| } | } |