|
|
| version 1.71, 2001/04/05 13:27:14 | version 1.158, 2002/04/22 14:25:41 |
|---|---|
| Line 1 | Line 1 |
| /** @file | /** @file |
| Parser: string class. @see untasize_t.C. | Parser: string class. @see untasize_t.C. |
| Copyright (c) 2001 ArtLebedev Group (http://www.artlebedev.com) | Copyright (c) 2001, 2002 ArtLebedev Group (http://www.artlebedev.com) |
| Author: Alexandr Petrosian <paf@design.ru> (http://paf.design.ru) | |
| Author: Alexander Petrosyan <paf@design.ru> (http://design.ru/paf) | |
| $Id$ | $Id$ |
| */ | */ |
| #include "pa_config_includes.h" | |
| #include <locale.h> | |
| #include "pcre.h" | #include "pcre.h" |
| #include "pa_pool.h" | #include "pa_pool.h" |
| Line 22 | Line 17 |
| #include "pa_array.h" | #include "pa_array.h" |
| #include "pa_globals.h" | #include "pa_globals.h" |
| #include "pa_table.h" | #include "pa_table.h" |
| #include "pa_threads.h" | #include "pa_dictionary.h" |
| #include "pa_charset.h" | |
| //#include "pa_sapi.h" | #define DEBUG_STRING_APPENDS_VS_EXPANDS |
| // String | |
| String::String(Pool& apool, const char *src, bool tasize_ted) : | #ifdef DEBUG_STRING_APPENDS_VS_EXPANDS |
| ulong string_piece_appends=0; | |
| #endif | |
| String::String(Pool& apool, const char *src, size_t src_size, bool tainted) : | |
| Pooled(apool) { | Pooled(apool) { |
| last_chunk=&head; | last_chunk=&head.chunk; |
| head.count=CR_PREALLOCATED_COUNT; | head.chunk.count=CR_PREALLOCATED_COUNT; |
| append_here=head.rows; | append_here=head.chunk.rows; |
| head.preallocated_link=0; | |
| link_row=&head.rows[head.count]; | |
| fused_rows=fsize=0; | |
| if(src) | if(src) |
| if(tasize_ted) | if(tainted) |
| APPEND_TAINTED(src, 0, 0, 0); | APPEND_TAINTED(src, src_size, 0, 0); |
| else | else |
| APPEND_CONST(src); | APPEND_CLEAN(src, src_size, 0, 0); |
| } | } |
| void String::expand() { | String::String(const String& src) : |
| size_t new_chunk_count=last_chunk->count+last_chunk->count*CR_GROW_PERCENT/100; | Pooled(src.pool()) { |
| last_chunk=static_cast<Chunk *>( | last_chunk=&head.chunk; |
| malloc(sizeof(size_t)+sizeof(Chunk::Row)*new_chunk_count+sizeof(Chunk *))); | head.chunk.count=CR_PREALLOCATED_COUNT; |
| last_chunk->count=new_chunk_count; | append_here=head.chunk.rows; |
| link_row->link=last_chunk; | |
| append_here=last_chunk->rows; | |
| link_row=&last_chunk->rows[last_chunk->count]; | |
| link_row->link=0; | |
| } | |
| String::String(const String& src) : Pooled(src.pool()) { | append(src, UL_UNSPECIFIED); |
| head.count=CR_PREALLOCATED_COUNT; | } |
| size_t src_used_rows=src.fused_rows; | |
| if(src_used_rows<=head.count) { | |
| // all new rows fit size_to preallocated area | |
| size_t curr_chunk_rows=head.count; | |
| memcpy(head.rows, src.head.rows, sizeof(Chunk::Row)*src_used_rows); | |
| append_here=&head.rows[src_used_rows]; | |
| link_row=&head.rows[curr_chunk_rows]; | |
| } else { | |
| // warning: | |
| // heavily relies on the fact | |
| // "preallocated area is the same for all strings" | |
| // | |
| // info: | |
| // allocating only enough mem to fit src string rows | |
| // next append would allocate a new chunk | |
| // | |
| // new rows don't fit size_to preallocated area: splitting size_to two chunks | |
| // preallocated chunk src to constructing head | |
| memcpy(head.rows, src.head.rows, sizeof(Chunk::Row)*head.count); | |
| // remaining rows size_to new_chunk | |
| size_t curr_chunk_rows=src_used_rows-head.count; | |
| Chunk *new_chunk=static_cast<Chunk *>( | |
| malloc(sizeof(size_t)+sizeof(Chunk::Row)*curr_chunk_rows+sizeof(Chunk *))); | |
| new_chunk->count=curr_chunk_rows; | |
| head.preallocated_link=new_chunk; | |
| append_here=link_row=&new_chunk->rows[new_chunk->count]; | |
| Chunk *old_chunk=src.head.preallocated_link; | |
| Chunk::Row *new_rows=new_chunk->rows; | |
| size_t rows_left_to_copy=new_chunk->count; | |
| while(true) { | |
| size_t old_count=old_chunk->count; | |
| Chunk *next_chunk=old_chunk->rows[old_count].link; | |
| if(next_chunk) { | |
| // not last source chunk | |
| // taking it all | |
| memcpy(new_rows, old_chunk->rows, sizeof(Chunk::Row)*old_count); | |
| new_rows+=old_count; | |
| rows_left_to_copy-=old_count; | |
| old_chunk=next_chunk; | size_t String::size() const { |
| } else { | size_t result=0; |
| // the last source chunk | STRING_FOREACH_ROW( |
| // taking only those rows of chunk that _left_to_copy | result+=row->item.size; |
| memcpy(new_rows, old_chunk->rows, sizeof(Chunk::Row)*rows_left_to_copy); | ); |
| break; | return result; |
| } | |
| } | |
| } | |
| link_row->link=0; | |
| fused_rows=src_used_rows; | |
| fsize=src.fsize; | |
| } | } |
| String& String::append(const String& src, Untaint_lang lang, bool forced) { | /// @todo not very optimal |
| const Chunk *chunk=&src.head; | uint String::used_rows() const { |
| do { | uint result=0; |
| const Chunk::Row *row=chunk->rows; | STRING_FOREACH_ROW( |
| for(size_t i=0; i<chunk->count; i++, row++) { | result++; |
| if(row==src.append_here) | ); |
| goto break2; | return result; |
| } | |
| APPEND(row->item.ptr, row->item.size, | void String::expand() { |
| (lang!=UL_PASS_APPENDED && (row->item.lang==UL_TAINTED || forced))?lang:row->item.lang, | uint new_chunk_count=last_chunk->count+CR_GROW_COUNT; |
| row->item.origin.file, row->item.origin.line); | if(new_chunk_count>max_integral(Chunk::count_type)) |
| } | new_chunk_count=max_integral(Chunk::count_type); |
| chunk=row->link; | |
| } while(chunk); | Chunk *new_chunk=static_cast<Chunk *>(malloc( |
| break2: | sizeof(Chunk)// count+interpadding(?)+rows[CR_PREALLOCATED_COUNT]+tailpadding(??) |
| return *this; | -sizeof(Chunk::rows_type) // PREALLOCATED rows |
| +sizeof(Chunk::Row)*new_chunk_count // neaded rows | |
| +sizeof(Chunk *) // link size | |
| , 10)); | |
| new_chunk->rows[new_chunk->count=new_chunk_count].link=0; | |
| last_chunk->rows[last_chunk->count].link=new_chunk; | |
| last_chunk=new_chunk; | |
| append_here=last_chunk->rows; | |
| } | } |
| String& String::real_append(STRING_APPEND_PARAMS) { | String& String::real_append(STRING_APPEND_PARAMS) { |
| if(!last_chunk) // growth stopped [we're appended as string to somebody] | |
| throw Exception(0, | |
| this, | |
| "string growth stopped (append cstr)"); | |
| if(!src) | if(!src) |
| return *this; | return *this; |
| if(!size) | if(!size) |
| Line 138 String& String::real_append(STRING_APPEN | Line 96 String& String::real_append(STRING_APPEN |
| if(!size) | if(!size) |
| return *this; | return *this; |
| #ifdef DEBUG_STRING_APPENDS_VS_EXPANDS | |
| string_piece_appends++; | |
| #endif | |
| // manually unrolled to avoid extra check | |
| while(size>max_integral(Chunk::Row::item_size_type)) { | |
| if(chunk_is_full()) | |
| expand(); | |
| append_here->item.ptr=src; | |
| append_here->item.size=max_integral(Chunk::Row::item_size_type); | |
| append_here->item.lang=lang; | |
| #ifndef NO_STRING_ORIGIN | |
| append_here->item.origin.file=file; | |
| append_here->item.origin.line=line; | |
| #endif | |
| append_here++; | |
| src+=max_integral(Chunk::Row::item_size_type); | |
| size-=max_integral(Chunk::Row::item_size_type); | |
| } | |
| if(chunk_is_full()) | if(chunk_is_full()) |
| expand(); | expand(); |
| append_here->item.ptr=src; | append_here->item.ptr=src; |
| fsize+=append_here->item.size=size; | append_here->item.size=size; |
| append_here->item.lang=lang; | append_here->item.lang=lang; |
| #ifndef NO_STRING_ORIGIN | #ifndef NO_STRING_ORIGIN |
| append_here->item.origin.file=file; | append_here->item.origin.file=file; |
| append_here->item.origin.line=line; | append_here->item.origin.line=line; |
| #endif | #endif |
| append_here++; fused_rows++; | append_here++; |
| return *this; | return *this; |
| } | } |
| uint String::hash_code() const { | char String::first_char() const { |
| uint result=0; | if(is_empty()) |
| throw Exception(0, | |
| this, | |
| "getting first char of empty string"); | |
| const Chunk *chunk=&head; | return *head.chunk.rows[0].item.ptr; |
| do { | } |
| const Chunk::Row *row=chunk->rows; | |
| for(size_t i=0; i<chunk->count; i++) { | |
| if(row==append_here) | |
| goto break2; | |
| uint String::hash_code() const { | |
| uint result=0; | |
| STRING_FOREACH_ROW( | |
| result=Hash::generic_code(result, row->item.ptr, row->item.size); | result=Hash::generic_code(result, row->item.ptr, row->item.size); |
| row++; | ); |
| } | |
| chunk=row->link; | |
| } while(chunk); | |
| break2: | |
| return result; | return result; |
| } | } |
| Line 176 break2: | Line 154 break2: |
| int String::cmp(int& partial, const String& src, | int String::cmp(int& partial, const String& src, |
| size_t this_offset, Untaint_lang lang) const { | size_t this_offset, Untaint_lang lang) const { |
| partial=-1; | partial=-1; |
| this_offset=min(this_offset, size()-1); | size_t a_size=size(); |
| this_offset=min(this_offset, a_size-1); | |
| const Chunk *a_chunk=&head; | const Chunk *a_chunk=&head.chunk; |
| const Chunk *b_chunk=&src.head; | const Chunk *b_chunk=&src.head.chunk; |
| const Chunk::Row *a_row=a_chunk->rows; | const Chunk::Row *a_row=a_chunk->rows; |
| const Chunk::Row *b_row=b_chunk->rows; | const Chunk::Row *b_row=b_chunk->rows; |
| size_t a_offset=this_offset; | size_t a_offset=this_offset; |
| size_t b_offset=0; | size_t b_offset=0; |
| Chunk::Row *a_end=append_here; | Chunk::Row *a_end=append_here; |
| Chunk::Row *b_end=src.append_here; | Chunk::Row *b_end=src.append_here; |
| size_t a_countdown=a_chunk->count; | uint a_countdown=a_chunk->count; |
| size_t b_countdown=b_chunk->count; | uint b_countdown=b_chunk->count; |
| bool a_break=false; | int result; |
| bool b_break=false; | |
| size_t result; | |
| size_t pos=0; | size_t pos=0; |
| while(true) { | |
| a_break=a_row==a_end; | |
| b_break=b_row==b_end; | |
| if(a_break || b_break) | |
| break; | |
| bool a_break=a_size==0; | |
| bool b_break=src.is_empty(); | |
| if(!(a_break || b_break)) while(true) { | |
| if(pos+a_row->item.size > this_offset) { | if(pos+a_row->item.size > this_offset) { |
| if(lang!=UL_UNSPECIFIED && a_row->item.lang!=lang) | if(lang!=UL_UNSPECIFIED && a_row->item.lang>lang) |
| return -1; // wrong lang -- bail out | return -1; // wrong lang -- bail out |
| int size_diff= | int size_diff= |
| Line 230 int String::cmp(int& partial, const Stri | Line 205 int String::cmp(int& partial, const Stri |
| pos+=a_row->item.size; | pos+=a_row->item.size; |
| a_row++; a_countdown--; a_offset=0; | a_row++; a_countdown--; a_offset=0; |
| } | } |
| if(b_break=b_row==b_end) { | |
| a_break=a_row==a_end; | |
| break; | |
| } | |
| if(!b_countdown) { | if(!b_countdown) { |
| b_chunk=b_row->link; | b_chunk=b_row->link; |
| b_row=b_chunk->rows; | b_row=b_chunk->rows; |
| Line 242 int String::cmp(int& partial, const Stri | Line 220 int String::cmp(int& partial, const Stri |
| a_row++; a_countdown--; | a_row++; a_countdown--; |
| } | } |
| if(a_break=a_row==a_end) { | |
| b_break=b_row==b_end; | |
| break; | |
| } | |
| if(!a_countdown) { | if(!a_countdown) { |
| a_chunk=a_row->link; | a_chunk=a_row->link; |
| a_row=a_chunk->rows; | a_row=a_chunk->rows; |
| Line 261 int String::cmp(int& partial, const Stri | Line 243 int String::cmp(int& partial, const Stri |
| int String::cmp(int& partial, const char* b_ptr, size_t src_size, | int String::cmp(int& partial, const char* b_ptr, size_t src_size, |
| size_t this_offset, Untaint_lang lang) const { | size_t this_offset, Untaint_lang lang) const { |
| partial=-1; | partial=-1; |
| size_t a_size=size(); | |
| size_t b_size=src_size?src_size:b_ptr?strlen(b_ptr):0; | size_t b_size=src_size?src_size:b_ptr?strlen(b_ptr):0; |
| this_offset=min(this_offset, size()-1); | this_offset=min(this_offset, a_size-1); |
| const Chunk *a_chunk=&head; | const Chunk *a_chunk=&head.chunk; |
| const Chunk::Row *a_row=a_chunk->rows; | const Chunk::Row *a_row=a_chunk->rows; |
| size_t a_offset=this_offset; | size_t a_offset=this_offset; |
| size_t b_offset=0; | size_t b_offset=0; |
| Chunk::Row *a_end=append_here; | Chunk::Row *a_end=append_here; |
| size_t a_countdown=a_chunk->count; | uint a_countdown=a_chunk->count; |
| bool a_break=false; | |
| bool b_break=false; | |
| size_t pos=0; | size_t pos=0; |
| while(true) { | |
| a_break=a_row==a_end; | |
| if(a_break || b_break) | |
| break; | |
| bool a_break=a_size==0; | |
| bool b_break=b_size==0; | |
| if(!(a_break || b_break)) while(true) { | |
| if(pos+a_row->item.size > this_offset) { | if(pos+a_row->item.size > this_offset) { |
| if(lang!=UL_UNSPECIFIED && a_row->item.lang!=lang) | if(lang!=UL_UNSPECIFIED && a_row->item.lang>lang) |
| return -1; // wrong lang -- bail out | return -1; // wrong lang -- bail out |
| int size_diff= | int size_diff= |
| Line 287 int String::cmp(int& partial, const char | Line 267 int String::cmp(int& partial, const char |
| (b_size-b_offset); | (b_size-b_offset); |
| if(size_diff==0) { // a has same size as b | if(size_diff==0) { // a has same size as b |
| if(size_t result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset, | if(int result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset, |
| a_row->item.size-a_offset)!=0) | a_row->item.size-a_offset)!=0) |
| return result; | return result; |
| pos+=a_row->item.size; | pos+=a_row->item.size; |
| a_row++; a_countdown--; a_offset=0; | a_row++; a_countdown--; a_offset=0; |
| b_break=true; | b_break=true; |
| } else if (size_diff>0) { // a longer | } else if (size_diff>0) { // a longer |
| if(size_t result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset, | if(int result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset, |
| b_size-b_offset)!=0) | b_size-b_offset)!=0) |
| return result; | return result; |
| a_offset+=b_size-b_offset; | a_offset+=b_size-b_offset; |
| b_break=true; | b_break=true; |
| } else { // b longer | } else { // b longer |
| if(size_t result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset, | if(int result=memcmp(a_row->item.ptr+a_offset, b_ptr+b_offset, |
| a_row->item.size-a_offset)!=0) | a_row->item.size-a_offset)!=0) |
| return result; | return result; |
| b_offset+=a_row->item.size-a_offset; | b_offset+=a_row->item.size-a_offset; |
| Line 313 int String::cmp(int& partial, const char | Line 293 int String::cmp(int& partial, const char |
| a_row++; a_countdown--; | a_row++; a_countdown--; |
| } | } |
| a_break=a_row==a_end; | |
| if(a_break || b_break) | |
| break; | |
| if(!a_countdown) { | if(!a_countdown) { |
| a_chunk=a_row->link; | a_chunk=a_row->link; |
| a_row=a_chunk->rows; | a_row=a_chunk->rows; |
| Line 330 int String::cmp(int& partial, const char | Line 313 int String::cmp(int& partial, const char |
| #ifndef NO_STRING_ORIGIN | #ifndef NO_STRING_ORIGIN |
| const Origin& String::origin() const { | const Origin& String::origin() const { |
| if(!fused_rows) | if(is_empty()) { |
| THROW(0, 0, | static const Origin empty_origin={"empty string"}; |
| 0, | return empty_origin; |
| "String::origin() of empty string called"); | } |
| // determining origin by last appended piece | // determining origin by first piece or last appended piece |
| // because first one frequently constant. | // because any of them can be constant=without origin: |
| // ex: ^load[/file] "document_root" + "/file" | // ex: ^load[/file] "document_root" + "/file" |
| return append_here[-1].item.origin; | // when last peice is constant, |
| // ex: parser_root_auto_path{dynamic} / auto.p{const} | |
| // using first piece | |
| Origin& first_origin=head.chunk.rows[0].item.origin; | |
| return first_origin.file ? first_origin : append_here[-1].item.origin; | |
| } | } |
| #endif | #endif |
| String& String::mid(size_t start, size_t finish) const { | String& String::mid(size_t start, size_t finish) const { |
| start=max(0, start); | String& result=*NEW String(pool()); |
| start=max(size_t(0), start); | |
| finish=min(size(), finish); | finish=min(size(), finish); |
| if(start==finish) | if(start==finish) |
| return *empty_string; | return result; |
| String& result=*NEW String(pool()); | |
| size_t pos=0; | size_t pos=0; |
| const Chunk *chunk=&head; | STRING_FOREACH_ROW( |
| do { | size_t item_finish=pos+row->item.size; |
| const Chunk::Row *row=chunk->rows; | if(item_finish > start) { // started now or already? |
| for(size_t i=0; i<chunk->count; pos+=row->item.size, i++, row++) { | bool started=result.is_empty(); // started now? |
| if(row==append_here) | bool finished=finish <= item_finish; // finished now? |
| size_t offset=started?start-pos:0; | |
| size_t size=finished?finish-pos:row->item.size; | |
| result.APPEND( | |
| row->item.ptr+offset, size-offset, | |
| row->item.lang, | |
| row->item.origin.file, row->item.origin.line); | |
| if(finished) | |
| goto break2; | goto break2; |
| size_t item_finish=pos+row->item.size; | |
| if(item_finish > start) { // started now or already? | |
| bool started=result.size()==0; // started now? | |
| bool finished=finish <= item_finish; // finished now? | |
| size_t offset=started?start-pos:0; | |
| size_t size=finished?finish-pos:row->item.size; | |
| result.APPEND( | |
| row->item.ptr+offset, size-offset, | |
| row->item.lang, | |
| row->item.origin.file, row->item.origin.line); | |
| if(finished) | |
| goto break2; | |
| } | |
| } | } |
| chunk=row->link; | pos+=row->item.size; |
| } while(chunk); | ); |
| break2: | break2: |
| // SAPI::log(pool(), "piece of '%s' from %d to %d is '%s'", | // SAPI::log(pool(), "piece of '%s' from %d to %d is '%s'", |
| //cstr(), start, finish, result.cstr()); | //cstr(), start, finish, result.cstr()); |
| Line 381 break2: | Line 361 break2: |
| } | } |
| int String::pos(const String& substr, | int String::pos(const String& substr, |
| size_t result, Untaint_lang lang) const { | int result, Untaint_lang lang) const { |
| for(; result<size(); result++) { | size_t self_size=size(); |
| for(; size_t(result)<self_size; result++) { | |
| int partial; cmp(partial, substr, result, lang); | int partial; cmp(partial, substr, result, lang); |
| if( | if( |
| partial==0 || // full match | partial==0 || // full match |
| Line 394 int String::pos(const String& substr, | Line 375 int String::pos(const String& substr, |
| } | } |
| int String::pos(const char *substr, size_t substr_size, | int String::pos(const char *substr, size_t substr_size, |
| size_t result, Untaint_lang lang) const { | int result, Untaint_lang lang) const { |
| for(; result<size(); result++) { | size_t self_size=size(); |
| for(; size_t(result)<self_size; result++) { | |
| int partial; cmp(partial, substr, substr_size, result, lang); | int partial; cmp(partial, substr, substr_size, result, lang); |
| if( | if( |
| partial==0 || // full match | partial==0 || // full match |
| Line 410 void String::split(Array& result, | Line 392 void String::split(Array& result, |
| size_t* pos_after_ref, | size_t* pos_after_ref, |
| const char *delim, size_t delim_size, | const char *delim, size_t delim_size, |
| Untaint_lang lang, int limit) const { | Untaint_lang lang, int limit) const { |
| size_t self_size=size(); | |
| if(delim_size) { | if(delim_size) { |
| size_t pos_after=pos_after_ref?*pos_after_ref:0; | size_t pos_after=pos_after_ref?*pos_after_ref:0; |
| int pos_before; | int pos_before; |
| Line 419 void String::split(Array& result, | Line 402 void String::split(Array& result, |
| pos_after=pos_before+delim_size; | pos_after=pos_before+delim_size; |
| } | } |
| // last piece | // last piece |
| if(pos_after<size() && limit) { | if(pos_after<self_size && limit) { |
| result+=&mid(pos_after, size()); | result+=&mid(pos_after, self_size); |
| pos_after=size(); | pos_after=self_size; |
| } | } |
| if(pos_after_ref) | if(pos_after_ref) |
| *pos_after_ref=pos_after; | *pos_after_ref=pos_after; |
| } else { // empty delim | } else { // empty delim |
| result+=this; | result+=this; |
| if(pos_after_ref) | if(pos_after_ref) |
| *pos_after_ref+=size(); | *pos_after_ref+=self_size; |
| } | } |
| } | } |
| Line 436 void String::split(Array& result, | Line 419 void String::split(Array& result, |
| size_t* pos_after_ref, | size_t* pos_after_ref, |
| const String& delim, Untaint_lang lang, | const String& delim, Untaint_lang lang, |
| int limit) const { | int limit) const { |
| if(delim.size()) { | if(!delim.is_empty()) { |
| size_t pos_after=pos_after_ref?*pos_after_ref:0; | size_t pos_after=pos_after_ref?*pos_after_ref:0; |
| int pos_before; | int pos_before; |
| // while we have 'delim'... | // while we have 'delim'... |
| Line 458 void String::split(Array& result, | Line 441 void String::split(Array& result, |
| } | } |
| } | } |
| /// @test really @b test: s x m [tested: i & g ] | static void regex_options(const String *options, int *result, bool& need_pre_post_match){ |
| static void regex_options(char *options, int *result){ | |
| struct Regex_option { | struct Regex_option { |
| char key; | const char *keyL; |
| const char *keyU; | |
| int clear, set; | int clear, set; |
| int *result; | int *result; |
| bool *flag; | |
| } regex_option[]={ | } regex_option[]={ |
| {'i', 0, PCRE_CASELESS, result}, // a=A | {"i", "I", 0, PCRE_CASELESS, result}, // a=A |
| {'s', 0, PCRE_DOTALL, result}, // \n\n$ | {"s", "S", 0, PCRE_DOTALL, result}, // \n\n$ [default] |
| {'x', 0, PCRE_EXTENDED, result}, // whitespace in regex ignored | {"x", "U", 0, PCRE_EXTENDED, result}, // whitespace in regex ignored |
| {'m', PCRE_DOTALL, PCRE_MULTILINE, result}, // ^aaa\n$^bbb\n$ | {"m", "M", PCRE_DOTALL, PCRE_MULTILINE, result}, // ^aaa\n$^bbb\n$ |
| {'g', 0, true, result+1}, // many rows | {"g", "G", 0, true, result+1}, // many rows |
| {0}, | {"'", 0, 0, 0, 0, &need_pre_post_match}, |
| {0} | |
| }; | }; |
| result[0]=PCRE_EXTRA | PCRE_DOTALL; | result[0]=PCRE_EXTRA | PCRE_DOTALL; |
| result[1]=0; | result[1]=0; |
| if(options) | if(options) |
| for(Regex_option *o=regex_option; o->key; o++) | for(Regex_option *o=regex_option; o->keyL; o++) |
| if( | if(options->pos(o->keyL)>=0 |
| strchr(options, o->key) || | || (o->keyU && options->pos(o->keyU)>=0)) { |
| strchr(options, toupper(o->key))) { | if(o->flag) |
| *(o->result)&=~o->clear; | *o->flag=true; |
| *(o->result)|=o->set; | else { // result |
| *o->result &= ~o->clear; | |
| *o->result |= o->set; | |
| } | |
| } | } |
| } | } |
| /** | /// @todo make replacement Table stacked |
| returns true if fills table. | bool String::match( |
| table format is defined and fixed[can be used by others]: | const String *aorigin, |
| @verbatim | |
| pre-match/match/post-match/1/2/3/... | |
| @endverbatim | |
| @test setlocale param to auto.p | |
| */ | |
| bool String::match(const String *aorigin, | |
| const String& regexp, | const String& regexp, |
| const String *options, | const String *options, |
| Table **table, | Table **table, |
| Row_action row_action, void *info) const { | Row_action row_action, void *info, |
| bool *was_global) const { | |
| static const unsigned char *tables=0; { SYNCHRONIZED(true); | if(regexp.is_empty()) |
| if(!tables) { | throw Exception(0, |
| setlocale(LC_CTYPE, "ru"); | aorigin, |
| tables=pcre_maketables(); | "regexp is empty"); |
| } | |
| } | const char *pattern=regexp.cstr(); |
| const char *pattern=regexp.cstr(UL_AS_IS); | |
| const char *errptr; | const char *errptr; |
| int erroffset; | int erroffset; |
| int option_bits[2]; regex_options(options?options->cstr():0, option_bits); | bool need_pre_post_match=false; |
| int option_bits[2]; regex_options(options, option_bits, need_pre_post_match); | |
| if(was_global) | |
| *was_global=option_bits[1]!=0; | |
| pcre *code=pcre_compile(pattern, option_bits[0], | pcre *code=pcre_compile(pattern, option_bits[0], |
| &errptr, &erroffset, | &errptr, &erroffset, |
| tables); | pool().get_source_charset().pcre_tables); |
| if(!code) | if(!code) |
| THROW(0, 0, | throw Exception(0, |
| ®exp.mid(erroffset, regexp.size()), | ®exp.mid(erroffset, regexp.size()), |
| "match error - %s", errptr); | "regular expression syntax error - %s", errptr); |
| int info_substrings=pcre_info(code, 0, 0); | int info_substrings=pcre_info(code, 0, 0); |
| if(info_substrings<0) { | if(info_substrings<0) { |
| (*pcre_free)(code); | pcre_free(code); |
| THROW(0, 0, | throw Exception(0, |
| aorigin, | aorigin, |
| "pcre_info error #%d", | "pcre_info error (%d)", |
| info_substrings); | info_substrings); |
| } | } |
| int startoffset=0; | const char *subject=cstr(); |
| const char *subject=cstr(UL_AS_IS); | |
| int length=strlen(subject); | int length=strlen(subject); |
| int ovecsize; | const int ovecsize=(1/*match*/+MAX_STRING_MATCH_TABLE_COLUMNS)*3; |
| int *ovector=(int *)malloc(sizeof(int)* | int ovector[ovecsize]; |
| (ovecsize=(1/*match*/+info_substrings)*3)); | |
| // create table | |
| { // create table | *table=NEW Table(pool(), *string_match_table_template); |
| Array& columns=*NEW Array(pool()); | |
| columns+=string_pre_match_name; | |
| columns+=string_match_name; | |
| columns+=string_post_match_name; | |
| for(int i=1; i<=info_substrings; i++) { | |
| char *column=(char *)malloc(MAX_NUMBER); | |
| snprintf(column, MAX_NUMBER, "%d", i); | |
| columns+=NEW String(pool(), column); // .i column name | |
| } | |
| *table=NEW Table(pool(), aorigin, &columns); | |
| } | |
| int exec_option_bits=0; | int exec_option_bits=0; |
| int prestart=0; | |
| int poststart=0; | |
| int postfinish=size(); | |
| while(true) { | while(true) { |
| int exec_substrings=pcre_exec(code, 0, | int exec_substrings=pcre_exec(code, 0, |
| subject, length, startoffset, | subject, length, prestart, |
| exec_option_bits, ovector, ovecsize); | exec_option_bits, ovector, ovecsize); |
| if(exec_substrings==PCRE_ERROR_NOMATCH) { | if(exec_substrings==PCRE_ERROR_NOMATCH) { |
| (*pcre_free)(code); | pcre_free(code); |
| (*row_action)(**table, 0/*last time, no row*/, 0, 0, info); | row_action(**table, 0/*last time, no row*/, 0, 0, poststart, postfinish, info); |
| return option_bits[1]!=0; // global=true+table, not global=false | return option_bits[1]!=0; // global=true+table, not global=false |
| } | } |
| if(exec_substrings<0) { | if(exec_substrings<0) { |
| (*pcre_free)(code); | pcre_free(code); |
| THROW(0, 0, | throw Exception(0, |
| aorigin, | aorigin, |
| "pcre_exec error #%d", | "regular expression execute error (%d)", |
| exec_substrings); | exec_substrings); |
| } | } |
| int prefinish=ovector[0]; | |
| poststart=ovector[1]; | |
| Array& row=*NEW Array(pool()); | Array& row=*NEW Array(pool()); |
| row+=&mid(0, ovector[0]); // .pre-match column value | row+=need_pre_post_match?&mid(0, prefinish):0; // .prematch column value |
| row+=&mid(ovector[0], ovector[1]); // .match | row+=need_pre_post_match?&mid(prefinish, poststart):0; // .match |
| row+=&mid(ovector[1], size()); // .post-match | row+=need_pre_post_match?&mid(poststart, postfinish):0; // .postmatch |
| for(int i=1; i<exec_substrings; i++) { | for(int i=1; i<exec_substrings; i++) { |
| // -1:-1 case handled peacefully by mid() itself | // -1:-1 case handled peacefully by mid() itself |
| row+=&mid(ovector[i*2+0], ovector[i*2+1]); // .i column value | row+=&mid(ovector[i*2+0], ovector[i*2+1]); // .i column value |
| } | } |
| (*row_action)(**table, &row, startoffset, ovector[0], info); | row_action(**table, &row, prestart, prefinish, poststart, postfinish, info); |
| if(!option_bits[1] || !(startoffset=ovector[1])) { // not global | going to hang | if(!option_bits[1] || prestart==poststart) { // not global | going to hang |
| (*pcre_free)(code); | pcre_free(code); |
| (*row_action)(**table, 0/*last time, no row*/, 0, 0, info); | row_action(**table, 0/*last time, no row*/, 0, 0, poststart, postfinish, info); |
| return true; | return true; |
| } | } |
| prestart=poststart; | |
| /* | /* |
| if(option_bits[0] & PCRE_MULTILINE) | if(option_bits[0] & PCRE_MULTILINE) |
| Line 591 bool String::match(const String *aorigin | Line 570 bool String::match(const String *aorigin |
| */ | */ |
| } | } |
| } | } |
| String& String::change_case(Pool& pool, | |
| Change_case_kind kind) const { | |
| const unsigned char *tables=pool.get_source_charset().pcre_tables; | |
| String& result=*new(pool) String(pool); | |
| const unsigned char *a; | |
| const unsigned char *b; | |
| switch(kind) { | |
| case CC_UPPER: | |
| a=tables+lcc_offset; | |
| b=tables+fcc_offset; | |
| break; | |
| case CC_LOWER: | |
| a=tables+lcc_offset; | |
| b=0; | |
| break; | |
| default: | |
| throw Exception(0, | |
| this, | |
| "unknown change case kind #%d", | |
| static_cast<int>(kind)); // never | |
| a=b=0; // calm, compiler | |
| break; // never | |
| } | |
| STRING_FOREACH_ROW( | |
| char *new_cstr=(char *)pool.malloc(row->item.size, 12); | |
| char *dest=new_cstr; | |
| const char *src=row->item.ptr; | |
| for(int size=row->item.size; size--; src++) { | |
| unsigned char c=a[(unsigned char)*src]; | |
| if(b) | |
| c=b[c]; | |
| *dest++=(char)c; | |
| } | |
| result.APPEND(new_cstr, row->item.size, | |
| row->item.lang, | |
| row->item.origin.file, row->item.origin.line); | |
| ); | |
| return result; | |
| } | |
| /// @test if in some piece were found no dict words, append it, not it's duplicate | |
| String& String::replace(Pool& pool, Dictionary& dict) const { | |
| // return reconstruct(pool).replace_in_reconstructed(pool, dict); | |
| String& result=*new(pool) String(pool); | |
| STRING_FOREACH_ROW( | |
| const char *src=row->item.ptr; | |
| size_t src_size=row->item.size; | |
| char *new_cstr=(char *)pool.malloc((size_t)ceil(src_size*dict.max_ratio()), 14); | |
| char *dest=new_cstr; | |
| while(src_size) { | |
| // there is a row where first column starts 'src' | |
| if(Table::Item *item=dict.first_that_starts(src, src_size)) { | |
| // get a=>b values | |
| const String& a=*static_cast<Array *>(item)->get_string(0); | |
| const String& b=*static_cast<Array *>(item)->get_string(1); | |
| // skip 'a' in 'src' && reduce work size | |
| src+=a.size(); src_size-=a.size(); | |
| // write 'b' to 'dest' && skip 'b' in 'dest' | |
| b.store_to(dest); dest+=b.size(); | |
| } else { | |
| // write a char to b && reduce work size | |
| *dest++=*src++; src_size--; | |
| } | |
| } | |
| result.APPEND(new_cstr, dest-new_cstr, row->item.lang, | |
| row->item.origin.file, row->item.origin.line); | |
| ); | |
| return result; | |
| } | |
| String& String::join_chains(Pool& pool, char** acstr) const { | |
| char *lcstr=cstr(); | |
| const char *current=lcstr; | |
| String& result=*new(pool) String(pool); | |
| STRING_FOREACH_ROW( | |
| IFNDEF_NO_STRING_ORIGIN( | |
| const char *joined_origin_file=row->item.origin.file; | |
| const size_t joined_origin_line=row->item.origin.line; | |
| ); | |
| uchar joined_lang=row->item.lang; | |
| const char *joined_ptr=current; | |
| // calc size | |
| size_t joined_size=0; | |
| STRING_PREPARED_FOREACH_ROW(*this, | |
| if(row->item.lang==joined_lang) | |
| joined_size+=row->item.size; | |
| else | |
| break; // before non-ours | |
| ); | |
| current+=joined_size; | |
| // pointers are after joined piece | |
| // & one step back, see STRING_FOREACH_ROW | |
| --row; ++countdown; | |
| result.APPEND(joined_ptr, joined_size, joined_lang, | |
| joined_origin_file, joined_origin_line); | |
| ); | |
| if(acstr) | |
| *acstr=lcstr; | |
| return result; | |
| } | |
| double String::as_double() const { | |
| double result; | |
| const char *cstr; | |
| char buf[MAX_NUMBER]; | |
| if(head.chunk.rows+1==append_here) { | |
| int size=min(head.chunk.rows[0].item.size, MAX_NUMBER-1); | |
| memcpy(buf, head.chunk.rows[0].item.ptr, size); | |
| buf[size]=0; | |
| cstr=buf; | |
| } else | |
| cstr=this->cstr(); | |
| char *error_pos; | |
| // 0xABC | |
| if(cstr[0]=='0') | |
| if(cstr[1]=='x' || cstr[1]=='X') | |
| result=(double)(unsigned long)strtol(cstr, &error_pos, 0); | |
| else | |
| result=(double)strtod(cstr+1/*skip leading 0*/, &error_pos); | |
| else | |
| result=(double)strtod(cstr, &error_pos); | |
| if(*error_pos/*not EOS*/) | |
| throw Exception("number.format", | |
| this, | |
| "invalid number (double)"); | |
| return result; | |
| } | |
| int String::as_int() const { | |
| int result; | |
| const char *cstr; | |
| char buf[MAX_NUMBER]; | |
| if(head.chunk.rows+1==append_here) { | |
| int size=min(head.chunk.rows[0].item.size, MAX_NUMBER-1); | |
| memcpy(buf, head.chunk.rows[0].item.ptr, size); | |
| buf[size]=0; | |
| cstr=buf; | |
| } else | |
| cstr=this->cstr(); | |
| char *error_pos; | |
| // 0xABC | |
| if(cstr[0]=='0') | |
| if(cstr[1]=='x' || cstr[1]=='X') | |
| result=(int)(unsigned long)strtol(cstr, &error_pos, 0); | |
| else | |
| result=(int)strtol(cstr+1/*skip leading 0*/, &error_pos, 0); | |
| else | |
| result=(int)strtol(cstr, &error_pos, 0); | |
| if(*error_pos/*not EOS*/) | |
| throw Exception("number.format", | |
| this, | |
| "invalid number (int)"); | |
| return result; | |
| } | |
| inline void ushort2uchars(ushort word, uchar& byte1, uchar& byte2) { | |
| byte1=word&0xFF; | |
| byte2=word>>8; | |
| } | |
| inline ushort uchars2ushort(uchar byte1, uchar byte2) { | |
| return (byte2<<8) | byte1; | |
| } | |
| /* @todo maybe network order worth spending some effort? | |
| don't bothering myself with network byte order, | |
| am not planning to be able to move resulting file across platforms | |
| for now | |
| */ | |
| void String::serialize(size_t prolog_size, void *& buf, size_t& buf_size) const { | |
| buf_size= | |
| prolog_size | |
| +used_rows()*(sizeof(uchar)+sizeof(ushort)) | |
| +size(); | |
| buf=malloc(buf_size,15); | |
| char *cur=(char *)buf+prolog_size; | |
| STRING_FOREACH_ROW( | |
| // lang | |
| memcpy(cur, &row->item.lang, sizeof(uchar)); | |
| cur+=sizeof(uchar); | |
| // size | |
| uchar byte1; uchar byte2; | |
| ushort2uchars(row->item.size, byte1, byte2); | |
| memcpy(cur, &byte1, sizeof(uchar)); cur+=sizeof(uchar); | |
| memcpy(cur, &byte2, sizeof(uchar)); cur+=sizeof(uchar); | |
| // bytes | |
| memcpy(cur, row->item.ptr, row->item.size); | |
| cur+=row->item.size; | |
| ); | |
| } | |
| bool String::deserialize(size_t prolog_size, void *buf, size_t buf_size, const char *file) { | |
| if(buf_size<=prolog_size) | |
| return false; | |
| char *cur=(char *)buf+prolog_size; | |
| buf_size-=prolog_size; | |
| while(buf_size) { | |
| if(sizeof(uchar)+sizeof(ushort)>buf_size) // lang+size | |
| return false; | |
| uchar lang=*(uchar *)(cur); | |
| ushort size=uchars2ushort( | |
| *(uchar*)(cur+sizeof(uchar)*1), | |
| *(uchar*)(cur+sizeof(uchar)*2) | |
| ); | |
| size_t piece_size=sizeof(uchar)+sizeof(ushort)+size; | |
| if(piece_size>buf_size) // buffer overrun, can be on incomplete cache files | |
| return false; | |
| const char *ptr=(const char*)(cur+sizeof(uchar)*3); | |
| APPEND(ptr, size, lang, file, 0); | |
| cur+=piece_size; | |
| buf_size-=piece_size; | |
| } | |
| return true; | |
| } |