Annotation of parser3/src/types/pa_vregex.C, revision 1.21
1.1 misha 1: /** @file
2: Parser: @b regex class.
3:
1.21 ! moko 4: Copyright (c) 2001-2020 Art. Lebedev Studio (http://www.artlebedev.com)
1.1 misha 5: Author: Alexandr Petrosian <paf@design.ru> (http://paf.design.ru)
6: */
7:
8: #include "pa_vregex.h"
1.2 misha 9: #include "pa_vint.h"
1.8 misha 10: #include "pa_vstring.h"
11:
1.21 ! moko 12: volatile const char * IDENT_PA_VREGEX_C="$Id: pa_vregex.C,v 1.20 2018/01/19 00:27:43 moko Exp $" IDENT_PA_VREGEX_H;
1.8 misha 13:
14: // defines
15:
16: #define REGEX_PATTERN_NAME "pattern"
17: #define REGEX_OPTIONS_NAME "options"
1.1 misha 18:
19:
1.11 moko 20: const char* get_pcre_exec_error_text(int exec_result){
1.1 misha 21: switch(exec_result){
22: case PCRE_ERROR_BADUTF8:
23: case PCRE_ERROR_BADUTF8_OFFSET:
24: return "UTF-8 validation failed during pcre_exec (%d).";
25: break;
26: default:
27: return "execution error (%d)";
28: }
29: }
30:
31:
1.15 moko 32: Value& VRegex::as_expr_result() {
1.1 misha 33: return *new VInt(as_int());
34: }
35:
36: void VRegex::regex_options(const String* options, int* result){
37: struct Regex_option {
38: const char* key;
39: const char* keyAlt;
40: int clear;
41: int set;
42: int *result;
43: } regex_option[]={
44: {"i", "I", 0, PCRE_CASELESS, result}, // a=A
45: {"s", "S", 0, PCRE_DOTALL, result}, // ^\n\n$ [default]
46: {"m", "M", PCRE_DOTALL, PCRE_MULTILINE, result}, // ^aaa\n$^bbb\n$
47: {"x", 0, 0, PCRE_EXTENDED, result}, // whitespace in regex ignored
48: {"U", 0, 0, PCRE_UNGREEDY, result}, // ungreedy patterns (greedy by default)
49: {"g", "G", 0, MF_GLOBAL_SEARCH, result+1}, // many rows
50: {"'", 0, 0, MF_NEED_PRE_POST_MATCH, result+1},
51: {"n", 0, 0, MF_JUST_COUNT_MATCHES, result+1},
52: {0, 0, 0, 0, 0}
53: };
54: result[0]=PCRE_EXTRA /* backslash+non-special char causes error */
55: | PCRE_DOTALL /* dot matches all chars including newline char */
56: | PCRE_DOLLAR_ENDONLY /* dollar matches only end of string, but not newline chars */;
57: result[1]=0;
58:
1.11 moko 59: if(options && !options->is_empty()){
1.12 misha 60: size_t valid_options=0;
1.11 moko 61: for(Regex_option *o=regex_option; o->key; o++)
1.1 misha 62: if(
63: options->pos(o->key)!=STRING_NOT_FOUND
64: || (o->keyAlt && options->pos(o->keyAlt)!=STRING_NOT_FOUND)
65: ){
66: *o->result &= ~o->clear;
67: *o->result |= o->set;
1.11 moko 68: valid_options++;
1.1 misha 69: }
1.11 moko 70: if(options->length()!=valid_options)
71: throw Exception(PARSER_RUNTIME, 0, CALLED_WITH_INVALID_OPTION);
72: }
1.1 misha 73: }
74:
75:
76: void VRegex::set(Charset& acharset, const String* aregex, const String* aoptions){
77: if(aregex->is_empty())
1.20 moko 78: throw Exception(PARSER_RUNTIME, 0, "regexp is empty");
1.1 misha 79:
80: fcharset=&acharset;
1.5 misha 81:
1.7 misha 82: fpattern=aregex->untaint_cstr(String::L_REGEX);
1.1 misha 83:
1.20 moko 84: foptions_cstr=aoptions ? aoptions->cstr() : 0;
1.8 misha 85:
1.1 misha 86: regex_options(aoptions, foptions);
87: }
88:
89:
1.20 moko 90: void VRegex::set(VRegex& avregex){
91: fcharset=avregex.fcharset;
92:
93: fpattern=avregex.fpattern;
94:
95: foptions_cstr=avregex.foptions_cstr;
96:
97: regex_options(foptions_cstr ? new String(foptions_cstr) : 0, foptions);
98: }
99:
100:
1.1 misha 101: void VRegex::compile(){
102: const char* err_ptr;
103: int err_offset;
104: int options=foptions[0];
105:
106: // @todo (for UTF-8): check string & pattern and use PCRE_NO_UTF8_CHECK option
107: if(fcharset->isUTF8())
1.13 misha 108: options |= (PCRE_UTF8 | PCRE_UCP);
1.1 misha 109:
110: fcode=pcre_compile(fpattern, options,
111: &err_ptr, &err_offset,
112: fcharset->pcre_tables);
113:
114: if(!fcode){
115: throw Exception(PCRE_EXCEPTION_TYPE,
1.4 misha 116: new String(fpattern+err_offset, String::L_TAINTED),
1.1 misha 117: "regular expression syntax error - %s", err_ptr);
118: }
119:
120: }
121:
122:
123: size_t VRegex::full_info(int type){
124: size_t result;
125: int fullinfo_result=pcre_fullinfo(fcode, fextra, type, &result);
126: if(fullinfo_result<0){
127: throw Exception(PCRE_EXCEPTION_TYPE,
1.4 misha 128: new String(fpattern, String::L_TAINTED),
1.1 misha 129: "pcre_full_info error (%d)", fullinfo_result);
130: }
131:
132: return result;
1.17 moko 133: }
1.1 misha 134:
135:
136: size_t VRegex::get_info_size(){
137: return full_info(PCRE_INFO_SIZE);
138: }
139:
140:
141: size_t VRegex::get_study_size(){
142: return full_info(PCRE_INFO_STUDYSIZE);
143: }
144:
145: void VRegex::study(){
146: if(fstudied)
147: return;
148:
149: const char* err_ptr;
150: fextra=pcre_study(fcode, 0/*options*/, &err_ptr);
151:
152: if(err_ptr){
153: throw Exception(PCRE_EXCEPTION_TYPE,
1.4 misha 154: new String(fpattern, String::L_TAINTED),
1.1 misha 155: "pcre_study error: %s", err_ptr);
156: }
157:
158: fstudied=true;
159: }
160:
161:
162: int VRegex::exec(const char* string, size_t string_len, int* ovector, int ovector_size, int prestart){
163: int result=pcre_exec(fcode, fextra,
164: string, string_len, prestart,
1.10 misha 165: prestart>0 ? PCRE_NO_UTF8_CHECK : 0, ovector, ovector_size);
1.1 misha 166:
167: if(result<0 && result!=PCRE_ERROR_NOMATCH){
168: throw Exception(PCRE_EXCEPTION_TYPE,
1.4 misha 169: new String(fpattern, String::L_TAINTED),
1.1 misha 170: get_pcre_exec_error_text(result), result);
171: }
172:
173: return result;
174: }
175:
176:
1.8 misha 177: Value* VRegex::get_element(const String& aname) {
178: if(aname == REGEX_PATTERN_NAME)
179: return new VString(*new String(fpattern, String::L_TAINTED));
180:
181: if(aname == REGEX_OPTIONS_NAME)
182: return new VString(*new String(foptions_cstr, String::L_TAINTED));
183:
1.16 misha 184: // method (if any)
1.8 misha 185: if(Value* result=VStateless_object::get_element(aname))
186: return result;
187:
188: throw Exception(PARSER_RUNTIME,
189: &aname,
190: "reading of invalid field");
191: }
E-mail: