Annotation of parser3/src/types/pa_vregex.C, revision 1.26
1.1 misha 1: /** @file
2: Parser: @b regex class.
3:
1.25 moko 4: Copyright (c) 2001-2024 Art. Lebedev Studio (http://www.artlebedev.com)
1.22 moko 5: Authors: Konstantin Morshnev <moko@design.ru>, Alexandr Petrosian <paf@design.ru>
1.1 misha 6: */
7:
8: #include "pa_vregex.h"
1.2 misha 9: #include "pa_vint.h"
1.8 misha 10: #include "pa_vstring.h"
11:
1.26 ! moko 12: volatile const char * IDENT_PA_VREGEX_C="$Id: pa_vregex.C,v 1.25 2024/11/04 03:53:26 moko Exp $" IDENT_PA_VREGEX_H;
1.8 misha 13:
14: // defines
15:
16: #define REGEX_PATTERN_NAME "pattern"
17: #define REGEX_OPTIONS_NAME "options"
1.1 misha 18:
1.11 moko 19: const char* get_pcre_exec_error_text(int exec_result){
1.24 moko 20: if(exec_result == PCRE_ERROR_BADUTF8_OFFSET ||
21: #ifdef HAVE_PCRE2
22: exec_result <= PCRE2_ERROR_UTF8_ERR1 /* -3 */ && exec_result >= PCRE2_ERROR_UTF8_ERR21 /* -21 */
23: #else
24: exec_result == PCRE_ERROR_BADUTF8
25: #endif
26: )
27: return "UTF-8 validation failed during pcre_exec (%d).";
28: return "execution error (%d)";
1.1 misha 29: }
30:
31:
1.15 moko 32: Value& VRegex::as_expr_result() {
1.1 misha 33: return *new VInt(as_int());
34: }
35:
36: void VRegex::regex_options(const String* options, int* result){
37: struct Regex_option {
38: const char* key;
39: const char* keyAlt;
40: int clear;
41: int set;
42: int *result;
43: } regex_option[]={
44: {"i", "I", 0, PCRE_CASELESS, result}, // a=A
45: {"s", "S", 0, PCRE_DOTALL, result}, // ^\n\n$ [default]
46: {"m", "M", PCRE_DOTALL, PCRE_MULTILINE, result}, // ^aaa\n$^bbb\n$
47: {"x", 0, 0, PCRE_EXTENDED, result}, // whitespace in regex ignored
48: {"U", 0, 0, PCRE_UNGREEDY, result}, // ungreedy patterns (greedy by default)
49: {"g", "G", 0, MF_GLOBAL_SEARCH, result+1}, // many rows
50: {"'", 0, 0, MF_NEED_PRE_POST_MATCH, result+1},
51: {"n", 0, 0, MF_JUST_COUNT_MATCHES, result+1},
52: {0, 0, 0, 0, 0}
53: };
54: result[0]=PCRE_EXTRA /* backslash+non-special char causes error */
55: | PCRE_DOTALL /* dot matches all chars including newline char */
56: | PCRE_DOLLAR_ENDONLY /* dollar matches only end of string, but not newline chars */;
57: result[1]=0;
58:
1.11 moko 59: if(options && !options->is_empty()){
1.12 misha 60: size_t valid_options=0;
1.11 moko 61: for(Regex_option *o=regex_option; o->key; o++)
1.1 misha 62: if(
63: options->pos(o->key)!=STRING_NOT_FOUND
64: || (o->keyAlt && options->pos(o->keyAlt)!=STRING_NOT_FOUND)
65: ){
66: *o->result &= ~o->clear;
67: *o->result |= o->set;
1.11 moko 68: valid_options++;
1.1 misha 69: }
1.11 moko 70: if(options->length()!=valid_options)
71: throw Exception(PARSER_RUNTIME, 0, CALLED_WITH_INVALID_OPTION);
72: }
1.1 misha 73: }
74:
75:
76: void VRegex::set(Charset& acharset, const String* aregex, const String* aoptions){
77: if(aregex->is_empty())
1.20 moko 78: throw Exception(PARSER_RUNTIME, 0, "regexp is empty");
1.1 misha 79:
80: fcharset=&acharset;
1.5 misha 81:
1.7 misha 82: fpattern=aregex->untaint_cstr(String::L_REGEX);
1.1 misha 83:
1.20 moko 84: foptions_cstr=aoptions ? aoptions->cstr() : 0;
1.8 misha 85:
1.1 misha 86: regex_options(aoptions, foptions);
87: }
88:
89:
1.20 moko 90: void VRegex::set(VRegex& avregex){
91: fcharset=avregex.fcharset;
92:
93: fpattern=avregex.fpattern;
94:
95: foptions_cstr=avregex.foptions_cstr;
96:
97: regex_options(foptions_cstr ? new String(foptions_cstr) : 0, foptions);
98: }
99:
100:
1.1 misha 101: void VRegex::compile(){
102: const char* err_ptr;
103: int options=foptions[0];
104:
105: // @todo (for UTF-8): check string & pattern and use PCRE_NO_UTF8_CHECK option
106: if(fcharset->isUTF8())
1.13 misha 107: options |= (PCRE_UTF8 | PCRE_UCP);
1.1 misha 108:
1.24 moko 109: #ifdef HAVE_PCRE2
110: int err;
111: size_t err_offset;
112: PCRE2_UCHAR buffer[120];
113:
114: if(!fcmp_ctxt)
115: fcmp_ctxt=pcre2_compile_context_create(fgen_ctxt);
116:
117: pcre2_set_character_tables(fcmp_ctxt, fcharset->pcre_tables);
118:
119: fcode=pcre2_compile((PCRE2_SPTR)fpattern, PCRE2_ZERO_TERMINATED, options,
120: &err, &err_offset,
121: fcmp_ctxt);
122:
123: if(!fcode){
124: pcre2_get_error_message(err, buffer, sizeof(buffer));
125: err_ptr=(const char*)buffer;
126: }
127: #else
128: int err_offset;
1.1 misha 129: fcode=pcre_compile(fpattern, options,
130: &err_ptr, &err_offset,
131: fcharset->pcre_tables);
1.24 moko 132: #endif
1.1 misha 133: if(!fcode){
134: throw Exception(PCRE_EXCEPTION_TYPE,
1.24 moko 135: new String(fpattern + (fpattern[err_offset] ? err_offset : 0), String::L_TAINTED),
1.1 misha 136: "regular expression syntax error - %s", err_ptr);
137: }
138:
139: }
140:
141:
142: size_t VRegex::full_info(int type){
143: size_t result;
144: int fullinfo_result=pcre_fullinfo(fcode, fextra, type, &result);
145: if(fullinfo_result<0){
146: throw Exception(PCRE_EXCEPTION_TYPE,
1.4 misha 147: new String(fpattern, String::L_TAINTED),
1.1 misha 148: "pcre_full_info error (%d)", fullinfo_result);
149: }
150:
151: return result;
1.17 moko 152: }
1.1 misha 153:
154:
155: size_t VRegex::get_info_size(){
156: return full_info(PCRE_INFO_SIZE);
157: }
158:
159:
160: size_t VRegex::get_study_size(){
1.24 moko 161: #ifdef HAVE_PCRE2
162: return 0;
163: #else
1.1 misha 164: return full_info(PCRE_INFO_STUDYSIZE);
1.24 moko 165: #endif
1.1 misha 166: }
167:
1.24 moko 168:
1.1 misha 169: void VRegex::study(){
1.24 moko 170: #ifndef HAVE_PCRE2
1.1 misha 171: if(fstudied)
172: return;
173:
174: const char* err_ptr;
175: fextra=pcre_study(fcode, 0/*options*/, &err_ptr);
176:
177: if(err_ptr){
178: throw Exception(PCRE_EXCEPTION_TYPE,
1.4 misha 179: new String(fpattern, String::L_TAINTED),
1.1 misha 180: "pcre_study error: %s", err_ptr);
181: }
182:
183: fstudied=true;
1.24 moko 184: #endif
1.1 misha 185: }
186:
187:
188: int VRegex::exec(const char* string, size_t string_len, int* ovector, int ovector_size, int prestart){
1.24 moko 189: #ifdef HAVE_PCRE2
190: if(!fmatch_ctxt)
191: fmatch_ctxt=pcre2_match_context_create(fgen_ctxt);
192:
193: if(!fmatch_data)
194: fmatch_data=pcre2_match_data_create_from_pattern(fcode, fgen_ctxt);
195:
196: int result=pcre2_match(fcode,
197: (PCRE2_SPTR)string, string_len, prestart,
198: prestart>0 ? PCRE2_NO_UTF_CHECK : 0, fmatch_data, fmatch_ctxt);
199: #else
1.1 misha 200: int result=pcre_exec(fcode, fextra,
201: string, string_len, prestart,
1.10 misha 202: prestart>0 ? PCRE_NO_UTF8_CHECK : 0, ovector, ovector_size);
1.24 moko 203: #endif
204:
1.1 misha 205: if(result<0 && result!=PCRE_ERROR_NOMATCH){
206: throw Exception(PCRE_EXCEPTION_TYPE,
1.4 misha 207: new String(fpattern, String::L_TAINTED),
1.1 misha 208: get_pcre_exec_error_text(result), result);
209: }
210:
1.24 moko 211: #ifdef HAVE_PCRE2
212: if(result>0){
213: result=min(result, ovector_size/3);
214: size_t* groups=pcre2_get_ovector_pointer(fmatch_data);
215: for(int i=0; i<result*2; i++){
216: ovector[i]=groups[i];
217: }
218: }
219: #endif
1.1 misha 220: return result;
221: }
222:
223:
1.8 misha 224: Value* VRegex::get_element(const String& aname) {
225: if(aname == REGEX_PATTERN_NAME)
1.26 ! moko 226: return new VString(fpattern);
1.8 misha 227:
228: if(aname == REGEX_OPTIONS_NAME)
1.26 ! moko 229: return new VString(foptions_cstr);
1.8 misha 230:
1.16 misha 231: // method (if any)
1.8 misha 232: if(Value* result=VStateless_object::get_element(aname))
233: return result;
234:
1.23 moko 235: return bark("%s field not found", &aname);
1.8 misha 236: }
E-mail: