diff options
Diffstat (limited to 'modules/regex/regex.cpp')
-rw-r--r-- | modules/regex/regex.cpp | 1562 |
1 files changed, 276 insertions, 1286 deletions
diff --git a/modules/regex/regex.cpp b/modules/regex/regex.cpp index c728657d6b..8afd01e20b 100644 --- a/modules/regex/regex.cpp +++ b/modules/regex/regex.cpp @@ -3,7 +3,7 @@ /*************************************************************************/ /* This file is part of: */ /* GODOT ENGINE */ -/* http://www.godotengine.org */ +/* https://godotengine.org */ /*************************************************************************/ /* Copyright (c) 2007-2017 Juan Linietsky, Ariel Manzur. */ /* Copyright (c) 2014-2017 Godot Engine contributors (cf. AUTHORS.md) */ @@ -29,1479 +29,469 @@ /*************************************************************************/ #include "regex.h" -#include <wchar.h> -#include <wctype.h> - -static int RegEx_hex2int(const CharType c) { - if ('0' <= c && c <= '9') - return int(c - '0'); - else if ('a' <= c && c <= 'f') - return int(c - 'a') + 10; - else if ('A' <= c && c <= 'F') - return int(c - 'A') + 10; - return -1; -} - -struct RegExSearch { - - Ref<RegExMatch> match; - const CharType *str; - int end; - int eof; - - // For standard quantifier behaviour, test_parent is used to check the - // rest of the pattern. If the pattern matches, to prevent the parent - // from testing again, the complete flag is used as a shortcut out. - bool complete; - - // With lookahead, the position needs to rewind to its starting position - // when test_parent is used. Due to functional programming, this state - // has to be kept as a parameter. - Vector<int> lookahead_pos; - - CharType at(int p_pos) { - return str[p_pos]; - } - - RegExSearch(Ref<RegExMatch> &p_match, int p_end, int p_lookahead) - : match(p_match) { - - str = p_match->string.c_str(); - end = p_end; - eof = p_match->string.length(); - complete = false; - lookahead_pos.resize(p_lookahead); - } -}; - -struct RegExNode { - - RegExNode *next; - RegExNode *previous; - RegExNode *parent; - bool quantifiable; - int length; - - RegExNode() { - - next = NULL; - previous = NULL; - parent = NULL; - quantifiable = false; - length = -1; - } - - virtual ~RegExNode() { - - if (next) - memdelete(next); - } +#include "core/os/memory.h" - // For avoiding RTTI - virtual bool is_look_behind() { return false; } - - virtual int test(RegExSearch &s, int pos) const { - - return next ? next->test(s, pos) : -1; - } - - virtual int test_parent(RegExSearch &s, int pos) const { - - if (next) - pos = next->test(s, pos); - - if (pos >= 0) { - s.complete = true; - if (parent) - pos = parent->test_parent(s, pos); - } - - if (pos < 0) - s.complete = false; - - return pos; - } - - void increment_length(int amount, bool subtract = false) { - - if (amount >= 0 && length >= 0) { - if (!subtract) - length += amount; - else - length -= amount; - } else { - length = -1; - } +extern "C" { +#include <pcre2.h> +} - if (parent) - parent->increment_length(amount, subtract); - } -}; +static void *_regex_malloc(PCRE2_SIZE size, void *user) { -struct RegExNodeChar : public RegExNode { + return memalloc(size); +} - CharType ch; +static void _regex_free(void *ptr, void *user) { - RegExNodeChar(CharType p_char) { + memfree(ptr); +} - length = 1; - quantifiable = true; - ch = p_char; - } +int RegExMatch::_find(const Variant &p_name) const { - virtual int test(RegExSearch &s, int pos) const { + if (p_name.is_num()) { - if (s.end <= pos || 0 > pos || s.at(pos) != ch) + int i = (int)p_name; + if (i >= data.size()) return -1; + return i; - return next ? next->test(s, pos + 1) : pos + 1; - } + } else if (p_name.get_type() == Variant::STRING) { - static CharType parse_escape(const CharType *&c) { - - int point = 0; - switch (c[1]) { - case 'x': - for (int i = 2; i <= 3; ++i) { - int res = RegEx_hex2int(c[i]); - if (res == -1) - return '\0'; - point = (point << 4) + res; - } - c = &c[3]; - return CharType(point); - case 'u': - for (int i = 2; i <= 5; ++i) { - int res = RegEx_hex2int(c[i]); - if (res == -1) - return '\0'; - point = (point << 4) + res; - } - c = &c[5]; - return CharType(point); - case '0': ++c; return '\0'; - case 'a': ++c; return '\a'; - case 'e': ++c; return '\e'; - case 'f': ++c; return '\f'; - case 'n': ++c; return '\n'; - case 'r': ++c; return '\r'; - case 't': ++c; return '\t'; - case 'v': ++c; return '\v'; - case 'b': ++c; return '\b'; - default: break; - } - return (++c)[0]; + const Map<String, int>::Element *found = names.find((String)p_name); + if (found) + return found->value(); } -}; -struct RegExNodeRange : public RegExNode { + return -1; +} - CharType start; - CharType end; +String RegExMatch::get_subject() const { - RegExNodeRange(CharType p_start, CharType p_end) { + return subject; +} - length = 1; - quantifiable = true; - start = p_start; - end = p_end; - } +int RegExMatch::get_group_count() const { - virtual int test(RegExSearch &s, int pos) const { + if (data.size() == 0) + return 0; + return data.size() - 1; +} - if (s.end <= pos || 0 > pos) - return -1; +Dictionary RegExMatch::get_names() const { - CharType c = s.at(pos); - if (c < start || end < c) - return -1; + Dictionary result; - return next ? next->test(s, pos + 1) : pos + 1; + for (const Map<String, int>::Element *i = names.front(); i != NULL; i = i->next()) { + result[i->key()] = i->value(); } -}; - -struct RegExNodeShorthand : public RegExNode { - CharType repr; + return result; +} - RegExNodeShorthand(CharType p_repr) { +Array RegExMatch::get_strings() const { - length = 1; - quantifiable = true; - repr = p_repr; - } + Array result; - virtual int test(RegExSearch &s, int pos) const { + int size = data.size(); - if (s.end <= pos || 0 > pos) - return -1; + for (int i = 0; i < size; i++) { - bool found = false; - bool invert = false; - CharType c = s.at(pos); - switch (repr) { - case '.': - found = true; - break; - case 'W': - invert = true; - case 'w': - found = (c == '_' || iswalnum(c) != 0); - break; - case 'D': - invert = true; - case 'd': - found = ('0' <= c && c <= '9'); - break; - case 'S': - invert = true; - case 's': - found = (iswspace(c) != 0); - break; - default: - break; - } + int start = data[i].start; - if (found == invert) - return -1; - - return next ? next->test(s, pos + 1) : pos + 1; - } -}; - -struct RegExNodeClass : public RegExNode { - - enum Type { - Type_none, - Type_alnum, - Type_alpha, - Type_ascii, - Type_blank, - Type_cntrl, - Type_digit, - Type_graph, - Type_lower, - Type_print, - Type_punct, - Type_space, - Type_upper, - Type_xdigit, - Type_word - }; - - Type type; - - bool test_class(CharType c) const { - - static Vector<CharType> REGEX_NODE_SPACE = String(" \t\r\n\f"); - static Vector<CharType> REGEX_NODE_PUNCT = String("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"); - - switch (type) { - case Type_alnum: - if ('0' <= c && c <= '9') return true; - if ('a' <= c && c <= 'z') return true; - if ('A' <= c && c <= 'Z') return true; - return false; - case Type_alpha: - if ('a' <= c && c <= 'z') return true; - if ('A' <= c && c <= 'Z') return true; - return false; - case Type_ascii: - return (0x00 <= c && c <= 0x7F); - case Type_blank: - return (c == ' ' || c == '\t'); - case Type_cntrl: - return ((0x00 <= c && c <= 0x1F) || c == 0x7F); - case Type_digit: - return ('0' <= c && c <= '9'); - case Type_graph: - return (0x20 < c && c < 0x7F); - case Type_lower: - return ('a' <= c && c <= 'z'); - case Type_print: - return (0x20 < c && c < 0x7f); - case Type_punct: - return (REGEX_NODE_PUNCT.find(c) >= 0); - case Type_space: - return (REGEX_NODE_SPACE.find(c) >= 0); - case Type_upper: - return ('A' <= c && c <= 'Z'); - case Type_xdigit: - if ('0' <= c && c <= '9') return true; - if ('a' <= c && c <= 'f') return true; - if ('A' <= c && c <= 'F') return true; - return false; - case Type_word: - if ('0' <= c && c <= '9') return true; - if ('a' <= c && c <= 'z') return true; - if ('A' <= c && c <= 'Z') return true; - return (c == '_'); - default: - return false; + if (start == -1) { + result.append(String()); + continue; } - return false; - } - - RegExNodeClass(Type p_type) { - - length = 1; - quantifiable = true; - type = p_type; - } - - virtual int test(RegExSearch &s, int pos) const { - - if (s.end <= pos || 0 > pos) - return -1; - - if (!test_class(s.at(pos))) - return -1; - - return next ? next->test(s, pos + 1) : pos + 1; - } - -#define REGEX_CMP_CLASS(POS, NAME) \ - if (cmp_class(POS, #NAME)) return Type_##NAME - - static Type parse_type(const CharType *&p_pos) { - - REGEX_CMP_CLASS(p_pos, alnum); - REGEX_CMP_CLASS(p_pos, alpha); - REGEX_CMP_CLASS(p_pos, ascii); - REGEX_CMP_CLASS(p_pos, blank); - REGEX_CMP_CLASS(p_pos, cntrl); - REGEX_CMP_CLASS(p_pos, digit); - REGEX_CMP_CLASS(p_pos, graph); - REGEX_CMP_CLASS(p_pos, lower); - REGEX_CMP_CLASS(p_pos, print); - REGEX_CMP_CLASS(p_pos, punct); - REGEX_CMP_CLASS(p_pos, space); - REGEX_CMP_CLASS(p_pos, upper); - REGEX_CMP_CLASS(p_pos, xdigit); - REGEX_CMP_CLASS(p_pos, word); - return Type_none; - } - - static bool cmp_class(const CharType *&p_pos, const char *p_text) { - - unsigned int i = 0; - for (i = 0; p_text[i] != '\0'; ++i) - if (p_pos[i] != p_text[i]) - return false; - - if (p_pos[i++] != ':' || p_pos[i] != ']') - return false; - - p_pos = &p_pos[i]; - return true; - } -}; - -struct RegExNodeAnchorStart : public RegExNode { - RegExNodeAnchorStart() { + int length = data[i].end - start; - length = 0; + result.append(subject.substr(start, length)); } - virtual int test(RegExSearch &s, int pos) const { + return result; +} - if (pos != 0) - return -1; +String RegExMatch::get_string(const Variant &p_name) const { - return next ? next->test(s, pos) : pos; - } -}; + int id = _find(p_name); -struct RegExNodeAnchorEnd : public RegExNode { + if (id < 0) + return String(); - RegExNodeAnchorEnd() { + int start = data[id].start; - length = 0; - } + if (start == -1) + return String(); - virtual int test(RegExSearch &s, int pos) const { + int length = data[id].end - start; - if (pos != s.eof) - return -1; + return subject.substr(start, length); +} - return next ? next->test(s, pos) : pos; - } -}; +int RegExMatch::get_start(const Variant &p_name) const { -struct RegExNodeWordBoundary : public RegExNode { + int id = _find(p_name); - bool inverse; + if (id < 0) + return -1; - RegExNodeWordBoundary(bool p_inverse) { + return data[id].start; +} - length = 0; - inverse = p_inverse; - } +int RegExMatch::get_end(const Variant &p_name) const { - virtual int test(RegExSearch &s, int pos) const { + int id = _find(p_name); - bool left = false; - bool right = false; + if (id < 0) + return -1; - if (pos != 0) { - CharType c = s.at(pos - 1); - if (c == '_' || iswalnum(c)) - left = true; - } + return data[id].end; +} - if (pos != s.eof) { - CharType c = s.at(pos); - if (c == '_' || iswalnum(c)) - right = true; - } +void RegExMatch::_bind_methods() { - if ((left == right) != inverse) - return -1; + ClassDB::bind_method(D_METHOD("get_subject"), &RegExMatch::get_subject); + ClassDB::bind_method(D_METHOD("get_group_count"), &RegExMatch::get_group_count); + ClassDB::bind_method(D_METHOD("get_names"), &RegExMatch::get_names); + ClassDB::bind_method(D_METHOD("get_strings"), &RegExMatch::get_strings); + ClassDB::bind_method(D_METHOD("get_string", "name"), &RegExMatch::get_string, DEFVAL(0)); + ClassDB::bind_method(D_METHOD("get_start", "name"), &RegExMatch::get_start, DEFVAL(0)); + ClassDB::bind_method(D_METHOD("get_end", "name"), &RegExMatch::get_end, DEFVAL(0)); +} - return next ? next->test(s, pos) : pos; - } -}; +void RegEx::_pattern_info(uint32_t what, void *where) const { -struct RegExNodeQuantifier : public RegExNode { + if (sizeof(CharType) == 2) { - int min; - int max; - bool greedy; - RegExNode *child; + pcre2_pattern_info_16((pcre2_code_16 *)code, what, where); - RegExNodeQuantifier(int p_min, int p_max) { + } else { - min = p_min; - max = p_max; - greedy = true; - child = NULL; + pcre2_pattern_info_32((pcre2_code_32 *)code, what, where); } +} - ~RegExNodeQuantifier() { - - if (child) - memdelete(child); - } +void RegEx::clear() { - virtual int test(RegExSearch &s, int pos) const { + if (sizeof(CharType) == 2) { - return test_step(s, pos, 0, pos); - } + if (code) + pcre2_code_free_16((pcre2_code_16 *)code); - virtual int test_parent(RegExSearch &s, int pos) const { + } else { - s.complete = false; - return pos; + if (code) + pcre2_code_free_32((pcre2_code_32 *)code); } +} - int test_step(RegExSearch &s, int pos, int level, int start) const { - - if (pos > s.end) - return -1; - - if (!greedy && level > min) { - int res = next ? next->test(s, pos) : pos; - if (s.complete) - return res; - - if (res >= 0 && parent->test_parent(s, res) >= 0) - return res; - } - - if (max >= 0 && level > max) - return -1; +Error RegEx::compile(const String &p_pattern) { - int res = pos; - if (level >= 1) { - if (level > min + 1 && pos == start) - return -1; + pattern = p_pattern; + clear(); - res = child->test(s, pos); - if (s.complete) - return res; - } + int err; + PCRE2_SIZE offset; + uint32_t flags = PCRE2_DUPNAMES; - if (res >= 0) { + if (sizeof(CharType) == 2) { - int res_step = test_step(s, res, level + 1, start); - if (res_step >= 0) - return res_step; + pcre2_general_context_16 *gctx = (pcre2_general_context_16 *)general_ctx; + pcre2_compile_context_16 *cctx = pcre2_compile_context_create_16(gctx); + PCRE2_SPTR16 p = (PCRE2_SPTR16)pattern.c_str(); - if (greedy && level >= min) { - if (next) - res = next->test(s, res); - if (s.complete) - return res; + code = pcre2_compile_16(p, pattern.length(), flags, &err, &offset, cctx); - if (res >= 0 && parent->test_parent(s, res) >= 0) - return res; - } + if (!code) { + PCRE2_UCHAR16 buf[256]; + pcre2_get_error_message_16(err, buf, 256); + String message = String::num(offset) + ": " + String((const CharType *)buf); + ERR_PRINT(message.utf8()); + return FAILED; } - return -1; - } -}; - -struct RegExNodeBackReference : public RegExNode { - int id; + } else { - RegExNodeBackReference(int p_id) { - - length = -1; - quantifiable = true; - id = p_id; - } + pcre2_general_context_32 *gctx = (pcre2_general_context_32 *)general_ctx; + pcre2_compile_context_32 *cctx = pcre2_compile_context_create_32(gctx); + PCRE2_SPTR32 p = (PCRE2_SPTR32)pattern.c_str(); - virtual int test(RegExSearch &s, int pos) const { + code = pcre2_compile_32(p, pattern.length(), flags, &err, &offset, cctx); - RegExMatch::Group &ref = s.match->captures[id]; - for (int i = 0; i < ref.length; ++i) { - - if (pos + i >= s.end) - return -1; - - if (s.at(ref.start + i) != s.at(pos + i)) - return -1; + if (!code) { + PCRE2_UCHAR32 buf[256]; + pcre2_get_error_message_32(err, buf, 256); + String message = String::num(offset) + ": " + String((const CharType *)buf); + ERR_PRINT(message.utf8()); + return FAILED; } - return next ? next->test(s, pos + ref.length) : pos + ref.length; } -}; - -struct RegExNodeGroup : public RegExNode { - - bool inverse; - bool reset_pos; - Vector<RegExNode *> childset; - RegExNode *back; - - RegExNodeGroup() { - - length = 0; - quantifiable = true; - inverse = false; - reset_pos = false; - back = NULL; - } - - virtual ~RegExNodeGroup() { - - for (int i = 0; i < childset.size(); ++i) - memdelete(childset[i]); - } - - virtual void test_success(RegExSearch &s, int pos) const { + return OK; +} - return; - } +Ref<RegExMatch> RegEx::search(const String &p_subject, int p_offset, int p_end) const { - virtual int test(RegExSearch &s, int pos) const { + ERR_FAIL_COND_V(!is_valid(), NULL); - for (int i = 0; i < childset.size(); ++i) { + Ref<RegExMatch> result = memnew(RegExMatch); - s.complete = false; + int length = p_subject.length(); + if (p_end >= 0 && p_end < length) + length = p_end; - int res = childset[i]->test(s, pos); + if (sizeof(CharType) == 2) { - if (inverse) { - s.complete = false; - if (res < 0) - res = pos + 1; - else - return -1; + pcre2_code_16 *c = (pcre2_code_16 *)code; + pcre2_general_context_16 *gctx = (pcre2_general_context_16 *)general_ctx; + pcre2_match_context_16 *mctx = pcre2_match_context_create_16(gctx); + PCRE2_SPTR16 s = (PCRE2_SPTR16)p_subject.c_str(); - if (i + 1 < childset.size()) - continue; - } + pcre2_match_data_16 *match = pcre2_match_data_create_from_pattern_16(c, gctx); - if (s.complete) - return res; + int res = pcre2_match_16(c, s, length, p_offset, 0, match, mctx); - if (res >= 0) { - if (reset_pos) - res = pos; - this->test_success(s, res); - return next ? next->test(s, res) : res; - } + if (res < 0) { + pcre2_match_data_free_16(match); + return NULL; } - return -1; - } - - void add_child(RegExNode *node) { - node->parent = this; - node->previous = back; + uint32_t size = pcre2_get_ovector_count_16(match); + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_16(match); - if (back) - back->next = node; - else - childset.push_back(node); + result->data.resize(size); - increment_length(node->length); + for (uint32_t i = 0; i < size; i++) { - back = node; - } - - void add_childset() { - - if (childset.size() > 0) - length = -1; - back = NULL; - } - - RegExNode *swap_back(RegExNode *node) { - - RegExNode *old = back; - - if (old) { - if (!old->previous) - childset.remove(childset.size() - 1); - back = old->previous; - increment_length(old->length, true); + result->data[i].start = ovector[i * 2]; + result->data[i].end = ovector[i * 2 + 1]; } - add_child(node); + pcre2_match_data_free_16(match); + pcre2_match_context_free_16(mctx); - return old; - } -}; + } else { -struct RegExNodeCapturing : public RegExNodeGroup { + pcre2_code_32 *c = (pcre2_code_32 *)code; + pcre2_general_context_32 *gctx = (pcre2_general_context_32 *)general_ctx; + pcre2_match_context_32 *mctx = pcre2_match_context_create_32(gctx); + PCRE2_SPTR32 s = (PCRE2_SPTR32)p_subject.c_str(); - int id; + pcre2_match_data_32 *match = pcre2_match_data_create_from_pattern_32(c, gctx); - RegExNodeCapturing(int p_id = 0) { + int res = pcre2_match_32(c, s, length, p_offset, 0, match, mctx); - id = p_id; - } - - virtual void test_success(RegExSearch &s, int pos) const { - - RegExMatch::Group &ref = s.match->captures[id]; - ref.length = pos - ref.start; - } - - virtual int test(RegExSearch &s, int pos) const { - - RegExMatch::Group &ref = s.match->captures[id]; - int old_start = ref.start; - ref.start = pos; - - int res = RegExNodeGroup::test(s, pos); - - if (res < 0) - ref.start = old_start; - return res; - } - - virtual int test_parent(RegExSearch &s, int pos) const { - - RegExMatch::Group &ref = s.match->captures[id]; - ref.length = pos - ref.start; - return RegExNode::test_parent(s, pos); - } - - static Variant parse_name(const CharType *&c, bool p_allow_numeric) { - - if (c[1] == '0') { - return -1; - } else if ('1' <= c[1] && c[1] <= '9') { - if (!p_allow_numeric) - return -1; - int res = (++c)[0] - '0'; - while ('0' <= c[1] && c[1] <= '9') - res = res * 10 + int((++c)[0] - '0'); - if ((++c)[0] != '>') - return -1; - return res; - } else if (iswalnum(c[1])) { - String res(++c, 1); - while (iswalnum(c[1])) - res += String(++c, 1); - if ((++c)[0] != '>') - return -1; - return res; + if (res < 0) { + pcre2_match_data_free_32(match); + return NULL; } - return -1; - } -}; - -struct RegExNodeLookAhead : public RegExNodeGroup { - - int id; - RegExNodeLookAhead(bool p_inverse, int p_id = 0) { + uint32_t size = pcre2_get_ovector_count_32(match); + PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_32(match); - quantifiable = false; - inverse = p_inverse; - reset_pos = true; - id = p_id; - } - - virtual int test(RegExSearch &s, int pos) const { + result->data.resize(size); - s.lookahead_pos[id] = pos; - return RegExNodeGroup::test(s, pos); - } + for (uint32_t i = 0; i < size; i++) { - virtual int test_parent(RegExSearch &s, int pos) const { + result->data[i].start = ovector[i * 2]; + result->data[i].end = ovector[i * 2 + 1]; + } - return RegExNode::test_parent(s, s.lookahead_pos[id]); + pcre2_match_data_free_32(match); + pcre2_match_context_free_32(mctx); } -}; -struct RegExNodeLookBehind : public RegExNodeGroup { + result->subject = p_subject; - RegExNodeLookBehind(bool p_inverse, int p_id = 0) { + uint32_t count; + const CharType *table; + uint32_t entry_size; - quantifiable = false; - inverse = p_inverse; - reset_pos = true; - } + _pattern_info(PCRE2_INFO_NAMECOUNT, &count); + _pattern_info(PCRE2_INFO_NAMETABLE, &table); + _pattern_info(PCRE2_INFO_NAMEENTRYSIZE, &entry_size); - virtual bool is_look_behind() { return true; } + for (int i = 0; i < count; i++) { - virtual int test(RegExSearch &s, int pos) const { + CharType id = table[i * entry_size]; + if (result->data[id].start == -1) + continue; + String name = &table[i * entry_size + 1]; + if (result->names.has(name)) + continue; - if (pos < length) - return -1; - return RegExNodeGroup::test(s, pos - length); + result->names.insert(name, id); } -}; -struct RegExNodeBracket : public RegExNode { + return result; +} - bool inverse; - Vector<RegExNode *> children; +String RegEx::sub(const String &p_subject, const String &p_replacement, bool p_all, int p_offset, int p_end) const { - RegExNodeBracket() { + ERR_FAIL_COND_V(!is_valid(), String()); - length = 1; - quantifiable = true; - inverse = false; - } + String output; + output.resize(p_subject.length()); - virtual ~RegExNodeBracket() { + uint32_t flags = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH; + if (p_all) + flags |= PCRE2_SUBSTITUTE_GLOBAL; - for (int i = 0; i < children.size(); ++i) - memdelete(children[i]); - } + PCRE2_SIZE olength = output.length(); - virtual int test(RegExSearch &s, int pos) const { + PCRE2_SIZE length = p_subject.length(); + if (p_end >= 0 && p_end < length) + length = p_end; - for (int i = 0; i < children.size(); ++i) { + if (sizeof(CharType) == 2) { - int res = children[i]->test(s, pos); + pcre2_code_16 *c = (pcre2_code_16 *)code; + pcre2_general_context_16 *gctx = (pcre2_general_context_16 *)general_ctx; + pcre2_match_context_16 *mctx = pcre2_match_context_create_16(gctx); + PCRE2_SPTR16 s = (PCRE2_SPTR16)p_subject.c_str(); + PCRE2_SPTR16 r = (PCRE2_SPTR16)p_replacement.c_str(); + PCRE2_UCHAR16 *o = (PCRE2_UCHAR16 *)output.c_str(); - if (inverse) { - if (res < 0) - res = pos + 1; - else - return -1; + pcre2_match_data_16 *match = pcre2_match_data_create_from_pattern_16(c, gctx); - if (i + 1 < children.size()) - continue; - } + int res = pcre2_substitute_16(c, s, length, p_offset, flags, match, mctx, r, p_replacement.length(), o, &olength); - if (res >= 0) - return next ? next->test(s, res) : res; + if (res == PCRE2_ERROR_NOMEMORY) { + output.resize(olength); + o = (PCRE2_UCHAR16 *)output.c_str(); + res = pcre2_substitute_16(c, s, length, p_offset, flags, match, mctx, r, p_replacement.length(), o, &olength); } - return -1; - } - void add_child(RegExNode *node) { - - node->parent = this; - children.push_back(node); - } + pcre2_match_data_free_16(match); + pcre2_match_context_free_16(mctx); - void pop_back() { - - memdelete(children[children.size() - 1]); - children.remove(children.size() - 1); - } -}; - -#define REGEX_EXPAND_FAIL(MSG) \ - { \ - ERR_PRINT(MSG); \ - return String(); \ - } - -String RegExMatch::expand(const String &p_template) const { - - String res; - for (const CharType *c = p_template.c_str(); *c != '\0'; ++c) { - if (c[0] == '\\') { - if (('1' <= c[1] && c[1] <= '9') || (c[1] == 'g' && c[2] == '{')) { - - int ref = 0; - bool unclosed = false; - - if (c[1] == 'g') { - unclosed = true; - c = &c[2]; - } - - while ('0' <= c[1] && c[1] <= '9') { - ref = ref * 10 + int(c[1] - '0'); - ++c; - } - - if (unclosed) { - if (c[1] != '}') - REGEX_EXPAND_FAIL("unclosed backreference '{'"); - ++c; - } - - res += get_string(ref); - - } else if (c[1] == 'g' && c[2] == '<') { - - const CharType *d = &c[2]; + if (res < 0) + return String(); - Variant name = RegExNodeCapturing::parse_name(d, true); - if (name == Variant(-1)) - REGEX_EXPAND_FAIL("unrecognised character for group name"); + } else { - c = d; + pcre2_code_32 *c = (pcre2_code_32 *)code; + pcre2_general_context_32 *gctx = (pcre2_general_context_32 *)general_ctx; + pcre2_match_context_32 *mctx = pcre2_match_context_create_32(gctx); + PCRE2_SPTR32 s = (PCRE2_SPTR32)p_subject.c_str(); + PCRE2_SPTR32 r = (PCRE2_SPTR32)p_replacement.c_str(); + PCRE2_UCHAR32 *o = (PCRE2_UCHAR32 *)output.c_str(); - res += get_string(name); + pcre2_match_data_32 *match = pcre2_match_data_create_from_pattern_32(c, gctx); - } else { + int res = pcre2_substitute_32(c, s, length, p_offset, flags, match, mctx, r, p_replacement.length(), o, &olength); - const CharType *d = c; - CharType ch = RegExNodeChar::parse_escape(d); - if (c == d) - REGEX_EXPAND_FAIL("invalid escape token"); - res += String(&ch, 1); - c = d; - } - } else { - res += String(c, 1); + if (res == PCRE2_ERROR_NOMEMORY) { + output.resize(olength); + o = (PCRE2_UCHAR32 *)output.c_str(); + res = pcre2_substitute_32(c, s, length, p_offset, flags, match, mctx, r, p_replacement.length(), o, &olength); } - } - return res; -} -int RegExMatch::get_group_count() const { + pcre2_match_data_free_32(match); + pcre2_match_context_free_32(mctx); - int count = 0; - for (int i = 1; i < captures.size(); ++i) - if (captures[i].name.get_type() == Variant::INT) - ++count; - return count; -} - -Array RegExMatch::get_group_array() const { - - Array res; - for (int i = 1; i < captures.size(); ++i) { - const RegExMatch::Group &capture = captures[i]; - if (capture.name.get_type() != Variant::INT) - continue; - - if (capture.start >= 0) - res.push_back(string.substr(capture.start, capture.length)); - else - res.push_back(String()); - } - return res; -} - -Array RegExMatch::get_names() const { - - Array res; - for (int i = 1; i < captures.size(); ++i) - if (captures[i].name.get_type() == Variant::STRING) - res.push_back(captures[i].name); - return res; -} - -Dictionary RegExMatch::get_name_dict() const { - - Dictionary res; - for (int i = 1; i < captures.size(); ++i) { - const RegExMatch::Group &capture = captures[i]; - if (capture.name.get_type() != Variant::STRING) - continue; - - if (capture.start >= 0) - res[capture.name] = string.substr(capture.start, capture.length); - else - res[capture.name] = String(); - } - return res; -} - -String RegExMatch::get_string(const Variant &p_name) const { - - for (int i = 0; i < captures.size(); ++i) { - - const RegExMatch::Group &capture = captures[i]; - - if (capture.name != p_name) - continue; - - if (capture.start == -1) + if (res < 0) return String(); - - return string.substr(capture.start, capture.length); } - return String(); -} - -int RegExMatch::get_start(const Variant &p_name) const { - for (int i = 0; i < captures.size(); ++i) - if (captures[i].name == p_name) - return captures[i].start; - return -1; + return output; } -int RegExMatch::get_end(const Variant &p_name) const { +bool RegEx::is_valid() const { - for (int i = 0; i < captures.size(); ++i) - if (captures[i].name == p_name) - return captures[i].start + captures[i].length; - return -1; + return (code != NULL); } -RegExMatch::RegExMatch() { -} +String RegEx::get_pattern() const { -static bool RegEx_is_shorthand(CharType ch) { - - switch (ch) { - case 'w': - case 'W': - case 'd': - case 'D': - case 's': - case 'S': - return true; - default: - break; - } - return false; + return pattern; } -#define REGEX_COMPILE_FAIL(MSG) \ - { \ - ERR_PRINT(MSG); \ - clear(); \ - return FAILED; \ - } +int RegEx::get_group_count() const { -Error RegEx::compile(const String &p_pattern) { + ERR_FAIL_COND_V(!is_valid(), 0); - ERR_FAIL_COND_V(p_pattern.length() == 0, FAILED); + uint32_t count; - if (pattern == p_pattern && root) - return OK; + _pattern_info(PCRE2_INFO_CAPTURECOUNT, &count); - clear(); - pattern = p_pattern; - group_names.push_back(0); - RegExNodeGroup *root_group = memnew(RegExNodeCapturing(0)); - root = root_group; - Vector<RegExNodeGroup *> stack; - stack.push_back(root_group); - int lookahead_level = 0; - int numeric_groups = 0; - const int numeric_max = 9; - - for (const CharType *c = p_pattern.c_str(); *c != '\0'; ++c) { - - switch (c[0]) { - case '(': - if (c[1] == '?') { - - RegExNodeGroup *group = NULL; - switch (c[2]) { - case ':': - c = &c[2]; - group = memnew(RegExNodeGroup()); - break; - case '!': - case '=': - group = memnew(RegExNodeLookAhead((c[2] == '!'), lookahead_level++)); - if (lookahead_depth < lookahead_level) - lookahead_depth = lookahead_level; - c = &c[2]; - break; - case '<': - if (c[3] == '!' || c[3] == '=') { - group = memnew(RegExNodeLookBehind((c[3] == '!'), lookahead_level++)); - c = &c[3]; - } - break; - case 'P': - if (c[3] == '<') { - const CharType *d = &c[3]; - Variant name = RegExNodeCapturing::parse_name(d, false); - if (name == Variant(-1)) - REGEX_COMPILE_FAIL("unrecognised character for group name"); - group = memnew(RegExNodeCapturing(group_names.size())); - group_names.push_back(name); - c = d; - } - default: - break; - } - if (!group) - REGEX_COMPILE_FAIL("unrecognised qualifier for group"); - stack[0]->add_child(group); - stack.insert(0, group); - - } else if (numeric_groups < numeric_max) { - - RegExNodeCapturing *group = memnew(RegExNodeCapturing(group_names.size())); - group_names.push_back(++numeric_groups); - stack[0]->add_child(group); - stack.insert(0, group); - - } else { - - RegExNodeGroup *group = memnew(RegExNodeGroup()); - stack[0]->add_child(group); - stack.insert(0, group); - } - break; - case ')': - if (stack.size() == 1) - REGEX_COMPILE_FAIL("unexpected ')'"); - stack.remove(0); - break; - case '\\': - if (('1' <= c[1] && c[1] <= '9') || (c[1] == 'g' && c[2] == '{')) { - - int ref = 0; - bool unclosed = false; - - if (c[1] == 'g') { - unclosed = true; - c = &c[2]; - } - - while ('0' <= c[1] && c[1] <= '9') { - ref = ref * 10 + int(c[1] - '0'); - ++c; - } - - if (unclosed) { - if (c[1] != '}') - REGEX_COMPILE_FAIL("unclosed backreference '{'"); - ++c; - } - - if (ref > numeric_groups || ref <= 0) - REGEX_COMPILE_FAIL("backreference not found"); - - for (int i = 0; i < stack.size(); ++i) - if (stack[i]->is_look_behind()) - REGEX_COMPILE_FAIL("backreferences inside lookbehind not supported"); - - for (int i = 0; i < group_names.size(); ++i) { - if (group_names[i].get_type() == Variant::INT && int(group_names[i]) == ref) { - ref = group_names[i]; - break; - } - } - - stack[0]->add_child(memnew(RegExNodeBackReference(ref))); - } - if (c[1] == 'g' && c[2] == '<') { - - const CharType *d = &c[2]; - - Variant name = RegExNodeCapturing::parse_name(d, true); - if (name == Variant(-1)) - REGEX_COMPILE_FAIL("unrecognised character for group name"); - - c = d; - - for (int i = 0; i < stack.size(); ++i) - if (stack[i]->is_look_behind()) - REGEX_COMPILE_FAIL("backreferences inside lookbehind not supported"); - - int ref = -1; - - for (int i = 0; i < group_names.size(); ++i) { - if (group_names[i].get_type() == Variant::INT && int(group_names[i]) == ref) { - ref = group_names[i]; - break; - } - } - - if (ref == -1) - REGEX_COMPILE_FAIL("backreference not found"); - - stack[0]->add_child(memnew(RegExNodeBackReference(ref))); - - } else if (c[1] == 'b' || c[1] == 'B') { - - stack[0]->add_child(memnew(RegExNodeWordBoundary(*(++c) == 'B'))); - - } else if (RegEx_is_shorthand(c[1])) { - - stack[0]->add_child(memnew(RegExNodeShorthand(*(++c)))); - - } else { - - const CharType *d = c; - CharType ch = RegExNodeChar::parse_escape(d); - if (c == d) - REGEX_COMPILE_FAIL("invalid escape token"); - stack[0]->add_child(memnew(RegExNodeChar(ch))); - c = d; - } - break; - case '[': { - RegExNodeBracket *bracket = memnew(RegExNodeBracket()); - stack[0]->add_child(bracket); - if (c[1] == '^') { - bracket->inverse = true; - ++c; - } - bool first_child = true; - CharType previous_child; - bool previous_child_single = false; - while (true) { - ++c; - if (!first_child && c[0] == ']') { - - break; - - } else if (c[0] == '\0') { - - REGEX_COMPILE_FAIL("unclosed bracket expression '['"); - - } else if (c[0] == '\\') { - - if (RegEx_is_shorthand(c[1])) { - bracket->add_child(memnew(RegExNodeShorthand(*(++c)))); - } else { - const CharType *d = c; - CharType ch = RegExNodeChar::parse_escape(d); - if (c == d) - REGEX_COMPILE_FAIL("invalid escape token"); - bracket->add_child(memnew(RegExNodeChar(ch))); - c = d; - previous_child = ch; - previous_child_single = true; - } - - } else if (c[0] == ']' && c[1] == ':') { - - const CharType *d = &c[2]; - RegExNodeClass::Type type = RegExNodeClass::parse_type(d); - if (type != RegExNodeClass::Type_none) { - - c = d; - previous_child_single = false; - - } else { - - bracket->add_child(memnew(RegExNodeChar('['))); - previous_child = '['; - previous_child_single = true; - } - } else if (previous_child_single && c[0] == '-') { - - if (c[1] != '\0' && c[1] != ']') { - - CharType next; - - if (c[1] == '\\') { - const CharType *d = ++c; - next = RegExNodeChar::parse_escape(d); - if (c == d) - REGEX_COMPILE_FAIL("invalid escape token"); - } else { - next = *(++c); - } - - if (next < previous_child) - REGEX_COMPILE_FAIL("text range out of order"); - - bracket->pop_back(); - bracket->add_child(memnew(RegExNodeRange(previous_child, next))); - previous_child_single = false; - } else { - - bracket->add_child(memnew(RegExNodeChar('-'))); - previous_child = '-'; - previous_child_single = true; - } - } else { - - bracket->add_child(memnew(RegExNodeChar(c[0]))); - previous_child = c[0]; - previous_child_single = true; - } - first_child = false; - } - } break; - case '|': - for (int i = 0; i < stack.size(); ++i) - if (stack[i]->is_look_behind()) - REGEX_COMPILE_FAIL("alternations inside lookbehind not supported"); - stack[0]->add_childset(); - break; - case '^': - stack[0]->add_child(memnew(RegExNodeAnchorStart())); - break; - case '$': - stack[0]->add_child(memnew(RegExNodeAnchorEnd())); - break; - case '.': - stack[0]->add_child(memnew(RegExNodeShorthand('.'))); - break; - case '?': - case '*': - case '+': - case '{': { - int min_val = 0; - int max_val = -1; - bool valid = true; - const CharType *d = c; - bool max_set = true; - switch (c[0]) { - case '?': - min_val = 0; - max_val = 1; - break; - case '*': - min_val = 0; - max_val = -1; - break; - case '+': - min_val = 1; - max_val = -1; - break; - case '{': - max_set = false; - while (valid) { - ++d; - if (d[0] == '}') { - break; - } else if (d[0] == ',') { - max_set = true; - } else if ('0' <= d[0] && d[0] <= '9') { - if (max_set) { - if (max_val < 0) - max_val = int(d[0] - '0'); - else - max_val = max_val * 10 + int(d[0] - '0'); - } else { - min_val = min_val * 10 + int(d[0] - '0'); - } - } else { - valid = false; - } - } - break; - default: - break; - } - - if (!max_set) - max_val = min_val; - - if (valid) { - - c = d; - - if (stack[0]->back == NULL || !stack[0]->back->quantifiable) - REGEX_COMPILE_FAIL("element not quantifiable"); - - if (min_val != max_val) - for (int i = 0; i < stack.size(); ++i) - if (stack[i]->is_look_behind()) - REGEX_COMPILE_FAIL("variable length quantifiers inside lookbehind not supported"); - - RegExNodeQuantifier *quant = memnew(RegExNodeQuantifier(min_val, max_val)); - quant->child = stack[0]->swap_back(quant); - quant->child->previous = NULL; - quant->child->parent = quant; - - if (min_val == max_val && quant->child->length >= 0) - quant->length = max_val * quant->child->length; - - if (c[1] == '?') { - quant->greedy = false; - ++c; - } - break; - } - } - default: - stack[0]->add_child(memnew(RegExNodeChar(c[0]))); - break; - } - } - if (stack.size() > 1) - REGEX_COMPILE_FAIL("unclosed group '('"); - return OK; + return count; } -Ref<RegExMatch> RegEx::search(const String &p_text, int p_start, int p_end) const { - - ERR_FAIL_COND_V(!is_valid(), NULL); - ERR_FAIL_COND_V(p_start < 0, NULL); - ERR_FAIL_COND_V(p_start >= p_text.length(), NULL); - ERR_FAIL_COND_V(p_end > p_text.length(), NULL); - ERR_FAIL_COND_V(p_end != -1 && p_end < p_start, NULL); +Array RegEx::get_names() const { - Ref<RegExMatch> res = memnew(RegExMatch()); + Array result; - for (int i = 0; i < group_names.size(); ++i) { - RegExMatch::Group group; - group.name = group_names[i]; - res->captures.push_back(group); - } + ERR_FAIL_COND_V(!is_valid(), result); - res->string = p_text; + uint32_t count; + const CharType *table; + uint32_t entry_size; - if (p_end == -1) - p_end = p_text.length(); + _pattern_info(PCRE2_INFO_NAMECOUNT, &count); + _pattern_info(PCRE2_INFO_NAMETABLE, &table); + _pattern_info(PCRE2_INFO_NAMEENTRYSIZE, &entry_size); - RegExSearch s(res, p_end, lookahead_depth); + for (int i = 0; i < count; i++) { - for (int i = p_start; i <= s.end; ++i) { - for (int c = 0; c < group_names.size(); ++c) { - res->captures[c].start = -1; - res->captures[c].length = 0; + String name = &table[i * entry_size + 1]; + if (result.find(name) < 0) { + result.append(name); } - if (root->test(s, i) >= 0) - break; } - if (res->captures[0].start >= 0) - return res; - return NULL; + return result; } -String RegEx::sub(const String &p_text, const String &p_replacement, bool p_all, int p_start, int p_end) const { - - ERR_FAIL_COND_V(!is_valid(), p_text); - ERR_FAIL_COND_V(p_start < 0, p_text); - ERR_FAIL_COND_V(p_start >= p_text.length(), p_text); - ERR_FAIL_COND_V(p_end > p_text.length(), p_text); - ERR_FAIL_COND_V(p_end != -1 && p_end < p_start, p_text); - - String text = p_text; - int start = p_start; - - if (p_end == -1) - p_end = p_text.length(); - - while (start < text.length() && (p_all || start == p_start)) { - - Ref<RegExMatch> m = search(text, start, p_end); - - RegExMatch::Group &s = m->captures[0]; - - if (s.start < 0) - break; - - String res = text.substr(0, s.start) + m->expand(p_replacement); - - start = res.length(); +RegEx::RegEx() { - if (s.length == 0) - ++start; + if (sizeof(CharType) == 2) { - int sub_end = s.start + s.length; - if (sub_end < text.length()) - res += text.substr(sub_end, text.length() - sub_end); + general_ctx = pcre2_general_context_create_16(&_regex_malloc, &_regex_free, NULL); - p_end += res.length() - text.length(); + } else { - text = res; + general_ctx = pcre2_general_context_create_32(&_regex_malloc, &_regex_free, NULL); } - return text; + code = NULL; } -void RegEx::clear() { - - if (root) - memdelete(root); - - root = NULL; - group_names.clear(); - lookahead_depth = 0; -} - -bool RegEx::is_valid() const { - - return (root != NULL); -} - -String RegEx::get_pattern() const { - - return pattern; -} - -int RegEx::get_group_count() const { - - int count = 0; - for (int i = 1; i < group_names.size(); ++i) - if (group_names[i].get_type() == Variant::INT) - ++count; - return count; -} - -Array RegEx::get_names() const { +RegEx::RegEx(const String &p_pattern) { - Array res; - for (int i = 1; i < group_names.size(); ++i) - if (group_names[i].get_type() == Variant::STRING) - res.push_back(group_names[i]); - return res; -} + if (sizeof(CharType) == 2) { -RegEx::RegEx() { + general_ctx = pcre2_general_context_create_16(&_regex_malloc, &_regex_free, NULL); - root = NULL; - lookahead_depth = 0; -} + } else { -RegEx::RegEx(const String &p_pattern) { - - root = NULL; + general_ctx = pcre2_general_context_create_32(&_regex_malloc, &_regex_free, NULL); + } + code = NULL; compile(p_pattern); } RegEx::~RegEx() { - if (root) - memdelete(root); -} + if (sizeof(CharType) == 2) { -void RegExMatch::_bind_methods() { + if (code) + pcre2_code_free_16((pcre2_code_16 *)code); + pcre2_general_context_free_16((pcre2_general_context_16 *)general_ctx); - ClassDB::bind_method(D_METHOD("expand", "template"), &RegExMatch::expand); - ClassDB::bind_method(D_METHOD("get_group_count"), &RegExMatch::get_group_count); - ClassDB::bind_method(D_METHOD("get_group_array"), &RegExMatch::get_group_array); - ClassDB::bind_method(D_METHOD("get_names"), &RegExMatch::get_names); - ClassDB::bind_method(D_METHOD("get_name_dict"), &RegExMatch::get_name_dict); - ClassDB::bind_method(D_METHOD("get_string", "name"), &RegExMatch::get_string, DEFVAL(0)); - ClassDB::bind_method(D_METHOD("get_start", "name"), &RegExMatch::get_start, DEFVAL(0)); - ClassDB::bind_method(D_METHOD("get_end", "name"), &RegExMatch::get_end, DEFVAL(0)); + } else { + + if (code) + pcre2_code_free_32((pcre2_code_32 *)code); + pcre2_general_context_free_32((pcre2_general_context_32 *)general_ctx); + } } void RegEx::_bind_methods() { ClassDB::bind_method(D_METHOD("clear"), &RegEx::clear); ClassDB::bind_method(D_METHOD("compile", "pattern"), &RegEx::compile); - ClassDB::bind_method(D_METHOD("search", "text", "start", "end"), &RegEx::search, DEFVAL(0), DEFVAL(-1)); - ClassDB::bind_method(D_METHOD("sub", "text", "replacement", "all", "start", "end"), &RegEx::sub, DEFVAL(false), DEFVAL(0), DEFVAL(-1)); + ClassDB::bind_method(D_METHOD("search", "subject", "offset", "end"), &RegEx::search, DEFVAL(0), DEFVAL(-1)); + ClassDB::bind_method(D_METHOD("sub", "subject", "replacement", "all", "offset", "end"), &RegEx::sub, DEFVAL(false), DEFVAL(0), DEFVAL(-1)); ClassDB::bind_method(D_METHOD("is_valid"), &RegEx::is_valid); ClassDB::bind_method(D_METHOD("get_pattern"), &RegEx::get_pattern); ClassDB::bind_method(D_METHOD("get_group_count"), &RegEx::get_group_count); ClassDB::bind_method(D_METHOD("get_names"), &RegEx::get_names); - - ADD_PROPERTY(PropertyInfo(Variant::STRING, "pattern"), "compile", "get_pattern"); } |