diff options
Diffstat (limited to 'modules/regex/regex.cpp')
-rw-r--r-- | modules/regex/regex.cpp | 1502 |
1 files changed, 1502 insertions, 0 deletions
diff --git a/modules/regex/regex.cpp b/modules/regex/regex.cpp new file mode 100644 index 0000000000..a0f4b4934c --- /dev/null +++ b/modules/regex/regex.cpp @@ -0,0 +1,1502 @@ +/*************************************************************************/ +/* regex.cpp */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* http://www.godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2016 Juan Linietsky, Ariel Manzur. */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#include "regex.h" +#include <wctype.h> +#include <wchar.h> + +static int RegEx_hex2int(const CharType c) +{ + if ('0' <= c && c <= '9') + return int(c - '0'); + else if ('a' <= c && c <= 'f') + return int(c - 'a') + 10; + else if ('A' <= c && c <= 'F') + return int(c - 'A') + 10; + return -1; +} + +struct RegExSearch { + + Ref<RegExMatch> match; + const CharType* str; + int end; + int eof; + + // For standard quantifier behaviour, test_parent is used to check the + // rest of the pattern. If the pattern matches, to prevent the parent + // from testing again, the complete flag is used as a shortcut out. + bool complete; + + // With lookahead, the position needs to rewind to its starting position + // when test_parent is used. Due to functional programming, this state + // has to be kept as a parameter. + Vector<int> lookahead_pos; + + CharType at(int p_pos) { + return str[p_pos]; + } + + RegExSearch(Ref<RegExMatch>& p_match, int p_end, int p_lookahead) : match(p_match) { + + str = p_match->string.c_str(); + end = p_end; + eof = p_match->string.length(); + complete = false; + lookahead_pos.resize(p_lookahead); + } + +}; + +struct RegExNode { + + RegExNode* next; + RegExNode* previous; + RegExNode* parent; + bool quantifiable; + int length; + + RegExNode() { + + next = NULL; + previous = NULL; + parent = NULL; + quantifiable = false; + length = -1; + } + + virtual ~RegExNode() { + + if (next) + memdelete(next); + } + + virtual int test(RegExSearch& s, int pos) const { + + return next ? next->test(s, pos) : -1; + } + + virtual int test_parent(RegExSearch& s, int pos) const { + + if (next) + pos = next->test(s, pos); + + if (pos >= 0) { + s.complete = true; + if (parent) + pos = parent->test_parent(s, pos); + } + + if (pos < 0) + s.complete = false; + + return pos; + } + + void increment_length(int amount, bool subtract = false) { + + if (amount >= 0 && length >= 0) { + if (!subtract) + length += amount; + else + length -= amount; + } else { + length = -1; + } + + if (parent) + parent->increment_length(amount, subtract); + + } + +}; + +struct RegExNodeChar : public RegExNode { + + CharType ch; + + RegExNodeChar(CharType p_char) { + + length = 1; + quantifiable = true; + ch = p_char; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (s.end <= pos || 0 > pos || s.at(pos) != ch) + return -1; + + return next ? next->test(s, pos + 1) : pos + 1; + } + + static CharType parse_escape(const CharType*& c) { + + int point = 0; + switch (c[1]) { + case 'x': + for (int i = 2; i <= 3; ++i) { + int res = RegEx_hex2int(c[i]); + if (res == -1) + return '\0'; + point = (point << 4) + res; + } + c = &c[3]; + return CharType(point); + case 'u': + for (int i = 2; i <= 5; ++i) { + int res = RegEx_hex2int(c[i]); + if (res == -1) + return '\0'; + point = (point << 4) + res; + } + c = &c[5]; + return CharType(point); + case '0': ++c; return '\0'; + case 'a': ++c; return '\a'; + case 'e': ++c; return '\e'; + case 'f': ++c; return '\f'; + case 'n': ++c; return '\n'; + case 'r': ++c; return '\r'; + case 't': ++c; return '\t'; + case 'v': ++c; return '\v'; + case 'b': ++c; return '\b'; + default: break; + } + return (++c)[0]; + } +}; + +struct RegExNodeRange : public RegExNode { + + CharType start; + CharType end; + + RegExNodeRange(CharType p_start, CharType p_end) { + + length = 1; + quantifiable = true; + start = p_start; + end = p_end; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (s.end <= pos || 0 > pos) + return -1; + + CharType c = s.at(pos); + if (c < start || end < c) + return -1; + + return next ? next->test(s, pos + 1) : pos + 1; + } +}; + +struct RegExNodeShorthand : public RegExNode { + + CharType repr; + + RegExNodeShorthand(CharType p_repr) { + + length = 1; + quantifiable = true; + repr = p_repr; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (s.end <= pos || 0 > pos) + return -1; + + bool found = false; + bool invert = false; + CharType c = s.at(pos); + switch (repr) { + case '.': + found = true; + break; + case 'W': + invert = true; + case 'w': + found = (c == '_' || iswalnum(c) != 0); + break; + case 'D': + invert = true; + case 'd': + found = ('0' <= c && c <= '9'); + break; + case 'S': + invert = true; + case 's': + found = (iswspace(c) != 0); + break; + default: + break; + } + + if (found == invert) + return -1; + + return next ? next->test(s, pos + 1) : pos + 1; + } +}; + +struct RegExNodeClass : public RegExNode { + + enum Type { + Type_none, + Type_alnum, + Type_alpha, + Type_ascii, + Type_blank, + Type_cntrl, + Type_digit, + Type_graph, + Type_lower, + Type_print, + Type_punct, + Type_space, + Type_upper, + Type_xdigit, + Type_word + }; + + Type type; + + bool test_class(CharType c) const { + + static Vector<CharType> REGEX_NODE_SPACE = String(" \t\r\n\f"); + static Vector<CharType> REGEX_NODE_PUNCT = String("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"); + + switch (type) { + case Type_alnum: + if ('0' <= c && c <= '9') return true; + if ('a' <= c && c <= 'z') return true; + if ('A' <= c && c <= 'Z') return true; + return false; + case Type_alpha: + if ('a' <= c && c <= 'z') return true; + if ('A' <= c && c <= 'Z') return true; + return false; + case Type_ascii: + return (0x00 <= c && c <= 0x7F); + case Type_blank: + return (c == ' ' || c == '\t'); + case Type_cntrl: + return ((0x00 <= c && c <= 0x1F) || c == 0x7F); + case Type_digit: + return ('0' <= c && c <= '9'); + case Type_graph: + return (0x20 < c && c < 0x7F); + case Type_lower: + return ('a' <= c && c <= 'z'); + case Type_print: + return (0x1F < c && c < 0x1F); + case Type_punct: + return (REGEX_NODE_PUNCT.find(c) >= 0); + case Type_space: + return (REGEX_NODE_SPACE.find(c) >= 0); + case Type_upper: + return ('A' <= c && c <= 'Z'); + case Type_xdigit: + if ('0' <= c && c <= '9') return true; + if ('a' <= c && c <= 'f') return true; + if ('A' <= c && c <= 'F') return true; + return false; + case Type_word: + if ('0' <= c && c <= '9') return true; + if ('a' <= c && c <= 'z') return true; + if ('A' <= c && c <= 'Z') return true; + return (c == '_'); + default: + return false; + } + return false; + } + + RegExNodeClass(Type p_type) { + + length = 1; + quantifiable = true; + type = p_type; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (s.end <= pos || 0 > pos) + return -1; + + if (!test_class(s.at(pos))) + return -1; + + return next ? next->test(s, pos + 1) : pos + 1; + } + +#define REGEX_CMP_CLASS(POS, NAME) if (cmp_class(POS, #NAME)) return Type_ ## NAME + + static Type parse_type(const CharType*& p_pos) { + + REGEX_CMP_CLASS(p_pos, alnum); + REGEX_CMP_CLASS(p_pos, alpha); + REGEX_CMP_CLASS(p_pos, ascii); + REGEX_CMP_CLASS(p_pos, blank); + REGEX_CMP_CLASS(p_pos, cntrl); + REGEX_CMP_CLASS(p_pos, digit); + REGEX_CMP_CLASS(p_pos, graph); + REGEX_CMP_CLASS(p_pos, lower); + REGEX_CMP_CLASS(p_pos, print); + REGEX_CMP_CLASS(p_pos, punct); + REGEX_CMP_CLASS(p_pos, space); + REGEX_CMP_CLASS(p_pos, upper); + REGEX_CMP_CLASS(p_pos, xdigit); + REGEX_CMP_CLASS(p_pos, word); + return Type_none; + } + + static bool cmp_class(const CharType*& p_pos, const char* p_text) { + + unsigned int i = 0; + for (i = 0; p_text[i] != '\0'; ++i) + if (p_pos[i] != p_text[i]) + return false; + + if (p_pos[i++] != ':' || p_pos[i] != ']') + return false; + + p_pos = &p_pos[i]; + return true; + } +}; + +struct RegExNodeAnchorStart : public RegExNode { + + RegExNodeAnchorStart() { + + length = 0; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (pos != 0) + return -1; + + return next ? next->test(s, pos) : pos; + } +}; + +struct RegExNodeAnchorEnd : public RegExNode { + + RegExNodeAnchorEnd() { + + length = 0; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (pos != s.eof) + return -1; + + return next ? next->test(s, pos) : pos; + } +}; + +struct RegExNodeWordBoundary : public RegExNode { + + bool inverse; + + RegExNodeWordBoundary(bool p_inverse) { + + length = 0; + inverse = p_inverse; + } + + virtual int test(RegExSearch& s, int pos) const { + + bool left = false; + bool right = false; + + if (pos != 0) { + CharType c = s.at(pos - 1); + if (c == '_' || iswalnum(c)) + left = true; + } + + if (pos != s.eof) { + CharType c = s.at(pos); + if (c == '_' || iswalnum(c)) + right = true; + } + + if ((left == right) != inverse) + return -1; + + return next ? next->test(s, pos) : pos; + } +}; + +struct RegExNodeQuantifier : public RegExNode { + + int min; + int max; + bool greedy; + RegExNode* child; + + RegExNodeQuantifier(int p_min, int p_max) { + + min = p_min; + max = p_max; + greedy = true; + child = NULL; + } + + ~RegExNodeQuantifier() { + + if (child) + memdelete(child); + } + + virtual int test(RegExSearch& s, int pos) const { + + return test_step(s, pos, 0, pos); + } + + virtual int test_parent(RegExSearch& s, int pos) const { + + s.complete = false; + return pos; + } + + int test_step(RegExSearch& s, int pos, int level, int start) const { + + if (pos > s.end) + return -1; + + if (!greedy && level > min) { + int res = next ? next->test(s, pos) : pos; + if (s.complete) + return res; + + if (res >= 0 && parent->test_parent(s, res) >= 0) + return res; + } + + if (max >= 0 && level > max) + return -1; + + int res = pos; + if (level >= 1) { + if (level > min + 1 && pos == start) + return -1; + + res = child->test(s, pos); + if (s.complete) + return res; + } + + if (res >= 0) { + + int res_step = test_step(s, res, level + 1, start); + if (res_step >= 0) + return res_step; + + if (greedy && level >= min) { + if (next) + res = next->test(s, res); + if (s.complete) + return res; + + if (res >= 0 && parent->test_parent(s, res) >= 0) + return res; + } + } + return -1; + } +}; + +struct RegExNodeBackReference : public RegExNode { + + int id; + + RegExNodeBackReference(int p_id) { + + length = -1; + quantifiable = true; + id = p_id; + } + + virtual int test(RegExSearch& s, int pos) const { + + RegExMatch::Group& ref = s.match->captures[id]; + for (int i = 0; i < ref.length; ++i) { + + if (pos + i >= s.end) + return -1; + + if (s.at(ref.start + i) != s.at(pos + i)) + return -1; + } + return next ? next->test(s, pos + ref.length) : pos + ref.length; + } +}; + + +struct RegExNodeGroup : public RegExNode { + + bool inverse; + bool reset_pos; + Vector<RegExNode*> childset; + RegExNode* back; + + RegExNodeGroup() { + + length = 0; + quantifiable = true; + inverse = false; + reset_pos = false; + back = NULL; + } + + virtual ~RegExNodeGroup() { + + for (int i = 0; i < childset.size(); ++i) + memdelete(childset[i]); + } + + virtual int test(RegExSearch& s, int pos) const { + + for (int i = 0; i < childset.size(); ++i) { + + s.complete = false; + + int res = childset[i]->test(s, pos); + + if (s.complete) + return res; + + if (inverse) { + if (res < 0) + res = pos + 1; + else + return -1; + + if (i + 1 < childset.size()) + continue; + } + + if (res >= 0) { + if (reset_pos) + res = pos; + return next ? next->test(s, res) : res; + } + } + return -1; + } + + void add_child(RegExNode* node) { + + node->parent = this; + node->previous = back; + + if (back) + back->next = node; + else + childset.push_back(node); + + increment_length(node->length); + + back = node; + } + + void add_childset() { + + if (childset.size() > 0) + length = -1; + back = NULL; + } + + RegExNode* swap_back(RegExNode* node) { + + RegExNode* old = back; + + if (old) { + if (!old->previous) + childset.remove(childset.size() - 1); + back = old->previous; + increment_length(old->length, true); + } + + add_child(node); + + return old; + } +}; + +struct RegExNodeCapturing : public RegExNodeGroup { + + int id; + + RegExNodeCapturing(int p_id = 0) { + + id = p_id; + } + + virtual int test(RegExSearch& s, int pos) const { + + RegExMatch::Group& ref = s.match->captures[id]; + int old_start = ref.start; + ref.start = pos; + + int res = RegExNodeGroup::test(s, pos); + + if (res >= 0) { + if (!s.complete) + ref.length = res - pos; + } else { + ref.start = old_start; + } + + return res; + } + + virtual int test_parent(RegExSearch& s, int pos) const { + + RegExMatch::Group& ref = s.match->captures[id]; + ref.length = pos - ref.start; + return RegExNode::test_parent(s, pos); + } + + static Variant parse_name(const CharType*& c, bool p_allow_numeric) { + + if (c[1] == '0') { + return -1; + } else if ('1' <= c[1] && c[1] <= '9') { + if (!p_allow_numeric) + return -1; + int res = (++c)[0] - '0'; + while ('0' <= c[1] && c[1] <= '9') + res = res * 10 + int((++c)[0] - '0'); + if ((++c)[0] != '>') + return -1; + return res; + } else if (iswalnum(c[1])) { + String res(++c, 1); + while (iswalnum(c[1])) + res += String(++c, 1); + if ((++c)[0] != '>') + return -1; + return res; + } + return -1; + } +}; + +struct RegExNodeLookAhead : public RegExNodeGroup { + + int id; + + RegExNodeLookAhead(bool p_inverse, int p_id = 0) { + + quantifiable = false; + inverse = p_inverse; + reset_pos = true; + id = p_id; + } + + virtual int test(RegExSearch& s, int pos) const { + + s.lookahead_pos[id] = pos; + return RegExNodeGroup::test(s, pos); + } + + virtual int test_parent(RegExSearch& s, int pos) const { + + return RegExNode::test_parent(s, s.lookahead_pos[id]); + } +}; + +struct RegExNodeLookBehind : public RegExNodeGroup { + + RegExNodeLookBehind(bool p_inverse, int p_id = 0) { + + quantifiable = false; + inverse = p_inverse; + reset_pos = true; + } + + virtual int test(RegExSearch& s, int pos) const { + + if (pos < length) + return -1; + return RegExNodeGroup::test(s, pos - length); + } +}; + +struct RegExNodeBracket : public RegExNode { + + bool inverse; + Vector<RegExNode*> children; + + RegExNodeBracket() { + + length = 1; + quantifiable = true; + inverse = false; + } + + virtual ~RegExNodeBracket() { + + for (int i = 0; i < children.size(); ++i) + memdelete(children[i]); + } + + virtual int test(RegExSearch& s, int pos) const { + + for (int i = 0; i < children.size(); ++i) { + + int res = children[i]->test(s, pos); + + if (inverse) { + if (res < 0) + res = pos + 1; + else + return -1; + + if (i + 1 < children.size()) + continue; + } + + if (res >= 0) + return next ? next->test(s, res) : res; + } + return -1; + } + + void add_child(RegExNode* node) { + + node->parent = this; + children.push_back(node); + } + + void pop_back() { + + memdelete(children[children.size() - 1]); + children.remove(children.size() - 1); + } +}; + +#define REGEX_EXPAND_FAIL(MSG)\ +{\ + ERR_PRINT(MSG);\ + return String();\ +} + +String RegExMatch::expand(const String& p_template) const { + + String res; + for (const CharType* c = p_template.c_str(); *c != '\0'; ++c) { + if (c[0] == '\\') { + if (('1' <= c[1] && c[1] <= '9') || (c[1] == 'g' && c[2] == '{')) { + + int ref = 0; + bool unclosed = false; + + if (c[1] == 'g') { + unclosed = true; + c = &c[2]; + } + + while ('0' <= c[1] && c[1] <= '9') { + ref = ref * 10 + int(c[1] - '0'); + ++c; + } + + if (unclosed) { + if (c[1] != '}') + REGEX_EXPAND_FAIL("unclosed backreference '{'"); + ++c; + } + + res += get_string(ref); + + } else if (c[1] =='g' && c[2] == '<') { + + const CharType* d = &c[2]; + + Variant name = RegExNodeCapturing::parse_name(d, true); + if (name == Variant(-1)) + REGEX_EXPAND_FAIL("unrecognised character for group name"); + + c = d; + + res += get_string(name); + + } else { + + const CharType* d = c; + CharType ch = RegExNodeChar::parse_escape(d); + if (c == d) + REGEX_EXPAND_FAIL("invalid escape token"); + res += String(&ch, 1); + c = d; + } + } else { + res += String(c, 1); + } + } + return res; +} + +int RegExMatch::get_group_count() const { + + int count = 0; + for (int i = 1; i < captures.size(); ++i) + if (captures[i].name.get_type() == Variant::INT) + ++count; + return count; +} + +Array RegExMatch::get_group_array() const { + + Array res; + for (int i = 1; i < captures.size(); ++i) { + const RegExMatch::Group& capture = captures[i]; + if (capture.name.get_type() != Variant::INT) + continue; + + if (capture.start >= 0) + res.push_back(string.substr(capture.start, capture.length)); + else + res.push_back(String()); + } + return res; +} + +Array RegExMatch::get_names() const { + + Array res; + for (int i = 1; i < captures.size(); ++i) + if (captures[i].name.get_type() == Variant::STRING) + res.push_back(captures[i].name); + return res; +} + +Dictionary RegExMatch::get_name_dict() const { + + Dictionary res; + for (int i = 1; i < captures.size(); ++i) { + const RegExMatch::Group& capture = captures[i]; + if (capture.name.get_type() != Variant::STRING) + continue; + + if (capture.start >= 0) + res[capture.name] = string.substr(capture.start, capture.length); + else + res[capture.name] = String(); + } + return res; +} + +String RegExMatch::get_string(const Variant& p_name) const { + + for (int i = 0; i < captures.size(); ++i) { + + const RegExMatch::Group& capture = captures[i]; + + if (capture.name != p_name) + continue; + + if (capture.start == -1) + return String(); + + return string.substr(capture.start, capture.length); + } + return String(); +} + +int RegExMatch::get_start(const Variant& p_name) const { + + for (int i = 0; i < captures.size(); ++i) + if (captures[i].name == p_name) + return captures[i].start; + return -1; +} + +int RegExMatch::get_end(const Variant& p_name) const { + + for (int i = 0; i < captures.size(); ++i) + if (captures[i].name == p_name) + return captures[i].start + captures[i].length; + return -1; +} + +RegExMatch::RegExMatch() { + +} + +static bool RegEx_is_shorthand(CharType ch) { + + switch (ch) { + case 'w': + case 'W': + case 'd': + case 'D': + case 's': + case 'S': + return true; + default: + break; + } + return false; +} + +#define REGEX_COMPILE_FAIL(MSG)\ +{\ + ERR_PRINT(MSG);\ + clear();\ + return FAILED;\ +} + +Error RegEx::compile(const String& p_pattern) { + + ERR_FAIL_COND_V(p_pattern.length() == 0, FAILED); + + if (pattern == p_pattern && root) + return OK; + + clear(); + pattern = p_pattern; + group_names.push_back(0); + RegExNodeGroup* root_group = memnew(RegExNodeCapturing(0)); + root = root_group; + Vector<RegExNodeGroup*> stack; + stack.push_back(root_group); + int lookahead_level = 0; + int numeric_groups = 0; + const int numeric_max = 9; + + for (const CharType* c = p_pattern.c_str(); *c != '\0'; ++c) { + + switch (c[0]) { + case '(': + if (c[1] == '?') { + + RegExNodeGroup* group = NULL; + switch (c[2]) { + case ':': + c = &c[2]; + group = memnew(RegExNodeGroup()); + break; + case '!': + case '=': + group = memnew(RegExNodeLookAhead((c[2] == '!'), lookahead_level++)); + if (lookahead_depth < lookahead_level) + lookahead_depth = lookahead_level; + c = &c[2]; + break; + case '<': + if (c[3] == '!' || c[3] == '=') { + group = memnew(RegExNodeLookBehind((c[3] == '!'), lookahead_level++)); + c = &c[3]; + } + break; + case 'P': + if (c[3] == '<') { + const CharType* d = &c[3]; + Variant name = RegExNodeCapturing::parse_name(d, false); + if (name == Variant(-1)) + REGEX_COMPILE_FAIL("unrecognised character for group name"); + group = memnew(RegExNodeCapturing(group_names.size())); + group_names.push_back(name); + c = d; + } + default: + break; + } + if (!group) + REGEX_COMPILE_FAIL("unrecognised qualifier for group"); + stack[0]->add_child(group); + stack.insert(0, group); + + } else if (numeric_groups < numeric_max) { + + RegExNodeCapturing* group = memnew(RegExNodeCapturing(group_names.size())); + group_names.push_back(++numeric_groups); + stack[0]->add_child(group); + stack.insert(0, group); + + } else { + + RegExNodeGroup* group = memnew(RegExNodeGroup()); + stack[0]->add_child(group); + stack.insert(0, group); + } + break; + case ')': + if (stack.size() == 1) + REGEX_COMPILE_FAIL("unexpected ')'"); + stack.remove(0); + break; + case '\\': + if (('1' <= c[1] && c[1] <= '9') || (c[1] == 'g' && c[2] == '{')) { + + int ref = 0; + bool unclosed = false; + + if (c[1] == 'g') { + unclosed = true; + c = &c[2]; + } + + while ('0' <= c[1] && c[1] <= '9') { + ref = ref * 10 + int(c[1] - '0'); + ++c; + } + + if (unclosed) { + if (c[1] != '}') + REGEX_COMPILE_FAIL("unclosed backreference '{'"); + ++c; + } + + if (ref > numeric_groups || ref <= 0) + REGEX_COMPILE_FAIL("backreference not found"); + + for (int i = 0; i < stack.size(); ++i) + if (dynamic_cast<RegExNodeLookBehind*>(stack[i])) + REGEX_COMPILE_FAIL("backreferences inside lookbehind not supported"); + + for (int i = 0; i < group_names.size(); ++i) { + if (group_names[i].get_type() == Variant::INT && int(group_names[i]) == ref) { + ref = group_names[i]; + break; + } + } + + stack[0]->add_child(memnew(RegExNodeBackReference(ref))); + + } if (c[1] =='g' && c[2] == '<') { + + const CharType* d = &c[2]; + + Variant name = RegExNodeCapturing::parse_name(d, true); + if (name == Variant(-1)) + REGEX_COMPILE_FAIL("unrecognised character for group name"); + + c = d; + + for (int i = 0; i < stack.size(); ++i) + if (dynamic_cast<RegExNodeLookBehind*>(stack[i])) + REGEX_COMPILE_FAIL("backreferences inside lookbehind not supported"); + + int ref = -1; + + for (int i = 0; i < group_names.size(); ++i) { + if (group_names[i].get_type() == Variant::INT && int(group_names[i]) == ref) { + ref = group_names[i]; + break; + } + } + + if (ref == -1) + REGEX_COMPILE_FAIL("backreference not found"); + + stack[0]->add_child(memnew(RegExNodeBackReference(ref))); + + } else if (c[1] == 'b' || c[1] == 'B') { + + stack[0]->add_child(memnew(RegExNodeWordBoundary(*(++c) == 'B'))); + + } else if (RegEx_is_shorthand(c[1])) { + + stack[0]->add_child(memnew(RegExNodeShorthand(*(++c)))); + + } else { + + const CharType* d = c; + CharType ch = RegExNodeChar::parse_escape(d); + if (c == d) + REGEX_COMPILE_FAIL("invalid escape token"); + stack[0]->add_child(memnew(RegExNodeChar(ch))); + c = d; + + } + break; + case '[': + { + RegExNodeBracket* bracket = memnew(RegExNodeBracket()); + stack[0]->add_child(bracket); + if (c[1] == '^') { + bracket->inverse = true; + ++c; + } + bool first_child = true; + CharType previous_child; + bool previous_child_single = false; + while (true) { + ++c; + if (!first_child && c[0] == ']') { + + break; + + } else if (c[0] == '\0') { + + REGEX_COMPILE_FAIL("unclosed bracket expression '['"); + + } else if (c[0] == '\\') { + + if (RegEx_is_shorthand(c[1])) { + bracket->add_child(memnew(RegExNodeShorthand(*(++c)))); + } else { + const CharType* d = c; + CharType ch = RegExNodeChar::parse_escape(d); + if (c == d) + REGEX_COMPILE_FAIL("invalid escape token"); + bracket->add_child(memnew(RegExNodeChar(ch))); + c = d; + previous_child = ch; + previous_child_single = true; + } + + } else if (c[0] == ']' && c[1] == ':') { + + const CharType* d = &c[2]; + RegExNodeClass::Type type = RegExNodeClass::parse_type(d); + if (type != RegExNodeClass::Type_none) { + + c = d; + previous_child_single = false; + + } else { + + bracket->add_child(memnew(RegExNodeChar('['))); + previous_child = '['; + previous_child_single = true; + } + } else if (previous_child_single && c[0] == '-') { + + if (c[1] != '\0' && c[1] != ']') { + + CharType next; + + if (c[1] == '\\') { + const CharType* d = ++c; + next = RegExNodeChar::parse_escape(d); + if (c == d) + REGEX_COMPILE_FAIL("invalid escape token"); + } else { + next = *(++c); + } + + if (next < previous_child) + REGEX_COMPILE_FAIL("text range out of order"); + + bracket->pop_back(); + bracket->add_child(memnew(RegExNodeRange(previous_child, next))); + previous_child_single = false; + } else { + + bracket->add_child(memnew(RegExNodeChar('-'))); + previous_child = '-'; + previous_child_single = true; + } + } else { + + bracket->add_child(memnew(RegExNodeChar(c[0]))); + previous_child = c[0]; + previous_child_single = true; + } + first_child = false; + } + } + break; + case '|': + for (int i = 0; i < stack.size(); ++i) + if (dynamic_cast<RegExNodeLookBehind*>(stack[i])) + REGEX_COMPILE_FAIL("alternations inside lookbehind not supported"); + stack[0]->add_childset(); + break; + case '^': + stack[0]->add_child(memnew(RegExNodeAnchorStart())); + break; + case '$': + stack[0]->add_child(memnew(RegExNodeAnchorEnd())); + break; + case '.': + stack[0]->add_child(memnew(RegExNodeShorthand('.'))); + break; + case '?': + case '*': + case '+': + case '{': + { + int min_val = 0; + int max_val = -1; + bool valid = true; + const CharType* d = c; + bool max_set = true; + switch (c[0]) { + case '?': + min_val = 0; + max_val = 1; + break; + case '*': + min_val = 0; + max_val = -1; + break; + case '+': + min_val = 1; + max_val = -1; + break; + case '{': + max_set = false; + while (valid) { + ++d; + if (d[0] == '}') { + break; + } else if (d[0] == ',') { + max_set = true; + } else if ('0' <= d[0] && d[0] <= '9') { + if (max_set) { + if (max_val < 0) + max_val = int(d[0] - '0'); + else + max_val = max_val * 10 + int(d[0] - '0'); + } else { + min_val = min_val * 10 + int(d[0] - '0'); + } + } else { + valid = false; + } + } + break; + default: + break; + } + + if (!max_set) + max_val = min_val; + + if (valid) { + + c = d; + + if (stack[0]->back == NULL || !stack[0]->back->quantifiable) + REGEX_COMPILE_FAIL("element not quantifiable"); + + if (min_val != max_val) + for (int i = 0; i < stack.size(); ++i) + if (dynamic_cast<RegExNodeLookBehind*>(stack[i])) + REGEX_COMPILE_FAIL("variable length quantifiers inside lookbehind not supported"); + + RegExNodeQuantifier* quant = memnew(RegExNodeQuantifier(min_val, max_val)); + quant->child = stack[0]->swap_back(quant); + quant->child->previous = NULL; + quant->child->parent = quant; + + if (min_val == max_val && quant->child->length >= 0) + quant->length = max_val * quant->child->length; + + if (c[1] == '?') { + quant->greedy = false; + ++c; + } + break; + } + } + default: + stack[0]->add_child(memnew(RegExNodeChar(c[0]))); + break; + } + } + if (stack.size() > 1) + REGEX_COMPILE_FAIL("unclosed group '('"); + return OK; +} + +Ref<RegExMatch> RegEx::search(const String& p_text, int p_start, int p_end) const { + + ERR_FAIL_COND_V(!is_valid(), NULL); + ERR_FAIL_COND_V(p_start < 0, NULL); + ERR_FAIL_COND_V(p_start >= p_text.length(), NULL); + ERR_FAIL_COND_V(p_end > p_text.length(), NULL); + ERR_FAIL_COND_V(p_end != -1 && p_end < p_start, NULL); + + Ref<RegExMatch> res = memnew(RegExMatch()); + + for (int i = 0; i < group_names.size(); ++i) { + RegExMatch::Group group; + group.name = group_names[i]; + res->captures.push_back(group); + } + + res->string = p_text; + + if (p_end == -1) + p_end = p_text.length(); + + RegExSearch s(res, p_end, lookahead_depth); + + for (int i = p_start; i <= s.end; ++i) { + for (int c = 0; c < group_names.size(); ++c) { + res->captures[c].start = -1; + res->captures[c].length = 0; + } + if (root->test(s, i) >= 0) + break; + } + + if (res->captures[0].start >= 0) + return res; + return NULL; +} + +String RegEx::sub(const String& p_text, const String& p_replacement, bool p_all, int p_start, int p_end) const { + + ERR_FAIL_COND_V(!is_valid(), p_text); + ERR_FAIL_COND_V(p_start < 0, p_text); + ERR_FAIL_COND_V(p_start >= p_text.length(), p_text); + ERR_FAIL_COND_V(p_end > p_text.length(), p_text); + ERR_FAIL_COND_V(p_end != -1 && p_end < p_start, p_text); + + String text = p_text; + int start = p_start; + + if (p_end == -1) + p_end = p_text.length(); + + while (start < text.length() && (p_all || start == p_start)) { + + Ref<RegExMatch> m = search(text, start, p_end); + + RegExMatch::Group& s = m->captures[0]; + + if (s.start < 0) + break; + + String res = text.substr(0, s.start) + m->expand(p_replacement); + + start = res.length(); + + if (s.length == 0) + ++start; + + int sub_end = s.start + s.length; + if (sub_end < text.length()) + res += text.substr(sub_end, text.length() - sub_end); + + p_end += res.length() - text.length(); + + text = res; + } + return text; +} + +void RegEx::clear() { + + if (root) + memdelete(root); + + root = NULL; + group_names.clear(); + lookahead_depth = 0; +} + +bool RegEx::is_valid() const { + + return (root != NULL); +} + +String RegEx::get_pattern() const { + + return pattern; +} + +int RegEx::get_group_count() const { + + int count = 0; + for (int i = 1; i < group_names.size(); ++i) + if (group_names[i].get_type() == Variant::INT) + ++count; + return count; +} + +Array RegEx::get_names() const { + + Array res; + for (int i = 1; i < group_names.size(); ++i) + if (group_names[i].get_type() == Variant::STRING) + res.push_back(group_names[i]); + return res; +} + +RegEx::RegEx() { + + root = NULL; + lookahead_depth = 0; +} + +RegEx::RegEx(const String& p_pattern) { + + root = NULL; + compile(p_pattern); +} + +RegEx::~RegEx() { + + if (root) + memdelete(root); +} + +void RegExMatch::_bind_methods() { + + ObjectTypeDB::bind_method(_MD("expand","template"),&RegExMatch::expand); + ObjectTypeDB::bind_method(_MD("get_group_count"),&RegExMatch::get_group_count); + ObjectTypeDB::bind_method(_MD("get_group_array"),&RegExMatch::get_group_array); + ObjectTypeDB::bind_method(_MD("get_names"),&RegExMatch::get_names); + ObjectTypeDB::bind_method(_MD("get_name_dict"),&RegExMatch::get_name_dict); + ObjectTypeDB::bind_method(_MD("get_string","name"),&RegExMatch::get_string, DEFVAL(0)); + ObjectTypeDB::bind_method(_MD("get_start","name"),&RegExMatch::get_start, DEFVAL(0)); + ObjectTypeDB::bind_method(_MD("get_end","name"),&RegExMatch::get_end, DEFVAL(0)); +} + +void RegEx::_bind_methods() { + + ObjectTypeDB::bind_method(_MD("clear"),&RegEx::clear); + ObjectTypeDB::bind_method(_MD("compile","pattern"),&RegEx::compile); + ObjectTypeDB::bind_method(_MD("search","text","start","end"),&RegEx::search, DEFVAL(0), DEFVAL(-1)); + ObjectTypeDB::bind_method(_MD("sub","text","replacement","all","start","end"),&RegEx::sub, DEFVAL(false), DEFVAL(0), DEFVAL(-1)); + ObjectTypeDB::bind_method(_MD("is_valid"),&RegEx::is_valid); + ObjectTypeDB::bind_method(_MD("get_pattern"),&RegEx::get_pattern); + ObjectTypeDB::bind_method(_MD("get_group_count"),&RegEx::get_group_count); + ObjectTypeDB::bind_method(_MD("get_names"),&RegEx::get_names); + + ADD_PROPERTY(PropertyInfo(Variant::STRING, "pattern"), _SCS("compile"), _SCS("get_pattern")); +} + |