diff options
author | Juan Linietsky <reduzio@gmail.com> | 2015-11-18 19:13:57 -0300 |
---|---|---|
committer | Juan Linietsky <reduzio@gmail.com> | 2015-11-18 19:13:57 -0300 |
commit | 474822b6db6c96aba2684aac7039ee03b605630c (patch) | |
tree | 5e20a94cbb60cf3288d30077a014ba4a1a60643c /drivers | |
parent | 6aa95e77f3bae35fb136eb00e271362c1707058e (diff) | |
parent | f0d246a7bce9f8f22e9886d474f1758e0c20d8cb (diff) |
Merge pull request #2642 from leezh/nrex-update
Updated nrex with new features
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/nrex/README.md | 57 | ||||
-rw-r--r-- | drivers/nrex/nrex.cpp | 790 | ||||
-rw-r--r-- | drivers/nrex/nrex.hpp | 10 | ||||
-rw-r--r-- | drivers/nrex/regex.cpp | 10 | ||||
-rw-r--r-- | drivers/nrex/regex.h | 2 |
5 files changed, 681 insertions, 188 deletions
diff --git a/drivers/nrex/README.md b/drivers/nrex/README.md index f150a5d76f..951b301c1e 100644 --- a/drivers/nrex/README.md +++ b/drivers/nrex/README.md @@ -18,47 +18,42 @@ More details about its use is documented in `nrex.hpp` Currently supported features: * Capturing `()` and non-capturing `(?:)` groups - * Any character `.` + * Any character `.` (includes newlines) * Shorthand caracter classes `\w\W\s\S\d\D` - * User-defined character classes such as `[A-Za-z]` + * POSIX character classes such as `[[:alnum:]]` + * Bracket expressions such as `[A-Za-z]` * Simple quantifiers `?`, `*` and `+` * Range quantifiers `{0,1}` * Lazy (non-greedy) quantifiers `*?` * Begining `^` and end `$` anchors + * Word boundaries `\b` * Alternation `|` - * Backreferences `\1` to `\99` - -To do list: + * ASCII `\xFF` code points * Unicode `\uFFFF` code points + * Positive `(?=)` and negative `(?!)` lookahead + * Positive `(?<=)` and negative `(?<!)` lookbehind (fixed length and no alternations) + * Backreferences `\1` to `\9` (with option to expand to `\99`) ## License Copyright (c) 2015, Zher Huei Lee All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - 1. Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - 3. Neither the name of the copyright holder nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS -IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED -TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A -PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED -TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +This software is provided 'as-is', without any express or implied +warranty. In no event will the authors be held liable for any damages +arising from the use of this software. + +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it +freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would + be appreciated but is not required. + + 2. Altered source versions must be plainly marked as such, and must not + be misrepresented as being the original software. + + 3. This notice may not be removed or altered from any source + distribution. diff --git a/drivers/nrex/nrex.cpp b/drivers/nrex/nrex.cpp index 696d46240e..104e07f887 100644 --- a/drivers/nrex/nrex.cpp +++ b/drivers/nrex/nrex.cpp @@ -29,11 +29,13 @@ #include <wctype.h> #include <wchar.h> #define NREX_ISALPHANUM iswalnum +#define NREX_ISSPACE iswspace #define NREX_STRLEN wcslen #else #include <ctype.h> #include <string.h> #define NREX_ISALPHANUM isalnum +#define NREX_ISSPACE isspace #define NREX_STRLEN strlen #endif @@ -116,34 +118,72 @@ class nrex_array } }; -static nrex_char nrex_unescape(nrex_char repr) +static int nrex_parse_hex(nrex_char c) { - switch (repr) + if ('0' <= c && c <= '9') + { + return int(c - '0'); + } + else if ('a' <= c && c <= 'f') + { + return int(c - 'a') + 10; + } + else if ('A' <= c && c <= 'F') { - case '^': return '^'; - case '$': return '$'; - case '(': return '('; - case ')': return ')'; - case '\\': return '\\'; - case '.': return '.'; - case '+': return '+'; - case '*': return '*'; - case '?': return '?'; - case '-': return '-'; - case 'a': return '\a'; - case 'e': return '\e'; - case 'f': return '\f'; - case 'n': return '\n'; - case 'r': return '\r'; - case 't': return '\t'; - case 'v': return '\v'; + return int(c - 'A') + 10; } - return 0; + return -1; +} + +static nrex_char nrex_unescape(const nrex_char*& c) +{ + switch (c[1]) + { + case '0': ++c; return '\0'; + case 'a': ++c; return '\a'; + case 'e': ++c; return '\e'; + case 'f': ++c; return '\f'; + case 'n': ++c; return '\n'; + case 'r': ++c; return '\r'; + case 't': ++c; return '\t'; + case 'v': ++c; return '\v'; + case 'b': ++c; return '\b'; + case 'x': + { + int point = 0; + for (int i = 2; i <= 3; ++i) + { + int res = nrex_parse_hex(c[i]); + if (res == -1) + { + return '\0'; + } + point = (point << 4) + res; + } + c = &c[3]; + return nrex_char(point); + } + case 'u': + { + int point = 0; + for (int i = 2; i <= 5; ++i) + { + int res = nrex_parse_hex(c[i]); + if (res == -1) + { + return '\0'; + } + point = (point << 4) + res; + } + c = &c[5]; + return nrex_char(point); + } + } + return (++c)[0]; } struct nrex_search { - public: const nrex_char* str; nrex_result* captures; int end; @@ -168,12 +208,14 @@ struct nrex_node nrex_node* previous; nrex_node* parent; bool quantifiable; + int length; nrex_node(bool quantify = false) : next(NULL) , previous(NULL) , parent(NULL) , quantifiable(quantify) + , length(-1) { } @@ -206,21 +248,57 @@ struct nrex_node } return pos; } + + void increment_length(int amount, bool subtract = false) + { + if (amount >= 0 && length >= 0) + { + if (!subtract) + { + length += amount; + } + else + { + length -= amount; + } + } + else + { + length = -1; + } + if (parent) + { + parent->increment_length(amount, subtract); + } + } }; struct nrex_node_group : public nrex_node { - int capturing; + static const int NonCapture = -1; + static const int Bracket = -2; + static const int LookAhead = -3; + static const int LookBehind = -4; + + int mode; bool negate; nrex_array<nrex_node*> childset; nrex_node* back; - nrex_node_group(int capturing) + nrex_node_group(int mode) : nrex_node(true) - , capturing(capturing) + , mode(mode) , negate(false) , back(NULL) { + if (mode != Bracket) + { + length = 0; + } + else + { + length = 1; + } } virtual ~nrex_node_group() @@ -234,14 +312,19 @@ struct nrex_node_group : public nrex_node int test(nrex_search* s, int pos) const { - if (capturing >= 0) + if (mode >= 0) { - s->captures[capturing].start = pos; + s->captures[mode].start = pos; } for (unsigned int i = 0; i < childset.size(); ++i) { s->complete = false; - int res = childset[i]->test(s, pos); + int offset = 0; + if (mode == LookBehind) + { + offset = length; + } + int res = childset[i]->test(s, pos - offset); if (s->complete) { return res; @@ -256,12 +339,20 @@ struct nrex_node_group : public nrex_node { return -1; } + if (i + 1 < childset.size()) + { + continue; + } } if (res >= 0) { - if (capturing >= 0) + if (mode >= 0) { - s->captures[capturing].length = res - pos; + s->captures[mode].length = res - pos; + } + else if (mode == LookAhead || mode == LookBehind) + { + res = pos; } return next ? next->test(s, res) : res; } @@ -271,15 +362,19 @@ struct nrex_node_group : public nrex_node virtual int test_parent(nrex_search* s, int pos) const { - if (capturing >= 0) + if (mode >= 0) { - s->captures[capturing].length = pos - s->captures[capturing].start; + s->captures[mode].length = pos - s->captures[mode].start; } return nrex_node::test_parent(s, pos); } void add_childset() { + if (childset.size() > 0 && mode != Bracket) + { + length = -1; + } back = NULL; } @@ -287,7 +382,7 @@ struct nrex_node_group : public nrex_node { node->parent = this; node->previous = back; - if (back) + if (back && mode != Bracket) { back->next = node; } @@ -295,6 +390,10 @@ struct nrex_node_group : public nrex_node { childset.push(node); } + if (mode != Bracket) + { + increment_length(node->length); + } back = node; } @@ -310,10 +409,32 @@ struct nrex_node_group : public nrex_node { childset.pop(); } + if (mode != Bracket) + { + increment_length(old->length, true); + } back = old->previous; add_child(node); return old; } + + void pop_back() + { + if (back) + { + nrex_node* old = back; + if (!old->previous) + { + childset.pop(); + } + if (mode != Bracket) + { + increment_length(old->length, true); + } + back = old->previous; + NREX_DELETE(old); + } + } }; struct nrex_node_char : public nrex_node @@ -324,6 +445,7 @@ struct nrex_node_char : public nrex_node : nrex_node(true) , ch(c) { + length = 1; } int test(nrex_search* s, int pos) const @@ -346,6 +468,7 @@ struct nrex_node_range : public nrex_node , start(s) , end(e) { + length = 1; } int test(nrex_search* s, int pos) const @@ -363,20 +486,219 @@ struct nrex_node_range : public nrex_node } }; -static bool nrex_is_whitespace(nrex_char repr) +enum nrex_class_type { - switch (repr) + nrex_class_none, + nrex_class_alnum, + nrex_class_alpha, + nrex_class_blank, + nrex_class_cntrl, + nrex_class_digit, + nrex_class_graph, + nrex_class_lower, + nrex_class_print, + nrex_class_punct, + nrex_class_space, + nrex_class_upper, + nrex_class_xdigit, + nrex_class_word +}; + +static bool nrex_compare_class(const nrex_char** pos, const char* text) +{ + unsigned int i = 0; + for (i = 0; text[i] != '\0'; ++i) { - case ' ': - case '\t': - case '\r': - case '\n': - case '\f': - return true; + if ((*pos)[i] != text[i]) + { + return false; + } } - return false; + if ((*pos)[i++] != ':' || (*pos)[i] != ']') + { + return false; + } + *pos = &(*pos)[i]; + return true; } +#define NREX_COMPARE_CLASS(POS, NAME) if (nrex_compare_class(POS, #NAME)) return nrex_class_ ## NAME + +static nrex_class_type nrex_parse_class(const nrex_char** pos) +{ + NREX_COMPARE_CLASS(pos, alnum); + NREX_COMPARE_CLASS(pos, alpha); + NREX_COMPARE_CLASS(pos, blank); + NREX_COMPARE_CLASS(pos, cntrl); + NREX_COMPARE_CLASS(pos, digit); + NREX_COMPARE_CLASS(pos, graph); + NREX_COMPARE_CLASS(pos, lower); + NREX_COMPARE_CLASS(pos, print); + NREX_COMPARE_CLASS(pos, punct); + NREX_COMPARE_CLASS(pos, space); + NREX_COMPARE_CLASS(pos, upper); + NREX_COMPARE_CLASS(pos, xdigit); + NREX_COMPARE_CLASS(pos, word); + return nrex_class_none; +} + +struct nrex_node_class : public nrex_node +{ + nrex_class_type type; + + nrex_node_class(nrex_class_type t) + : nrex_node(true) + , type(t) + { + length = 1; + } + + int test(nrex_search* s, int pos) const + { + if (s->end == pos) + { + return -1; + } + if (!test_class(s->at(pos))) + { + return -1; + } + return next ? next->test(s, pos + 1) : pos + 1; + } + + bool test_class(nrex_char c) const + { + if ((0 <= c && c <= 0x1F) || c == 0x7F) + { + if (type == nrex_class_cntrl) + { + return true; + } + } + else if (c < 0x7F) + { + if (type == nrex_class_print) + { + return true; + } + else if (type == nrex_class_graph && c != ' ') + { + return true; + } + else if ('0' <= c && c <= '9') + { + switch (type) + { + case nrex_class_alnum: + case nrex_class_digit: + case nrex_class_xdigit: + case nrex_class_word: + return true; + default: + break; + } + } + else if ('A' <= c && c <= 'Z') + { + switch (type) + { + case nrex_class_alnum: + case nrex_class_alpha: + case nrex_class_upper: + case nrex_class_word: + return true; + case nrex_class_xdigit: + if (c <= 'F') + { + return true; + } + default: + break; + } + } + else if ('a' <= c && c <= 'z') + { + switch (type) + { + case nrex_class_alnum: + case nrex_class_alpha: + case nrex_class_lower: + case nrex_class_word: + return true; + case nrex_class_xdigit: + if (c <= 'f') + { + return true; + } + default: + break; + } + } + } + switch (c) + { + case ' ': + case '\t': + if (type == nrex_class_blank) + { + return true; + } + case '\r': + case '\n': + case '\f': + if (type == nrex_class_space) + { + return true; + } + break; + case '_': + if (type == nrex_class_word) + { + return true; + } + case ']': + case '[': + case '!': + case '"': + case '#': + case '$': + case '%': + case '&': + case '\'': + case '(': + case ')': + case '*': + case '+': + case ',': + case '.': + case '/': + case ':': + case ';': + case '<': + case '=': + case '>': + case '?': + case '@': + case '\\': + case '^': + case '`': + case '{': + case '|': + case '}': + case '~': + case '-': + if (type == nrex_class_punct) + { + return true; + } + break; + default: + break; + } + return false; + } +}; + static bool nrex_is_shorthand(nrex_char repr) { switch (repr) @@ -400,6 +722,7 @@ struct nrex_node_shorthand : public nrex_node : nrex_node(true) , repr(c) { + length = 1; } int test(nrex_search* s, int pos) const @@ -435,7 +758,7 @@ struct nrex_node_shorthand : public nrex_node case 'S': invert = true; case 's': - if (nrex_is_whitespace(c)) + if (NREX_ISSPACE(c)) { found = true; } @@ -469,10 +792,10 @@ struct nrex_node_quantifier : public nrex_node bool greedy; nrex_node* child; - nrex_node_quantifier() + nrex_node_quantifier(int min, int max) : nrex_node() - , min(0) - , max(0) + , min(min) + , max(max) , greedy(true) , child(NULL) { @@ -488,17 +811,49 @@ struct nrex_node_quantifier : public nrex_node int test(nrex_search* s, int pos) const { - nrex_array<int> backtrack; - backtrack.push(pos); - while (backtrack.top() <= s->end) + return test_step(s, pos, 1); + } + + int test_step(nrex_search* s, int pos, int level) const + { + if (max == 0) { - if (max >= 1 && backtrack.size() > (unsigned int)max) + return pos; + } + if ((max >= 1 && level > max) || pos > s->end) + { + return -1; + } + if (!greedy && level > min) + { + int res = pos; + if (next) { - break; + res = next->test(s, res); + } + if (s->complete) + { + return res; } - if (!greedy && (unsigned int)min < backtrack.size()) + if (res >= 0 && parent->test_parent(s, res) >= 0) + { + return res; + } + } + int res = child->test(s, pos); + if (s->complete) + { + return res; + } + if (res >= 0) + { + int res_step = test_step(s, res, level + 1); + if (res_step >= 0) + { + return res_step; + } + else if (greedy && level >= min) { - int res = backtrack.top(); if (next) { res = next->test(s, res); @@ -512,33 +867,6 @@ struct nrex_node_quantifier : public nrex_node return res; } } - int res = child->test(s, backtrack.top()); - if (s->complete) - { - return res; - } - if (res < 0 || res == backtrack.top()) - { - break; - } - backtrack.push(res); - } - while (greedy && (unsigned int) min < backtrack.size()) - { - int res = backtrack.top(); - if (next) - { - res = next->test(s, res); - } - if (res >= 0 && parent->test_parent(s, res) >= 0) - { - return res; - } - if (s->complete) - { - return res; - } - backtrack.pop(); } return -1; } @@ -552,6 +880,7 @@ struct nrex_node_anchor : public nrex_node : nrex_node() , end(end) { + length = 0; } int test(nrex_search* s, int pos) const @@ -568,6 +897,45 @@ struct nrex_node_anchor : public nrex_node } }; +struct nrex_node_word_boundary : public nrex_node +{ + bool inverse; + + nrex_node_word_boundary(bool inverse) + : nrex_node() + , inverse(inverse) + { + length = 0; + } + + int test(nrex_search* s, int pos) const + { + bool left = false; + bool right = false; + if (pos != 0) + { + nrex_char c = s->at(pos - 1); + if (c == '_' || NREX_ISALPHANUM(c)) + { + left = true; + } + } + if (pos != s->end) + { + nrex_char c = s->at(pos); + if (c == '_' || NREX_ISALPHANUM(c)) + { + right = true; + } + } + if ((left != right) == inverse) + { + return -1; + } + return next ? next->test(s, pos) : pos; + } +}; + struct nrex_node_backreference : public nrex_node { int ref; @@ -576,6 +944,7 @@ struct nrex_node_backreference : public nrex_node : nrex_node(true) , ref(ref) { + length = -1; } int test(nrex_search* s, int pos) const @@ -596,6 +965,18 @@ struct nrex_node_backreference : public nrex_node } }; +bool nrex_has_lookbehind(nrex_array<nrex_node_group*>& stack) +{ + for (unsigned int i = 0; i < stack.size(); i++) + { + if (stack[i]->mode == nrex_node_group::LookBehind) + { + return true; + } + } + return false; +} + nrex::nrex() : _capturing(0) , _root(NULL) @@ -630,7 +1011,7 @@ int nrex::capture_size() const return _capturing + 1; } -bool nrex::compile(const nrex_char* pattern) +bool nrex::compile(const nrex_char* pattern, bool extended) { reset(); nrex_node_group* root = NREX_NEW(nrex_node_group(_capturing)); @@ -647,16 +1028,32 @@ bool nrex::compile(const nrex_char* pattern) if (c[2] == ':') { c = &c[2]; - nrex_node_group* group = NREX_NEW(nrex_node_group(-1)); + nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::NonCapture)); + stack.top()->add_child(group); + stack.push(group); + } + else if (c[2] == '!' || c[2] == '=') + { + c = &c[2]; + nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::LookAhead)); + group->negate = (c[0] == '!'); + stack.top()->add_child(group); + stack.push(group); + } + else if (c[2] == '<' && (c[3] == '!' || c[3] == '=')) + { + c = &c[3]; + nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::LookBehind)); + group->negate = (c[0] == '!'); stack.top()->add_child(group); stack.push(group); } else { - NREX_COMPILE_ERROR("unrecognised qualifier for parenthesis"); + NREX_COMPILE_ERROR("unrecognised qualifier for group"); } } - else if (_capturing < 99) + else if ((!extended && _capturing < 9) || (extended && _capturing < 99)) { nrex_node_group* group = NREX_NEW(nrex_node_group(++_capturing)); stack.top()->add_child(group); @@ -664,7 +1061,7 @@ bool nrex::compile(const nrex_char* pattern) } else { - nrex_node_group* group = NREX_NEW(nrex_node_group(-1)); + nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::NonCapture)); stack.top()->add_child(group); stack.push(group); } @@ -682,152 +1079,233 @@ bool nrex::compile(const nrex_char* pattern) } else if (c[0] == '[') { - nrex_node_group* group = NREX_NEW(nrex_node_group(-1)); + nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::Bracket)); stack.top()->add_child(group); if (c[1] == '^') { group->negate = true; ++c; } + bool first_child = true; + nrex_char previous_child; + bool previous_child_single = false; while (true) { group->add_childset(); ++c; if (c[0] == '\0') { - NREX_COMPILE_ERROR("unclosed character class '[]'"); + NREX_COMPILE_ERROR("unclosed bracket expression '['"); } - if (c[0] == ']') + if (c[0] == '[' && c[1] == ':') + { + const nrex_char* d = &c[2]; + nrex_class_type cls = nrex_parse_class(&d); + if (cls != nrex_class_none) + { + c = d; + group->add_child(NREX_NEW(nrex_node_class(cls))); + previous_child_single = false; + } + else + { + group->add_child(NREX_NEW(nrex_node_char('['))); + previous_child = '['; + previous_child_single = true; + } + } + else if (c[0] == ']' && !first_child) { break; } else if (c[0] == '\\') { - nrex_char unescaped = nrex_unescape(c[1]); - if (unescaped) - { - group->add_child(NREX_NEW(nrex_node_char(unescaped))); - ++c; - } - else if (nrex_is_shorthand(c[1])) + if (nrex_is_shorthand(c[1])) { group->add_child(NREX_NEW(nrex_node_shorthand(c[1]))); ++c; + previous_child_single = false; } else { - NREX_COMPILE_ERROR("escape token not recognised"); + const nrex_char* d = c; + nrex_char unescaped = nrex_unescape(d); + if (c == d) + { + NREX_COMPILE_ERROR("invalid escape token"); + } + group->add_child(NREX_NEW(nrex_node_char(unescaped))); + c = d; + previous_child = unescaped; + previous_child_single = true; } } - else + else if (previous_child_single && c[0] == '-') { - if (c[1] == '-' && c[2] != '\0') + bool is_range = false; + nrex_char next; + if (c[1] != '\0' && c[1] != ']') { - bool range = false; - if ('A' <= c[0] && c[0] <= 'Z' && 'A' <= c[2] && c[2] <= 'Z') + if (c[1] == '\\') { - range = true; + const nrex_char* d = ++c; + next = nrex_unescape(d); + if (c == d) + { + NREX_COMPILE_ERROR("invalid escape token in range"); + } } - if ('a' <= c[0] && c[0] <= 'z' && 'a' <= c[2] && c[2] <= 'z') - { - range = true; - } - if ('0' <= c[0] && c[0] <= '9' && '0' <= c[2] && c[2] <= '9') + else { - range = true; + next = c[1]; + ++c; } - if (range) + is_range = true; + } + if (is_range) + { + if (next < previous_child) { - group->add_child(NREX_NEW(nrex_node_range(c[0], c[2]))); - c = &c[2]; - continue; + NREX_COMPILE_ERROR("text range out of order"); } + group->pop_back(); + group->add_child(NREX_NEW(nrex_node_range(previous_child, next))); + previous_child_single = false; + } + else + { + group->add_child(NREX_NEW(nrex_node_char(c[0]))); + previous_child = c[0]; + previous_child_single = true; } + } + else + { group->add_child(NREX_NEW(nrex_node_char(c[0]))); + previous_child = c[0]; + previous_child_single = true; } - + first_child = false; } } else if (nrex_is_quantifier(c[0])) { - nrex_node_quantifier* quant = NREX_NEW(nrex_node_quantifier); - quant->child = stack.top()->swap_back(quant); - if (quant->child == NULL || !quant->child->quantifiable) + if (stack.top()->back == NULL || !stack.top()->back->quantifiable) { + if (c[0] == '{') + { + stack.top()->add_child(NREX_NEW(nrex_node_char('{'))); + continue; + } NREX_COMPILE_ERROR("element not quantifiable"); } - quant->child->previous = NULL; - quant->child->next = NULL; - quant->child->parent = quant; + int min = 0; + int max = -1; + bool valid_quantifier = true; if (c[0] == '?') { - quant->min = 0; - quant->max = 1; + min = 0; + max = 1; } else if (c[0] == '+') { - quant->min = 1; - quant->max = -1; + min = 1; + max = -1; } else if (c[0] == '*') { - quant->min = 0; - quant->max = -1; + min = 0; + max = -1; } else if (c[0] == '{') { bool max_set = false; - quant->min = 0; - quant->max = -1; + const nrex_char* d = c; while (true) { - ++c; - if (c[0] == '\0') + ++d; + if (d[0] == '\0') { - NREX_COMPILE_ERROR("unclosed range quantifier '{}'"); + valid_quantifier = false; + break; } - else if (c[0] == '}') + else if (d[0] == '}') { break; } - else if (c[0] == ',') + else if (d[0] == ',') { max_set = true; continue; } - else if (c[0] < '0' || '9' < c[0]) + else if (d[0] < '0' || '9' < d[0]) { - NREX_COMPILE_ERROR("expected numeric digits, ',' or '}'"); + valid_quantifier = false; + break; } if (max_set) { - if (quant->max < 0) + if (max < 0) { - quant->max = int(c[0] - '0'); + max = int(d[0] - '0'); } else { - quant->max = quant->max * 10 + int(c[0] - '0'); + max = max * 10 + int(d[0] - '0'); } } else { - quant->min = quant->min * 10 + int(c[0] - '0'); + min = min * 10 + int(d[0] - '0'); } } if (!max_set) { - quant->max = quant->min; + max = min; + } + if (valid_quantifier) + { + c = d; } } - if (c[1] == '?') + if (valid_quantifier) { - quant->greedy = false; - ++c; + nrex_node_quantifier* quant = NREX_NEW(nrex_node_quantifier(min, max)); + if (min == max) + { + if (stack.top()->back->length >= 0) + { + quant->length = max * stack.top()->back->length; + } + } + else + { + if (nrex_has_lookbehind(stack)) + { + NREX_COMPILE_ERROR("variable length quantifiers inside lookbehind not supported"); + } + } + quant->child = stack.top()->swap_back(quant); + quant->child->previous = NULL; + quant->child->next = NULL; + quant->child->parent = quant; + if (c[1] == '?') + { + quant->greedy = false; + ++c; + } + } + else + { + stack.top()->add_child(NREX_NEW(nrex_node_char(c[0]))); } } else if (c[0] == '|') { + if (nrex_has_lookbehind(stack)) + { + NREX_COMPILE_ERROR("alternations inside lookbehind not supported"); + } stack.top()->add_childset(); } else if (c[0] == '^' || c[0] == '$') @@ -840,13 +1318,7 @@ bool nrex::compile(const nrex_char* pattern) } else if (c[0] == '\\') { - nrex_char unescaped = nrex_unescape(c[1]); - if (unescaped) - { - stack.top()->add_child(NREX_NEW(nrex_node_char(unescaped))); - ++c; - } - else if (nrex_is_shorthand(c[1])) + if (nrex_is_shorthand(c[1])) { stack.top()->add_child(NREX_NEW(nrex_node_shorthand(c[1]))); ++c; @@ -854,7 +1326,7 @@ bool nrex::compile(const nrex_char* pattern) else if ('1' <= c[1] && c[1] <= '9') { int ref = 0; - if ('0' <= c[2] && c[2] <= '9') + if (extended && '0' <= c[2] && c[2] <= '9') { ref = int(c[1] - '0') * 10 + int(c[2] - '0'); c = &c[2]; @@ -868,11 +1340,27 @@ bool nrex::compile(const nrex_char* pattern) { NREX_COMPILE_ERROR("backreference to non-existent capture"); } + if (nrex_has_lookbehind(stack)) + { + NREX_COMPILE_ERROR("backreferences inside lookbehind not supported"); + } stack.top()->add_child(NREX_NEW(nrex_node_backreference(ref))); } + else if (c[1] == 'b' || c[1] == 'B') + { + stack.top()->add_child(NREX_NEW(nrex_node_word_boundary(c[1] == 'B'))); + ++c; + } else { - NREX_COMPILE_ERROR("escape token not recognised"); + const nrex_char* d = c; + nrex_char unescaped = nrex_unescape(d); + if (c == d) + { + NREX_COMPILE_ERROR("invalid escape token"); + } + stack.top()->add_child(NREX_NEW(nrex_node_char(unescaped))); + c = d; } } else @@ -880,6 +1368,10 @@ bool nrex::compile(const nrex_char* pattern) stack.top()->add_child(NREX_NEW(nrex_node_char(c[0]))); } } + if (stack.size() > 1) + { + NREX_COMPILE_ERROR("unclosed group '('"); + } return true; } diff --git a/drivers/nrex/nrex.hpp b/drivers/nrex/nrex.hpp index 2a6aa08e1d..e26a61c39a 100644 --- a/drivers/nrex/nrex.hpp +++ b/drivers/nrex/nrex.hpp @@ -79,7 +79,8 @@ class nrex * This is used to provide the array size of the captures needed for * nrex::match() to work. The size is actually the number of capture * groups + one for the matching of the entire pattern. The result is - * always capped at 100. + * always capped at 10 or 100, depending on the extend option given in + * nrex::compile() (default 10). * * \return The number of captures */ @@ -95,10 +96,13 @@ class nrex * runtime error nrex_compile_error if it encounters a problem when * parsing the pattern. * - * \param The regex pattern + * \param pattern The regex pattern + * \param extended If true, raises the limit on number of capture + * groups and back-references to 99. Otherwise limited + * to 9. Defaults to false. * \return True if the pattern was succesfully compiled */ - bool compile(const nrex_char* pattern); + bool compile(const nrex_char* pattern, bool extended = false); /*! * \brief Uses the pattern to search through the provided string diff --git a/drivers/nrex/regex.cpp b/drivers/nrex/regex.cpp index 0a813c3490..246384b10a 100644 --- a/drivers/nrex/regex.cpp +++ b/drivers/nrex/regex.cpp @@ -15,7 +15,7 @@ void RegEx::_bind_methods() { - ObjectTypeDB::bind_method(_MD("compile","pattern"),&RegEx::compile); + ObjectTypeDB::bind_method(_MD("compile","pattern", "expanded"),&RegEx::compile, DEFVAL(true)); ObjectTypeDB::bind_method(_MD("find","text","start","end"),&RegEx::find, DEFVAL(0), DEFVAL(-1)); ObjectTypeDB::bind_method(_MD("clear"),&RegEx::clear); ObjectTypeDB::bind_method(_MD("is_valid"),&RegEx::is_valid); @@ -54,7 +54,9 @@ bool RegEx::is_valid() const { }; int RegEx::get_capture_count() const { - + + ERR_FAIL_COND_V( !exp.valid(), 0 ); + return exp.capture_size(); } @@ -66,11 +68,11 @@ String RegEx::get_capture(int capture) const { } -Error RegEx::compile(const String& p_pattern) { +Error RegEx::compile(const String& p_pattern, bool expanded) { clear(); - exp.compile(p_pattern.c_str()); + exp.compile(p_pattern.c_str(), expanded); ERR_FAIL_COND_V( !exp.valid(), FAILED ); diff --git a/drivers/nrex/regex.h b/drivers/nrex/regex.h index 0626029705..be52da8149 100644 --- a/drivers/nrex/regex.h +++ b/drivers/nrex/regex.h @@ -36,7 +36,7 @@ public: bool is_valid() const; int get_capture_count() const; String get_capture(int capture) const; - Error compile(const String& p_pattern); + Error compile(const String& p_pattern, bool expanded = false); int find(const String& p_text, int p_start = 0, int p_end = -1) const; RegEx(); |