summaryrefslogtreecommitdiff
path: root/drivers
diff options
context:
space:
mode:
authorJuan Linietsky <reduzio@gmail.com>2015-11-18 19:13:57 -0300
committerJuan Linietsky <reduzio@gmail.com>2015-11-18 19:13:57 -0300
commit474822b6db6c96aba2684aac7039ee03b605630c (patch)
tree5e20a94cbb60cf3288d30077a014ba4a1a60643c /drivers
parent6aa95e77f3bae35fb136eb00e271362c1707058e (diff)
parentf0d246a7bce9f8f22e9886d474f1758e0c20d8cb (diff)
Merge pull request #2642 from leezh/nrex-update
Updated nrex with new features
Diffstat (limited to 'drivers')
-rw-r--r--drivers/nrex/README.md57
-rw-r--r--drivers/nrex/nrex.cpp790
-rw-r--r--drivers/nrex/nrex.hpp10
-rw-r--r--drivers/nrex/regex.cpp10
-rw-r--r--drivers/nrex/regex.h2
5 files changed, 681 insertions, 188 deletions
diff --git a/drivers/nrex/README.md b/drivers/nrex/README.md
index f150a5d76f..951b301c1e 100644
--- a/drivers/nrex/README.md
+++ b/drivers/nrex/README.md
@@ -18,47 +18,42 @@ More details about its use is documented in `nrex.hpp`
Currently supported features:
* Capturing `()` and non-capturing `(?:)` groups
- * Any character `.`
+ * Any character `.` (includes newlines)
* Shorthand caracter classes `\w\W\s\S\d\D`
- * User-defined character classes such as `[A-Za-z]`
+ * POSIX character classes such as `[[:alnum:]]`
+ * Bracket expressions such as `[A-Za-z]`
* Simple quantifiers `?`, `*` and `+`
* Range quantifiers `{0,1}`
* Lazy (non-greedy) quantifiers `*?`
* Begining `^` and end `$` anchors
+ * Word boundaries `\b`
* Alternation `|`
- * Backreferences `\1` to `\99`
-
-To do list:
+ * ASCII `\xFF` code points
* Unicode `\uFFFF` code points
+ * Positive `(?=)` and negative `(?!)` lookahead
+ * Positive `(?<=)` and negative `(?<!)` lookbehind (fixed length and no alternations)
+ * Backreferences `\1` to `\9` (with option to expand to `\99`)
## License
Copyright (c) 2015, Zher Huei Lee
All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- 1. Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- 2. Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- 3. Neither the name of the copyright holder nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
-IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
-TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would
+ be appreciated but is not required.
+
+ 2. Altered source versions must be plainly marked as such, and must not
+ be misrepresented as being the original software.
+
+ 3. This notice may not be removed or altered from any source
+ distribution.
diff --git a/drivers/nrex/nrex.cpp b/drivers/nrex/nrex.cpp
index 696d46240e..104e07f887 100644
--- a/drivers/nrex/nrex.cpp
+++ b/drivers/nrex/nrex.cpp
@@ -29,11 +29,13 @@
#include <wctype.h>
#include <wchar.h>
#define NREX_ISALPHANUM iswalnum
+#define NREX_ISSPACE iswspace
#define NREX_STRLEN wcslen
#else
#include <ctype.h>
#include <string.h>
#define NREX_ISALPHANUM isalnum
+#define NREX_ISSPACE isspace
#define NREX_STRLEN strlen
#endif
@@ -116,34 +118,72 @@ class nrex_array
}
};
-static nrex_char nrex_unescape(nrex_char repr)
+static int nrex_parse_hex(nrex_char c)
{
- switch (repr)
+ if ('0' <= c && c <= '9')
+ {
+ return int(c - '0');
+ }
+ else if ('a' <= c && c <= 'f')
+ {
+ return int(c - 'a') + 10;
+ }
+ else if ('A' <= c && c <= 'F')
{
- case '^': return '^';
- case '$': return '$';
- case '(': return '(';
- case ')': return ')';
- case '\\': return '\\';
- case '.': return '.';
- case '+': return '+';
- case '*': return '*';
- case '?': return '?';
- case '-': return '-';
- case 'a': return '\a';
- case 'e': return '\e';
- case 'f': return '\f';
- case 'n': return '\n';
- case 'r': return '\r';
- case 't': return '\t';
- case 'v': return '\v';
+ return int(c - 'A') + 10;
}
- return 0;
+ return -1;
+}
+
+static nrex_char nrex_unescape(const nrex_char*& c)
+{
+ switch (c[1])
+ {
+ case '0': ++c; return '\0';
+ case 'a': ++c; return '\a';
+ case 'e': ++c; return '\e';
+ case 'f': ++c; return '\f';
+ case 'n': ++c; return '\n';
+ case 'r': ++c; return '\r';
+ case 't': ++c; return '\t';
+ case 'v': ++c; return '\v';
+ case 'b': ++c; return '\b';
+ case 'x':
+ {
+ int point = 0;
+ for (int i = 2; i <= 3; ++i)
+ {
+ int res = nrex_parse_hex(c[i]);
+ if (res == -1)
+ {
+ return '\0';
+ }
+ point = (point << 4) + res;
+ }
+ c = &c[3];
+ return nrex_char(point);
+ }
+ case 'u':
+ {
+ int point = 0;
+ for (int i = 2; i <= 5; ++i)
+ {
+ int res = nrex_parse_hex(c[i]);
+ if (res == -1)
+ {
+ return '\0';
+ }
+ point = (point << 4) + res;
+ }
+ c = &c[5];
+ return nrex_char(point);
+ }
+ }
+ return (++c)[0];
}
struct nrex_search
{
- public:
const nrex_char* str;
nrex_result* captures;
int end;
@@ -168,12 +208,14 @@ struct nrex_node
nrex_node* previous;
nrex_node* parent;
bool quantifiable;
+ int length;
nrex_node(bool quantify = false)
: next(NULL)
, previous(NULL)
, parent(NULL)
, quantifiable(quantify)
+ , length(-1)
{
}
@@ -206,21 +248,57 @@ struct nrex_node
}
return pos;
}
+
+ void increment_length(int amount, bool subtract = false)
+ {
+ if (amount >= 0 && length >= 0)
+ {
+ if (!subtract)
+ {
+ length += amount;
+ }
+ else
+ {
+ length -= amount;
+ }
+ }
+ else
+ {
+ length = -1;
+ }
+ if (parent)
+ {
+ parent->increment_length(amount, subtract);
+ }
+ }
};
struct nrex_node_group : public nrex_node
{
- int capturing;
+ static const int NonCapture = -1;
+ static const int Bracket = -2;
+ static const int LookAhead = -3;
+ static const int LookBehind = -4;
+
+ int mode;
bool negate;
nrex_array<nrex_node*> childset;
nrex_node* back;
- nrex_node_group(int capturing)
+ nrex_node_group(int mode)
: nrex_node(true)
- , capturing(capturing)
+ , mode(mode)
, negate(false)
, back(NULL)
{
+ if (mode != Bracket)
+ {
+ length = 0;
+ }
+ else
+ {
+ length = 1;
+ }
}
virtual ~nrex_node_group()
@@ -234,14 +312,19 @@ struct nrex_node_group : public nrex_node
int test(nrex_search* s, int pos) const
{
- if (capturing >= 0)
+ if (mode >= 0)
{
- s->captures[capturing].start = pos;
+ s->captures[mode].start = pos;
}
for (unsigned int i = 0; i < childset.size(); ++i)
{
s->complete = false;
- int res = childset[i]->test(s, pos);
+ int offset = 0;
+ if (mode == LookBehind)
+ {
+ offset = length;
+ }
+ int res = childset[i]->test(s, pos - offset);
if (s->complete)
{
return res;
@@ -256,12 +339,20 @@ struct nrex_node_group : public nrex_node
{
return -1;
}
+ if (i + 1 < childset.size())
+ {
+ continue;
+ }
}
if (res >= 0)
{
- if (capturing >= 0)
+ if (mode >= 0)
{
- s->captures[capturing].length = res - pos;
+ s->captures[mode].length = res - pos;
+ }
+ else if (mode == LookAhead || mode == LookBehind)
+ {
+ res = pos;
}
return next ? next->test(s, res) : res;
}
@@ -271,15 +362,19 @@ struct nrex_node_group : public nrex_node
virtual int test_parent(nrex_search* s, int pos) const
{
- if (capturing >= 0)
+ if (mode >= 0)
{
- s->captures[capturing].length = pos - s->captures[capturing].start;
+ s->captures[mode].length = pos - s->captures[mode].start;
}
return nrex_node::test_parent(s, pos);
}
void add_childset()
{
+ if (childset.size() > 0 && mode != Bracket)
+ {
+ length = -1;
+ }
back = NULL;
}
@@ -287,7 +382,7 @@ struct nrex_node_group : public nrex_node
{
node->parent = this;
node->previous = back;
- if (back)
+ if (back && mode != Bracket)
{
back->next = node;
}
@@ -295,6 +390,10 @@ struct nrex_node_group : public nrex_node
{
childset.push(node);
}
+ if (mode != Bracket)
+ {
+ increment_length(node->length);
+ }
back = node;
}
@@ -310,10 +409,32 @@ struct nrex_node_group : public nrex_node
{
childset.pop();
}
+ if (mode != Bracket)
+ {
+ increment_length(old->length, true);
+ }
back = old->previous;
add_child(node);
return old;
}
+
+ void pop_back()
+ {
+ if (back)
+ {
+ nrex_node* old = back;
+ if (!old->previous)
+ {
+ childset.pop();
+ }
+ if (mode != Bracket)
+ {
+ increment_length(old->length, true);
+ }
+ back = old->previous;
+ NREX_DELETE(old);
+ }
+ }
};
struct nrex_node_char : public nrex_node
@@ -324,6 +445,7 @@ struct nrex_node_char : public nrex_node
: nrex_node(true)
, ch(c)
{
+ length = 1;
}
int test(nrex_search* s, int pos) const
@@ -346,6 +468,7 @@ struct nrex_node_range : public nrex_node
, start(s)
, end(e)
{
+ length = 1;
}
int test(nrex_search* s, int pos) const
@@ -363,20 +486,219 @@ struct nrex_node_range : public nrex_node
}
};
-static bool nrex_is_whitespace(nrex_char repr)
+enum nrex_class_type
{
- switch (repr)
+ nrex_class_none,
+ nrex_class_alnum,
+ nrex_class_alpha,
+ nrex_class_blank,
+ nrex_class_cntrl,
+ nrex_class_digit,
+ nrex_class_graph,
+ nrex_class_lower,
+ nrex_class_print,
+ nrex_class_punct,
+ nrex_class_space,
+ nrex_class_upper,
+ nrex_class_xdigit,
+ nrex_class_word
+};
+
+static bool nrex_compare_class(const nrex_char** pos, const char* text)
+{
+ unsigned int i = 0;
+ for (i = 0; text[i] != '\0'; ++i)
{
- case ' ':
- case '\t':
- case '\r':
- case '\n':
- case '\f':
- return true;
+ if ((*pos)[i] != text[i])
+ {
+ return false;
+ }
}
- return false;
+ if ((*pos)[i++] != ':' || (*pos)[i] != ']')
+ {
+ return false;
+ }
+ *pos = &(*pos)[i];
+ return true;
}
+#define NREX_COMPARE_CLASS(POS, NAME) if (nrex_compare_class(POS, #NAME)) return nrex_class_ ## NAME
+
+static nrex_class_type nrex_parse_class(const nrex_char** pos)
+{
+ NREX_COMPARE_CLASS(pos, alnum);
+ NREX_COMPARE_CLASS(pos, alpha);
+ NREX_COMPARE_CLASS(pos, blank);
+ NREX_COMPARE_CLASS(pos, cntrl);
+ NREX_COMPARE_CLASS(pos, digit);
+ NREX_COMPARE_CLASS(pos, graph);
+ NREX_COMPARE_CLASS(pos, lower);
+ NREX_COMPARE_CLASS(pos, print);
+ NREX_COMPARE_CLASS(pos, punct);
+ NREX_COMPARE_CLASS(pos, space);
+ NREX_COMPARE_CLASS(pos, upper);
+ NREX_COMPARE_CLASS(pos, xdigit);
+ NREX_COMPARE_CLASS(pos, word);
+ return nrex_class_none;
+}
+
+struct nrex_node_class : public nrex_node
+{
+ nrex_class_type type;
+
+ nrex_node_class(nrex_class_type t)
+ : nrex_node(true)
+ , type(t)
+ {
+ length = 1;
+ }
+
+ int test(nrex_search* s, int pos) const
+ {
+ if (s->end == pos)
+ {
+ return -1;
+ }
+ if (!test_class(s->at(pos)))
+ {
+ return -1;
+ }
+ return next ? next->test(s, pos + 1) : pos + 1;
+ }
+
+ bool test_class(nrex_char c) const
+ {
+ if ((0 <= c && c <= 0x1F) || c == 0x7F)
+ {
+ if (type == nrex_class_cntrl)
+ {
+ return true;
+ }
+ }
+ else if (c < 0x7F)
+ {
+ if (type == nrex_class_print)
+ {
+ return true;
+ }
+ else if (type == nrex_class_graph && c != ' ')
+ {
+ return true;
+ }
+ else if ('0' <= c && c <= '9')
+ {
+ switch (type)
+ {
+ case nrex_class_alnum:
+ case nrex_class_digit:
+ case nrex_class_xdigit:
+ case nrex_class_word:
+ return true;
+ default:
+ break;
+ }
+ }
+ else if ('A' <= c && c <= 'Z')
+ {
+ switch (type)
+ {
+ case nrex_class_alnum:
+ case nrex_class_alpha:
+ case nrex_class_upper:
+ case nrex_class_word:
+ return true;
+ case nrex_class_xdigit:
+ if (c <= 'F')
+ {
+ return true;
+ }
+ default:
+ break;
+ }
+ }
+ else if ('a' <= c && c <= 'z')
+ {
+ switch (type)
+ {
+ case nrex_class_alnum:
+ case nrex_class_alpha:
+ case nrex_class_lower:
+ case nrex_class_word:
+ return true;
+ case nrex_class_xdigit:
+ if (c <= 'f')
+ {
+ return true;
+ }
+ default:
+ break;
+ }
+ }
+ }
+ switch (c)
+ {
+ case ' ':
+ case '\t':
+ if (type == nrex_class_blank)
+ {
+ return true;
+ }
+ case '\r':
+ case '\n':
+ case '\f':
+ if (type == nrex_class_space)
+ {
+ return true;
+ }
+ break;
+ case '_':
+ if (type == nrex_class_word)
+ {
+ return true;
+ }
+ case ']':
+ case '[':
+ case '!':
+ case '"':
+ case '#':
+ case '$':
+ case '%':
+ case '&':
+ case '\'':
+ case '(':
+ case ')':
+ case '*':
+ case '+':
+ case ',':
+ case '.':
+ case '/':
+ case ':':
+ case ';':
+ case '<':
+ case '=':
+ case '>':
+ case '?':
+ case '@':
+ case '\\':
+ case '^':
+ case '`':
+ case '{':
+ case '|':
+ case '}':
+ case '~':
+ case '-':
+ if (type == nrex_class_punct)
+ {
+ return true;
+ }
+ break;
+ default:
+ break;
+ }
+ return false;
+ }
+};
+
static bool nrex_is_shorthand(nrex_char repr)
{
switch (repr)
@@ -400,6 +722,7 @@ struct nrex_node_shorthand : public nrex_node
: nrex_node(true)
, repr(c)
{
+ length = 1;
}
int test(nrex_search* s, int pos) const
@@ -435,7 +758,7 @@ struct nrex_node_shorthand : public nrex_node
case 'S':
invert = true;
case 's':
- if (nrex_is_whitespace(c))
+ if (NREX_ISSPACE(c))
{
found = true;
}
@@ -469,10 +792,10 @@ struct nrex_node_quantifier : public nrex_node
bool greedy;
nrex_node* child;
- nrex_node_quantifier()
+ nrex_node_quantifier(int min, int max)
: nrex_node()
- , min(0)
- , max(0)
+ , min(min)
+ , max(max)
, greedy(true)
, child(NULL)
{
@@ -488,17 +811,49 @@ struct nrex_node_quantifier : public nrex_node
int test(nrex_search* s, int pos) const
{
- nrex_array<int> backtrack;
- backtrack.push(pos);
- while (backtrack.top() <= s->end)
+ return test_step(s, pos, 1);
+ }
+
+ int test_step(nrex_search* s, int pos, int level) const
+ {
+ if (max == 0)
{
- if (max >= 1 && backtrack.size() > (unsigned int)max)
+ return pos;
+ }
+ if ((max >= 1 && level > max) || pos > s->end)
+ {
+ return -1;
+ }
+ if (!greedy && level > min)
+ {
+ int res = pos;
+ if (next)
{
- break;
+ res = next->test(s, res);
+ }
+ if (s->complete)
+ {
+ return res;
}
- if (!greedy && (unsigned int)min < backtrack.size())
+ if (res >= 0 && parent->test_parent(s, res) >= 0)
+ {
+ return res;
+ }
+ }
+ int res = child->test(s, pos);
+ if (s->complete)
+ {
+ return res;
+ }
+ if (res >= 0)
+ {
+ int res_step = test_step(s, res, level + 1);
+ if (res_step >= 0)
+ {
+ return res_step;
+ }
+ else if (greedy && level >= min)
{
- int res = backtrack.top();
if (next)
{
res = next->test(s, res);
@@ -512,33 +867,6 @@ struct nrex_node_quantifier : public nrex_node
return res;
}
}
- int res = child->test(s, backtrack.top());
- if (s->complete)
- {
- return res;
- }
- if (res < 0 || res == backtrack.top())
- {
- break;
- }
- backtrack.push(res);
- }
- while (greedy && (unsigned int) min < backtrack.size())
- {
- int res = backtrack.top();
- if (next)
- {
- res = next->test(s, res);
- }
- if (res >= 0 && parent->test_parent(s, res) >= 0)
- {
- return res;
- }
- if (s->complete)
- {
- return res;
- }
- backtrack.pop();
}
return -1;
}
@@ -552,6 +880,7 @@ struct nrex_node_anchor : public nrex_node
: nrex_node()
, end(end)
{
+ length = 0;
}
int test(nrex_search* s, int pos) const
@@ -568,6 +897,45 @@ struct nrex_node_anchor : public nrex_node
}
};
+struct nrex_node_word_boundary : public nrex_node
+{
+ bool inverse;
+
+ nrex_node_word_boundary(bool inverse)
+ : nrex_node()
+ , inverse(inverse)
+ {
+ length = 0;
+ }
+
+ int test(nrex_search* s, int pos) const
+ {
+ bool left = false;
+ bool right = false;
+ if (pos != 0)
+ {
+ nrex_char c = s->at(pos - 1);
+ if (c == '_' || NREX_ISALPHANUM(c))
+ {
+ left = true;
+ }
+ }
+ if (pos != s->end)
+ {
+ nrex_char c = s->at(pos);
+ if (c == '_' || NREX_ISALPHANUM(c))
+ {
+ right = true;
+ }
+ }
+ if ((left != right) == inverse)
+ {
+ return -1;
+ }
+ return next ? next->test(s, pos) : pos;
+ }
+};
+
struct nrex_node_backreference : public nrex_node
{
int ref;
@@ -576,6 +944,7 @@ struct nrex_node_backreference : public nrex_node
: nrex_node(true)
, ref(ref)
{
+ length = -1;
}
int test(nrex_search* s, int pos) const
@@ -596,6 +965,18 @@ struct nrex_node_backreference : public nrex_node
}
};
+bool nrex_has_lookbehind(nrex_array<nrex_node_group*>& stack)
+{
+ for (unsigned int i = 0; i < stack.size(); i++)
+ {
+ if (stack[i]->mode == nrex_node_group::LookBehind)
+ {
+ return true;
+ }
+ }
+ return false;
+}
+
nrex::nrex()
: _capturing(0)
, _root(NULL)
@@ -630,7 +1011,7 @@ int nrex::capture_size() const
return _capturing + 1;
}
-bool nrex::compile(const nrex_char* pattern)
+bool nrex::compile(const nrex_char* pattern, bool extended)
{
reset();
nrex_node_group* root = NREX_NEW(nrex_node_group(_capturing));
@@ -647,16 +1028,32 @@ bool nrex::compile(const nrex_char* pattern)
if (c[2] == ':')
{
c = &c[2];
- nrex_node_group* group = NREX_NEW(nrex_node_group(-1));
+ nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::NonCapture));
+ stack.top()->add_child(group);
+ stack.push(group);
+ }
+ else if (c[2] == '!' || c[2] == '=')
+ {
+ c = &c[2];
+ nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::LookAhead));
+ group->negate = (c[0] == '!');
+ stack.top()->add_child(group);
+ stack.push(group);
+ }
+ else if (c[2] == '<' && (c[3] == '!' || c[3] == '='))
+ {
+ c = &c[3];
+ nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::LookBehind));
+ group->negate = (c[0] == '!');
stack.top()->add_child(group);
stack.push(group);
}
else
{
- NREX_COMPILE_ERROR("unrecognised qualifier for parenthesis");
+ NREX_COMPILE_ERROR("unrecognised qualifier for group");
}
}
- else if (_capturing < 99)
+ else if ((!extended && _capturing < 9) || (extended && _capturing < 99))
{
nrex_node_group* group = NREX_NEW(nrex_node_group(++_capturing));
stack.top()->add_child(group);
@@ -664,7 +1061,7 @@ bool nrex::compile(const nrex_char* pattern)
}
else
{
- nrex_node_group* group = NREX_NEW(nrex_node_group(-1));
+ nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::NonCapture));
stack.top()->add_child(group);
stack.push(group);
}
@@ -682,152 +1079,233 @@ bool nrex::compile(const nrex_char* pattern)
}
else if (c[0] == '[')
{
- nrex_node_group* group = NREX_NEW(nrex_node_group(-1));
+ nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::Bracket));
stack.top()->add_child(group);
if (c[1] == '^')
{
group->negate = true;
++c;
}
+ bool first_child = true;
+ nrex_char previous_child;
+ bool previous_child_single = false;
while (true)
{
group->add_childset();
++c;
if (c[0] == '\0')
{
- NREX_COMPILE_ERROR("unclosed character class '[]'");
+ NREX_COMPILE_ERROR("unclosed bracket expression '['");
}
- if (c[0] == ']')
+ if (c[0] == '[' && c[1] == ':')
+ {
+ const nrex_char* d = &c[2];
+ nrex_class_type cls = nrex_parse_class(&d);
+ if (cls != nrex_class_none)
+ {
+ c = d;
+ group->add_child(NREX_NEW(nrex_node_class(cls)));
+ previous_child_single = false;
+ }
+ else
+ {
+ group->add_child(NREX_NEW(nrex_node_char('[')));
+ previous_child = '[';
+ previous_child_single = true;
+ }
+ }
+ else if (c[0] == ']' && !first_child)
{
break;
}
else if (c[0] == '\\')
{
- nrex_char unescaped = nrex_unescape(c[1]);
- if (unescaped)
- {
- group->add_child(NREX_NEW(nrex_node_char(unescaped)));
- ++c;
- }
- else if (nrex_is_shorthand(c[1]))
+ if (nrex_is_shorthand(c[1]))
{
group->add_child(NREX_NEW(nrex_node_shorthand(c[1])));
++c;
+ previous_child_single = false;
}
else
{
- NREX_COMPILE_ERROR("escape token not recognised");
+ const nrex_char* d = c;
+ nrex_char unescaped = nrex_unescape(d);
+ if (c == d)
+ {
+ NREX_COMPILE_ERROR("invalid escape token");
+ }
+ group->add_child(NREX_NEW(nrex_node_char(unescaped)));
+ c = d;
+ previous_child = unescaped;
+ previous_child_single = true;
}
}
- else
+ else if (previous_child_single && c[0] == '-')
{
- if (c[1] == '-' && c[2] != '\0')
+ bool is_range = false;
+ nrex_char next;
+ if (c[1] != '\0' && c[1] != ']')
{
- bool range = false;
- if ('A' <= c[0] && c[0] <= 'Z' && 'A' <= c[2] && c[2] <= 'Z')
+ if (c[1] == '\\')
{
- range = true;
+ const nrex_char* d = ++c;
+ next = nrex_unescape(d);
+ if (c == d)
+ {
+ NREX_COMPILE_ERROR("invalid escape token in range");
+ }
}
- if ('a' <= c[0] && c[0] <= 'z' && 'a' <= c[2] && c[2] <= 'z')
- {
- range = true;
- }
- if ('0' <= c[0] && c[0] <= '9' && '0' <= c[2] && c[2] <= '9')
+ else
{
- range = true;
+ next = c[1];
+ ++c;
}
- if (range)
+ is_range = true;
+ }
+ if (is_range)
+ {
+ if (next < previous_child)
{
- group->add_child(NREX_NEW(nrex_node_range(c[0], c[2])));
- c = &c[2];
- continue;
+ NREX_COMPILE_ERROR("text range out of order");
}
+ group->pop_back();
+ group->add_child(NREX_NEW(nrex_node_range(previous_child, next)));
+ previous_child_single = false;
+ }
+ else
+ {
+ group->add_child(NREX_NEW(nrex_node_char(c[0])));
+ previous_child = c[0];
+ previous_child_single = true;
}
+ }
+ else
+ {
group->add_child(NREX_NEW(nrex_node_char(c[0])));
+ previous_child = c[0];
+ previous_child_single = true;
}
-
+ first_child = false;
}
}
else if (nrex_is_quantifier(c[0]))
{
- nrex_node_quantifier* quant = NREX_NEW(nrex_node_quantifier);
- quant->child = stack.top()->swap_back(quant);
- if (quant->child == NULL || !quant->child->quantifiable)
+ if (stack.top()->back == NULL || !stack.top()->back->quantifiable)
{
+ if (c[0] == '{')
+ {
+ stack.top()->add_child(NREX_NEW(nrex_node_char('{')));
+ continue;
+ }
NREX_COMPILE_ERROR("element not quantifiable");
}
- quant->child->previous = NULL;
- quant->child->next = NULL;
- quant->child->parent = quant;
+ int min = 0;
+ int max = -1;
+ bool valid_quantifier = true;
if (c[0] == '?')
{
- quant->min = 0;
- quant->max = 1;
+ min = 0;
+ max = 1;
}
else if (c[0] == '+')
{
- quant->min = 1;
- quant->max = -1;
+ min = 1;
+ max = -1;
}
else if (c[0] == '*')
{
- quant->min = 0;
- quant->max = -1;
+ min = 0;
+ max = -1;
}
else if (c[0] == '{')
{
bool max_set = false;
- quant->min = 0;
- quant->max = -1;
+ const nrex_char* d = c;
while (true)
{
- ++c;
- if (c[0] == '\0')
+ ++d;
+ if (d[0] == '\0')
{
- NREX_COMPILE_ERROR("unclosed range quantifier '{}'");
+ valid_quantifier = false;
+ break;
}
- else if (c[0] == '}')
+ else if (d[0] == '}')
{
break;
}
- else if (c[0] == ',')
+ else if (d[0] == ',')
{
max_set = true;
continue;
}
- else if (c[0] < '0' || '9' < c[0])
+ else if (d[0] < '0' || '9' < d[0])
{
- NREX_COMPILE_ERROR("expected numeric digits, ',' or '}'");
+ valid_quantifier = false;
+ break;
}
if (max_set)
{
- if (quant->max < 0)
+ if (max < 0)
{
- quant->max = int(c[0] - '0');
+ max = int(d[0] - '0');
}
else
{
- quant->max = quant->max * 10 + int(c[0] - '0');
+ max = max * 10 + int(d[0] - '0');
}
}
else
{
- quant->min = quant->min * 10 + int(c[0] - '0');
+ min = min * 10 + int(d[0] - '0');
}
}
if (!max_set)
{
- quant->max = quant->min;
+ max = min;
+ }
+ if (valid_quantifier)
+ {
+ c = d;
}
}
- if (c[1] == '?')
+ if (valid_quantifier)
{
- quant->greedy = false;
- ++c;
+ nrex_node_quantifier* quant = NREX_NEW(nrex_node_quantifier(min, max));
+ if (min == max)
+ {
+ if (stack.top()->back->length >= 0)
+ {
+ quant->length = max * stack.top()->back->length;
+ }
+ }
+ else
+ {
+ if (nrex_has_lookbehind(stack))
+ {
+ NREX_COMPILE_ERROR("variable length quantifiers inside lookbehind not supported");
+ }
+ }
+ quant->child = stack.top()->swap_back(quant);
+ quant->child->previous = NULL;
+ quant->child->next = NULL;
+ quant->child->parent = quant;
+ if (c[1] == '?')
+ {
+ quant->greedy = false;
+ ++c;
+ }
+ }
+ else
+ {
+ stack.top()->add_child(NREX_NEW(nrex_node_char(c[0])));
}
}
else if (c[0] == '|')
{
+ if (nrex_has_lookbehind(stack))
+ {
+ NREX_COMPILE_ERROR("alternations inside lookbehind not supported");
+ }
stack.top()->add_childset();
}
else if (c[0] == '^' || c[0] == '$')
@@ -840,13 +1318,7 @@ bool nrex::compile(const nrex_char* pattern)
}
else if (c[0] == '\\')
{
- nrex_char unescaped = nrex_unescape(c[1]);
- if (unescaped)
- {
- stack.top()->add_child(NREX_NEW(nrex_node_char(unescaped)));
- ++c;
- }
- else if (nrex_is_shorthand(c[1]))
+ if (nrex_is_shorthand(c[1]))
{
stack.top()->add_child(NREX_NEW(nrex_node_shorthand(c[1])));
++c;
@@ -854,7 +1326,7 @@ bool nrex::compile(const nrex_char* pattern)
else if ('1' <= c[1] && c[1] <= '9')
{
int ref = 0;
- if ('0' <= c[2] && c[2] <= '9')
+ if (extended && '0' <= c[2] && c[2] <= '9')
{
ref = int(c[1] - '0') * 10 + int(c[2] - '0');
c = &c[2];
@@ -868,11 +1340,27 @@ bool nrex::compile(const nrex_char* pattern)
{
NREX_COMPILE_ERROR("backreference to non-existent capture");
}
+ if (nrex_has_lookbehind(stack))
+ {
+ NREX_COMPILE_ERROR("backreferences inside lookbehind not supported");
+ }
stack.top()->add_child(NREX_NEW(nrex_node_backreference(ref)));
}
+ else if (c[1] == 'b' || c[1] == 'B')
+ {
+ stack.top()->add_child(NREX_NEW(nrex_node_word_boundary(c[1] == 'B')));
+ ++c;
+ }
else
{
- NREX_COMPILE_ERROR("escape token not recognised");
+ const nrex_char* d = c;
+ nrex_char unescaped = nrex_unescape(d);
+ if (c == d)
+ {
+ NREX_COMPILE_ERROR("invalid escape token");
+ }
+ stack.top()->add_child(NREX_NEW(nrex_node_char(unescaped)));
+ c = d;
}
}
else
@@ -880,6 +1368,10 @@ bool nrex::compile(const nrex_char* pattern)
stack.top()->add_child(NREX_NEW(nrex_node_char(c[0])));
}
}
+ if (stack.size() > 1)
+ {
+ NREX_COMPILE_ERROR("unclosed group '('");
+ }
return true;
}
diff --git a/drivers/nrex/nrex.hpp b/drivers/nrex/nrex.hpp
index 2a6aa08e1d..e26a61c39a 100644
--- a/drivers/nrex/nrex.hpp
+++ b/drivers/nrex/nrex.hpp
@@ -79,7 +79,8 @@ class nrex
* This is used to provide the array size of the captures needed for
* nrex::match() to work. The size is actually the number of capture
* groups + one for the matching of the entire pattern. The result is
- * always capped at 100.
+ * always capped at 10 or 100, depending on the extend option given in
+ * nrex::compile() (default 10).
*
* \return The number of captures
*/
@@ -95,10 +96,13 @@ class nrex
* runtime error nrex_compile_error if it encounters a problem when
* parsing the pattern.
*
- * \param The regex pattern
+ * \param pattern The regex pattern
+ * \param extended If true, raises the limit on number of capture
+ * groups and back-references to 99. Otherwise limited
+ * to 9. Defaults to false.
* \return True if the pattern was succesfully compiled
*/
- bool compile(const nrex_char* pattern);
+ bool compile(const nrex_char* pattern, bool extended = false);
/*!
* \brief Uses the pattern to search through the provided string
diff --git a/drivers/nrex/regex.cpp b/drivers/nrex/regex.cpp
index 0a813c3490..246384b10a 100644
--- a/drivers/nrex/regex.cpp
+++ b/drivers/nrex/regex.cpp
@@ -15,7 +15,7 @@
void RegEx::_bind_methods() {
- ObjectTypeDB::bind_method(_MD("compile","pattern"),&RegEx::compile);
+ ObjectTypeDB::bind_method(_MD("compile","pattern", "expanded"),&RegEx::compile, DEFVAL(true));
ObjectTypeDB::bind_method(_MD("find","text","start","end"),&RegEx::find, DEFVAL(0), DEFVAL(-1));
ObjectTypeDB::bind_method(_MD("clear"),&RegEx::clear);
ObjectTypeDB::bind_method(_MD("is_valid"),&RegEx::is_valid);
@@ -54,7 +54,9 @@ bool RegEx::is_valid() const {
};
int RegEx::get_capture_count() const {
-
+
+ ERR_FAIL_COND_V( !exp.valid(), 0 );
+
return exp.capture_size();
}
@@ -66,11 +68,11 @@ String RegEx::get_capture(int capture) const {
}
-Error RegEx::compile(const String& p_pattern) {
+Error RegEx::compile(const String& p_pattern, bool expanded) {
clear();
- exp.compile(p_pattern.c_str());
+ exp.compile(p_pattern.c_str(), expanded);
ERR_FAIL_COND_V( !exp.valid(), FAILED );
diff --git a/drivers/nrex/regex.h b/drivers/nrex/regex.h
index 0626029705..be52da8149 100644
--- a/drivers/nrex/regex.h
+++ b/drivers/nrex/regex.h
@@ -36,7 +36,7 @@ public:
bool is_valid() const;
int get_capture_count() const;
String get_capture(int capture) const;
- Error compile(const String& p_pattern);
+ Error compile(const String& p_pattern, bool expanded = false);
int find(const String& p_text, int p_start = 0, int p_end = -1) const;
RegEx();