diff options
Diffstat (limited to 'drivers/nrex/nrex.cpp')
-rw-r--r-- | drivers/nrex/nrex.cpp | 1402 |
1 files changed, 1402 insertions, 0 deletions
diff --git a/drivers/nrex/nrex.cpp b/drivers/nrex/nrex.cpp new file mode 100644 index 0000000000..104e07f887 --- /dev/null +++ b/drivers/nrex/nrex.cpp @@ -0,0 +1,1402 @@ +// NREX: Node RegEx +// +// Copyright (c) 2015, Zher Huei Lee +// All rights reserved. +// +// This software is provided 'as-is', without any express or implied +// warranty. In no event will the authors be held liable for any damages +// arising from the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it +// freely, subject to the following restrictions: +// +// 1. The origin of this software must not be misrepresented; you must not +// claim that you wrote the original software. If you use this software +// in a product, an acknowledgment in the product documentation would +// be appreciated but is not required. +// +// 2. Altered source versions must be plainly marked as such, and must not +// be misrepresented as being the original software. +// +// 3. This notice may not be removed or altered from any source +// distribution. +// + +#include "nrex.hpp" + +#ifdef NREX_UNICODE +#include <wctype.h> +#include <wchar.h> +#define NREX_ISALPHANUM iswalnum +#define NREX_ISSPACE iswspace +#define NREX_STRLEN wcslen +#else +#include <ctype.h> +#include <string.h> +#define NREX_ISALPHANUM isalnum +#define NREX_ISSPACE isspace +#define NREX_STRLEN strlen +#endif + +#ifdef NREX_THROW_ERROR +#define NREX_COMPILE_ERROR(M) throw nrex_compile_error(M) +#else +#define NREX_COMPILE_ERROR(M) reset(); return false +#endif + +#ifndef NREX_NEW +#define NREX_NEW(X) new X +#define NREX_NEW_ARRAY(X, N) new X[N] +#define NREX_DELETE(X) delete X +#define NREX_DELETE_ARRAY(X) delete[] X +#endif + +template<typename T> +class nrex_array +{ + private: + T* _data; + unsigned int _reserved; + unsigned int _size; + public: + nrex_array() + : _data(NREX_NEW_ARRAY(T, 2)) + , _reserved(2) + , _size(0) + { + } + + ~nrex_array() + { + NREX_DELETE_ARRAY(_data); + } + + unsigned int size() const + { + return _size; + } + + void reserve(unsigned int size) + { + T* old = _data; + _data = NREX_NEW_ARRAY(T, size); + _reserved = size; + for (unsigned int i = 0; i < _size; ++i) + { + _data[i] = old[i]; + } + NREX_DELETE_ARRAY(old); + } + + void push(T item) + { + if (_size == _reserved) + { + reserve(_reserved * 2); + } + _data[_size] = item; + _size++; + } + + T& top() + { + return _data[_size - 1]; + } + + const T& operator[] (unsigned int i) const + { + return _data[i]; + } + + void pop() + { + if (_size > 0) + { + --_size; + } + } +}; + +static int nrex_parse_hex(nrex_char c) +{ + if ('0' <= c && c <= '9') + { + return int(c - '0'); + } + else if ('a' <= c && c <= 'f') + { + return int(c - 'a') + 10; + } + else if ('A' <= c && c <= 'F') + { + return int(c - 'A') + 10; + } + return -1; +} + +static nrex_char nrex_unescape(const nrex_char*& c) +{ + switch (c[1]) + { + case '0': ++c; return '\0'; + case 'a': ++c; return '\a'; + case 'e': ++c; return '\e'; + case 'f': ++c; return '\f'; + case 'n': ++c; return '\n'; + case 'r': ++c; return '\r'; + case 't': ++c; return '\t'; + case 'v': ++c; return '\v'; + case 'b': ++c; return '\b'; + case 'x': + { + int point = 0; + for (int i = 2; i <= 3; ++i) + { + int res = nrex_parse_hex(c[i]); + if (res == -1) + { + return '\0'; + } + point = (point << 4) + res; + } + c = &c[3]; + return nrex_char(point); + } + case 'u': + { + int point = 0; + for (int i = 2; i <= 5; ++i) + { + int res = nrex_parse_hex(c[i]); + if (res == -1) + { + return '\0'; + } + point = (point << 4) + res; + } + c = &c[5]; + return nrex_char(point); + } + } + return (++c)[0]; +} + +struct nrex_search +{ + const nrex_char* str; + nrex_result* captures; + int end; + bool complete; + + nrex_char at(int pos) + { + return str[pos]; + } + + nrex_search(const nrex_char* str, nrex_result* captures) + : str(str) + , captures(captures) + , end(0) + { + } +}; + +struct nrex_node +{ + nrex_node* next; + nrex_node* previous; + nrex_node* parent; + bool quantifiable; + int length; + + nrex_node(bool quantify = false) + : next(NULL) + , previous(NULL) + , parent(NULL) + , quantifiable(quantify) + , length(-1) + { + } + + virtual ~nrex_node() + { + if (next) + { + NREX_DELETE(next); + } + } + + virtual int test(nrex_search* s, int pos) const + { + return next ? next->test(s, pos) : -1; + } + + virtual int test_parent(nrex_search* s, int pos) const + { + if (next) + { + pos = next->test(s, pos); + } + if (parent && pos >= 0) + { + pos = parent->test_parent(s, pos); + } + if (pos >= 0) + { + s->complete = true; + } + return pos; + } + + void increment_length(int amount, bool subtract = false) + { + if (amount >= 0 && length >= 0) + { + if (!subtract) + { + length += amount; + } + else + { + length -= amount; + } + } + else + { + length = -1; + } + if (parent) + { + parent->increment_length(amount, subtract); + } + } +}; + +struct nrex_node_group : public nrex_node +{ + static const int NonCapture = -1; + static const int Bracket = -2; + static const int LookAhead = -3; + static const int LookBehind = -4; + + int mode; + bool negate; + nrex_array<nrex_node*> childset; + nrex_node* back; + + nrex_node_group(int mode) + : nrex_node(true) + , mode(mode) + , negate(false) + , back(NULL) + { + if (mode != Bracket) + { + length = 0; + } + else + { + length = 1; + } + } + + virtual ~nrex_node_group() + { + for (unsigned int i = 0; i < childset.size(); ++i) + { + NREX_DELETE(childset[i]); + } + + } + + int test(nrex_search* s, int pos) const + { + if (mode >= 0) + { + s->captures[mode].start = pos; + } + for (unsigned int i = 0; i < childset.size(); ++i) + { + s->complete = false; + int offset = 0; + if (mode == LookBehind) + { + offset = length; + } + int res = childset[i]->test(s, pos - offset); + if (s->complete) + { + return res; + } + if (negate) + { + if (res < 0) + { + res = pos + 1; + } + else + { + return -1; + } + if (i + 1 < childset.size()) + { + continue; + } + } + if (res >= 0) + { + if (mode >= 0) + { + s->captures[mode].length = res - pos; + } + else if (mode == LookAhead || mode == LookBehind) + { + res = pos; + } + return next ? next->test(s, res) : res; + } + } + return -1; + } + + virtual int test_parent(nrex_search* s, int pos) const + { + if (mode >= 0) + { + s->captures[mode].length = pos - s->captures[mode].start; + } + return nrex_node::test_parent(s, pos); + } + + void add_childset() + { + if (childset.size() > 0 && mode != Bracket) + { + length = -1; + } + back = NULL; + } + + void add_child(nrex_node* node) + { + node->parent = this; + node->previous = back; + if (back && mode != Bracket) + { + back->next = node; + } + else + { + childset.push(node); + } + if (mode != Bracket) + { + increment_length(node->length); + } + back = node; + } + + nrex_node* swap_back(nrex_node* node) + { + if (!back) + { + add_child(node); + return NULL; + } + nrex_node* old = back; + if (!old->previous) + { + childset.pop(); + } + if (mode != Bracket) + { + increment_length(old->length, true); + } + back = old->previous; + add_child(node); + return old; + } + + void pop_back() + { + if (back) + { + nrex_node* old = back; + if (!old->previous) + { + childset.pop(); + } + if (mode != Bracket) + { + increment_length(old->length, true); + } + back = old->previous; + NREX_DELETE(old); + } + } +}; + +struct nrex_node_char : public nrex_node +{ + nrex_char ch; + + nrex_node_char(nrex_char c) + : nrex_node(true) + , ch(c) + { + length = 1; + } + + int test(nrex_search* s, int pos) const + { + if (s->end == pos || s->at(pos) != ch) + { + return -1; + } + return next ? next->test(s, pos + 1) : pos + 1; + } +}; + +struct nrex_node_range : public nrex_node +{ + nrex_char start; + nrex_char end; + + nrex_node_range(nrex_char s, nrex_char e) + : nrex_node(true) + , start(s) + , end(e) + { + length = 1; + } + + int test(nrex_search* s, int pos) const + { + if (s->end == pos) + { + return -1; + } + nrex_char c = s->at(pos); + if (c < start || end < c) + { + return -1; + } + return next ? next->test(s, pos + 1) : pos + 1; + } +}; + +enum nrex_class_type +{ + nrex_class_none, + nrex_class_alnum, + nrex_class_alpha, + nrex_class_blank, + nrex_class_cntrl, + nrex_class_digit, + nrex_class_graph, + nrex_class_lower, + nrex_class_print, + nrex_class_punct, + nrex_class_space, + nrex_class_upper, + nrex_class_xdigit, + nrex_class_word +}; + +static bool nrex_compare_class(const nrex_char** pos, const char* text) +{ + unsigned int i = 0; + for (i = 0; text[i] != '\0'; ++i) + { + if ((*pos)[i] != text[i]) + { + return false; + } + } + if ((*pos)[i++] != ':' || (*pos)[i] != ']') + { + return false; + } + *pos = &(*pos)[i]; + return true; +} + +#define NREX_COMPARE_CLASS(POS, NAME) if (nrex_compare_class(POS, #NAME)) return nrex_class_ ## NAME + +static nrex_class_type nrex_parse_class(const nrex_char** pos) +{ + NREX_COMPARE_CLASS(pos, alnum); + NREX_COMPARE_CLASS(pos, alpha); + NREX_COMPARE_CLASS(pos, blank); + NREX_COMPARE_CLASS(pos, cntrl); + NREX_COMPARE_CLASS(pos, digit); + NREX_COMPARE_CLASS(pos, graph); + NREX_COMPARE_CLASS(pos, lower); + NREX_COMPARE_CLASS(pos, print); + NREX_COMPARE_CLASS(pos, punct); + NREX_COMPARE_CLASS(pos, space); + NREX_COMPARE_CLASS(pos, upper); + NREX_COMPARE_CLASS(pos, xdigit); + NREX_COMPARE_CLASS(pos, word); + return nrex_class_none; +} + +struct nrex_node_class : public nrex_node +{ + nrex_class_type type; + + nrex_node_class(nrex_class_type t) + : nrex_node(true) + , type(t) + { + length = 1; + } + + int test(nrex_search* s, int pos) const + { + if (s->end == pos) + { + return -1; + } + if (!test_class(s->at(pos))) + { + return -1; + } + return next ? next->test(s, pos + 1) : pos + 1; + } + + bool test_class(nrex_char c) const + { + if ((0 <= c && c <= 0x1F) || c == 0x7F) + { + if (type == nrex_class_cntrl) + { + return true; + } + } + else if (c < 0x7F) + { + if (type == nrex_class_print) + { + return true; + } + else if (type == nrex_class_graph && c != ' ') + { + return true; + } + else if ('0' <= c && c <= '9') + { + switch (type) + { + case nrex_class_alnum: + case nrex_class_digit: + case nrex_class_xdigit: + case nrex_class_word: + return true; + default: + break; + } + } + else if ('A' <= c && c <= 'Z') + { + switch (type) + { + case nrex_class_alnum: + case nrex_class_alpha: + case nrex_class_upper: + case nrex_class_word: + return true; + case nrex_class_xdigit: + if (c <= 'F') + { + return true; + } + default: + break; + } + } + else if ('a' <= c && c <= 'z') + { + switch (type) + { + case nrex_class_alnum: + case nrex_class_alpha: + case nrex_class_lower: + case nrex_class_word: + return true; + case nrex_class_xdigit: + if (c <= 'f') + { + return true; + } + default: + break; + } + } + } + switch (c) + { + case ' ': + case '\t': + if (type == nrex_class_blank) + { + return true; + } + case '\r': + case '\n': + case '\f': + if (type == nrex_class_space) + { + return true; + } + break; + case '_': + if (type == nrex_class_word) + { + return true; + } + case ']': + case '[': + case '!': + case '"': + case '#': + case '$': + case '%': + case '&': + case '\'': + case '(': + case ')': + case '*': + case '+': + case ',': + case '.': + case '/': + case ':': + case ';': + case '<': + case '=': + case '>': + case '?': + case '@': + case '\\': + case '^': + case '`': + case '{': + case '|': + case '}': + case '~': + case '-': + if (type == nrex_class_punct) + { + return true; + } + break; + default: + break; + } + return false; + } +}; + +static bool nrex_is_shorthand(nrex_char repr) +{ + switch (repr) + { + case 'W': + case 'w': + case 'D': + case 'd': + case 'S': + case 's': + return true; + } + return false; +} + +struct nrex_node_shorthand : public nrex_node +{ + nrex_char repr; + + nrex_node_shorthand(nrex_char c) + : nrex_node(true) + , repr(c) + { + length = 1; + } + + int test(nrex_search* s, int pos) const + { + if (s->end == pos) + { + return -1; + } + bool found = false; + bool invert = false; + nrex_char c = s->at(pos); + switch (repr) + { + case '.': + found = true; + break; + case 'W': + invert = true; + case 'w': + if (c == '_' || NREX_ISALPHANUM(c)) + { + found = true; + } + break; + case 'D': + invert = true; + case 'd': + if ('0' <= c && c <= '9') + { + found = true; + } + break; + case 'S': + invert = true; + case 's': + if (NREX_ISSPACE(c)) + { + found = true; + } + break; + } + if (found == invert) + { + return -1; + } + return next ? next->test(s, pos + 1) : pos + 1; + } +}; + +static bool nrex_is_quantifier(nrex_char repr) +{ + switch (repr) + { + case '?': + case '*': + case '+': + case '{': + return true; + } + return false; +} + +struct nrex_node_quantifier : public nrex_node +{ + int min; + int max; + bool greedy; + nrex_node* child; + + nrex_node_quantifier(int min, int max) + : nrex_node() + , min(min) + , max(max) + , greedy(true) + , child(NULL) + { + } + + virtual ~nrex_node_quantifier() + { + if (child) + { + NREX_DELETE(child); + } + } + + int test(nrex_search* s, int pos) const + { + return test_step(s, pos, 1); + } + + int test_step(nrex_search* s, int pos, int level) const + { + if (max == 0) + { + return pos; + } + if ((max >= 1 && level > max) || pos > s->end) + { + return -1; + } + if (!greedy && level > min) + { + int res = pos; + if (next) + { + res = next->test(s, res); + } + if (s->complete) + { + return res; + } + if (res >= 0 && parent->test_parent(s, res) >= 0) + { + return res; + } + } + int res = child->test(s, pos); + if (s->complete) + { + return res; + } + if (res >= 0) + { + int res_step = test_step(s, res, level + 1); + if (res_step >= 0) + { + return res_step; + } + else if (greedy && level >= min) + { + if (next) + { + res = next->test(s, res); + } + if (s->complete) + { + return res; + } + if (res >= 0 && parent->test_parent(s, res) >= 0) + { + return res; + } + } + } + return -1; + } +}; + +struct nrex_node_anchor : public nrex_node +{ + bool end; + + nrex_node_anchor(bool end) + : nrex_node() + , end(end) + { + length = 0; + } + + int test(nrex_search* s, int pos) const + { + if (!end && pos != 0) + { + return -1; + } + else if (end && pos != s->end) + { + return -1; + } + return next ? next->test(s, pos) : pos; + } +}; + +struct nrex_node_word_boundary : public nrex_node +{ + bool inverse; + + nrex_node_word_boundary(bool inverse) + : nrex_node() + , inverse(inverse) + { + length = 0; + } + + int test(nrex_search* s, int pos) const + { + bool left = false; + bool right = false; + if (pos != 0) + { + nrex_char c = s->at(pos - 1); + if (c == '_' || NREX_ISALPHANUM(c)) + { + left = true; + } + } + if (pos != s->end) + { + nrex_char c = s->at(pos); + if (c == '_' || NREX_ISALPHANUM(c)) + { + right = true; + } + } + if ((left != right) == inverse) + { + return -1; + } + return next ? next->test(s, pos) : pos; + } +}; + +struct nrex_node_backreference : public nrex_node +{ + int ref; + + nrex_node_backreference(int ref) + : nrex_node(true) + , ref(ref) + { + length = -1; + } + + int test(nrex_search* s, int pos) const + { + nrex_result& r = s->captures[ref]; + for (int i = 0; i < r.length; ++i) + { + if (pos + i >= s->end) + { + return -1; + } + if (s->at(r.start + i) != s->at(pos + i)) + { + return -1; + } + } + return next ? next->test(s, pos + r.length) : pos + r.length; + } +}; + +bool nrex_has_lookbehind(nrex_array<nrex_node_group*>& stack) +{ + for (unsigned int i = 0; i < stack.size(); i++) + { + if (stack[i]->mode == nrex_node_group::LookBehind) + { + return true; + } + } + return false; +} + +nrex::nrex() + : _capturing(0) + , _root(NULL) +{ +} + +nrex::~nrex() +{ + if (_root) + { + NREX_DELETE(_root); + } +} + +bool nrex::valid() const +{ + return (_root != NULL); +} + +void nrex::reset() +{ + _capturing = 0; + if (_root) + { + NREX_DELETE(_root); + } + _root = NULL; +} + +int nrex::capture_size() const +{ + return _capturing + 1; +} + +bool nrex::compile(const nrex_char* pattern, bool extended) +{ + reset(); + nrex_node_group* root = NREX_NEW(nrex_node_group(_capturing)); + nrex_array<nrex_node_group*> stack; + stack.push(root); + _root = root; + + for (const nrex_char* c = pattern; c[0] != '\0'; ++c) + { + if (c[0] == '(') + { + if (c[1] == '?') + { + if (c[2] == ':') + { + c = &c[2]; + nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::NonCapture)); + stack.top()->add_child(group); + stack.push(group); + } + else if (c[2] == '!' || c[2] == '=') + { + c = &c[2]; + nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::LookAhead)); + group->negate = (c[0] == '!'); + stack.top()->add_child(group); + stack.push(group); + } + else if (c[2] == '<' && (c[3] == '!' || c[3] == '=')) + { + c = &c[3]; + nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::LookBehind)); + group->negate = (c[0] == '!'); + stack.top()->add_child(group); + stack.push(group); + } + else + { + NREX_COMPILE_ERROR("unrecognised qualifier for group"); + } + } + else if ((!extended && _capturing < 9) || (extended && _capturing < 99)) + { + nrex_node_group* group = NREX_NEW(nrex_node_group(++_capturing)); + stack.top()->add_child(group); + stack.push(group); + } + else + { + nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::NonCapture)); + stack.top()->add_child(group); + stack.push(group); + } + } + else if (c[0] == ')') + { + if (stack.size() > 1) + { + stack.pop(); + } + else + { + NREX_COMPILE_ERROR("unexpected ')'"); + } + } + else if (c[0] == '[') + { + nrex_node_group* group = NREX_NEW(nrex_node_group(nrex_node_group::Bracket)); + stack.top()->add_child(group); + if (c[1] == '^') + { + group->negate = true; + ++c; + } + bool first_child = true; + nrex_char previous_child; + bool previous_child_single = false; + while (true) + { + group->add_childset(); + ++c; + if (c[0] == '\0') + { + NREX_COMPILE_ERROR("unclosed bracket expression '['"); + } + if (c[0] == '[' && c[1] == ':') + { + const nrex_char* d = &c[2]; + nrex_class_type cls = nrex_parse_class(&d); + if (cls != nrex_class_none) + { + c = d; + group->add_child(NREX_NEW(nrex_node_class(cls))); + previous_child_single = false; + } + else + { + group->add_child(NREX_NEW(nrex_node_char('['))); + previous_child = '['; + previous_child_single = true; + } + } + else if (c[0] == ']' && !first_child) + { + break; + } + else if (c[0] == '\\') + { + if (nrex_is_shorthand(c[1])) + { + group->add_child(NREX_NEW(nrex_node_shorthand(c[1]))); + ++c; + previous_child_single = false; + } + else + { + const nrex_char* d = c; + nrex_char unescaped = nrex_unescape(d); + if (c == d) + { + NREX_COMPILE_ERROR("invalid escape token"); + } + group->add_child(NREX_NEW(nrex_node_char(unescaped))); + c = d; + previous_child = unescaped; + previous_child_single = true; + } + } + else if (previous_child_single && c[0] == '-') + { + bool is_range = false; + nrex_char next; + if (c[1] != '\0' && c[1] != ']') + { + if (c[1] == '\\') + { + const nrex_char* d = ++c; + next = nrex_unescape(d); + if (c == d) + { + NREX_COMPILE_ERROR("invalid escape token in range"); + } + } + else + { + next = c[1]; + ++c; + } + is_range = true; + } + if (is_range) + { + if (next < previous_child) + { + NREX_COMPILE_ERROR("text range out of order"); + } + group->pop_back(); + group->add_child(NREX_NEW(nrex_node_range(previous_child, next))); + previous_child_single = false; + } + else + { + group->add_child(NREX_NEW(nrex_node_char(c[0]))); + previous_child = c[0]; + previous_child_single = true; + } + } + else + { + group->add_child(NREX_NEW(nrex_node_char(c[0]))); + previous_child = c[0]; + previous_child_single = true; + } + first_child = false; + } + } + else if (nrex_is_quantifier(c[0])) + { + if (stack.top()->back == NULL || !stack.top()->back->quantifiable) + { + if (c[0] == '{') + { + stack.top()->add_child(NREX_NEW(nrex_node_char('{'))); + continue; + } + NREX_COMPILE_ERROR("element not quantifiable"); + } + int min = 0; + int max = -1; + bool valid_quantifier = true; + if (c[0] == '?') + { + min = 0; + max = 1; + } + else if (c[0] == '+') + { + min = 1; + max = -1; + } + else if (c[0] == '*') + { + min = 0; + max = -1; + } + else if (c[0] == '{') + { + bool max_set = false; + const nrex_char* d = c; + while (true) + { + ++d; + if (d[0] == '\0') + { + valid_quantifier = false; + break; + } + else if (d[0] == '}') + { + break; + } + else if (d[0] == ',') + { + max_set = true; + continue; + } + else if (d[0] < '0' || '9' < d[0]) + { + valid_quantifier = false; + break; + } + if (max_set) + { + if (max < 0) + { + max = int(d[0] - '0'); + } + else + { + max = max * 10 + int(d[0] - '0'); + } + } + else + { + min = min * 10 + int(d[0] - '0'); + } + } + if (!max_set) + { + max = min; + } + if (valid_quantifier) + { + c = d; + } + } + if (valid_quantifier) + { + nrex_node_quantifier* quant = NREX_NEW(nrex_node_quantifier(min, max)); + if (min == max) + { + if (stack.top()->back->length >= 0) + { + quant->length = max * stack.top()->back->length; + } + } + else + { + if (nrex_has_lookbehind(stack)) + { + NREX_COMPILE_ERROR("variable length quantifiers inside lookbehind not supported"); + } + } + quant->child = stack.top()->swap_back(quant); + quant->child->previous = NULL; + quant->child->next = NULL; + quant->child->parent = quant; + if (c[1] == '?') + { + quant->greedy = false; + ++c; + } + } + else + { + stack.top()->add_child(NREX_NEW(nrex_node_char(c[0]))); + } + } + else if (c[0] == '|') + { + if (nrex_has_lookbehind(stack)) + { + NREX_COMPILE_ERROR("alternations inside lookbehind not supported"); + } + stack.top()->add_childset(); + } + else if (c[0] == '^' || c[0] == '$') + { + stack.top()->add_child(NREX_NEW(nrex_node_anchor((c[0] == '$')))); + } + else if (c[0] == '.') + { + stack.top()->add_child(NREX_NEW(nrex_node_shorthand('.'))); + } + else if (c[0] == '\\') + { + if (nrex_is_shorthand(c[1])) + { + stack.top()->add_child(NREX_NEW(nrex_node_shorthand(c[1]))); + ++c; + } + else if ('1' <= c[1] && c[1] <= '9') + { + int ref = 0; + if (extended && '0' <= c[2] && c[2] <= '9') + { + ref = int(c[1] - '0') * 10 + int(c[2] - '0'); + c = &c[2]; + } + else + { + ref = int(c[1] - '0'); + ++c; + } + if (ref > _capturing) + { + NREX_COMPILE_ERROR("backreference to non-existent capture"); + } + if (nrex_has_lookbehind(stack)) + { + NREX_COMPILE_ERROR("backreferences inside lookbehind not supported"); + } + stack.top()->add_child(NREX_NEW(nrex_node_backreference(ref))); + } + else if (c[1] == 'b' || c[1] == 'B') + { + stack.top()->add_child(NREX_NEW(nrex_node_word_boundary(c[1] == 'B'))); + ++c; + } + else + { + const nrex_char* d = c; + nrex_char unescaped = nrex_unescape(d); + if (c == d) + { + NREX_COMPILE_ERROR("invalid escape token"); + } + stack.top()->add_child(NREX_NEW(nrex_node_char(unescaped))); + c = d; + } + } + else + { + stack.top()->add_child(NREX_NEW(nrex_node_char(c[0]))); + } + } + if (stack.size() > 1) + { + NREX_COMPILE_ERROR("unclosed group '('"); + } + return true; +} + +bool nrex::match(const nrex_char* str, nrex_result* captures, int offset, int end) const +{ + nrex_search s(str, captures); + if (end >= offset) + { + s.end = end; + } + else + { + s.end = NREX_STRLEN(str); + } + for (int i = offset; i < s.end; ++i) + { + for (int c = 0; c <= _capturing; ++c) + { + captures[c].start = 0; + captures[c].length = 0; + } + if (_root->test(&s, i) >= 0) + { + return true; + } + } + return false; +} |