summaryrefslogtreecommitdiff
path: root/thirdparty/pcre2/src/pcre2_dfa_match.c
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/pcre2/src/pcre2_dfa_match.c')
-rw-r--r--thirdparty/pcre2/src/pcre2_dfa_match.c271
1 files changed, 230 insertions, 41 deletions
diff --git a/thirdparty/pcre2/src/pcre2_dfa_match.c b/thirdparty/pcre2/src/pcre2_dfa_match.c
index c6184ff5e9..9b43237da7 100644
--- a/thirdparty/pcre2/src/pcre2_dfa_match.c
+++ b/thirdparty/pcre2/src/pcre2_dfa_match.c
@@ -181,7 +181,8 @@ static const uint8_t coptable[] = {
0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
- 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
+ 0, 0, /* COMMIT, COMMIT_ARG */
+ 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
};
@@ -254,7 +255,8 @@ static const uint8_t poptable[] = {
0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
- 0, 0, 0, 0, /* COMMIT, FAIL, ACCEPT, ASSERT_ACCEPT */
+ 0, 0, /* COMMIT, COMMIT_ARG */
+ 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
0, 0, 0 /* CLOSE, SKIPZERO, DEFINE */
};
@@ -292,6 +294,35 @@ typedef struct stateblock {
#define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
+/* Before version 10.32 the recursive calls of internal_dfa_match() were passed
+local working space and output vectors that were created on the stack. This has
+caused issues for some patterns, especially in small-stack environments such as
+Windows. A new scheme is now in use which sets up a vector on the stack, but if
+this is too small, heap memory is used, up to the heap_limit. The main
+parameters are all numbers of ints because the workspace is a vector of ints.
+
+The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
+defined in pcre2_internal.h so as to be available to pcre2test when it is
+finding the minimum heap requirement for a match. */
+
+#define OVEC_UNIT (sizeof(PCRE2_SIZE)/sizeof(int))
+
+#define RWS_BASE_SIZE (DFA_START_RWS_SIZE/sizeof(int)) /* Stack vector */
+#define RWS_RSIZE 1000 /* Work size for recursion */
+#define RWS_OVEC_RSIZE (1000*OVEC_UNIT) /* Ovector for recursion */
+#define RWS_OVEC_OSIZE (2*OVEC_UNIT) /* Ovector in other cases */
+
+/* This structure is at the start of each workspace block. */
+
+typedef struct RWS_anchor {
+ struct RWS_anchor *next;
+ unsigned int size; /* Number of ints */
+ unsigned int free; /* Number of ints */
+} RWS_anchor;
+
+#define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
+
+
/*************************************************
* Process a callout *
@@ -354,6 +385,61 @@ return (mb->callout)(cb, mb->callout_data);
/*************************************************
+* Expand local workspace memory *
+*************************************************/
+
+/* This function is called when internal_dfa_match() is about to be called
+recursively and there is insufficient working space left in the current
+workspace block. If there's an existing next block, use it; otherwise get a new
+block unless the heap limit is reached.
+
+Arguments:
+ rwsptr pointer to block pointer (updated)
+ ovecsize space needed for an ovector
+ mb the match block
+
+Returns: 0 rwsptr has been updated
+ !0 an error code
+*/
+
+static int
+more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
+{
+RWS_anchor *rws = *rwsptr;
+RWS_anchor *new;
+
+if (rws->next != NULL)
+ {
+ new = rws->next;
+ }
+
+/* All sizes are in units of sizeof(int), except for mb->heaplimit, which is in
+kibibytes. */
+
+else
+ {
+ unsigned int newsize = rws->size * 2;
+ unsigned int heapleft = (unsigned int)
+ (((1024/sizeof(int))*mb->heap_limit - mb->heap_used));
+ if (newsize > heapleft) newsize = heapleft;
+ if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
+ return PCRE2_ERROR_HEAPLIMIT;
+ new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
+ if (new == NULL) return PCRE2_ERROR_NOMEMORY;
+ mb->heap_used += newsize;
+ new->next = NULL;
+ new->size = newsize;
+ rws->next = new;
+ }
+
+new->free = new->size - RWS_ANCHOR_SIZE;
+*rwsptr = new;
+return 0;
+}
+
+
+
+/*************************************************
* Match a Regular Expression - DFA engine *
*************************************************/
@@ -431,7 +517,8 @@ internal_dfa_match(
uint32_t offsetcount,
int *workspace,
int wscount,
- uint32_t rlevel)
+ uint32_t rlevel,
+ int *RWS)
{
stateblock *active_states, *new_states, *temp_states;
stateblock *next_active_state, *next_new_state;
@@ -788,7 +875,7 @@ for (;;)
else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
match_count = 0;
count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
- if (count > 0) memmove(offsets + 2, offsets,
+ if (count > 0) (void)memmove(offsets + 2, offsets,
(size_t)count * sizeof(PCRE2_SIZE));
if (offsetcount >= 2)
{
@@ -2587,10 +2674,22 @@ for (;;)
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
{
- PCRE2_SPTR endasscode = code + GET(code, 1);
- PCRE2_SIZE local_offsets[2];
int rc;
- int local_workspace[1000];
+ int *local_workspace;
+ PCRE2_SIZE *local_offsets;
+ PCRE2_SPTR endasscode = code + GET(code, 1);
+ RWS_anchor *rws = (RWS_anchor *)RWS;
+
+ if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
+ {
+ rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
+ if (rc != 0) return rc;
+ RWS = (int *)rws;
+ }
+
+ local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
+ local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
+ rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
@@ -2600,10 +2699,13 @@ for (;;)
ptr, /* where we currently are */
(PCRE2_SIZE)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
- sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
+ RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
local_workspace, /* workspace vector */
- sizeof(local_workspace)/sizeof(int), /* size of same */
- rlevel); /* function recursion level */
+ RWS_RSIZE, /* size of same */
+ rlevel, /* function recursion level */
+ RWS); /* recursion workspace */
+
+ rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
@@ -2615,8 +2717,6 @@ for (;;)
case OP_COND:
case OP_SCOND:
{
- PCRE2_SIZE local_offsets[1000];
- int local_workspace[1000];
int codelink = (int)GET(code, 1);
PCRE2_UCHAR condcode;
@@ -2673,8 +2773,22 @@ for (;;)
else
{
int rc;
+ int *local_workspace;
+ PCRE2_SIZE *local_offsets;
PCRE2_SPTR asscode = code + LINK_SIZE + 1;
PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
+ RWS_anchor *rws = (RWS_anchor *)RWS;
+
+ if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
+ {
+ rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
+ if (rc != 0) return rc;
+ RWS = (int *)rws;
+ }
+
+ local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
+ local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
+ rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
@@ -2684,10 +2798,13 @@ for (;;)
ptr, /* where we currently are */
(PCRE2_SIZE)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
- sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
+ RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
local_workspace, /* workspace vector */
- sizeof(local_workspace)/sizeof(int), /* size of same */
- rlevel); /* function recursion level */
+ RWS_RSIZE, /* size of same */
+ rlevel, /* function recursion level */
+ RWS); /* recursion workspace */
+
+ rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
if ((rc >= 0) ==
@@ -2702,13 +2819,25 @@ for (;;)
/*-----------------------------------------------------------------*/
case OP_RECURSE:
{
+ int rc;
+ int *local_workspace;
+ PCRE2_SIZE *local_offsets;
+ RWS_anchor *rws = (RWS_anchor *)RWS;
dfa_recursion_info *ri;
- PCRE2_SIZE local_offsets[1000];
- int local_workspace[1000];
PCRE2_SPTR callpat = start_code + GET(code, 1);
uint32_t recno = (callpat == mb->start_code)? 0 :
GET2(callpat, 1 + LINK_SIZE);
- int rc;
+
+ if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
+ {
+ rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
+ if (rc != 0) return rc;
+ RWS = (int *)rws;
+ }
+
+ local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
+ local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
+ rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
/* Check for repeating a recursion without advancing the subject
pointer. This should catch convoluted mutual recursions. (Some simple
@@ -2732,11 +2861,13 @@ for (;;)
ptr, /* where we currently are */
(PCRE2_SIZE)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
- sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
+ RWS_OVEC_RSIZE/OVEC_UNIT, /* size of same */
local_workspace, /* workspace vector */
- sizeof(local_workspace)/sizeof(int), /* size of same */
- rlevel); /* function recursion level */
+ RWS_RSIZE, /* size of same */
+ rlevel, /* function recursion level */
+ RWS); /* recursion workspace */
+ rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
mb->recursive = new_recursive.prevrec; /* Done this recursion */
/* Ran out of internal offsets */
@@ -2782,10 +2913,25 @@ for (;;)
case OP_SCBRAPOS:
case OP_BRAPOSZERO:
{
+ int rc;
+ int *local_workspace;
+ PCRE2_SIZE *local_offsets;
PCRE2_SIZE charcount, matched_count;
PCRE2_SPTR local_ptr = ptr;
+ RWS_anchor *rws = (RWS_anchor *)RWS;
BOOL allow_zero;
+ if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
+ {
+ rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
+ if (rc != 0) return rc;
+ RWS = (int *)rws;
+ }
+
+ local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
+ local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
+ rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
+
if (codevalue == OP_BRAPOSZERO)
{
allow_zero = TRUE;
@@ -2798,19 +2944,17 @@ for (;;)
for (matched_count = 0;; matched_count++)
{
- PCRE2_SIZE local_offsets[2];
- int local_workspace[1000];
-
- int rc = internal_dfa_match(
+ rc = internal_dfa_match(
mb, /* fixed match data */
code, /* this subexpression's code */
local_ptr, /* where we currently are */
(PCRE2_SIZE)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
- sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
+ RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
local_workspace, /* workspace vector */
- sizeof(local_workspace)/sizeof(int), /* size of same */
- rlevel); /* function recursion level */
+ RWS_RSIZE, /* size of same */
+ rlevel, /* function recursion level */
+ RWS); /* recursion workspace */
/* Failed to match */
@@ -2827,6 +2971,8 @@ for (;;)
local_ptr += charcount; /* Advance temporary position ptr */
}
+ rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
+
/* At this point we have matched the subpattern matched_count
times, and local_ptr is pointing to the character after the end of the
last match. */
@@ -2869,19 +3015,35 @@ for (;;)
/*-----------------------------------------------------------------*/
case OP_ONCE:
{
- PCRE2_SIZE local_offsets[2];
- int local_workspace[1000];
+ int rc;
+ int *local_workspace;
+ PCRE2_SIZE *local_offsets;
+ RWS_anchor *rws = (RWS_anchor *)RWS;
- int rc = internal_dfa_match(
+ if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
+ {
+ rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
+ if (rc != 0) return rc;
+ RWS = (int *)rws;
+ }
+
+ local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
+ local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
+ rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
+
+ rc = internal_dfa_match(
mb, /* fixed match data */
code, /* this subexpression's code */
ptr, /* where we currently are */
(PCRE2_SIZE)(ptr - start_subject), /* start offset */
local_offsets, /* offset vector */
- sizeof(local_offsets)/sizeof(PCRE2_SIZE), /* size of same */
+ RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
local_workspace, /* workspace vector */
- sizeof(local_workspace)/sizeof(int), /* size of same */
- rlevel); /* function recursion level */
+ RWS_RSIZE, /* size of same */
+ rlevel, /* function recursion level */
+ RWS); /* recursion workspace */
+
+ rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
if (rc >= 0)
{
@@ -3063,6 +3225,7 @@ pcre2_dfa_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length,
PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
{
+int rc;
const pcre2_real_code *re = (const pcre2_real_code *)code;
PCRE2_SPTR start_match;
@@ -3071,9 +3234,9 @@ PCRE2_SPTR bumpalong_limit;
PCRE2_SPTR req_cu_ptr;
BOOL utf, anchored, startline, firstline;
-
BOOL has_first_cu = FALSE;
BOOL has_req_cu = FALSE;
+
PCRE2_UCHAR first_cu = 0;
PCRE2_UCHAR first_cu2 = 0;
PCRE2_UCHAR req_cu = 0;
@@ -3088,6 +3251,17 @@ pcre2_callout_block cb;
dfa_match_block actual_match_block;
dfa_match_block *mb = &actual_match_block;
+/* Set up a starting block of memory for use during recursive calls to
+internal_dfa_match(). By putting this on the stack, it minimizes resource use
+in the case when it is not needed. If this is too small, more memory is
+obtained from the heap. At the start of each block is an anchor structure.*/
+
+int base_recursion_workspace[RWS_BASE_SIZE];
+RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
+rws->next = NULL;
+rws->size = RWS_BASE_SIZE;
+rws->free = RWS_BASE_SIZE - RWS_ANCHOR_SIZE;
+
/* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated
subject string. */
@@ -3184,6 +3358,7 @@ if (mcontext == NULL)
mb->memctl = re->memctl;
mb->match_limit = PRIV(default_match_context).match_limit;
mb->match_limit_depth = PRIV(default_match_context).depth_limit;
+ mb->heap_limit = PRIV(default_match_context).heap_limit;
}
else
{
@@ -3198,6 +3373,7 @@ else
mb->memctl = mcontext->memctl;
mb->match_limit = mcontext->match_limit;
mb->match_limit_depth = mcontext->depth_limit;
+ mb->heap_limit = mcontext->heap_limit;
}
if (mb->match_limit > re->limit_match)
@@ -3206,6 +3382,9 @@ if (mb->match_limit > re->limit_match)
if (mb->match_limit_depth > re->limit_depth)
mb->match_limit_depth = re->limit_depth;
+if (mb->heap_limit > re->limit_heap)
+ mb->heap_limit = re->limit_heap;
+
mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
re->name_count * re->name_entry_size;
mb->tables = re->tables;
@@ -3215,6 +3394,7 @@ mb->start_offset = start_offset;
mb->moptions = options;
mb->poptions = re->overall_options;
mb->match_call_count = 0;
+mb->heap_used = 0;
/* Process the \R and newline settings. */
@@ -3351,8 +3531,6 @@ a match. */
for (;;)
{
- int rc;
-
/* ----------------- Start of match optimizations ---------------- */
/* There are some optimizations that avoid running the match if a known
@@ -3544,7 +3722,7 @@ for (;;)
in characters, we treat it as code units to avoid spending too much time
in this optimization. */
- if (end_subject - start_match < re->minlength) return PCRE2_ERROR_NOMATCH;
+ if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
/* If req_cu is set, we know that that code unit must appear in the
subject for the match to succeed. If the first code unit is set, req_cu
@@ -3621,7 +3799,8 @@ for (;;)
(uint32_t)match_data->oveccount * 2, /* actual size of same */
workspace, /* workspace vector */
(int)wscount, /* size of same */
- 0); /* function recurse level */
+ 0, /* function recurse level */
+ base_recursion_workspace); /* initial workspace for recursion */
/* Anything other than "no match" means we are done, always; otherwise, carry
on only if not anchored. */
@@ -3637,7 +3816,7 @@ for (;;)
match_data->rightchar = (PCRE2_SIZE)( mb->last_used_ptr - subject);
match_data->startchar = (PCRE2_SIZE)(start_match - subject);
match_data->rc = rc;
- return rc;
+ goto EXIT;
}
/* Advance to the next subject character unless we are at the end of a line
@@ -3668,8 +3847,18 @@ for (;;)
} /* "Bumpalong" loop */
+NOMATCH_EXIT:
+rc = PCRE2_ERROR_NOMATCH;
+
+EXIT:
+while (rws->next != NULL)
+ {
+ RWS_anchor *next = rws->next;
+ rws->next = next->next;
+ mb->memctl.free(next, mb->memctl.memory_data);
+ }
-return PCRE2_ERROR_NOMATCH;
+return rc;
}
/* End of pcre2_dfa_match.c */