summaryrefslogtreecommitdiff
path: root/thirdparty/pcre2/src/pcre2_dfa_match.c
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/pcre2/src/pcre2_dfa_match.c')
-rw-r--r--thirdparty/pcre2/src/pcre2_dfa_match.c105
1 files changed, 68 insertions, 37 deletions
diff --git a/thirdparty/pcre2/src/pcre2_dfa_match.c b/thirdparty/pcre2/src/pcre2_dfa_match.c
index 7d8ffe8a3e..060dc7669a 100644
--- a/thirdparty/pcre2/src/pcre2_dfa_match.c
+++ b/thirdparty/pcre2/src/pcre2_dfa_match.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016-2019 University of Cambridge
+ New API code Copyright (c) 2016-2021 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -548,6 +548,7 @@ PCRE2_SPTR start_code = mb->start_code;
#ifdef SUPPORT_UNICODE
BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
+BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
#else
BOOL utf = FALSE;
#endif
@@ -2190,7 +2191,7 @@ for (;;)
if (clen == 0) break;
#ifdef SUPPORT_UNICODE
- if (utf)
+ if (utf_or_ucp)
{
if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
{
@@ -2204,7 +2205,7 @@ for (;;)
}
else
#endif /* SUPPORT_UNICODE */
- /* Not UTF mode */
+ /* Not UTF or UCP mode */
{
if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
{ ADD_NEW(state_offset + 2, 0); }
@@ -2339,7 +2340,7 @@ for (;;)
{
uint32_t otherd;
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -2374,7 +2375,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -2417,7 +2418,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -2458,7 +2459,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -2491,7 +2492,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -2531,7 +2532,7 @@ for (;;)
if (caseless)
{
#ifdef SUPPORT_UNICODE
- if (utf && d >= 128)
+ if (utf_or_ucp && d >= 128)
otherd = UCD_OTHERCASE(d);
else
#endif /* SUPPORT_UNICODE */
@@ -3255,8 +3256,8 @@ BOOL has_first_cu = FALSE;
BOOL has_req_cu = FALSE;
#if PCRE2_CODE_UNIT_WIDTH == 8
-BOOL memchr_not_found_first_cu = FALSE;
-BOOL memchr_not_found_first_cu2 = FALSE;
+PCRE2_SPTR memchr_found_first_cu = NULL;
+PCRE2_SPTR memchr_found_first_cu2 = NULL;
#endif
PCRE2_UCHAR first_cu = 0;
@@ -3526,10 +3527,15 @@ if ((re->flags & PCRE2_FIRSTSET) != 0)
if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
{
first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
- if (utf && first_cu > 127)
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
+ first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
+#else
+ if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
#endif
+#endif /* SUPPORT_UNICODE */
}
}
else
@@ -3545,9 +3551,15 @@ if ((re->flags & PCRE2_LASTSET) != 0)
if ((re->flags & PCRE2_LASTCASELESS) != 0)
{
req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
-#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8
- if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
+#ifdef SUPPORT_UNICODE
+#if PCRE2_CODE_UNIT_WIDTH == 8
+ if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
+ req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
+#else
+ if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
+ req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
#endif
+#endif /* SUPPORT_UNICODE */
}
}
@@ -3636,13 +3648,7 @@ for (;;)
}
}
- /* Not anchored. Advance to a unique first code unit if there is one. In
- 8-bit mode, the use of memchr() gives a big speed up, even though we have
- to call it twice in caseless mode, in order to find the earliest occurrence
- of the character in either of its cases. If a call to memchr() that
- searches the rest of the subject fails to find one case, remember that in
- order not to keep on repeating the search. This can make a huge difference
- when the strings are very long and only one case is present. */
+ /* Not anchored. Advance to a unique first code unit if there is one. */
else
{
@@ -3650,43 +3656,68 @@ for (;;)
{
if (first_cu != first_cu2) /* Caseless */
{
+ /* In 16-bit and 32_bit modes we have to do our own search, so can
+ look for both cases at once. */
+
#if PCRE2_CODE_UNIT_WIDTH != 8
PCRE2_UCHAR smc;
while (start_match < end_subject &&
(smc = UCHAR21TEST(start_match)) != first_cu &&
- smc != first_cu2)
+ smc != first_cu2)
start_match++;
+#else
+ /* In 8-bit mode, the use of memchr() gives a big speed up, even
+ though we have to call it twice in order to find the earliest
+ occurrence of the code unit in either of its cases. Caching is used
+ to remember the positions of previously found code units. This can
+ make a huge difference when the strings are very long and only one
+ case is actually present. */
-#else /* 8-bit code units */
PCRE2_SPTR pp1 = NULL;
PCRE2_SPTR pp2 = NULL;
- PCRE2_SIZE cu2size = end_subject - start_match;
+ PCRE2_SIZE searchlength = end_subject - start_match;
- if (!memchr_not_found_first_cu)
+ /* If we haven't got a previously found position for first_cu, or if
+ the current starting position is later, we need to do a search. If
+ the code unit is not found, set it to the end. */
+
+ if (memchr_found_first_cu == NULL ||
+ start_match > memchr_found_first_cu)
{
- pp1 = memchr(start_match, first_cu, end_subject - start_match);
- if (pp1 == NULL) memchr_not_found_first_cu = TRUE;
- else cu2size = pp1 - start_match;
+ pp1 = memchr(start_match, first_cu, searchlength);
+ memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
}
- /* If pp1 is not NULL, we have arranged to search only as far as pp1,
- to see if the other case is earlier, so we can set "not found" only
- when both searches have returned NULL. */
+ /* If the start is before a previously found position, use the
+ previous position, or NULL if a previous search failed. */
+
+ else pp1 = (memchr_found_first_cu == end_subject)? NULL :
+ memchr_found_first_cu;
- if (!memchr_not_found_first_cu2)
+ /* Do the same thing for the other case. */
+
+ if (memchr_found_first_cu2 == NULL ||
+ start_match > memchr_found_first_cu2)
{
- pp2 = memchr(start_match, first_cu2, cu2size);
- memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL);
+ pp2 = memchr(start_match, first_cu2, searchlength);
+ memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
}
+ else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
+ memchr_found_first_cu2;
+
+ /* Set the start to the end of the subject if neither case was found.
+ Otherwise, use the earlier found point. */
+
if (pp1 == NULL)
start_match = (pp2 == NULL)? end_subject : pp2;
else
start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
-#endif
+
+#endif /* 8-bit handling */
}
- /* The caseful case */
+ /* The caseful case is much simpler. */
else
{