diff options
Diffstat (limited to 'thirdparty/pcre2/src/pcre2_dfa_match.c')
-rw-r--r-- | thirdparty/pcre2/src/pcre2_dfa_match.c | 105 |
1 files changed, 68 insertions, 37 deletions
diff --git a/thirdparty/pcre2/src/pcre2_dfa_match.c b/thirdparty/pcre2/src/pcre2_dfa_match.c index 7d8ffe8a3e..060dc7669a 100644 --- a/thirdparty/pcre2/src/pcre2_dfa_match.c +++ b/thirdparty/pcre2/src/pcre2_dfa_match.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2019 University of Cambridge + New API code Copyright (c) 2016-2021 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -548,6 +548,7 @@ PCRE2_SPTR start_code = mb->start_code; #ifdef SUPPORT_UNICODE BOOL utf = (mb->poptions & PCRE2_UTF) != 0; +BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0; #else BOOL utf = FALSE; #endif @@ -2190,7 +2191,7 @@ for (;;) if (clen == 0) break; #ifdef SUPPORT_UNICODE - if (utf) + if (utf_or_ucp) { if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else { @@ -2204,7 +2205,7 @@ for (;;) } else #endif /* SUPPORT_UNICODE */ - /* Not UTF mode */ + /* Not UTF or UCP mode */ { if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d)) { ADD_NEW(state_offset + 2, 0); } @@ -2339,7 +2340,7 @@ for (;;) { uint32_t otherd; #ifdef SUPPORT_UNICODE - if (utf && d >= 128) + if (utf_or_ucp && d >= 128) otherd = UCD_OTHERCASE(d); else #endif /* SUPPORT_UNICODE */ @@ -2374,7 +2375,7 @@ for (;;) if (caseless) { #ifdef SUPPORT_UNICODE - if (utf && d >= 128) + if (utf_or_ucp && d >= 128) otherd = UCD_OTHERCASE(d); else #endif /* SUPPORT_UNICODE */ @@ -2417,7 +2418,7 @@ for (;;) if (caseless) { #ifdef SUPPORT_UNICODE - if (utf && d >= 128) + if (utf_or_ucp && d >= 128) otherd = UCD_OTHERCASE(d); else #endif /* SUPPORT_UNICODE */ @@ -2458,7 +2459,7 @@ for (;;) if (caseless) { #ifdef SUPPORT_UNICODE - if (utf && d >= 128) + if (utf_or_ucp && d >= 128) otherd = UCD_OTHERCASE(d); else #endif /* SUPPORT_UNICODE */ @@ -2491,7 +2492,7 @@ for (;;) if (caseless) { #ifdef SUPPORT_UNICODE - if (utf && d >= 128) + if (utf_or_ucp && d >= 128) otherd = UCD_OTHERCASE(d); else #endif /* SUPPORT_UNICODE */ @@ -2531,7 +2532,7 @@ for (;;) if (caseless) { #ifdef SUPPORT_UNICODE - if (utf && d >= 128) + if (utf_or_ucp && d >= 128) otherd = UCD_OTHERCASE(d); else #endif /* SUPPORT_UNICODE */ @@ -3255,8 +3256,8 @@ BOOL has_first_cu = FALSE; BOOL has_req_cu = FALSE; #if PCRE2_CODE_UNIT_WIDTH == 8 -BOOL memchr_not_found_first_cu = FALSE; -BOOL memchr_not_found_first_cu2 = FALSE; +PCRE2_SPTR memchr_found_first_cu = NULL; +PCRE2_SPTR memchr_found_first_cu2 = NULL; #endif PCRE2_UCHAR first_cu = 0; @@ -3526,10 +3527,15 @@ if ((re->flags & PCRE2_FIRSTSET) != 0) if ((re->flags & PCRE2_FIRSTCASELESS) != 0) { first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - if (utf && first_cu > 127) +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0) + first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu); +#else + if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0)) first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu); #endif +#endif /* SUPPORT_UNICODE */ } } else @@ -3545,9 +3551,15 @@ if ((re->flags & PCRE2_LASTSET) != 0) if ((re->flags & PCRE2_LASTCASELESS) != 0) { req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - if (utf && req_cu > 127) req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu); +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0) + req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu); +#else + if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0)) + req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu); #endif +#endif /* SUPPORT_UNICODE */ } } @@ -3636,13 +3648,7 @@ for (;;) } } - /* Not anchored. Advance to a unique first code unit if there is one. In - 8-bit mode, the use of memchr() gives a big speed up, even though we have - to call it twice in caseless mode, in order to find the earliest occurrence - of the character in either of its cases. If a call to memchr() that - searches the rest of the subject fails to find one case, remember that in - order not to keep on repeating the search. This can make a huge difference - when the strings are very long and only one case is present. */ + /* Not anchored. Advance to a unique first code unit if there is one. */ else { @@ -3650,43 +3656,68 @@ for (;;) { if (first_cu != first_cu2) /* Caseless */ { + /* In 16-bit and 32_bit modes we have to do our own search, so can + look for both cases at once. */ + #if PCRE2_CODE_UNIT_WIDTH != 8 PCRE2_UCHAR smc; while (start_match < end_subject && (smc = UCHAR21TEST(start_match)) != first_cu && - smc != first_cu2) + smc != first_cu2) start_match++; +#else + /* In 8-bit mode, the use of memchr() gives a big speed up, even + though we have to call it twice in order to find the earliest + occurrence of the code unit in either of its cases. Caching is used + to remember the positions of previously found code units. This can + make a huge difference when the strings are very long and only one + case is actually present. */ -#else /* 8-bit code units */ PCRE2_SPTR pp1 = NULL; PCRE2_SPTR pp2 = NULL; - PCRE2_SIZE cu2size = end_subject - start_match; + PCRE2_SIZE searchlength = end_subject - start_match; - if (!memchr_not_found_first_cu) + /* If we haven't got a previously found position for first_cu, or if + the current starting position is later, we need to do a search. If + the code unit is not found, set it to the end. */ + + if (memchr_found_first_cu == NULL || + start_match > memchr_found_first_cu) { - pp1 = memchr(start_match, first_cu, end_subject - start_match); - if (pp1 == NULL) memchr_not_found_first_cu = TRUE; - else cu2size = pp1 - start_match; + pp1 = memchr(start_match, first_cu, searchlength); + memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1; } - /* If pp1 is not NULL, we have arranged to search only as far as pp1, - to see if the other case is earlier, so we can set "not found" only - when both searches have returned NULL. */ + /* If the start is before a previously found position, use the + previous position, or NULL if a previous search failed. */ + + else pp1 = (memchr_found_first_cu == end_subject)? NULL : + memchr_found_first_cu; - if (!memchr_not_found_first_cu2) + /* Do the same thing for the other case. */ + + if (memchr_found_first_cu2 == NULL || + start_match > memchr_found_first_cu2) { - pp2 = memchr(start_match, first_cu2, cu2size); - memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL); + pp2 = memchr(start_match, first_cu2, searchlength); + memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2; } + else pp2 = (memchr_found_first_cu2 == end_subject)? NULL : + memchr_found_first_cu2; + + /* Set the start to the end of the subject if neither case was found. + Otherwise, use the earlier found point. */ + if (pp1 == NULL) start_match = (pp2 == NULL)? end_subject : pp2; else start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2; -#endif + +#endif /* 8-bit handling */ } - /* The caseful case */ + /* The caseful case is much simpler. */ else { |