diff options
Diffstat (limited to 'thirdparty/pcre2/src/pcre2_match.c')
-rw-r--r-- | thirdparty/pcre2/src/pcre2_match.c | 237 |
1 files changed, 174 insertions, 63 deletions
diff --git a/thirdparty/pcre2/src/pcre2_match.c b/thirdparty/pcre2/src/pcre2_match.c index 48e7b9dbb2..f28cdbb47a 100644 --- a/thirdparty/pcre2/src/pcre2_match.c +++ b/thirdparty/pcre2/src/pcre2_match.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2015-2019 University of Cambridge + New API code Copyright (c) 2015-2021 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -381,8 +381,12 @@ length = Fovector[offset+1] - Fovector[offset]; if (caseless) { #if defined SUPPORT_UNICODE - if ((mb->poptions & PCRE2_UTF) != 0) + BOOL utf = (mb->poptions & PCRE2_UTF) != 0; + + if (utf || (mb->poptions & PCRE2_UCP) != 0) { + PCRE2_SPTR endptr = p + length; + /* Match characters up to the end of the reference. NOTE: the number of code units matched may differ, because in UTF-8 there are some characters whose upper and lower case codes have different numbers of bytes. For @@ -390,16 +394,25 @@ if (caseless) bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a sequence of two of the latter. It is important, therefore, to check the length along the reference, not along the subject (earlier code did this - wrong). */ + wrong). UCP without uses Unicode properties but without UTF encoding. */ - PCRE2_SPTR endptr = p + length; while (p < endptr) { uint32_t c, d; const ucd_record *ur; if (eptr >= mb->end_subject) return 1; /* Partial match */ - GETCHARINC(c, eptr); - GETCHARINC(d, p); + + if (utf) + { + GETCHARINC(c, eptr); + GETCHARINC(d, p); + } + else + { + c = *eptr++; + d = *p++; + } + ur = GET_UCD(d); if (c != d && c != (uint32_t)((int)d + ur->other_case)) { @@ -415,7 +428,7 @@ if (caseless) else #endif - /* Not in UTF mode */ + /* Not in UTF or UCP mode */ { for (; length > 0; length--) { @@ -432,7 +445,8 @@ if (caseless) } /* In the caseful case, we can just compare the code units, whether or not we -are in UTF mode. When partial matching, we have to do this unit-by-unit. */ +are in UTF and/or UCP mode. When partial matching, we have to do this unit by +unit. */ else { @@ -574,8 +588,8 @@ match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, PCRE2_SIZE *ovector, heapframe *F; /* Current frame pointer */ heapframe *N = NULL; /* Temporary frame pointers */ heapframe *P = NULL; -heapframe *assert_accept_frame; /* For passing back the frame with captures */ -PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */ +heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */ +PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */ /* Local variables that do not need to be preserved over calls to RRMATCH(). */ @@ -598,12 +612,13 @@ BOOL condition; /* Used in conditional groups */ BOOL cur_is_word; /* Used in "word" tests */ BOOL prev_is_word; /* Used in "word" tests */ -/* UTF flag */ +/* UTF and UCP flags */ #ifdef SUPPORT_UNICODE BOOL utf = (mb->poptions & PCRE2_UTF) != 0; +BOOL ucp = (mb->poptions & PCRE2_UCP) != 0; #else -BOOL utf = FALSE; +BOOL utf = FALSE; /* Required for convenience even when no Unicode support */ #endif /* This is the length of the last part of a backtracking frame that must be @@ -803,10 +818,12 @@ fprintf(stderr, "++ op=%d\n", *Fecode); /* N is now the frame of the recursion; the previous frame is at the OP_RECURSE position. Go back there, copying the current subject position - and mark, and move on past the OP_RECURSE. */ + and mark, and the start_match position (\K might have changed it), and + then move on past the OP_RECURSE. */ P->eptr = Feptr; P->mark = Fmark; + P->start_match = Fstart_match; F = P; Fecode += 1 + LINK_SIZE; continue; @@ -928,6 +945,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); } else #endif + /* Not UTF mode */ { if (mb->end_subject - Feptr < 1) @@ -987,10 +1005,30 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); } } + + /* If UCP is set without UTF we must do the same as above, but with one + character per code unit. */ + + else if (ucp) + { + uint32_t cc = UCHAR21(Feptr); + fc = Fecode[1]; + if (fc < 128) + { + if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); + } + else + { + if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); + } + Feptr++; + Fecode += 2; + } + else #endif /* SUPPORT_UNICODE */ - /* Not UTF mode; use the table for characters < 256. */ + /* Not UTF or UCP mode; use the table for characters < 256. */ { if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1]) != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH); @@ -1010,6 +1048,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } + #ifdef SUPPORT_UNICODE if (utf) { @@ -1026,15 +1065,42 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (ch > 127) ch = UCD_OTHERCASE(ch); else - ch = TABLE_GET(ch, mb->fcc, ch); + ch = (mb->fcc)[ch]; + if (ch == fc) RRETURN(MATCH_NOMATCH); + } + } + + /* UCP without UTF is as above, but with one character per code unit. */ + + else if (ucp) + { + uint32_t ch; + fc = UCHAR21INC(Feptr); + ch = Fecode[1]; + Fecode += 2; + + if (ch == fc) + { + RRETURN(MATCH_NOMATCH); /* Caseful match */ + } + else if (Fop == OP_NOTI) /* If caseless */ + { + if (ch > 127) + ch = UCD_OTHERCASE(ch); + else + ch = (mb->fcc)[ch]; if (ch == fc) RRETURN(MATCH_NOMATCH); } } + else #endif /* SUPPORT_UNICODE */ + + /* Neither UTF nor UCP is set */ + { uint32_t ch = Fecode[1]; - fc = *Feptr++; + fc = UCHAR21INC(Feptr); if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc)) RRETURN(MATCH_NOMATCH); Fecode += 2; @@ -1244,7 +1310,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); #endif /* SUPPORT_UNICODE */ /* When not in UTF mode, load a single-code-unit character. Then proceed as - above. */ + above, using Unicode casing if either UTF or UCP is set. */ Lc = *Fecode++; @@ -1253,11 +1319,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (Fop >= OP_STARI) { #if PCRE2_CODE_UNIT_WIDTH == 8 - /* Lc must be < 128 in UTF-8 mode. */ +#ifdef SUPPORT_UNICODE + if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); + else +#endif /* SUPPORT_UNICODE */ + /* Lc will be < 128 in UTF-8 mode. */ Loc = mb->fcc[Lc]; #else /* 16-bit & 32-bit */ #ifdef SUPPORT_UNICODE - if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); + if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc); else #endif /* SUPPORT_UNICODE */ Loc = TABLE_GET(Lc, mb->fcc, Lc); @@ -1490,7 +1560,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (Fop >= OP_NOTSTARI) /* Caseless */ { #ifdef SUPPORT_UNICODE - if (utf && Lc > 127) + if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc); else #endif /* SUPPORT_UNICODE */ @@ -6045,11 +6115,10 @@ BOOL firstline; BOOL has_first_cu = FALSE; BOOL has_req_cu = FALSE; BOOL startline; -BOOL utf; #if PCRE2_CODE_UNIT_WIDTH == 8 -BOOL memchr_not_found_first_cu = FALSE; -BOOL memchr_not_found_first_cu2 = FALSE; +PCRE2_SPTR memchr_found_first_cu; +PCRE2_SPTR memchr_found_first_cu2; #endif PCRE2_UCHAR first_cu = 0; @@ -6069,13 +6138,19 @@ PCRE2_SPTR match_partial; BOOL use_jit; #endif +/* This flag is needed even when Unicode is not supported for convenience +(it is used by the IS_NEWLINE macro). */ + +BOOL utf = FALSE; + #ifdef SUPPORT_UNICODE +BOOL ucp = FALSE; BOOL allow_invalid; uint32_t fragment_options = 0; #ifdef SUPPORT_JIT BOOL jit_checked_utf = FALSE; #endif -#endif +#endif /* SUPPORT_UNICODE */ PCRE2_SIZE frame_size; @@ -6091,7 +6166,8 @@ proves to be too small, it is replaced by a larger one on the heap. To get a vector of the size required that is aligned for pointers, allocate it as a vector of pointers. */ -PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)]; +PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)] + PCRE2_KEEP_UNINITIALIZED; mb->stack_frames = (heapframe *)stack_frames_vector; /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated @@ -6147,12 +6223,13 @@ use_jit = (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0); #endif -/* Initialize UTF parameters. */ +/* Initialize UTF/UCP parameters. */ -utf = (re->overall_options & PCRE2_UTF) != 0; #ifdef SUPPORT_UNICODE +utf = (re->overall_options & PCRE2_UTF) != 0; allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0; -#endif +ucp = (re->overall_options & PCRE2_UCP) != 0; +#endif /* SUPPORT_UNICODE */ /* Convert the partial matching flags into an integer. */ @@ -6589,9 +6666,13 @@ if ((re->flags & PCRE2_FIRSTSET) != 0) if ((re->flags & PCRE2_FIRSTCASELESS) != 0) { first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu); +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu); +#else + if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu); #endif +#endif /* SUPPORT_UNICODE */ } } else @@ -6607,9 +6688,13 @@ if ((re->flags & PCRE2_LASTSET) != 0) if ((re->flags & PCRE2_LASTCASELESS) != 0) { req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu); +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu); +#else + if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu); #endif +#endif /* SUPPORT_UNICODE */ } } @@ -6626,6 +6711,11 @@ FRAGMENT_RESTART: start_partial = match_partial = NULL; mb->hitend = FALSE; +#if PCRE2_CODE_UNIT_WIDTH == 8 +memchr_found_first_cu = NULL; +memchr_found_first_cu2 = NULL; +#endif + for(;;) { PCRE2_SPTR new_start_match; @@ -6692,13 +6782,7 @@ for(;;) } } - /* Not anchored. Advance to a unique first code unit if there is one. In - 8-bit mode, the use of memchr() gives a big speed up, even though we have - to call it twice in caseless mode, in order to find the earliest occurrence - of the character in either of its cases. If a call to memchr() that - searches the rest of the subject fails to find one case, remember that in - order not to keep on repeating the search. This can make a huge difference - when the strings are very long and only one case is present. */ + /* Not anchored. Advance to a unique first code unit if there is one. */ else { @@ -6706,43 +6790,68 @@ for(;;) { if (first_cu != first_cu2) /* Caseless */ { + /* In 16-bit and 32_bit modes we have to do our own search, so can + look for both cases at once. */ + #if PCRE2_CODE_UNIT_WIDTH != 8 PCRE2_UCHAR smc; while (start_match < end_subject && (smc = UCHAR21TEST(start_match)) != first_cu && - smc != first_cu2) + smc != first_cu2) start_match++; +#else + /* In 8-bit mode, the use of memchr() gives a big speed up, even + though we have to call it twice in order to find the earliest + occurrence of the code unit in either of its cases. Caching is used + to remember the positions of previously found code units. This can + make a huge difference when the strings are very long and only one + case is actually present. */ -#else /* 8-bit code units */ PCRE2_SPTR pp1 = NULL; PCRE2_SPTR pp2 = NULL; - PCRE2_SIZE cu2size = end_subject - start_match; + PCRE2_SIZE searchlength = end_subject - start_match; + + /* If we haven't got a previously found position for first_cu, or if + the current starting position is later, we need to do a search. If + the code unit is not found, set it to the end. */ - if (!memchr_not_found_first_cu) + if (memchr_found_first_cu == NULL || + start_match > memchr_found_first_cu) { - pp1 = memchr(start_match, first_cu, end_subject - start_match); - if (pp1 == NULL) memchr_not_found_first_cu = TRUE; - else cu2size = pp1 - start_match; + pp1 = memchr(start_match, first_cu, searchlength); + memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1; } - /* If pp1 is not NULL, we have arranged to search only as far as pp1, - to see if the other case is earlier, so we can set "not found" only - when both searches have returned NULL. */ + /* If the start is before a previously found position, use the + previous position, or NULL if a previous search failed. */ + + else pp1 = (memchr_found_first_cu == end_subject)? NULL : + memchr_found_first_cu; + + /* Do the same thing for the other case. */ - if (!memchr_not_found_first_cu2) + if (memchr_found_first_cu2 == NULL || + start_match > memchr_found_first_cu2) { - pp2 = memchr(start_match, first_cu2, cu2size); - memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL); + pp2 = memchr(start_match, first_cu2, searchlength); + memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2; } + else pp2 = (memchr_found_first_cu2 == end_subject)? NULL : + memchr_found_first_cu2; + + /* Set the start to the end of the subject if neither case was found. + Otherwise, use the earlier found point. */ + if (pp1 == NULL) start_match = (pp2 == NULL)? end_subject : pp2; else start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2; -#endif + +#endif /* 8-bit handling */ } - /* The caseful case */ + /* The caseful case is much simpler. */ else { @@ -6756,15 +6865,16 @@ for(;;) #endif } - /* If we can't find the required code unit, having reached the true end - of the subject, break the bumpalong loop, to force a match failure, - except when doing partial matching, when we let the next cycle run at - the end of the subject. To see why, consider the pattern /(?<=abc)def/, - which partially matches "abc", even though the string does not contain - the starting character "d". If we have not reached the true end of the - subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified) - we also let the cycle run, because the matching string is legitimately - allowed to start with the first code unit of a newline. */ + /* If we can't find the required first code unit, having reached the + true end of the subject, break the bumpalong loop, to force a match + failure, except when doing partial matching, when we let the next cycle + run at the end of the subject. To see why, consider the pattern + /(?<=abc)def/, which partially matches "abc", even though the string + does not contain the starting character "d". If we have not reached the + true end of the subject (PCRE2_FIRSTLINE caused end_subject to be + temporarily modified) we also let the cycle run, because the matching + string is legitimately allowed to start with the first code unit of a + newline. */ if (mb->partial == 0 && start_match >= mb->end_subject) { @@ -7103,6 +7213,7 @@ if (utf && end_subject != true_end_subject && starting code units in 8-bit and 16-bit modes. */ start_match = end_subject + 1; + #if PCRE2_CODE_UNIT_WIDTH != 32 while (start_match < true_end_subject && NOT_FIRSTCU(*start_match)) start_match++; |