diff options
Diffstat (limited to 'thirdparty/pcre2/src/pcre2_match.c')
-rw-r--r-- | thirdparty/pcre2/src/pcre2_match.c | 170 |
1 files changed, 130 insertions, 40 deletions
diff --git a/thirdparty/pcre2/src/pcre2_match.c b/thirdparty/pcre2/src/pcre2_match.c index 48e7b9dbb2..e3f78c2ca3 100644 --- a/thirdparty/pcre2/src/pcre2_match.c +++ b/thirdparty/pcre2/src/pcre2_match.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2015-2019 University of Cambridge + New API code Copyright (c) 2015-2020 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -381,8 +381,12 @@ length = Fovector[offset+1] - Fovector[offset]; if (caseless) { #if defined SUPPORT_UNICODE - if ((mb->poptions & PCRE2_UTF) != 0) + BOOL utf = (mb->poptions & PCRE2_UTF) != 0; + + if (utf || (mb->poptions & PCRE2_UCP) != 0) { + PCRE2_SPTR endptr = p + length; + /* Match characters up to the end of the reference. NOTE: the number of code units matched may differ, because in UTF-8 there are some characters whose upper and lower case codes have different numbers of bytes. For @@ -390,16 +394,25 @@ if (caseless) bytes in UTF-8); a sequence of 3 of the former uses 6 bytes, as does a sequence of two of the latter. It is important, therefore, to check the length along the reference, not along the subject (earlier code did this - wrong). */ + wrong). UCP without uses Unicode properties but without UTF encoding. */ - PCRE2_SPTR endptr = p + length; while (p < endptr) { uint32_t c, d; const ucd_record *ur; if (eptr >= mb->end_subject) return 1; /* Partial match */ - GETCHARINC(c, eptr); - GETCHARINC(d, p); + + if (utf) + { + GETCHARINC(c, eptr); + GETCHARINC(d, p); + } + else + { + c = *eptr++; + d = *p++; + } + ur = GET_UCD(d); if (c != d && c != (uint32_t)((int)d + ur->other_case)) { @@ -415,7 +428,7 @@ if (caseless) else #endif - /* Not in UTF mode */ + /* Not in UTF or UCP mode */ { for (; length > 0; length--) { @@ -432,7 +445,8 @@ if (caseless) } /* In the caseful case, we can just compare the code units, whether or not we -are in UTF mode. When partial matching, we have to do this unit-by-unit. */ +are in UTF and/or UCP mode. When partial matching, we have to do this unit by +unit. */ else { @@ -574,8 +588,8 @@ match(PCRE2_SPTR start_eptr, PCRE2_SPTR start_ecode, PCRE2_SIZE *ovector, heapframe *F; /* Current frame pointer */ heapframe *N = NULL; /* Temporary frame pointers */ heapframe *P = NULL; -heapframe *assert_accept_frame; /* For passing back the frame with captures */ -PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */ +heapframe *assert_accept_frame = NULL; /* For passing back a frame with captures */ +PCRE2_SIZE frame_copy_size; /* Amount to copy when creating a new frame */ /* Local variables that do not need to be preserved over calls to RRMATCH(). */ @@ -598,12 +612,13 @@ BOOL condition; /* Used in conditional groups */ BOOL cur_is_word; /* Used in "word" tests */ BOOL prev_is_word; /* Used in "word" tests */ -/* UTF flag */ +/* UTF and UCP flags */ #ifdef SUPPORT_UNICODE BOOL utf = (mb->poptions & PCRE2_UTF) != 0; +BOOL ucp = (mb->poptions & PCRE2_UCP) != 0; #else -BOOL utf = FALSE; +BOOL utf = FALSE; /* Required for convenience even when no Unicode support */ #endif /* This is the length of the last part of a backtracking frame that must be @@ -928,6 +943,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); } else #endif + /* Not UTF mode */ { if (mb->end_subject - Feptr < 1) @@ -987,10 +1003,30 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (dc != fc && dc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); } } + + /* If UCP is set without UTF we must do the same as above, but with one + character per code unit. */ + + else if (ucp) + { + uint32_t cc = UCHAR21(Feptr); + fc = Fecode[1]; + if (fc < 128) + { + if (mb->lcc[fc] != TABLE_GET(cc, mb->lcc, cc)) RRETURN(MATCH_NOMATCH); + } + else + { + if (cc != fc && cc != UCD_OTHERCASE(fc)) RRETURN(MATCH_NOMATCH); + } + Feptr++; + Fecode += 2; + } + else #endif /* SUPPORT_UNICODE */ - /* Not UTF mode; use the table for characters < 256. */ + /* Not UTF or UCP mode; use the table for characters < 256. */ { if (TABLE_GET(Fecode[1], mb->lcc, Fecode[1]) != TABLE_GET(*Feptr, mb->lcc, *Feptr)) RRETURN(MATCH_NOMATCH); @@ -1010,6 +1046,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); SCHECK_PARTIAL(); RRETURN(MATCH_NOMATCH); } + #ifdef SUPPORT_UNICODE if (utf) { @@ -1026,15 +1063,42 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (ch > 127) ch = UCD_OTHERCASE(ch); else - ch = TABLE_GET(ch, mb->fcc, ch); + ch = (mb->fcc)[ch]; + if (ch == fc) RRETURN(MATCH_NOMATCH); + } + } + + /* UCP without UTF is as above, but with one character per code unit. */ + + else if (ucp) + { + uint32_t ch; + fc = UCHAR21INC(Feptr); + ch = Fecode[1]; + Fecode += 2; + + if (ch == fc) + { + RRETURN(MATCH_NOMATCH); /* Caseful match */ + } + else if (Fop == OP_NOTI) /* If caseless */ + { + if (ch > 127) + ch = UCD_OTHERCASE(ch); + else + ch = (mb->fcc)[ch]; if (ch == fc) RRETURN(MATCH_NOMATCH); } } + else #endif /* SUPPORT_UNICODE */ + + /* Neither UTF nor UCP is set */ + { uint32_t ch = Fecode[1]; - fc = *Feptr++; + fc = UCHAR21INC(Feptr); if (ch == fc || (Fop == OP_NOTI && TABLE_GET(ch, mb->fcc, ch) == fc)) RRETURN(MATCH_NOMATCH); Fecode += 2; @@ -1244,7 +1308,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); #endif /* SUPPORT_UNICODE */ /* When not in UTF mode, load a single-code-unit character. Then proceed as - above. */ + above, using Unicode casing if either UTF or UCP is set. */ Lc = *Fecode++; @@ -1253,11 +1317,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (Fop >= OP_STARI) { #if PCRE2_CODE_UNIT_WIDTH == 8 - /* Lc must be < 128 in UTF-8 mode. */ +#ifdef SUPPORT_UNICODE + if (ucp && !utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); + else +#endif /* SUPPORT_UNICODE */ + /* Lc will be < 128 in UTF-8 mode. */ Loc = mb->fcc[Lc]; #else /* 16-bit & 32-bit */ #ifdef SUPPORT_UNICODE - if (utf && Lc > 127) Loc = UCD_OTHERCASE(Lc); + if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc); else #endif /* SUPPORT_UNICODE */ Loc = TABLE_GET(Lc, mb->fcc, Lc); @@ -1490,7 +1558,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (Fop >= OP_NOTSTARI) /* Caseless */ { #ifdef SUPPORT_UNICODE - if (utf && Lc > 127) + if ((utf || ucp) && Lc > 127) Loc = UCD_OTHERCASE(Lc); else #endif /* SUPPORT_UNICODE */ @@ -6045,11 +6113,10 @@ BOOL firstline; BOOL has_first_cu = FALSE; BOOL has_req_cu = FALSE; BOOL startline; -BOOL utf; #if PCRE2_CODE_UNIT_WIDTH == 8 -BOOL memchr_not_found_first_cu = FALSE; -BOOL memchr_not_found_first_cu2 = FALSE; +BOOL memchr_not_found_first_cu; +BOOL memchr_not_found_first_cu2; #endif PCRE2_UCHAR first_cu = 0; @@ -6069,13 +6136,19 @@ PCRE2_SPTR match_partial; BOOL use_jit; #endif +/* This flag is needed even when Unicode is not supported for convenience +(it is used by the IS_NEWLINE macro). */ + +BOOL utf = FALSE; + #ifdef SUPPORT_UNICODE +BOOL ucp = FALSE; BOOL allow_invalid; uint32_t fragment_options = 0; #ifdef SUPPORT_JIT BOOL jit_checked_utf = FALSE; #endif -#endif +#endif /* SUPPORT_UNICODE */ PCRE2_SIZE frame_size; @@ -6091,7 +6164,8 @@ proves to be too small, it is replaced by a larger one on the heap. To get a vector of the size required that is aligned for pointers, allocate it as a vector of pointers. */ -PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)]; +PCRE2_SPTR stack_frames_vector[START_FRAMES_SIZE/sizeof(PCRE2_SPTR)] + PCRE2_KEEP_UNINITIALIZED; mb->stack_frames = (heapframe *)stack_frames_vector; /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated @@ -6147,12 +6221,13 @@ use_jit = (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0); #endif -/* Initialize UTF parameters. */ +/* Initialize UTF/UCP parameters. */ -utf = (re->overall_options & PCRE2_UTF) != 0; #ifdef SUPPORT_UNICODE +utf = (re->overall_options & PCRE2_UTF) != 0; allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0; -#endif +ucp = (re->overall_options & PCRE2_UCP) != 0; +#endif /* SUPPORT_UNICODE */ /* Convert the partial matching flags into an integer. */ @@ -6589,9 +6664,13 @@ if ((re->flags & PCRE2_FIRSTSET) != 0) if ((re->flags & PCRE2_FIRSTCASELESS) != 0) { first_cu2 = TABLE_GET(first_cu, mb->fcc, first_cu); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - if (utf && first_cu > 127) first_cu2 = UCD_OTHERCASE(first_cu); +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (first_cu > 127 && ucp && !utf) first_cu2 = UCD_OTHERCASE(first_cu); +#else + if (first_cu > 127 && (utf || ucp)) first_cu2 = UCD_OTHERCASE(first_cu); #endif +#endif /* SUPPORT_UNICODE */ } } else @@ -6607,9 +6686,13 @@ if ((re->flags & PCRE2_LASTSET) != 0) if ((re->flags & PCRE2_LASTCASELESS) != 0) { req_cu2 = TABLE_GET(req_cu, mb->fcc, req_cu); -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 - if (utf && req_cu > 127) req_cu2 = UCD_OTHERCASE(req_cu); +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 + if (req_cu > 127 && ucp && !utf) req_cu2 = UCD_OTHERCASE(req_cu); +#else + if (req_cu > 127 && (utf || ucp)) req_cu2 = UCD_OTHERCASE(req_cu); #endif +#endif /* SUPPORT_UNICODE */ } } @@ -6626,6 +6709,11 @@ FRAGMENT_RESTART: start_partial = match_partial = NULL; mb->hitend = FALSE; +#if PCRE2_CODE_UNIT_WIDTH == 8 +memchr_not_found_first_cu = FALSE; +memchr_not_found_first_cu2 = FALSE; +#endif + for(;;) { PCRE2_SPTR new_start_match; @@ -6756,15 +6844,16 @@ for(;;) #endif } - /* If we can't find the required code unit, having reached the true end - of the subject, break the bumpalong loop, to force a match failure, - except when doing partial matching, when we let the next cycle run at - the end of the subject. To see why, consider the pattern /(?<=abc)def/, - which partially matches "abc", even though the string does not contain - the starting character "d". If we have not reached the true end of the - subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified) - we also let the cycle run, because the matching string is legitimately - allowed to start with the first code unit of a newline. */ + /* If we can't find the required first code unit, having reached the + true end of the subject, break the bumpalong loop, to force a match + failure, except when doing partial matching, when we let the next cycle + run at the end of the subject. To see why, consider the pattern + /(?<=abc)def/, which partially matches "abc", even though the string + does not contain the starting character "d". If we have not reached the + true end of the subject (PCRE2_FIRSTLINE caused end_subject to be + temporarily modified) we also let the cycle run, because the matching + string is legitimately allowed to start with the first code unit of a + newline. */ if (mb->partial == 0 && start_match >= mb->end_subject) { @@ -7103,6 +7192,7 @@ if (utf && end_subject != true_end_subject && starting code units in 8-bit and 16-bit modes. */ start_match = end_subject + 1; + #if PCRE2_CODE_UNIT_WIDTH != 32 while (start_match < true_end_subject && NOT_FIRSTCU(*start_match)) start_match++; |