diff options
Diffstat (limited to 'thirdparty/pcre2/src/pcre2_substitute.c')
-rw-r--r-- | thirdparty/pcre2/src/pcre2_substitute.c | 195 |
1 files changed, 128 insertions, 67 deletions
diff --git a/thirdparty/pcre2/src/pcre2_substitute.c b/thirdparty/pcre2/src/pcre2_substitute.c index ec3dd66df9..981a106a9f 100644 --- a/thirdparty/pcre2/src/pcre2_substitute.c +++ b/thirdparty/pcre2/src/pcre2_substitute.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2019 University of Cambridge + New API code Copyright (c) 2016-2020 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -49,8 +49,9 @@ POSSIBILITY OF SUCH DAMAGE. #define SUBSTITUTE_OPTIONS \ (PCRE2_SUBSTITUTE_EXTENDED|PCRE2_SUBSTITUTE_GLOBAL| \ - PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_UNKNOWN_UNSET| \ - PCRE2_SUBSTITUTE_UNSET_EMPTY) + PCRE2_SUBSTITUTE_LITERAL|PCRE2_SUBSTITUTE_MATCHED| \ + PCRE2_SUBSTITUTE_OVERFLOW_LENGTH|PCRE2_SUBSTITUTE_REPLACEMENT_ONLY| \ + PCRE2_SUBSTITUTE_UNKNOWN_UNSET|PCRE2_SUBSTITUTE_UNSET_EMPTY) @@ -194,6 +195,7 @@ overflow, either give an error immediately, or keep on, accumulating the length. */ #define CHECKMEMCPY(from,length) \ + { \ if (!overflowed && lengthleft < length) \ { \ if ((suboptions & PCRE2_SUBSTITUTE_OVERFLOW_LENGTH) == 0) goto NOROOM; \ @@ -209,7 +211,8 @@ length. */ memcpy(buffer + buff_offset, from, CU2BYTES(length)); \ buff_offset += length; \ lengthleft -= length; \ - } + } \ + } /* Here's the function */ @@ -226,11 +229,14 @@ int forcecasereset = 0; uint32_t ovector_count; uint32_t goptions = 0; uint32_t suboptions; -BOOL match_data_created = FALSE; -BOOL literal = FALSE; +pcre2_match_data *internal_match_data = NULL; +BOOL escaped_literal = FALSE; BOOL overflowed = FALSE; +BOOL use_existing_match; +BOOL replacement_only; #ifdef SUPPORT_UNICODE BOOL utf = (code->overall_options & PCRE2_UTF) != 0; +BOOL ucp = (code->overall_options & PCRE2_UCP) != 0; #endif PCRE2_UCHAR temp[6]; PCRE2_SPTR ptr; @@ -248,23 +254,54 @@ lengthleft = buff_length = *blength; *blength = PCRE2_UNSET; ovecsave[0] = ovecsave[1] = ovecsave[2] = PCRE2_UNSET; -/* Partial matching is not valid. This must come after setting *blength to +/* Partial matching is not valid. This must come after setting *blength to PCRE2_UNSET, so as not to imply an offset in the replacement. */ if ((options & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) != 0) return PCRE2_ERROR_BADOPTION; -/* If no match data block is provided, create one. */ +/* Check for using a match that has already happened. Note that the subject +pointer in the match data may be NULL after a no-match. */ + +use_existing_match = ((options & PCRE2_SUBSTITUTE_MATCHED) != 0); +replacement_only = ((options & PCRE2_SUBSTITUTE_REPLACEMENT_ONLY) != 0); + +/* If starting from an existing match, there must be an externally provided +match data block. We create an internal match_data block in two cases: (a) an +external one is not supplied (and we are not starting from an existing match); +(b) an existing match is to be used for the first substitution. In the latter +case, we copy the existing match into the internal block. This ensures that no +changes are made to the existing match data block. */ if (match_data == NULL) { + pcre2_general_context *gcontext; + if (use_existing_match) return PCRE2_ERROR_NULL; + gcontext = (mcontext == NULL)? + (pcre2_general_context *)code : + (pcre2_general_context *)mcontext; + match_data = internal_match_data = + pcre2_match_data_create_from_pattern(code, gcontext); + if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; + } + +else if (use_existing_match) + { pcre2_general_context *gcontext = (mcontext == NULL)? (pcre2_general_context *)code : (pcre2_general_context *)mcontext; - match_data = pcre2_match_data_create_from_pattern(code, gcontext); - if (match_data == NULL) return PCRE2_ERROR_NOMEMORY; - match_data_created = TRUE; + int pairs = (code->top_bracket + 1 < match_data->oveccount)? + code->top_bracket + 1 : match_data->oveccount; + internal_match_data = pcre2_match_data_create(match_data->oveccount, + gcontext); + if (internal_match_data == NULL) return PCRE2_ERROR_NOMEMORY; + memcpy(internal_match_data, match_data, offsetof(pcre2_match_data, ovector) + + 2*pairs*sizeof(PCRE2_SIZE)); + match_data = internal_match_data; } + +/* Remember ovector details */ + ovector = pcre2_get_ovector_pointer(match_data); ovector_count = pcre2_get_ovector_count(match_data); @@ -286,7 +323,7 @@ repend = replacement + rlength; #ifdef SUPPORT_UNICODE if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) { - rc = PRIV(valid_utf)(replacement, rlength, &(match_data->rightchar)); + rc = PRIV(valid_utf)(replacement, rlength, &(match_data->startchar)); if (rc != 0) { match_data->leftchar = 0; @@ -300,7 +337,7 @@ if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) suboptions = options & SUBSTITUTE_OPTIONS; options &= ~SUBSTITUTE_OPTIONS; -/* Copy up to the start offset */ +/* Error if the start match offset is greater than the length of the subject. */ if (start_offset > length) { @@ -308,9 +345,13 @@ if (start_offset > length) rc = PCRE2_ERROR_BADOFFSET; goto EXIT; } -CHECKMEMCPY(subject, start_offset); -/* Loop for global substituting. */ +/* Copy up to the start offset, unless only the replacement is required. */ + +if (!replacement_only) CHECKMEMCPY(subject, start_offset); + +/* Loop for global substituting. If PCRE2_SUBSTITUTE_MATCHED is set, the first +match is taken from the match_data that was passed in. */ subs = 0; do @@ -318,7 +359,12 @@ do PCRE2_SPTR ptrstack[PTR_STACK_SIZE]; uint32_t ptrstackptr = 0; - rc = pcre2_match(code, subject, length, start_offset, options|goptions, + if (use_existing_match) + { + rc = match_data->rc; + use_existing_match = FALSE; + } + else rc = pcre2_match(code, subject, length, start_offset, options|goptions, match_data, mcontext); #ifdef SUPPORT_UNICODE @@ -364,44 +410,44 @@ do #endif } - /* Copy what we have advanced past, reset the special global options, and - continue to the next match. */ + /* Copy what we have advanced past (unless not required), reset the special + global options, and continue to the next match. */ fraglength = start_offset - save_start; - CHECKMEMCPY(subject + save_start, fraglength); + if (!replacement_only) CHECKMEMCPY(subject + save_start, fraglength); goptions = 0; continue; } /* Handle a successful match. Matches that use \K to end before they start or start before the current point in the subject are not supported. */ - + if (ovector[1] < ovector[0] || ovector[0] < start_offset) { rc = PCRE2_ERROR_BADSUBSPATTERN; goto EXIT; } - - /* Check for the same match as previous. This is legitimate after matching an + + /* Check for the same match as previous. This is legitimate after matching an empty string that starts after the initial match offset. We have tried again at the match point in case the pattern is one like /(?<=\G.)/ which can never match at its starting point, so running the match achieves the bumpalong. If we do get the same (null) match at the original match point, it isn't such a pattern, so we now do the empty string magic. In all other cases, a repeat match should never occur. */ - + if (ovecsave[0] == ovector[0] && ovecsave[1] == ovector[1]) - { - if (ovector[0] == ovector[1] && ovecsave[2] != start_offset) - { - goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; - ovecsave[2] = start_offset; - continue; /* Back to the top of the loop */ + { + if (ovector[0] == ovector[1] && ovecsave[2] != start_offset) + { + goptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; + ovecsave[2] = start_offset; + continue; /* Back to the top of the loop */ } rc = PCRE2_ERROR_INTERNAL_DUPMATCH; - goto EXIT; - } - + goto EXIT; + } + /* Count substitutions with a paranoid check for integer overflow; surely no real call to this function would ever hit this! */ @@ -412,21 +458,30 @@ do } subs++; - /* Copy the text leading up to the match, and remember where the insert - begins and how many ovector pairs are set. */ + /* Copy the text leading up to the match (unless not required), and remember + where the insert begins and how many ovector pairs are set. */ if (rc == 0) rc = ovector_count; fraglength = ovector[0] - start_offset; - CHECKMEMCPY(subject + start_offset, fraglength); + if (!replacement_only) CHECKMEMCPY(subject + start_offset, fraglength); scb.output_offsets[0] = buff_offset; scb.oveccount = rc; - /* Process the replacement string. Literal mode is set by \Q, but only in - extended mode when backslashes are being interpreted. In extended mode we - must handle nested substrings that are to be reprocessed. */ + /* Process the replacement string. If the entire replacement is literal, just + copy it with length check. */ ptr = replacement; - for (;;) + if ((suboptions & PCRE2_SUBSTITUTE_LITERAL) != 0) + { + CHECKMEMCPY(ptr, rlength); + } + + /* Within a non-literal replacement, which must be scanned character by + character, local literal mode can be set by \Q, but only in extended mode + when backslashes are being interpreted. In extended mode we must handle + nested substrings that are to be reprocessed. */ + + else for (;;) { uint32_t ch; unsigned int chlen; @@ -443,11 +498,11 @@ do /* Handle the next character */ - if (literal) + if (escaped_literal) { if (ptr[0] == CHAR_BACKSLASH && ptr < repend - 1 && ptr[1] == CHAR_E) { - literal = FALSE; + escaped_literal = FALSE; ptr += 2; continue; } @@ -704,7 +759,7 @@ do if (forcecase != 0) { #ifdef SUPPORT_UNICODE - if (utf) + if (utf || ucp) { uint32_t type = UCD_CHARTYPE(ch); if (PRIV(ucp_gentype)[type] == ucp_L && @@ -784,7 +839,7 @@ do continue; case ESC_Q: - literal = TRUE; + escaped_literal = TRUE; continue; case 0: /* Data character */ @@ -806,7 +861,7 @@ do if (forcecase != 0) { #ifdef SUPPORT_UNICODE - if (utf) + if (utf || ucp) { uint32_t type = UCD_CHARTYPE(ch); if (PRIV(ucp_gentype)[type] == ucp_L && @@ -835,53 +890,59 @@ do } /* End handling a literal code unit */ } /* End of loop for scanning the replacement. */ - /* The replacement has been copied to the output, or its size has been - remembered. Do the callout if there is one and we have done an actual + /* The replacement has been copied to the output, or its size has been + remembered. Do the callout if there is one and we have done an actual replacement. */ - + if (!overflowed && mcontext != NULL && mcontext->substitute_callout != NULL) { - scb.subscount = subs; + scb.subscount = subs; scb.output_offsets[1] = buff_offset; - rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data); + rc = mcontext->substitute_callout(&scb, mcontext->substitute_callout_data); - /* A non-zero return means cancel this substitution. Instead, copy the + /* A non-zero return means cancel this substitution. Instead, copy the matched string fragment. */ if (rc != 0) { PCRE2_SIZE newlength = scb.output_offsets[1] - scb.output_offsets[0]; PCRE2_SIZE oldlength = ovector[1] - ovector[0]; - + buff_offset -= newlength; lengthleft += newlength; - CHECKMEMCPY(subject + ovector[0], oldlength); - + if (!replacement_only) CHECKMEMCPY(subject + ovector[0], oldlength); + /* A negative return means do not do any more. */ - + if (rc < 0) suboptions &= (~PCRE2_SUBSTITUTE_GLOBAL); } - } - + } + /* Save the details of this match. See above for how this data is used. If we - matched an empty string, do the magic for global matches. Finally, update the - start offset to point to the rest of the subject string. */ - - ovecsave[0] = ovector[0]; - ovecsave[1] = ovector[1]; + matched an empty string, do the magic for global matches. Update the start + offset to point to the rest of the subject string. If we re-used an existing + match for the first match, switch to the internal match data block. */ + + ovecsave[0] = ovector[0]; + ovecsave[1] = ovector[1]; ovecsave[2] = start_offset; - + goptions = (ovector[0] != ovector[1] || ovector[0] > start_offset)? 0 : PCRE2_ANCHORED|PCRE2_NOTEMPTY_ATSTART; start_offset = ovector[1]; } while ((suboptions & PCRE2_SUBSTITUTE_GLOBAL) != 0); /* Repeat "do" loop */ -/* Copy the rest of the subject. */ +/* Copy the rest of the subject unless not required, and terminate the output +with a binary zero. */ + +if (!replacement_only) + { + fraglength = length - start_offset; + CHECKMEMCPY(subject + start_offset, fraglength); + } -fraglength = length - start_offset; -CHECKMEMCPY(subject + start_offset, fraglength); temp[0] = 0; -CHECKMEMCPY(temp , 1); +CHECKMEMCPY(temp, 1); /* If overflowed is set it means the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH is set, and matching has carried on after a full buffer, in order to compute the length @@ -903,7 +964,7 @@ else } EXIT: -if (match_data_created) pcre2_match_data_free(match_data); +if (internal_match_data != NULL) pcre2_match_data_free(internal_match_data); else match_data->rc = rc; return rc; |