diff options
Diffstat (limited to 'thirdparty/pcre2/src/pcre2_match.c')
-rw-r--r-- | thirdparty/pcre2/src/pcre2_match.c | 603 |
1 files changed, 456 insertions, 147 deletions
diff --git a/thirdparty/pcre2/src/pcre2_match.c b/thirdparty/pcre2/src/pcre2_match.c index 419561fd64..48e7b9dbb2 100644 --- a/thirdparty/pcre2/src/pcre2_match.c +++ b/thirdparty/pcre2/src/pcre2_match.c @@ -415,8 +415,7 @@ if (caseless) else #endif - /* Not in UTF mode */ - + /* Not in UTF mode */ { for (; length > 0; length--) { @@ -491,27 +490,32 @@ heap is used for a larger vector. *************************************************/ /* These macros pack up tests that are used for partial matching several times -in the code. We set the "hit end" flag if the pointer is at the end of the -subject and also past the earliest inspected character (i.e. something has been -matched, even if not part of the actual matched string). For hard partial -matching, we then return immediately. The second one is used when we already -know we are past the end of the subject. */ +in the code. The second one is used when we already know we are past the end of +the subject. We set the "hit end" flag if the pointer is at the end of the +subject and either (a) the pointer is past the earliest inspected character +(i.e. something has been matched, even if not part of the actual matched +string), or (b) the pattern contains a lookbehind. These are the conditions for +which adding more characters may allow the current match to continue. + +For hard partial matching, we immediately return a partial match. Otherwise, +carrying on means that a complete match on the current subject will be sought. +A partial match is returned only if no complete match can be found. */ #define CHECK_PARTIAL()\ - if (mb->partial != 0 && Feptr >= mb->end_subject && \ - Feptr > mb->start_used_ptr) \ + if (Feptr >= mb->end_subject) \ { \ - mb->hitend = TRUE; \ - if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \ + SCHECK_PARTIAL(); \ } #define SCHECK_PARTIAL()\ - if (mb->partial != 0 && Feptr > mb->start_used_ptr) \ + if (mb->partial != 0 && \ + (Feptr > mb->start_used_ptr || mb->allowemptypartial)) \ { \ mb->hitend = TRUE; \ if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \ } + /* These macros are used to implement backtracking. They simulate a recursive call to the match() function by means of a local vector of frames which remember the backtracking points. */ @@ -5127,6 +5131,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode); case OP_ASSERT: case OP_ASSERTBACK: + case OP_ASSERT_NA: + case OP_ASSERTBACK_NA: Lframe_type = GF_NOCAPTURE | Fop; for (;;) { @@ -5412,7 +5418,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); { while (number-- > 0) { - if (Feptr <= mb->start_subject) RRETURN(MATCH_NOMATCH); + if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH); Feptr--; BACKCHAR(Feptr); } @@ -5420,7 +5426,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); else #endif - /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ + /* No UTF-8 support, or not in UTF-8 mode: count is code unit count */ { if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH); @@ -5472,15 +5478,16 @@ fprintf(stderr, "++ op=%d\n", *Fecode); /* If we are at the end of an assertion that is a condition, return a match, discarding any intermediate backtracking points. Copy back the - captures into the frame before N so that they are set on return. Doing - this for all assertions, both positive and negative, seems to match what - Perl does. */ + mark setting and the captures into the frame before N so that they are + set on return. Doing this for all assertions, both positive and negative, + seems to match what Perl does. */ if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT) { memcpy((char *)P + offsetof(heapframe, ovector), Fovector, Foffset_top * sizeof(PCRE2_SIZE)); P->offset_top = Foffset_top; + P->mark = Fmark; Fback_frame = (char *)F - (char *)P; RRETURN(MATCH_MATCH); } @@ -5496,10 +5503,20 @@ fprintf(stderr, "++ op=%d\n", *Fecode); case OP_SCOND: break; - /* Positive assertions are like OP_ONCE, except that in addition the + /* Non-atomic positive assertions are like OP_BRA, except that the subject pointer must be put back to where it was at the start of the assertion. */ + case OP_ASSERT_NA: + case OP_ASSERTBACK_NA: + if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; + Feptr = P->eptr; + break; + + /* Atomic positive assertions are like OP_ONCE, except that in addition + the subject pointer must be put back to where it was at the start of the + assertion. */ + case OP_ASSERT: case OP_ASSERTBACK: if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; @@ -5640,7 +5657,11 @@ fprintf(stderr, "++ op=%d\n", *Fecode); case OP_EOD: if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH); - SCHECK_PARTIAL(); + if (mb->partial != 0) + { + mb->hitend = TRUE; + if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; + } Fecode++; break; @@ -5665,7 +5686,11 @@ fprintf(stderr, "++ op=%d\n", *Fecode); /* Either at end of string or \n before end. */ - SCHECK_PARTIAL(); + if (mb->partial != 0) + { + mb->hitend = TRUE; + if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; + } Fecode++; break; @@ -5743,7 +5768,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); case OP_NOT_WORD_BOUNDARY: case OP_WORD_BOUNDARY: - if (Feptr == mb->start_subject) prev_is_word = FALSE; else + if (Feptr == mb->check_subject) prev_is_word = FALSE; else { PCRE2_SPTR lastptr = Feptr - 1; #ifdef SUPPORT_UNICODE @@ -5946,6 +5971,7 @@ in rrc. */ #define LBL(val) case val: goto L_RM##val; RETURN_SWITCH: +if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; if (Frdepth == 0) return rrc; /* Exit from the top level */ F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */ mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */ @@ -5999,9 +6025,9 @@ Arguments: Returns: > 0 => success; value is the number of ovector pairs filled = 0 => success, but ovector is not big enough - -1 => failed to match (PCRE2_ERROR_NOMATCH) - -2 => partial match (PCRE2_ERROR_PARTIAL) - < -2 => some kind of unexpected problem + = -1 => failed to match (PCRE2_ERROR_NOMATCH) + = -2 => partial match (PCRE2_ERROR_PARTIAL) + < -2 => some kind of unexpected problem */ PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION @@ -6014,7 +6040,6 @@ int was_zero_terminated = 0; const uint8_t *start_bits = NULL; const pcre2_real_code *re = (const pcre2_real_code *)code; - BOOL anchored; BOOL firstline; BOOL has_first_cu = FALSE; @@ -6022,6 +6047,11 @@ BOOL has_req_cu = FALSE; BOOL startline; BOOL utf; +#if PCRE2_CODE_UNIT_WIDTH == 8 +BOOL memchr_not_found_first_cu = FALSE; +BOOL memchr_not_found_first_cu2 = FALSE; +#endif + PCRE2_UCHAR first_cu = 0; PCRE2_UCHAR first_cu2 = 0; PCRE2_UCHAR req_cu = 0; @@ -6029,10 +6059,23 @@ PCRE2_UCHAR req_cu2 = 0; PCRE2_SPTR bumpalong_limit; PCRE2_SPTR end_subject; +PCRE2_SPTR true_end_subject; PCRE2_SPTR start_match = subject + start_offset; PCRE2_SPTR req_cu_ptr = start_match - 1; -PCRE2_SPTR start_partial = NULL; -PCRE2_SPTR match_partial = NULL; +PCRE2_SPTR start_partial; +PCRE2_SPTR match_partial; + +#ifdef SUPPORT_JIT +BOOL use_jit; +#endif + +#ifdef SUPPORT_UNICODE +BOOL allow_invalid; +uint32_t fragment_options = 0; +#ifdef SUPPORT_JIT +BOOL jit_checked_utf = FALSE; +#endif +#endif PCRE2_SIZE frame_size; @@ -6059,7 +6102,7 @@ if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject); was_zero_terminated = 1; } -end_subject = subject + length; +true_end_subject = end_subject = subject + length; /* Plausibility checks */ @@ -6095,12 +6138,24 @@ options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1))); #undef FF #undef OO -/* These two settings are used in the code for checking a UTF string that -follows immediately afterwards. Other values in the mb block are used only -during interpretive processing, not when the JIT support is in use, so they are -set up later. */ +/* If the pattern was successfully studied with JIT support, we will run the +JIT executable instead of the rest of this function. Most options must be set +at compile time for the JIT code to be usable. */ + +#ifdef SUPPORT_JIT +use_jit = (re->executable_jit != NULL && + (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0); +#endif + +/* Initialize UTF parameters. */ utf = (re->overall_options & PCRE2_UTF) != 0; +#ifdef SUPPORT_UNICODE +allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0; +#endif + +/* Convert the partial matching flags into an integer. */ + mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 : ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0; @@ -6111,88 +6166,107 @@ if (mb->partial != 0 && ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0) return PCRE2_ERROR_BADOPTION; -/* Check a UTF string for validity if required. For 8-bit and 16-bit strings, -we must also check that a starting offset does not point into the middle of a -multiunit character. We check only the portion of the subject that is going to -be inspected during matching - from the offset minus the maximum back reference -to the given length. This saves time when a small part of a large subject is -being matched by the use of a starting offset. Note that the maximum lookbehind -is a number of characters, not code units. */ +/* It is an error to set an offset limit without setting the flag at compile +time. */ -#ifdef SUPPORT_UNICODE -if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) +if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET && + (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) + return PCRE2_ERROR_BADOFFSETLIMIT; + +/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT, +free the memory that was obtained. Set the field to NULL for no match cases. */ + +if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0) { - PCRE2_SPTR check_subject = start_match; /* start_match includes offset */ + match_data->memctl.free((void *)match_data->subject, + match_data->memctl.memory_data); + match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT; + } +match_data->subject = NULL; + +/* Zero the error offset in case the first code unit is invalid UTF. */ + +match_data->startchar = 0; + + +/* ============================= JIT matching ============================== */ + +/* Prepare for JIT matching. Check a UTF string for validity unless no check is +requested or invalid UTF can be handled. We check only the portion of the +subject that might be be inspected during matching - from the offset minus the +maximum lookbehind to the given length. This saves time when a small part of a +large subject is being matched by the use of a starting offset. Note that the +maximum lookbehind is a number of characters, not code units. */ - if (start_offset > 0) +#ifdef SUPPORT_JIT +if (use_jit) + { +#ifdef SUPPORT_UNICODE + if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid) { #if PCRE2_CODE_UNIT_WIDTH != 32 unsigned int i; +#endif + + /* For 8-bit and 16-bit UTF, check that the first code unit is a valid + character start. */ + +#if PCRE2_CODE_UNIT_WIDTH != 32 if (start_match < end_subject && NOT_FIRSTCU(*start_match)) - return PCRE2_ERROR_BADUTFOFFSET; - for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--) { - check_subject--; - while (check_subject > subject && + if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET; +#if PCRE2_CODE_UNIT_WIDTH == 8 + return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */ +#else + return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */ +#endif + } +#endif /* WIDTH != 32 */ + + /* Move back by the maximum lookbehind, just in case it happens at the very + start of matching. */ + +#if PCRE2_CODE_UNIT_WIDTH != 32 + for (i = re->max_lookbehind; i > 0 && start_match > subject; i--) + { + start_match--; + while (start_match > subject && #if PCRE2_CODE_UNIT_WIDTH == 8 - (*check_subject & 0xc0) == 0x80) + (*start_match & 0xc0) == 0x80) #else /* 16-bit */ - (*check_subject & 0xfc00) == 0xdc00) -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ - check_subject--; + (*start_match & 0xfc00) == 0xdc00) +#endif + start_match--; } -#else +#else /* PCRE2_CODE_UNIT_WIDTH != 32 */ + /* In the 32-bit library, one code unit equals one character. However, we cannot just subtract the lookbehind and then compare pointers, because a very large lookbehind could create an invalid pointer. */ if (start_offset >= re->max_lookbehind) - check_subject -= re->max_lookbehind; + start_match -= re->max_lookbehind; else - check_subject = subject; + start_match = subject; #endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ - } - /* Validate the relevant portion of the subject. After an error, adjust the - offset to be an absolute offset in the whole string. */ + /* Validate the relevant portion of the subject. Adjust the offset of an + invalid code point to be an absolute offset in the whole string. */ - match_data->rc = PRIV(valid_utf)(check_subject, - length - (check_subject - subject), &(match_data->startchar)); - if (match_data->rc != 0) - { - match_data->startchar += check_subject - subject; - return match_data->rc; + match_data->rc = PRIV(valid_utf)(start_match, + length - (start_match - subject), &(match_data->startchar)); + if (match_data->rc != 0) + { + match_data->startchar += start_match - subject; + return match_data->rc; + } + jit_checked_utf = TRUE; } - } #endif /* SUPPORT_UNICODE */ -/* It is an error to set an offset limit without setting the flag at compile -time. */ - -if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET && - (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) - return PCRE2_ERROR_BADOFFSETLIMIT; - -/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT, -free the memory that was obtained. Set the field to NULL for no match cases. */ + /* If JIT returns BADOPTION, which means that the selected complete or + partial matching mode was not compiled, fall through to the interpreter. */ -if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0) - { - match_data->memctl.free((void *)match_data->subject, - match_data->memctl.memory_data); - match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT; - } -match_data->subject = NULL; - -/* If the pattern was successfully studied with JIT support, run the JIT -executable instead of the rest of this function. Most options must be set at -compile time for the JIT code to be usable. Fallback to the normal code path if -an unsupported option is set or if JIT returns BADOPTION (which means that the -selected normal or partial matching mode was not compiled). */ - -#ifdef SUPPORT_JIT -if (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0) - { rc = pcre2_jit_match(code, subject, length, start_offset, options, match_data, mcontext); if (rc != PCRE2_ERROR_JIT_BADOPTION) @@ -6209,10 +6283,152 @@ if (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0) return rc; } } +#endif /* SUPPORT_JIT */ + +/* ========================= End of JIT matching ========================== */ + + +/* Proceed with non-JIT matching. The default is to allow lookbehinds to the +start of the subject. A UTF check when there is a non-zero offset may change +this. */ + +mb->check_subject = subject; + +/* If a UTF subject string was not checked for validity in the JIT code above, +check it here, and handle support for invalid UTF strings. The check above +happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset. +If we get here in those circumstances, it means the subject string is valid, +but for some reason JIT matching was not successful. There is no need to check +the subject again. + +We check only the portion of the subject that might be be inspected during +matching - from the offset minus the maximum lookbehind to the given length. +This saves time when a small part of a large subject is being matched by the +use of a starting offset. Note that the maximum lookbehind is a number of +characters, not code units. + +Note also that support for invalid UTF forces a check, overriding the setting +of PCRE2_NO_CHECK_UTF. */ + +#ifdef SUPPORT_UNICODE +if (utf && +#ifdef SUPPORT_JIT + !jit_checked_utf && +#endif + ((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid)) + { +#if PCRE2_CODE_UNIT_WIDTH != 32 + BOOL skipped_bad_start = FALSE; +#endif + + /* For 8-bit and 16-bit UTF, check that the first code unit is a valid + character start. If we are handling invalid UTF, just skip over such code + units. Otherwise, give an appropriate error. */ + +#if PCRE2_CODE_UNIT_WIDTH != 32 + if (allow_invalid) + { + while (start_match < end_subject && NOT_FIRSTCU(*start_match)) + { + start_match++; + skipped_bad_start = TRUE; + } + } + else if (start_match < end_subject && NOT_FIRSTCU(*start_match)) + { + if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET; +#if PCRE2_CODE_UNIT_WIDTH == 8 + return PCRE2_ERROR_UTF8_ERR20; /* Isolated 0x80 byte */ +#else + return PCRE2_ERROR_UTF16_ERR3; /* Isolated low surrogate */ +#endif + } +#endif /* WIDTH != 32 */ + + /* The mb->check_subject field points to the start of UTF checking; + lookbehinds can go back no further than this. */ + + mb->check_subject = start_match; + + /* Move back by the maximum lookbehind, just in case it happens at the very + start of matching, but don't do this if we skipped bad 8-bit or 16-bit code + units above. */ + +#if PCRE2_CODE_UNIT_WIDTH != 32 + if (!skipped_bad_start) + { + unsigned int i; + for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--) + { + mb->check_subject--; + while (mb->check_subject > subject && +#if PCRE2_CODE_UNIT_WIDTH == 8 + (*mb->check_subject & 0xc0) == 0x80) +#else /* 16-bit */ + (*mb->check_subject & 0xfc00) == 0xdc00) +#endif + mb->check_subject--; + } + } +#else /* PCRE2_CODE_UNIT_WIDTH != 32 */ + + /* In the 32-bit library, one code unit equals one character. However, + we cannot just subtract the lookbehind and then compare pointers, because + a very large lookbehind could create an invalid pointer. */ + + if (start_offset >= re->max_lookbehind) + mb->check_subject -= re->max_lookbehind; + else + mb->check_subject = subject; +#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */ + + /* Validate the relevant portion of the subject. There's a loop in case we + encounter bad UTF in the characters preceding start_match which we are + scanning because of a lookbehind. */ + + for (;;) + { + match_data->rc = PRIV(valid_utf)(mb->check_subject, + length - (mb->check_subject - subject), &(match_data->startchar)); + + if (match_data->rc == 0) break; /* Valid UTF string */ + + /* Invalid UTF string. Adjust the offset to be an absolute offset in the + whole string. If we are handling invalid UTF strings, set end_subject to + stop before the bad code unit, and set the options to "not end of line". + Otherwise return the error. */ + + match_data->startchar += mb->check_subject - subject; + if (!allow_invalid || match_data->rc > 0) return match_data->rc; + end_subject = subject + match_data->startchar; + + /* If the end precedes start_match, it means there is invalid UTF in the + extra code units we reversed over because of a lookbehind. Advance past the + first bad code unit, and then skip invalid character starting code units in + 8-bit and 16-bit modes, and try again. */ + + if (end_subject < start_match) + { + mb->check_subject = end_subject + 1; +#if PCRE2_CODE_UNIT_WIDTH != 32 + while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject)) + mb->check_subject++; #endif + } + + /* Otherwise, set the not end of line option, and do the match. */ + + else + { + fragment_options = PCRE2_NOTEOL; + break; + } + } + } +#endif /* SUPPORT_UNICODE */ -/* Carry on with non-JIT matching. A NULL match context means "use a default -context", but we take the memory control functions from the pattern. */ +/* A NULL match context means "use a default context", but we take the memory +control functions from the pattern. */ if (mcontext == NULL) { @@ -6224,8 +6440,8 @@ else mb->memctl = mcontext->memctl; anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0; firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0; startline = (re->flags & PCRE2_STARTLINE) != 0; -bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)? - end_subject : subject + mcontext->offset_limit; +bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)? + true_end_subject : subject + mcontext->offset_limit; /* Initialize and set up the fixed fields in the callout block, with a pointer in the match block. */ @@ -6236,7 +6452,8 @@ cb.subject = subject; cb.subject_length = (PCRE2_SIZE)(end_subject - subject); cb.callout_flags = 0; -/* Fill in the remaining fields in the match block. */ +/* Fill in the remaining fields in the match block, except for moptions, which +gets set later. */ mb->callout = mcontext->callout; mb->callout_data = mcontext->callout_data; @@ -6245,13 +6462,11 @@ mb->start_subject = subject; mb->start_offset = start_offset; mb->end_subject = end_subject; mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0; - -mb->moptions = options; /* Match options */ -mb->poptions = re->overall_options; /* Pattern options */ - +mb->allowemptypartial = (re->max_lookbehind > 0) || + (re->flags & PCRE2_MATCH_EMPTY) != 0; +mb->poptions = re->overall_options; /* Pattern options */ mb->ignore_skip_arg = 0; -mb->mark = mb->nomatch_mark = NULL; /* In case never set */ -mb->hitend = FALSE; +mb->mark = mb->nomatch_mark = NULL; /* In case never set */ /* The name table is needed for finding all the numbers associated with a given name, for condition testing. The code follows the name table. */ @@ -6404,6 +6619,13 @@ if ((re->flags & PCRE2_LASTSET) != 0) /* Loop for handling unanchored repeated matching attempts; for anchored regexs the loop runs just once. */ +#ifdef SUPPORT_UNICODE +FRAGMENT_RESTART: +#endif + +start_partial = match_partial = NULL; +mb->hitend = FALSE; + for(;;) { PCRE2_SPTR new_start_match; @@ -6473,7 +6695,10 @@ for(;;) /* Not anchored. Advance to a unique first code unit if there is one. In 8-bit mode, the use of memchr() gives a big speed up, even though we have to call it twice in caseless mode, in order to find the earliest occurrence - of the character in either of its cases. */ + of the character in either of its cases. If a call to memchr() that + searches the rest of the subject fails to find one case, remember that in + order not to keep on repeating the search. This can make a huge difference + when the strings are very long and only one case is present. */ else { @@ -6487,11 +6712,29 @@ for(;;) (smc = UCHAR21TEST(start_match)) != first_cu && smc != first_cu2) start_match++; + #else /* 8-bit code units */ - PCRE2_SPTR pp1 = - memchr(start_match, first_cu, end_subject-start_match); - PCRE2_SPTR pp2 = - memchr(start_match, first_cu2, end_subject-start_match); + PCRE2_SPTR pp1 = NULL; + PCRE2_SPTR pp2 = NULL; + PCRE2_SIZE cu2size = end_subject - start_match; + + if (!memchr_not_found_first_cu) + { + pp1 = memchr(start_match, first_cu, end_subject - start_match); + if (pp1 == NULL) memchr_not_found_first_cu = TRUE; + else cu2size = pp1 - start_match; + } + + /* If pp1 is not NULL, we have arranged to search only as far as pp1, + to see if the other case is earlier, so we can set "not found" only + when both searches have returned NULL. */ + + if (!memchr_not_found_first_cu2) + { + pp2 = memchr(start_match, first_cu2, cu2size); + memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL); + } + if (pp1 == NULL) start_match = (pp2 == NULL)? end_subject : pp2; else @@ -6523,7 +6766,7 @@ for(;;) we also let the cycle run, because the matching string is legitimately allowed to start with the first code unit of a newline. */ - if (!mb->partial && start_match >= mb->end_subject) + if (mb->partial == 0 && start_match >= mb->end_subject) { rc = MATCH_NOMATCH; break; @@ -6582,7 +6825,7 @@ for(;;) /* See comment above in first_cu checking about the next few lines. */ - if (!mb->partial && start_match >= mb->end_subject) + if (mb->partial == 0 && start_match >= mb->end_subject) { rc = MATCH_NOMATCH; break; @@ -6596,8 +6839,10 @@ for(;;) /* The following two optimizations must be disabled for partial matching. */ - if (!mb->partial) + if (mb->partial == 0) { + PCRE2_SPTR p; + /* The minimum matching length is a lower bound; no string of that length may actually match the pattern. Although the value is, strictly, in characters, we treat it as code units to avoid spending too much time in @@ -6621,60 +6866,57 @@ for(;;) memchr() twice in the caseless case because we only need to check for the presence of the character in either case, not find the first occurrence. + The search can be skipped if the code unit was found later than the + current starting point in a previous iteration of the bumpalong loop. + HOWEVER: when the subject string is very, very long, searching to its end can take a long time, and give bad performance on quite ordinary - patterns. This showed up when somebody was matching something like - /^\d+C/ on a 32-megabyte string... so we don't do this when the string is - sufficiently long. */ + anchored patterns. This showed up when somebody was matching something + like /^\d+C/ on a 32-megabyte string... so we don't do this when the + string is sufficiently long, but it's worth searching a lot more for + unanchored patterns. */ - if (has_req_cu && end_subject - start_match < REQ_CU_MAX) + p = start_match + (has_first_cu? 1:0); + if (has_req_cu && p > req_cu_ptr) { - PCRE2_SPTR p = start_match + (has_first_cu? 1:0); - - /* We don't need to repeat the search if we haven't yet reached the - place we found it last time round the bumpalong loop. */ + PCRE2_SIZE check_length = end_subject - start_match; - if (p > req_cu_ptr) + if (check_length < REQ_CU_MAX || + (!anchored && check_length < REQ_CU_MAX * 1000)) { - if (p < end_subject) + if (req_cu != req_cu2) /* Caseless */ { - if (req_cu != req_cu2) /* Caseless */ - { #if PCRE2_CODE_UNIT_WIDTH != 8 - do - { - uint32_t pp = UCHAR21INCTEST(p); - if (pp == req_cu || pp == req_cu2) { p--; break; } - } - while (p < end_subject); - + while (p < end_subject) + { + uint32_t pp = UCHAR21INCTEST(p); + if (pp == req_cu || pp == req_cu2) { p--; break; } + } #else /* 8-bit code units */ - PCRE2_SPTR pp = p; - p = memchr(pp, req_cu, end_subject - pp); - if (p == NULL) - { - p = memchr(pp, req_cu2, end_subject - pp); - if (p == NULL) p = end_subject; - } -#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ + PCRE2_SPTR pp = p; + p = memchr(pp, req_cu, end_subject - pp); + if (p == NULL) + { + p = memchr(pp, req_cu2, end_subject - pp); + if (p == NULL) p = end_subject; } +#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ + } - /* The caseful case */ + /* The caseful case */ - else - { + else + { #if PCRE2_CODE_UNIT_WIDTH != 8 - do - { - if (UCHAR21INCTEST(p) == req_cu) { p--; break; } - } - while (p < end_subject); + while (p < end_subject) + { + if (UCHAR21INCTEST(p) == req_cu) { p--; break; } + } #else /* 8-bit code units */ - p = memchr(p, req_cu, end_subject - p); - if (p == NULL) p = end_subject; + p = memchr(p, req_cu, end_subject - p); + if (p == NULL) p = end_subject; #endif - } } /* If we can't find the required code unit, break the bumpalong loop, @@ -6714,6 +6956,11 @@ for(;;) mb->start_used_ptr = start_match; mb->last_used_ptr = start_match; +#ifdef SUPPORT_UNICODE + mb->moptions = options | fragment_options; +#else + mb->moptions = options; +#endif mb->match_call_count = 0; mb->end_offset_top = 0; mb->skip_arg_count = 0; @@ -6839,6 +7086,68 @@ for(;;) ENDLOOP: +/* If end_subject != true_end_subject, it means we are handling invalid UTF, +and have just processed a non-terminal fragment. If this resulted in no match +or a partial match we must carry on to the next fragment (a partial match is +returned to the caller only at the very end of the subject). A loop is used to +avoid trying to match against empty fragments; if the pattern can match an +empty string it would have done so already. */ + +#ifdef SUPPORT_UNICODE +if (utf && end_subject != true_end_subject && + (rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL)) + { + for (;;) + { + /* Advance past the first bad code unit, and then skip invalid character + starting code units in 8-bit and 16-bit modes. */ + + start_match = end_subject + 1; +#if PCRE2_CODE_UNIT_WIDTH != 32 + while (start_match < true_end_subject && NOT_FIRSTCU(*start_match)) + start_match++; +#endif + + /* If we have hit the end of the subject, there isn't another non-empty + fragment, so give up. */ + + if (start_match >= true_end_subject) + { + rc = MATCH_NOMATCH; /* In case it was partial */ + break; + } + + /* Check the rest of the subject */ + + mb->check_subject = start_match; + rc = PRIV(valid_utf)(start_match, length - (start_match - subject), + &(match_data->startchar)); + + /* The rest of the subject is valid UTF. */ + + if (rc == 0) + { + mb->end_subject = end_subject = true_end_subject; + fragment_options = PCRE2_NOTBOL; + goto FRAGMENT_RESTART; + } + + /* A subsequent UTF error has been found; if the next fragment is + non-empty, set up to process it. Otherwise, let the loop advance. */ + + else if (rc < 0) + { + mb->end_subject = end_subject = start_match + match_data->startchar; + if (end_subject > start_match) + { + fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL; + goto FRAGMENT_RESTART; + } + } + } + } +#endif /* SUPPORT_UNICODE */ + /* Release an enlarged frame vector that is on the heap. */ if (mb->match_frames != mb->stack_frames) |