diff options
Diffstat (limited to 'thirdparty/pcre2/src/pcre2_match.c')
| -rw-r--r-- | thirdparty/pcre2/src/pcre2_match.c | 603 | 
1 files changed, 456 insertions, 147 deletions
diff --git a/thirdparty/pcre2/src/pcre2_match.c b/thirdparty/pcre2/src/pcre2_match.c index 419561fd64..48e7b9dbb2 100644 --- a/thirdparty/pcre2/src/pcre2_match.c +++ b/thirdparty/pcre2/src/pcre2_match.c @@ -415,8 +415,7 @@ if (caseless)    else  #endif -    /* Not in UTF mode */ - +  /* Not in UTF mode */      {      for (; length > 0; length--)        { @@ -491,27 +490,32 @@ heap is used for a larger vector.  *************************************************/  /* These macros pack up tests that are used for partial matching several times -in the code. We set the "hit end" flag if the pointer is at the end of the -subject and also past the earliest inspected character (i.e. something has been -matched, even if not part of the actual matched string). For hard partial -matching, we then return immediately. The second one is used when we already -know we are past the end of the subject. */ +in the code. The second one is used when we already know we are past the end of +the subject. We set the "hit end" flag if the pointer is at the end of the +subject and either (a) the pointer is past the earliest inspected character +(i.e. something has been matched, even if not part of the actual matched +string), or (b) the pattern contains a lookbehind. These are the conditions for +which adding more characters may allow the current match to continue. + +For hard partial matching, we immediately return a partial match. Otherwise, +carrying on means that a complete match on the current subject will be sought. +A partial match is returned only if no complete match can be found. */  #define CHECK_PARTIAL()\ -  if (mb->partial != 0 && Feptr >= mb->end_subject && \ -      Feptr > mb->start_used_ptr) \ +  if (Feptr >= mb->end_subject) \      { \ -    mb->hitend = TRUE; \ -    if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \ +    SCHECK_PARTIAL(); \      }  #define SCHECK_PARTIAL()\ -  if (mb->partial != 0 && Feptr > mb->start_used_ptr) \ +  if (mb->partial != 0 && \ +      (Feptr > mb->start_used_ptr || mb->allowemptypartial)) \      { \      mb->hitend = TRUE; \      if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; \      } +  /* These macros are used to implement backtracking. They simulate a recursive  call to the match() function by means of a local vector of frames which  remember the backtracking points. */ @@ -5127,6 +5131,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode);      case OP_ASSERT:      case OP_ASSERTBACK: +    case OP_ASSERT_NA: +    case OP_ASSERTBACK_NA:      Lframe_type = GF_NOCAPTURE | Fop;      for (;;)        { @@ -5412,7 +5418,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);        {        while (number-- > 0)          { -        if (Feptr <= mb->start_subject) RRETURN(MATCH_NOMATCH); +        if (Feptr <= mb->check_subject) RRETURN(MATCH_NOMATCH);          Feptr--;          BACKCHAR(Feptr);          } @@ -5420,7 +5426,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);      else  #endif -    /* No UTF-8 support, or not in UTF-8 mode: count is byte count */ +    /* No UTF-8 support, or not in UTF-8 mode: count is code unit count */        {        if ((ptrdiff_t)number > Feptr - mb->start_subject) RRETURN(MATCH_NOMATCH); @@ -5472,15 +5478,16 @@ fprintf(stderr, "++ op=%d\n", *Fecode);        /* If we are at the end of an assertion that is a condition, return a        match, discarding any intermediate backtracking points. Copy back the -      captures into the frame before N so that they are set on return. Doing -      this for all assertions, both positive and negative, seems to match what -      Perl does. */ +      mark setting and the captures into the frame before N so that they are +      set on return. Doing this for all assertions, both positive and negative, +      seems to match what Perl does. */        if (GF_IDMASK(N->group_frame_type) == GF_CONDASSERT)          {          memcpy((char *)P + offsetof(heapframe, ovector), Fovector,            Foffset_top * sizeof(PCRE2_SIZE));          P->offset_top = Foffset_top; +        P->mark = Fmark;          Fback_frame = (char *)F - (char *)P;          RRETURN(MATCH_MATCH);          } @@ -5496,10 +5503,20 @@ fprintf(stderr, "++ op=%d\n", *Fecode);        case OP_SCOND:        break; -      /* Positive assertions are like OP_ONCE, except that in addition the +      /* Non-atomic positive assertions are like OP_BRA, except that the        subject pointer must be put back to where it was at the start of the        assertion. */ +      case OP_ASSERT_NA: +      case OP_ASSERTBACK_NA: +      if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; +      Feptr = P->eptr; +      break; + +      /* Atomic positive assertions are like OP_ONCE, except that in addition +      the subject pointer must be put back to where it was at the start of the +      assertion. */ +        case OP_ASSERT:        case OP_ASSERTBACK:        if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; @@ -5640,7 +5657,11 @@ fprintf(stderr, "++ op=%d\n", *Fecode);      case OP_EOD:      if (Feptr < mb->end_subject) RRETURN(MATCH_NOMATCH); -    SCHECK_PARTIAL(); +    if (mb->partial != 0) +      { +      mb->hitend = TRUE; +      if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; +      }      Fecode++;      break; @@ -5665,7 +5686,11 @@ fprintf(stderr, "++ op=%d\n", *Fecode);      /* Either at end of string or \n before end. */ -    SCHECK_PARTIAL(); +    if (mb->partial != 0) +      { +      mb->hitend = TRUE; +      if (mb->partial > 1) return PCRE2_ERROR_PARTIAL; +      }      Fecode++;      break; @@ -5743,7 +5768,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode);      case OP_NOT_WORD_BOUNDARY:      case OP_WORD_BOUNDARY: -    if (Feptr == mb->start_subject) prev_is_word = FALSE; else +    if (Feptr == mb->check_subject) prev_is_word = FALSE; else        {        PCRE2_SPTR lastptr = Feptr - 1;  #ifdef SUPPORT_UNICODE @@ -5946,6 +5971,7 @@ in rrc. */  #define LBL(val) case val: goto L_RM##val;  RETURN_SWITCH: +if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr;  if (Frdepth == 0) return rrc;                     /* Exit from the top level */  F = (heapframe *)((char *)F - Fback_frame);       /* Backtrack */  mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */ @@ -5999,9 +6025,9 @@ Arguments:  Returns:          > 0 => success; value is the number of ovector pairs filled                    = 0 => success, but ovector is not big enough -                   -1 => failed to match (PCRE2_ERROR_NOMATCH) -                   -2 => partial match (PCRE2_ERROR_PARTIAL) -                 < -2 => some kind of unexpected problem +                  = -1 => failed to match (PCRE2_ERROR_NOMATCH) +                  = -2 => partial match (PCRE2_ERROR_PARTIAL) +                  < -2 => some kind of unexpected problem  */  PCRE2_EXP_DEFN int PCRE2_CALL_CONVENTION @@ -6014,7 +6040,6 @@ int was_zero_terminated = 0;  const uint8_t *start_bits = NULL;  const pcre2_real_code *re = (const pcre2_real_code *)code; -  BOOL anchored;  BOOL firstline;  BOOL has_first_cu = FALSE; @@ -6022,6 +6047,11 @@ BOOL has_req_cu = FALSE;  BOOL startline;  BOOL utf; +#if PCRE2_CODE_UNIT_WIDTH == 8 +BOOL memchr_not_found_first_cu = FALSE; +BOOL memchr_not_found_first_cu2 = FALSE; +#endif +  PCRE2_UCHAR first_cu = 0;  PCRE2_UCHAR first_cu2 = 0;  PCRE2_UCHAR req_cu = 0; @@ -6029,10 +6059,23 @@ PCRE2_UCHAR req_cu2 = 0;  PCRE2_SPTR bumpalong_limit;  PCRE2_SPTR end_subject; +PCRE2_SPTR true_end_subject;  PCRE2_SPTR start_match = subject + start_offset;  PCRE2_SPTR req_cu_ptr = start_match - 1; -PCRE2_SPTR start_partial = NULL; -PCRE2_SPTR match_partial = NULL; +PCRE2_SPTR start_partial; +PCRE2_SPTR match_partial; + +#ifdef SUPPORT_JIT +BOOL use_jit; +#endif + +#ifdef SUPPORT_UNICODE +BOOL allow_invalid; +uint32_t fragment_options = 0; +#ifdef SUPPORT_JIT +BOOL jit_checked_utf = FALSE; +#endif +#endif  PCRE2_SIZE frame_size; @@ -6059,7 +6102,7 @@ if (length == PCRE2_ZERO_TERMINATED)    length = PRIV(strlen)(subject);    was_zero_terminated = 1;    } -end_subject = subject + length; +true_end_subject = end_subject = subject + length;  /* Plausibility checks */ @@ -6095,12 +6138,24 @@ options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));  #undef FF  #undef OO -/* These two settings are used in the code for checking a UTF string that -follows immediately afterwards. Other values in the mb block are used only -during interpretive processing, not when the JIT support is in use, so they are -set up later. */ +/* If the pattern was successfully studied with JIT support, we will run the +JIT executable instead of the rest of this function. Most options must be set +at compile time for the JIT code to be usable. */ + +#ifdef SUPPORT_JIT +use_jit = (re->executable_jit != NULL && +          (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0); +#endif + +/* Initialize UTF parameters. */  utf = (re->overall_options & PCRE2_UTF) != 0; +#ifdef SUPPORT_UNICODE +allow_invalid = (re->overall_options & PCRE2_MATCH_INVALID_UTF) != 0; +#endif + +/* Convert the partial matching flags into an integer. */ +  mb->partial = ((options & PCRE2_PARTIAL_HARD) != 0)? 2 :                ((options & PCRE2_PARTIAL_SOFT) != 0)? 1 : 0; @@ -6111,88 +6166,107 @@ if (mb->partial != 0 &&     ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)    return PCRE2_ERROR_BADOPTION; -/* Check a UTF string for validity if required. For 8-bit and 16-bit strings, -we must also check that a starting offset does not point into the middle of a -multiunit character. We check only the portion of the subject that is going to -be inspected during matching - from the offset minus the maximum back reference -to the given length. This saves time when a small part of a large subject is -being matched by the use of a starting offset. Note that the maximum lookbehind -is a number of characters, not code units. */ +/* It is an error to set an offset limit without setting the flag at compile +time. */ -#ifdef SUPPORT_UNICODE -if (utf && (options & PCRE2_NO_UTF_CHECK) == 0) +if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET && +     (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) +  return PCRE2_ERROR_BADOFFSETLIMIT; + +/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT, +free the memory that was obtained. Set the field to NULL for no match cases. */ + +if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)    { -  PCRE2_SPTR check_subject = start_match;  /* start_match includes offset */ +  match_data->memctl.free((void *)match_data->subject, +    match_data->memctl.memory_data); +  match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT; +  } +match_data->subject = NULL; + +/* Zero the error offset in case the first code unit is invalid UTF. */ + +match_data->startchar = 0; + + +/* ============================= JIT matching ============================== */ + +/* Prepare for JIT matching. Check a UTF string for validity unless no check is +requested or invalid UTF can be handled. We check only the portion of the +subject that might be be inspected during matching - from the offset minus the +maximum lookbehind to the given length. This saves time when a small part of a +large subject is being matched by the use of a starting offset. Note that the +maximum lookbehind is a number of characters, not code units. */ -  if (start_offset > 0) +#ifdef SUPPORT_JIT +if (use_jit) +  { +#ifdef SUPPORT_UNICODE +  if (utf && (options & PCRE2_NO_UTF_CHECK) == 0 && !allow_invalid)      {  #if PCRE2_CODE_UNIT_WIDTH != 32      unsigned int i; +#endif + +    /* For 8-bit and 16-bit UTF, check that the first code unit is a valid +    character start. */ + +#if PCRE2_CODE_UNIT_WIDTH != 32      if (start_match < end_subject && NOT_FIRSTCU(*start_match)) -      return PCRE2_ERROR_BADUTFOFFSET; -    for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)        { -      check_subject--; -      while (check_subject > subject && +      if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET; +#if PCRE2_CODE_UNIT_WIDTH == 8 +      return PCRE2_ERROR_UTF8_ERR20;  /* Isolated 0x80 byte */ +#else +      return PCRE2_ERROR_UTF16_ERR3;  /* Isolated low surrogate */ +#endif +      } +#endif  /* WIDTH != 32 */ + +    /* Move back by the maximum lookbehind, just in case it happens at the very +    start of matching. */ + +#if PCRE2_CODE_UNIT_WIDTH != 32 +    for (i = re->max_lookbehind; i > 0 && start_match > subject; i--) +      { +      start_match--; +      while (start_match > subject &&  #if PCRE2_CODE_UNIT_WIDTH == 8 -      (*check_subject & 0xc0) == 0x80) +      (*start_match & 0xc0) == 0x80)  #else  /* 16-bit */ -      (*check_subject & 0xfc00) == 0xdc00) -#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */ -        check_subject--; +      (*start_match & 0xfc00) == 0xdc00) +#endif +        start_match--;        } -#else +#else  /* PCRE2_CODE_UNIT_WIDTH != 32 */ +      /* In the 32-bit library, one code unit equals one character. However,      we cannot just subtract the lookbehind and then compare pointers, because      a very large lookbehind could create an invalid pointer. */      if (start_offset >= re->max_lookbehind) -      check_subject -= re->max_lookbehind; +      start_match -= re->max_lookbehind;      else -      check_subject = subject; +      start_match = subject;  #endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */ -    } -  /* Validate the relevant portion of the subject. After an error, adjust the -  offset to be an absolute offset in the whole string. */ +    /* Validate the relevant portion of the subject. Adjust the offset of an +    invalid code point to be an absolute offset in the whole string. */ -  match_data->rc = PRIV(valid_utf)(check_subject, -    length - (check_subject - subject), &(match_data->startchar)); -  if (match_data->rc != 0) -    { -    match_data->startchar += check_subject - subject; -    return match_data->rc; +    match_data->rc = PRIV(valid_utf)(start_match, +      length - (start_match - subject), &(match_data->startchar)); +    if (match_data->rc != 0) +      { +      match_data->startchar += start_match - subject; +      return match_data->rc; +      } +    jit_checked_utf = TRUE;      } -  }  #endif  /* SUPPORT_UNICODE */ -/* It is an error to set an offset limit without setting the flag at compile -time. */ - -if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET && -     (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) -  return PCRE2_ERROR_BADOFFSETLIMIT; - -/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT, -free the memory that was obtained. Set the field to NULL for no match cases. */ +  /* If JIT returns BADOPTION, which means that the selected complete or +  partial matching mode was not compiled, fall through to the interpreter. */ -if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0) -  { -  match_data->memctl.free((void *)match_data->subject, -    match_data->memctl.memory_data); -  match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT; -  } -match_data->subject = NULL; - -/* If the pattern was successfully studied with JIT support, run the JIT -executable instead of the rest of this function. Most options must be set at -compile time for the JIT code to be usable. Fallback to the normal code path if -an unsupported option is set or if JIT returns BADOPTION (which means that the -selected normal or partial matching mode was not compiled). */ - -#ifdef SUPPORT_JIT -if (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0) -  {    rc = pcre2_jit_match(code, subject, length, start_offset, options,      match_data, mcontext);    if (rc != PCRE2_ERROR_JIT_BADOPTION) @@ -6209,10 +6283,152 @@ if (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0)      return rc;      }    } +#endif  /* SUPPORT_JIT */ + +/* ========================= End of JIT matching ========================== */ + + +/* Proceed with non-JIT matching. The default is to allow lookbehinds to the +start of the subject. A UTF check when there is a non-zero offset may change +this. */ + +mb->check_subject = subject; + +/* If a UTF subject string was not checked for validity in the JIT code above, +check it here, and handle support for invalid UTF strings. The check above +happens only when invalid UTF is not supported and PCRE2_NO_CHECK_UTF is unset. +If we get here in those circumstances, it means the subject string is valid, +but for some reason JIT matching was not successful. There is no need to check +the subject again. + +We check only the portion of the subject that might be be inspected during +matching - from the offset minus the maximum lookbehind to the given length. +This saves time when a small part of a large subject is being matched by the +use of a starting offset. Note that the maximum lookbehind is a number of +characters, not code units. + +Note also that support for invalid UTF forces a check, overriding the setting +of PCRE2_NO_CHECK_UTF. */ + +#ifdef SUPPORT_UNICODE +if (utf && +#ifdef SUPPORT_JIT +    !jit_checked_utf && +#endif +    ((options & PCRE2_NO_UTF_CHECK) == 0 || allow_invalid)) +  { +#if PCRE2_CODE_UNIT_WIDTH != 32 +  BOOL skipped_bad_start = FALSE; +#endif + +  /* For 8-bit and 16-bit UTF, check that the first code unit is a valid +  character start. If we are handling invalid UTF, just skip over such code +  units. Otherwise, give an appropriate error. */ + +#if PCRE2_CODE_UNIT_WIDTH != 32 +  if (allow_invalid) +    { +    while (start_match < end_subject && NOT_FIRSTCU(*start_match)) +      { +      start_match++; +      skipped_bad_start = TRUE; +      } +    } +  else if (start_match < end_subject && NOT_FIRSTCU(*start_match)) +    { +    if (start_offset > 0) return PCRE2_ERROR_BADUTFOFFSET; +#if PCRE2_CODE_UNIT_WIDTH == 8 +    return PCRE2_ERROR_UTF8_ERR20;  /* Isolated 0x80 byte */ +#else +    return PCRE2_ERROR_UTF16_ERR3;  /* Isolated low surrogate */ +#endif +    } +#endif  /* WIDTH != 32 */ + +  /* The mb->check_subject field points to the start of UTF checking; +  lookbehinds can go back no further than this. */ + +  mb->check_subject = start_match; + +  /* Move back by the maximum lookbehind, just in case it happens at the very +  start of matching, but don't do this if we skipped bad 8-bit or 16-bit code +  units above. */ + +#if PCRE2_CODE_UNIT_WIDTH != 32 +  if (!skipped_bad_start) +    { +    unsigned int i; +    for (i = re->max_lookbehind; i > 0 && mb->check_subject > subject; i--) +      { +      mb->check_subject--; +      while (mb->check_subject > subject && +#if PCRE2_CODE_UNIT_WIDTH == 8 +      (*mb->check_subject & 0xc0) == 0x80) +#else  /* 16-bit */ +      (*mb->check_subject & 0xfc00) == 0xdc00) +#endif +        mb->check_subject--; +      } +    } +#else  /* PCRE2_CODE_UNIT_WIDTH != 32 */ + +  /* In the 32-bit library, one code unit equals one character. However, +  we cannot just subtract the lookbehind and then compare pointers, because +  a very large lookbehind could create an invalid pointer. */ + +  if (start_offset >= re->max_lookbehind) +    mb->check_subject -= re->max_lookbehind; +  else +    mb->check_subject = subject; +#endif  /* PCRE2_CODE_UNIT_WIDTH != 32 */ + +  /* Validate the relevant portion of the subject. There's a loop in case we +  encounter bad UTF in the characters preceding start_match which we are +  scanning because of a lookbehind. */ + +  for (;;) +    { +    match_data->rc = PRIV(valid_utf)(mb->check_subject, +      length - (mb->check_subject - subject), &(match_data->startchar)); + +    if (match_data->rc == 0) break;   /* Valid UTF string */ + +    /* Invalid UTF string. Adjust the offset to be an absolute offset in the +    whole string. If we are handling invalid UTF strings, set end_subject to +    stop before the bad code unit, and set the options to "not end of line". +    Otherwise return the error. */ + +    match_data->startchar += mb->check_subject - subject; +    if (!allow_invalid || match_data->rc > 0) return match_data->rc; +    end_subject = subject + match_data->startchar; + +    /* If the end precedes start_match, it means there is invalid UTF in the +    extra code units we reversed over because of a lookbehind. Advance past the +    first bad code unit, and then skip invalid character starting code units in +    8-bit and 16-bit modes, and try again. */ + +    if (end_subject < start_match) +      { +      mb->check_subject = end_subject + 1; +#if PCRE2_CODE_UNIT_WIDTH != 32 +      while (mb->check_subject < start_match && NOT_FIRSTCU(*mb->check_subject)) +        mb->check_subject++;  #endif +      } + +    /* Otherwise, set the not end of line option, and do the match. */ + +    else +      { +      fragment_options = PCRE2_NOTEOL; +      break; +      } +    } +  } +#endif  /* SUPPORT_UNICODE */ -/* Carry on with non-JIT matching. A NULL match context means "use a default -context", but we take the memory control functions from the pattern. */ +/* A NULL match context means "use a default context", but we take the memory +control functions from the pattern. */  if (mcontext == NULL)    { @@ -6224,8 +6440,8 @@ else mb->memctl = mcontext->memctl;  anchored = ((re->overall_options | options) & PCRE2_ANCHORED) != 0;  firstline = (re->overall_options & PCRE2_FIRSTLINE) != 0;  startline = (re->flags & PCRE2_STARTLINE) != 0; -bumpalong_limit =  (mcontext->offset_limit == PCRE2_UNSET)? -  end_subject : subject + mcontext->offset_limit; +bumpalong_limit = (mcontext->offset_limit == PCRE2_UNSET)? +  true_end_subject : subject + mcontext->offset_limit;  /* Initialize and set up the fixed fields in the callout block, with a pointer  in the match block. */ @@ -6236,7 +6452,8 @@ cb.subject = subject;  cb.subject_length = (PCRE2_SIZE)(end_subject - subject);  cb.callout_flags = 0; -/* Fill in the remaining fields in the match block. */ +/* Fill in the remaining fields in the match block, except for moptions, which +gets set later. */  mb->callout = mcontext->callout;  mb->callout_data = mcontext->callout_data; @@ -6245,13 +6462,11 @@ mb->start_subject = subject;  mb->start_offset = start_offset;  mb->end_subject = end_subject;  mb->hasthen = (re->flags & PCRE2_HASTHEN) != 0; - -mb->moptions = options;                 /* Match options */ -mb->poptions = re->overall_options;     /* Pattern options */ - +mb->allowemptypartial = (re->max_lookbehind > 0) || +    (re->flags & PCRE2_MATCH_EMPTY) != 0; +mb->poptions = re->overall_options;          /* Pattern options */  mb->ignore_skip_arg = 0; -mb->mark = mb->nomatch_mark = NULL;     /* In case never set */ -mb->hitend = FALSE; +mb->mark = mb->nomatch_mark = NULL;          /* In case never set */  /* The name table is needed for finding all the numbers associated with a  given name, for condition testing. The code follows the name table. */ @@ -6404,6 +6619,13 @@ if ((re->flags & PCRE2_LASTSET) != 0)  /* Loop for handling unanchored repeated matching attempts; for anchored regexs  the loop runs just once. */ +#ifdef SUPPORT_UNICODE +FRAGMENT_RESTART: +#endif + +start_partial = match_partial = NULL; +mb->hitend = FALSE; +  for(;;)    {    PCRE2_SPTR new_start_match; @@ -6473,7 +6695,10 @@ for(;;)      /* Not anchored. Advance to a unique first code unit if there is one. In      8-bit mode, the use of memchr() gives a big speed up, even though we have      to call it twice in caseless mode, in order to find the earliest occurrence -    of the character in either of its cases. */ +    of the character in either of its cases. If a call to memchr() that +    searches the rest of the subject fails to find one case, remember that in +    order not to keep on repeating the search. This can make a huge difference +    when the strings are very long and only one case is present. */      else        { @@ -6487,11 +6712,29 @@ for(;;)                  (smc = UCHAR21TEST(start_match)) != first_cu &&                    smc != first_cu2)              start_match++; +  #else  /* 8-bit code units */ -          PCRE2_SPTR pp1 = -            memchr(start_match, first_cu, end_subject-start_match); -          PCRE2_SPTR pp2 = -            memchr(start_match, first_cu2, end_subject-start_match); +          PCRE2_SPTR pp1 = NULL; +          PCRE2_SPTR pp2 = NULL; +          PCRE2_SIZE cu2size = end_subject - start_match; + +          if (!memchr_not_found_first_cu) +            { +            pp1 = memchr(start_match, first_cu, end_subject - start_match); +            if (pp1 == NULL) memchr_not_found_first_cu = TRUE; +              else cu2size = pp1 - start_match; +            } + +          /* If pp1 is not NULL, we have arranged to search only as far as pp1, +          to see if the other case is earlier, so we can set "not found" only +          when both searches have returned NULL. */ + +          if (!memchr_not_found_first_cu2) +            { +            pp2 = memchr(start_match, first_cu2, cu2size); +            memchr_not_found_first_cu2 = (pp2 == NULL && pp1 == NULL); +            } +            if (pp1 == NULL)              start_match = (pp2 == NULL)? end_subject : pp2;            else @@ -6523,7 +6766,7 @@ for(;;)          we also let the cycle run, because the matching string is legitimately          allowed to start with the first code unit of a newline. */ -        if (!mb->partial && start_match >= mb->end_subject) +        if (mb->partial == 0 && start_match >= mb->end_subject)            {            rc = MATCH_NOMATCH;            break; @@ -6582,7 +6825,7 @@ for(;;)          /* See comment above in first_cu checking about the next few lines. */ -        if (!mb->partial && start_match >= mb->end_subject) +        if (mb->partial == 0 && start_match >= mb->end_subject)            {            rc = MATCH_NOMATCH;            break; @@ -6596,8 +6839,10 @@ for(;;)      /* The following two optimizations must be disabled for partial matching. */ -    if (!mb->partial) +    if (mb->partial == 0)        { +      PCRE2_SPTR p; +        /* The minimum matching length is a lower bound; no string of that length        may actually match the pattern. Although the value is, strictly, in        characters, we treat it as code units to avoid spending too much time in @@ -6621,60 +6866,57 @@ for(;;)        memchr() twice in the caseless case because we only need to check for the        presence of the character in either case, not find the first occurrence. +      The search can be skipped if the code unit was found later than the +      current starting point in a previous iteration of the bumpalong loop. +        HOWEVER: when the subject string is very, very long, searching to its end        can take a long time, and give bad performance on quite ordinary -      patterns. This showed up when somebody was matching something like -      /^\d+C/ on a 32-megabyte string... so we don't do this when the string is -      sufficiently long. */ +      anchored patterns. This showed up when somebody was matching something +      like /^\d+C/ on a 32-megabyte string... so we don't do this when the +      string is sufficiently long, but it's worth searching a lot more for +      unanchored patterns. */ -      if (has_req_cu && end_subject - start_match < REQ_CU_MAX) +      p = start_match + (has_first_cu? 1:0); +      if (has_req_cu && p > req_cu_ptr)          { -        PCRE2_SPTR p = start_match + (has_first_cu? 1:0); - -        /* We don't need to repeat the search if we haven't yet reached the -        place we found it last time round the bumpalong loop. */ +        PCRE2_SIZE check_length = end_subject - start_match; -        if (p > req_cu_ptr) +        if (check_length < REQ_CU_MAX || +              (!anchored && check_length < REQ_CU_MAX * 1000))            { -          if (p < end_subject) +          if (req_cu != req_cu2)  /* Caseless */              { -            if (req_cu != req_cu2)  /* Caseless */ -              {  #if PCRE2_CODE_UNIT_WIDTH != 8 -              do -                { -                uint32_t pp = UCHAR21INCTEST(p); -                if (pp == req_cu || pp == req_cu2) { p--; break; } -                } -              while (p < end_subject); - +            while (p < end_subject) +              { +              uint32_t pp = UCHAR21INCTEST(p); +              if (pp == req_cu || pp == req_cu2) { p--; break; } +              }  #else  /* 8-bit code units */ -              PCRE2_SPTR pp = p; -              p = memchr(pp, req_cu, end_subject - pp); -              if (p == NULL) -                { -                p = memchr(pp, req_cu2, end_subject - pp); -                if (p == NULL) p = end_subject; -                } -#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ +            PCRE2_SPTR pp = p; +            p = memchr(pp, req_cu, end_subject - pp); +            if (p == NULL) +              { +              p = memchr(pp, req_cu2, end_subject - pp); +              if (p == NULL) p = end_subject;                } +#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */ +            } -            /* The caseful case */ +          /* The caseful case */ -            else -              { +          else +            {  #if PCRE2_CODE_UNIT_WIDTH != 8 -              do -                { -                if (UCHAR21INCTEST(p) == req_cu) { p--; break; } -                } -              while (p < end_subject); +            while (p < end_subject) +              { +              if (UCHAR21INCTEST(p) == req_cu) { p--; break; } +              }  #else  /* 8-bit code units */ -              p = memchr(p, req_cu, end_subject - p); -              if (p == NULL) p = end_subject; +            p = memchr(p, req_cu, end_subject - p); +            if (p == NULL) p = end_subject;  #endif -              }              }            /* If we can't find the required code unit, break the bumpalong loop, @@ -6714,6 +6956,11 @@ for(;;)    mb->start_used_ptr = start_match;    mb->last_used_ptr = start_match; +#ifdef SUPPORT_UNICODE +  mb->moptions = options | fragment_options; +#else +  mb->moptions = options; +#endif    mb->match_call_count = 0;    mb->end_offset_top = 0;    mb->skip_arg_count = 0; @@ -6839,6 +7086,68 @@ for(;;)  ENDLOOP: +/* If end_subject != true_end_subject, it means we are handling invalid UTF, +and have just processed a non-terminal fragment. If this resulted in no match +or a partial match we must carry on to the next fragment (a partial match is +returned to the caller only at the very end of the subject). A loop is used to +avoid trying to match against empty fragments; if the pattern can match an +empty string it would have done so already. */ + +#ifdef SUPPORT_UNICODE +if (utf && end_subject != true_end_subject && +    (rc == MATCH_NOMATCH || rc == PCRE2_ERROR_PARTIAL)) +  { +  for (;;) +    { +    /* Advance past the first bad code unit, and then skip invalid character +    starting code units in 8-bit and 16-bit modes. */ + +    start_match = end_subject + 1; +#if PCRE2_CODE_UNIT_WIDTH != 32 +    while (start_match < true_end_subject && NOT_FIRSTCU(*start_match)) +      start_match++; +#endif + +    /* If we have hit the end of the subject, there isn't another non-empty +    fragment, so give up. */ + +    if (start_match >= true_end_subject) +      { +      rc = MATCH_NOMATCH;  /* In case it was partial */ +      break; +      } + +    /* Check the rest of the subject */ + +    mb->check_subject = start_match; +    rc = PRIV(valid_utf)(start_match, length - (start_match - subject), +      &(match_data->startchar)); + +    /* The rest of the subject is valid UTF. */ + +    if (rc == 0) +      { +      mb->end_subject = end_subject = true_end_subject; +      fragment_options = PCRE2_NOTBOL; +      goto FRAGMENT_RESTART; +      } + +    /* A subsequent UTF error has been found; if the next fragment is +    non-empty, set up to process it. Otherwise, let the loop advance. */ + +    else if (rc < 0) +      { +      mb->end_subject = end_subject = start_match + match_data->startchar; +      if (end_subject > start_match) +        { +        fragment_options = PCRE2_NOTBOL|PCRE2_NOTEOL; +        goto FRAGMENT_RESTART; +        } +      } +    } +  } +#endif  /* SUPPORT_UNICODE */ +  /* Release an enlarged frame vector that is on the heap. */  if (mb->match_frames != mb->stack_frames)  |