diff options
Diffstat (limited to 'thirdparty/pcre2/src/pcre2_compile.c')
| -rw-r--r-- | thirdparty/pcre2/src/pcre2_compile.c | 197 | 
1 files changed, 94 insertions, 103 deletions
| diff --git a/thirdparty/pcre2/src/pcre2_compile.c b/thirdparty/pcre2/src/pcre2_compile.c index f2e6b6b5bd..e811f12f02 100644 --- a/thirdparty/pcre2/src/pcre2_compile.c +++ b/thirdparty/pcre2/src/pcre2_compile.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.                         Written by Philip Hazel       Original API code Copyright (c) 1997-2012 University of Cambridge -          New API code Copyright (c) 2016-2019 University of Cambridge +          New API code Copyright (c) 2016-2020 University of Cambridge  -----------------------------------------------------------------------------  Redistribution and use in source and binary forms, with or without @@ -1202,7 +1202,7 @@ in the decoded tables. */  if ((code->flags & PCRE2_DEREF_TABLES) != 0)    { -  ref_count = (PCRE2_SIZE *)(code->tables + tables_length); +  ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);    (*ref_count)++;    } @@ -1232,15 +1232,15 @@ if (newcode == NULL) return NULL;  memcpy(newcode, code, code->blocksize);  newcode->executable_jit = NULL; -newtables = code->memctl.malloc(tables_length + sizeof(PCRE2_SIZE), +newtables = code->memctl.malloc(TABLES_LENGTH + sizeof(PCRE2_SIZE),    code->memctl.memory_data);  if (newtables == NULL)    {    code->memctl.free((void *)newcode, code->memctl.memory_data);    return NULL;    } -memcpy(newtables, code->tables, tables_length); -ref_count = (PCRE2_SIZE *)(newtables + tables_length); +memcpy(newtables, code->tables, TABLES_LENGTH); +ref_count = (PCRE2_SIZE *)(newtables + TABLES_LENGTH);  *ref_count = 1;  newcode->tables = newtables; @@ -1270,7 +1270,7 @@ if (code != NULL)      be freed when there are no more references to them. The *ref_count should      always be > 0. */ -    ref_count = (PCRE2_SIZE *)(code->tables + tables_length); +    ref_count = (PCRE2_SIZE *)(code->tables + TABLES_LENGTH);      if (*ref_count > 0)        {        (*ref_count)--; @@ -2344,7 +2344,7 @@ if (ptr > *nameptr + MAX_NAME_SIZE)    *errorcodeptr = ERR48;    goto FAILED;    } -*namelenptr = ptr - *nameptr; +*namelenptr = (uint32_t)(ptr - *nameptr);  /* Subpattern names must not be empty, and their terminator is checked here.  (What follows a verb or alpha assertion name is checked separately.) */ @@ -3653,7 +3653,7 @@ while (ptr < ptrend)      if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;      /* If ( is not followed by ? it is either a capture or a special verb or an -    alpha assertion. */ +    alpha assertion or a positive non-atomic lookahead. */      if (*ptr != CHAR_QUESTION_MARK)        { @@ -3685,10 +3685,10 @@ while (ptr < ptrend)          break;        /* Handle "alpha assertions" such as (*pla:...). Most of these are -      synonyms for the historical symbolic assertions, but the script run ones -      are new. They are distinguished by starting with a lower case letter. -      Checking both ends of the alphabet makes this work in all character -      codes. */ +      synonyms for the historical symbolic assertions, but the script run and +      non-atomic lookaround ones are new. They are distinguished by starting +      with a lower case letter. Checking both ends of the alphabet makes this +      work in all character codes. */        else if (CHMAX_255(c) && (cb->ctypes[c] & ctype_lcletter) != 0)          { @@ -3747,9 +3747,7 @@ while (ptr < ptrend)            goto POSITIVE_LOOK_AHEAD;            case META_LOOKAHEAD_NA: -          *parsed_pattern++ = meta; -          ptr++; -          goto POST_ASSERTION; +          goto POSITIVE_NONATOMIC_LOOK_AHEAD;            case META_LOOKAHEADNOT:            goto NEGATIVE_LOOK_AHEAD; @@ -4333,6 +4331,7 @@ while (ptr < ptrend)            {            if (++ptr >= ptrend || !IS_DIGIT(*ptr)) goto BAD_VERSION_CONDITION;            minor = (*ptr++ - CHAR_0) * 10; +          if (ptr >= ptrend) goto BAD_VERSION_CONDITION;            if (IS_DIGIT(*ptr)) minor += *ptr++ - CHAR_0;            if (ptr >= ptrend || *ptr != CHAR_RIGHT_PARENTHESIS)              goto BAD_VERSION_CONDITION; @@ -4438,6 +4437,12 @@ while (ptr < ptrend)        ptr++;        goto POST_ASSERTION; +      case CHAR_ASTERISK: +      POSITIVE_NONATOMIC_LOOK_AHEAD:         /* Come from (?* */ +      *parsed_pattern++ = META_LOOKAHEAD_NA; +      ptr++; +      goto POST_ASSERTION; +        case CHAR_EXCLAMATION_MARK:        NEGATIVE_LOOK_AHEAD:                   /* Come from (*nla: */        *parsed_pattern++ = META_LOOKAHEADNOT; @@ -4447,20 +4452,23 @@ while (ptr < ptrend)        /* ---- Lookbehind assertions ---- */ -      /* (?< followed by = or ! is a lookbehind assertion. Otherwise (?< is the -      start of the name of a capturing group. */ +      /* (?< followed by = or ! or * is a lookbehind assertion. Otherwise (?< +      is the start of the name of a capturing group. */        case CHAR_LESS_THAN_SIGN:        if (ptrend - ptr <= 1 || -         (ptr[1] != CHAR_EQUALS_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK)) +         (ptr[1] != CHAR_EQUALS_SIGN && +          ptr[1] != CHAR_EXCLAMATION_MARK && +          ptr[1] != CHAR_ASTERISK))          {          terminator = CHAR_GREATER_THAN_SIGN;          goto DEFINE_NAME;          }        *parsed_pattern++ = (ptr[1] == CHAR_EQUALS_SIGN)? -        META_LOOKBEHIND : META_LOOKBEHINDNOT; +        META_LOOKBEHIND : (ptr[1] == CHAR_EXCLAMATION_MARK)? +        META_LOOKBEHINDNOT : META_LOOKBEHIND_NA; -      POST_LOOKBEHIND:              /* Come from (*plb: (*naplb: and (*nlb: */ +      POST_LOOKBEHIND:           /* Come from (*plb: (*naplb: and (*nlb: */        *has_lookbehind = TRUE;        offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);        PUTOFFSET(offset, parsed_pattern); @@ -4633,8 +4641,6 @@ while (ptr < ptrend)          *parsed_pattern++ = META_KET;          } - -        if (top_nest == (nest_save *)(cb->start_workspace)) top_nest = NULL;          else top_nest--;        } @@ -4899,7 +4905,7 @@ range. */  if ((options & PCRE2_CASELESS) != 0)    {  #ifdef SUPPORT_UNICODE -  if ((options & PCRE2_UTF) != 0) +  if ((options & (PCRE2_UTF|PCRE2_UCP)) != 0)      {      int rc;      uint32_t oc, od; @@ -5314,7 +5320,8 @@ dynamically as we process the pattern. */  #ifdef SUPPORT_UNICODE  BOOL utf = (options & PCRE2_UTF) != 0; -#else  /* No UTF support */ +BOOL ucp = (options & PCRE2_UCP) != 0; +#else  /* No Unicode support */  BOOL utf = FALSE;  #endif @@ -5559,12 +5566,12 @@ for (;; pptr++)        zerofirstcu = firstcu;        zerofirstcuflags = firstcuflags; -      /* For caseless UTF mode, check whether this character has more than -      one other case. If so, generate a special OP_NOTPROP item instead of +      /* For caseless UTF or UCP mode, check whether this character has more +      than one other case. If so, generate a special OP_NOTPROP item instead of        OP_NOTI. */  #ifdef SUPPORT_UNICODE -      if (utf && (options & PCRE2_CASELESS) != 0 && +      if ((utf||ucp) && (options & PCRE2_CASELESS) != 0 &&            (d = UCD_CASESET(c)) != 0)          {          *code++ = OP_NOTPROP; @@ -5597,7 +5604,7 @@ for (;; pptr++)          uint32_t d;  #ifdef SUPPORT_UNICODE -        if (utf && c > 127) d = UCD_OTHERCASE(c); else +        if ((utf || ucp) && c > 127) d = UCD_OTHERCASE(c); else  #endif            {  #if PCRE2_CODE_UNIT_WIDTH != 8 @@ -6671,23 +6678,11 @@ for (;; pptr++)              }            /* For a back reference, update the back reference map and the -          maximum back reference. Then, for each group, we must check to -          see if it is recursive, that is, it is inside the group that it -          references. A flag is set so that the group can be made atomic. -          */ +          maximum back reference. */            cb->backref_map |= (groupnumber < 32)? (1u << groupnumber) : 1;            if (groupnumber > cb->top_backref)              cb->top_backref = groupnumber; - -          for (oc = cb->open_caps; oc != NULL; oc = oc->next) -            { -            if (oc->number == groupnumber) -              { -              oc->flag = TRUE; -              break; -              } -            }            }          } @@ -7081,15 +7076,18 @@ for (;; pptr++)              previous[GET(previous, 1)] != OP_ALT)            goto END_REPEAT; -        /* There is no sense in actually repeating assertions. The only -        potential use of repetition is in cases when the assertion is optional. -        Therefore, if the minimum is greater than zero, just ignore the repeat. -        If the maximum is not zero or one, set it to 1. */ +        /* Perl allows all assertions to be quantified, and when they contain +        capturing parentheses and/or are optional there are potential uses for +        this feature. PCRE2 used to force the maximum quantifier to 1 on the +        invalid grounds that further repetition was never useful. This was +        always a bit pointless, since an assertion could be wrapped with a +        repeated group to achieve the effect. General repetition is now +        permitted, but if the maximum is unlimited it is set to one more than +        the minimum. */          if (op_previous < OP_ONCE)    /* Assertion */            { -          if (repeat_min > 0) goto END_REPEAT; -          if (repeat_max > 1) repeat_max = 1; +          if (repeat_max == REPEAT_UNLIMITED) repeat_max = repeat_min + 1;            }          /* The case of a zero minimum is special because of the need to stick @@ -7682,19 +7680,6 @@ for (;; pptr++)      cb->backref_map |= (meta_arg < 32)? (1u << meta_arg) : 1;      if (meta_arg > cb->top_backref) cb->top_backref = meta_arg; - -    /* Check to see if this back reference is recursive, that it, it -    is inside the group that it references. A flag is set so that the -    group can be made atomic. */ - -    for (oc = cb->open_caps; oc != NULL; oc = oc->next) -      { -      if (oc->number == meta_arg) -        { -        oc->flag = TRUE; -        break; -        } -      }      break; @@ -7840,11 +7825,12 @@ for (;; pptr++)      NORMAL_CHAR_SET:  /* Character is already in meta */      matched_char = TRUE; -    /* For caseless UTF mode, check whether this character has more than one -    other case. If so, generate a special OP_PROP item instead of OP_CHARI. */ +    /* For caseless UTF or UCP mode, check whether this character has more than +    one other case. If so, generate a special OP_PROP item instead of OP_CHARI. +    */  #ifdef SUPPORT_UNICODE -    if (utf && (options & PCRE2_CASELESS) != 0) +    if ((utf||ucp) && (options & PCRE2_CASELESS) != 0)        {        uint32_t caseset = UCD_CASESET(meta);        if (caseset != 0) @@ -8053,7 +8039,6 @@ if (*code == OP_CBRA)    capnumber = GET2(code, 1 + LINK_SIZE);    capitem.number = capnumber;    capitem.next = cb->open_caps; -  capitem.flag = FALSE;    capitem.assert_depth = cb->assert_depth;    cb->open_caps = &capitem;    } @@ -8182,26 +8167,9 @@ for (;;)      PUT(code, 1, (int)(code - start_bracket));      code += 1 + LINK_SIZE; -    /* If it was a capturing subpattern, check to see if it contained any -    recursive back references. If so, we must wrap it in atomic brackets. In -    any event, remove the block from the chain. */ +    /* If it was a capturing subpattern, remove the block from the chain. */ -    if (capnumber > 0) -      { -      if (cb->open_caps->flag) -        { -        (void)memmove(start_bracket + 1 + LINK_SIZE, start_bracket, -          CU2BYTES(code - start_bracket)); -        *start_bracket = OP_ONCE; -        code += 1 + LINK_SIZE; -        PUT(start_bracket, 1, (int)(code - start_bracket)); -        *code = OP_KET; -        PUT(code, 1, (int)(code - start_bracket)); -        code += 1 + LINK_SIZE; -        length += 2 + 2*LINK_SIZE; -        } -      cb->open_caps = cb->open_caps->next; -      } +    if (capnumber > 0) cb->open_caps = cb->open_caps->next;      /* Set values to pass back */ @@ -8836,9 +8804,10 @@ memset(slot + IMM2_SIZE + length, 0,  /* This function is called to skip parts of the parsed pattern when finding the  length of a lookbehind branch. It is called after (*ACCEPT) and (*FAIL) to find -the end of the branch, it is called to skip over an internal lookaround, and it -is also called to skip to the end of a class, during which it will never -encounter nested groups (but there's no need to have special code for that). +the end of the branch, it is called to skip over an internal lookaround or +(DEFINE) group, and it is also called to skip to the end of a class, during +which it will never encounter nested groups (but there's no need to have +special code for that).  When called to find the end of a branch or group, pptr must point to the first  meta code inside the branch, not the branch-starting code. In other cases it @@ -9316,14 +9285,21 @@ for (;; pptr++)      itemlength = grouplength;      break; -    /* Check nested groups - advance past the initial data for each type and -    then seek a fixed length with get_grouplength(). */ +    /* A (DEFINE) group is never obeyed inline and so it does not contribute to +    the length of this branch. Skip from the following item to the next +    unpaired ket. */ + +    case META_COND_DEFINE: +    pptr = parsed_skip(pptr + 1, PSKIP_KET); +    break; + +    /* Check other nested groups - advance past the initial data for each type +    and then seek a fixed length with get_grouplength(). */      case META_COND_NAME:      case META_COND_NUMBER:      case META_COND_RNAME:      case META_COND_RNUMBER: -    case META_COND_DEFINE:      pptr += 2 + SIZEOFFSET;      goto CHECK_GROUP; @@ -9580,6 +9556,10 @@ for (; *pptr != META_END; pptr++)      break;      case META_COND_DEFINE: +    pptr += SIZEOFFSET; +    nestlevel++; +    break; +      case META_COND_NAME:      case META_COND_NUMBER:      case META_COND_RNAME: @@ -9660,6 +9640,7 @@ pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE patlen, uint32_t options,     int *errorptr, PCRE2_SIZE *erroroffset, pcre2_compile_context *ccontext)  {  BOOL utf;                             /* Set TRUE for UTF mode */ +BOOL ucp;                             /* Set TRUE for UCP mode */  BOOL has_lookbehind = FALSE;          /* Set TRUE if a lookbehind is found */  BOOL zero_terminated;                 /* Set TRUE for zero-terminated pattern */  pcre2_real_code *re = NULL;           /* What we will return */ @@ -9947,8 +9928,8 @@ if (utf)  /* Check UCP lockout. */ -if ((cb.external_options & (PCRE2_UCP|PCRE2_NEVER_UCP)) == -    (PCRE2_UCP|PCRE2_NEVER_UCP)) +ucp = (cb.external_options & PCRE2_UCP) != 0; +if (ucp && (cb.external_options & PCRE2_NEVER_UCP) != 0)    {    errorcode = ERR75;    goto HAD_EARLY_ERROR; @@ -10324,7 +10305,7 @@ function call. */  if (errorcode == 0 && (re->overall_options & PCRE2_NO_AUTO_POSSESS) == 0)    {    PCRE2_UCHAR *temp = (PCRE2_UCHAR *)codestart; -  if (PRIV(auto_possessify)(temp, utf, &cb) != 0) errorcode = ERR80; +  if (PRIV(auto_possessify)(temp, &cb) != 0) errorcode = ERR80;    }  /* Failed to compile, or error while post-processing. */ @@ -10372,21 +10353,25 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)      if ((firstcuflags & REQ_CASELESS) != 0)        { -      if (firstcu < 128 || (!utf && firstcu < 255)) +      if (firstcu < 128 || (!utf && !ucp && firstcu < 255))          {          if (cb.fcc[firstcu] != firstcu) re->flags |= PCRE2_FIRSTCASELESS;          } -      /* The first code unit is > 128 in UTF mode, or > 255 otherwise. In -      8-bit UTF mode, codepoints in the range 128-255 are introductory code -      points and cannot have another case. In 16-bit and 32-bit modes, we can -      check wide characters when UTF (and therefore UCP) is supported. */ +      /* The first code unit is > 128 in UTF or UCP mode, or > 255 otherwise. +      In 8-bit UTF mode, codepoints in the range 128-255 are introductory code +      points and cannot have another case, but if UCP is set they may do. */ -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 -      else if (firstcu <= MAX_UTF_CODE_POINT && +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 +      else if (ucp && !utf && UCD_OTHERCASE(firstcu) != firstcu) +        re->flags |= PCRE2_FIRSTCASELESS; +#else +      else if ((utf || ucp) && firstcu <= MAX_UTF_CODE_POINT &&                 UCD_OTHERCASE(firstcu) != firstcu)          re->flags |= PCRE2_FIRSTCASELESS;  #endif +#endif  /* SUPPORT_UNICODE */        }      } @@ -10435,14 +10420,20 @@ if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0)        if ((reqcuflags & REQ_CASELESS) != 0)          { -        if (reqcu < 128 || (!utf && reqcu < 255)) +        if (reqcu < 128 || (!utf && !ucp && reqcu < 255))            {            if (cb.fcc[reqcu] != reqcu) re->flags |= PCRE2_LASTCASELESS;            } -#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 8 -        else if (reqcu <= MAX_UTF_CODE_POINT && UCD_OTHERCASE(reqcu) != reqcu) -          re->flags |= PCRE2_LASTCASELESS; +#ifdef SUPPORT_UNICODE +#if PCRE2_CODE_UNIT_WIDTH == 8 +      else if (ucp && !utf && UCD_OTHERCASE(reqcu) != reqcu) +        re->flags |= PCRE2_LASTCASELESS; +#else +      else if ((utf || ucp) && reqcu <= MAX_UTF_CODE_POINT && +               UCD_OTHERCASE(reqcu) != reqcu) +        re->flags |= PCRE2_LASTCASELESS;  #endif +#endif  /* SUPPORT_UNICODE */          }        }      } |