diff options
Diffstat (limited to 'thirdparty/pcre2/src/pcre2_match.c')
-rw-r--r-- | thirdparty/pcre2/src/pcre2_match.c | 142 |
1 files changed, 105 insertions, 37 deletions
diff --git a/thirdparty/pcre2/src/pcre2_match.c b/thirdparty/pcre2/src/pcre2_match.c index 79cc93f918..419561fd64 100644 --- a/thirdparty/pcre2/src/pcre2_match.c +++ b/thirdparty/pcre2/src/pcre2_match.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2015-2018 University of Cambridge + New API code Copyright (c) 2015-2019 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -43,11 +43,11 @@ POSSIBILITY OF SUCH DAMAGE. #include "config.h" #endif -/* These defines enables debugging code */ +/* These defines enable debugging code */ -//#define DEBUG_FRAMES_DISPLAY -//#define DEBUG_SHOW_OPS -//#define DEBUG_SHOW_RMATCH +/* #define DEBUG_FRAMES_DISPLAY */ +/* #define DEBUG_SHOW_OPS */ +/* #define DEBUG_SHOW_RMATCH */ #ifdef DEBUG_FRAME_DISPLAY #include <stdarg.h> @@ -69,11 +69,12 @@ information, and fields within it. */ #define PUBLIC_MATCH_OPTIONS \ (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \ PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \ - PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT) + PCRE2_PARTIAL_SOFT|PCRE2_NO_JIT|PCRE2_COPY_MATCHED_SUBJECT) #define PUBLIC_JIT_MATCH_OPTIONS \ (PCRE2_NO_UTF_CHECK|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY|\ - PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD) + PCRE2_NOTEMPTY_ATSTART|PCRE2_PARTIAL_SOFT|PCRE2_PARTIAL_HARD|\ + PCRE2_COPY_MATCHED_SUBJECT) /* Non-error returns from and within the match() function. Error returns are externally defined PCRE2_ERROR_xxx codes, which are all negative. */ @@ -149,7 +150,7 @@ changed, the code at RETURN_SWITCH below must be updated in sync. */ enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10, RM11, RM12, RM13, RM14, RM15, RM16, RM17, RM18, RM19, RM20, RM21, RM22, RM23, RM24, RM25, RM26, RM27, RM28, RM29, RM30, - RM31, RM32, RM33, RM34, RM35 }; + RM31, RM32, RM33, RM34, RM35, RM36 }; #ifdef SUPPORT_WIDE_CHARS enum { RM100=100, RM101 }; @@ -770,7 +771,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); /* ===================================================================== */ /* Real or forced end of the pattern, assertion, or recursion. In an assertion ACCEPT, update the last used pointer and remember the current - frame so that the captures can be fished out of it. */ + frame so that the captures and mark can be fished out of it. */ case OP_ASSERT_ACCEPT: if (Feptr > mb->last_used_ptr) mb->last_used_ptr = Feptr; @@ -1776,7 +1777,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); /* ===================================================================== */ - /* Match a bit-mapped character class, possibly repeatedly. These op codes + /* Match a bit-mapped character class, possibly repeatedly. These opcodes are used when all the characters in the class have values in the range 0-255, and either the matching is caseful, or the characters are in the range 0-127 when UTF processing is enabled. The only difference between @@ -1848,7 +1849,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); } else - if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); + if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); } } else @@ -1870,7 +1871,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); } else #endif - if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); + if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); } } @@ -1902,7 +1903,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (Fop == OP_CLASS) RRETURN(MATCH_NOMATCH); } else - if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); + if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); } } else @@ -1927,7 +1928,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); } else #endif - if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); + if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) RRETURN(MATCH_NOMATCH); } } /* Control never gets here */ @@ -1956,17 +1957,21 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (Fop == OP_CLASS) break; } else - if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) break; + if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break; Feptr += len; } if (reptype == REPTYPE_POS) continue; /* No backtracking */ + /* After \C in UTF mode, Lstart_eptr might be in the middle of a + Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't + go too far. */ + for (;;) { RMATCH(Fecode, RM201); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Feptr-- == Lstart_eptr) break; /* Tried at original position */ + if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ BACKCHAR(Feptr); } } @@ -1989,7 +1994,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); } else #endif - if ((Lbyte_map[fc/8] & (1 << (fc&7))) == 0) break; + if ((Lbyte_map[fc/8] & (1u << (fc&7))) == 0) break; Feptr++; } @@ -2126,11 +2131,15 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (reptype == REPTYPE_POS) continue; /* No backtracking */ + /* After \C in UTF mode, Lstart_eptr might be in the middle of a + Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't + go too far. */ + for(;;) { RMATCH(Fecode, RM101); if (rrc != MATCH_NOMATCH) RRETURN(rrc); - if (Feptr-- == Lstart_eptr) break; /* Tried at original position */ + if (Feptr-- <= Lstart_eptr) break; /* Tried at original position */ #ifdef SUPPORT_UNICODE if (utf) BACKCHAR(Feptr); #endif @@ -2456,7 +2465,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); /* ===================================================================== */ /* Match a single character type repeatedly. Note that the property type - does not need to be in a stack frame as it not used within an RMATCH() + does not need to be in a stack frame as it is not used within an RMATCH() loop. */ #define Lstart_eptr F->temp_sptr[0] @@ -4002,8 +4011,8 @@ fprintf(stderr, "++ op=%d\n", *Fecode); if (reptype == REPTYPE_POS) continue; /* No backtracking */ /* After \C in UTF mode, Lstart_eptr might be in the middle of a - Unicode character. Use <= pp to ensure backtracking doesn't go too far. - */ + Unicode character. Use <= Lstart_eptr to ensure backtracking doesn't + go too far. */ for(;;) { @@ -4076,7 +4085,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); GETCHAR(fc, fptr); } lgb = UCD_GRAPHBREAK(fc); - if ((PRIV(ucp_gbtable)[lgb] & (1 << rgb)) == 0) break; + if ((PRIV(ucp_gbtable)[lgb] & (1u << rgb)) == 0) break; Feptr = fptr; rgb = lgb; } @@ -4135,7 +4144,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); } break; - /* The "byte" (i.e. "code unit") case is the same as non-UTF */ + /* The "byte" (i.e. "code unit") case is the same as non-UTF */ case OP_ANYBYTE: fc = Lmax - Lmin; @@ -5006,6 +5015,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); must record a backtracking point and also set up a chained frame. */ case OP_ONCE: + case OP_SCRIPT_RUN: case OP_SBRA: Lframe_type = GF_NOCAPTURE | Fop; @@ -5111,7 +5121,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); /* Positive assertions are like other groups except that PCRE doesn't allow the effect of (*THEN) to escape beyond an assertion; it is therefore treated as NOMATCH. (*ACCEPT) is treated as successful assertion, with its - captures retained. Any other return is an error. */ + captures and mark retained. Any other return is an error. */ #define Lframe_type F->temp_32[0] @@ -5128,6 +5138,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); (char *)assert_accept_frame + offsetof(heapframe, ovector), assert_accept_frame->offset_top * sizeof(PCRE2_SIZE)); Foffset_top = assert_accept_frame->offset_top; + Fmark = assert_accept_frame->mark; break; } if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc); @@ -5416,7 +5427,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); Feptr -= number; } - /* Save the earliest consulted character, then skip to next op code */ + /* Save the earliest consulted character, then skip to next opcode */ if (Feptr < mb->start_used_ptr) mb->start_used_ptr = Feptr; Fecode += 1 + LINK_SIZE; @@ -5501,7 +5512,7 @@ fprintf(stderr, "++ op=%d\n", *Fecode); frame so that it points to the final branch. */ case OP_ONCE: - Fback_frame = ((char *)F - (char *)P) + frame_size; + Fback_frame = ((char *)F - (char *)P); for (;;) { uint32_t y = GET(P->ecode,1); @@ -5517,6 +5528,14 @@ fprintf(stderr, "++ op=%d\n", *Fecode); case OP_ASSERTBACK_NOT: RRETURN(MATCH_MATCH); + /* At the end of a script run, apply the script-checking rules. This code + will never by exercised if Unicode support it not compiled, because in + that environment script runs cause an error at compile time. */ + + case OP_SCRIPT_RUN: + if (!PRIV(script_run)(P->eptr, Feptr, utf)) RRETURN(MATCH_NOMATCH); + break; + /* Whole-pattern recursion is coded as a recurse into group 0, so it won't be picked up here. Instead, we catch it when the OP_END is reached. Other recursion is handled here. */ @@ -5829,6 +5848,13 @@ fprintf(stderr, "++ op=%d\n", *Fecode); mb->verb_current_recurse = Fcurrent_recurse; RRETURN(MATCH_COMMIT); + case OP_COMMIT_ARG: + Fmark = mb->nomatch_mark = Fecode + 2; + RMATCH(Fecode + PRIV(OP_lengths)[*Fecode] + Fecode[1], RM36); + if (rrc != MATCH_NOMATCH) RRETURN(rrc); + mb->verb_current_recurse = Fcurrent_recurse; + RRETURN(MATCH_COMMIT); + case OP_PRUNE: RMATCH(Fecode + PRIV(OP_lengths)[*Fecode], RM14); if (rrc != MATCH_NOMATCH) RRETURN(rrc); @@ -5921,7 +5947,7 @@ in rrc. */ RETURN_SWITCH: if (Frdepth == 0) return rrc; /* Exit from the top level */ -F = (heapframe *)((char *)F - Fback_frame); /* Back track */ +F = (heapframe *)((char *)F - Fback_frame); /* Backtrack */ mb->cb->callout_flags |= PCRE2_CALLOUT_BACKTRACK; /* Note for callouts */ #ifdef DEBUG_SHOW_RMATCH @@ -5934,7 +5960,7 @@ switch (Freturn_id) LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(16) LBL(17) LBL(18) LBL(19) LBL(20) LBL(21) LBL(22) LBL(23) LBL(24) LBL(25) LBL(26) LBL(27) LBL(28) LBL(29) LBL(30) LBL(31) LBL(32) - LBL(33) LBL(34) LBL(35) + LBL(33) LBL(34) LBL(35) LBL(36) #ifdef SUPPORT_WIDE_CHARS LBL(100) LBL(101) @@ -5984,10 +6010,11 @@ pcre2_match(const pcre2_code *code, PCRE2_SPTR subject, PCRE2_SIZE length, pcre2_match_context *mcontext) { int rc; +int was_zero_terminated = 0; const uint8_t *start_bits = NULL; - const pcre2_real_code *re = (const pcre2_real_code *)code; + BOOL anchored; BOOL firstline; BOOL has_first_cu = FALSE; @@ -6027,7 +6054,11 @@ mb->stack_frames = (heapframe *)stack_frames_vector; /* A length equal to PCRE2_ZERO_TERMINATED implies a zero-terminated subject string. */ -if (length == PCRE2_ZERO_TERMINATED) length = PRIV(strlen)(subject); +if (length == PCRE2_ZERO_TERMINATED) + { + length = PRIV(strlen)(subject); + was_zero_terminated = 1; + } end_subject = subject + length; /* Plausibility checks */ @@ -6142,6 +6173,17 @@ if (mcontext != NULL && mcontext->offset_limit != PCRE2_UNSET && (re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0) return PCRE2_ERROR_BADOFFSETLIMIT; +/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT, +free the memory that was obtained. Set the field to NULL for no match cases. */ + +if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0) + { + match_data->memctl.free((void *)match_data->subject, + match_data->memctl.memory_data); + match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT; + } +match_data->subject = NULL; + /* If the pattern was successfully studied with JIT support, run the JIT executable instead of the rest of this function. Most options must be set at compile time for the JIT code to be usable. Fallback to the normal code path if @@ -6153,7 +6195,19 @@ if (re->executable_jit != NULL && (options & ~PUBLIC_JIT_MATCH_OPTIONS) == 0) { rc = pcre2_jit_match(code, subject, length, start_offset, options, match_data, mcontext); - if (rc != PCRE2_ERROR_JIT_BADOPTION) return rc; + if (rc != PCRE2_ERROR_JIT_BADOPTION) + { + if (rc >= 0 && (options & PCRE2_COPY_MATCHED_SUBJECT) != 0) + { + length = CU2BYTES(length + was_zero_terminated); + match_data->subject = match_data->memctl.malloc(length, + match_data->memctl.memory_data); + if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY; + memcpy((void *)match_data->subject, subject, length); + match_data->flags |= PCRE2_MD_COPIED_SUBJECT; + } + return rc; + } } #endif @@ -6275,7 +6329,7 @@ mb->match_limit_depth = (mcontext->depth_limit < re->limit_depth)? /* If a pattern has very many capturing parentheses, the frame size may be very large. Ensure that there are at least 10 available frames by getting an initial vector on the heap if necessary, except when the heap limit prevents this. Get -fewer if possible. (The heap limit is in kilobytes.) */ +fewer if possible. (The heap limit is in kibibytes.) */ if (frame_size <= START_FRAMES_SIZE/10) { @@ -6405,7 +6459,7 @@ for(;;) #if PCRE2_CODE_UNIT_WIDTH != 8 if (c > 255) c = 255; #endif - ok = (start_bits[c/8] & (1 << (c&7))) != 0; + ok = (start_bits[c/8] & (1u << (c&7))) != 0; } } if (!ok) @@ -6522,7 +6576,7 @@ for(;;) #if PCRE2_CODE_UNIT_WIDTH != 8 if (c > 255) c = 255; #endif - if ((start_bits[c/8] & (1 << (c&7))) != 0) break; + if ((start_bits[c/8] & (1u << (c&7))) != 0) break; start_match++; } @@ -6793,13 +6847,13 @@ if (mb->match_frames != mb->stack_frames) /* Fill in fields that are always returned in the match data. */ match_data->code = re; -match_data->subject = subject; match_data->mark = mb->mark; match_data->matchedby = PCRE2_MATCHEDBY_INTERPRETER; /* Handle a fully successful match. Set the return code to the number of captured strings, or 0 if there were too many to fit into the ovector, and then -set the remaining returned values before returning. */ +set the remaining returned values before returning. Make a copy of the subject +string if requested. */ if (rc == MATCH_MATCH) { @@ -6809,6 +6863,16 @@ if (rc == MATCH_MATCH) match_data->leftchar = mb->start_used_ptr - subject; match_data->rightchar = ((mb->last_used_ptr > mb->end_match_ptr)? mb->last_used_ptr : mb->end_match_ptr) - subject; + if ((options & PCRE2_COPY_MATCHED_SUBJECT) != 0) + { + length = CU2BYTES(length + was_zero_terminated); + match_data->subject = match_data->memctl.malloc(length, + match_data->memctl.memory_data); + if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY; + memcpy((void *)match_data->subject, subject, length); + match_data->flags |= PCRE2_MD_COPIED_SUBJECT; + } + else match_data->subject = subject; return match_data->rc; } @@ -6822,10 +6886,14 @@ match_data->mark = mb->nomatch_mark; if (rc != MATCH_NOMATCH && rc != PCRE2_ERROR_PARTIAL) match_data->rc = rc; -/* Handle a partial match. */ +/* Handle a partial match. If a "soft" partial match was requested, searching +for a complete match will have continued, and the value of rc at this point +will be MATCH_NOMATCH. For a "hard" partial match, it will already be +PCRE2_ERROR_PARTIAL. */ else if (match_partial != NULL) { + match_data->subject = subject; match_data->ovector[0] = match_partial - subject; match_data->ovector[1] = end_subject - subject; match_data->startchar = match_partial - subject; |