diff options
Diffstat (limited to 'thirdparty/pcre2/src/pcre2_script_run.c')
-rw-r--r-- | thirdparty/pcre2/src/pcre2_script_run.c | 441 |
1 files changed, 441 insertions, 0 deletions
diff --git a/thirdparty/pcre2/src/pcre2_script_run.c b/thirdparty/pcre2/src/pcre2_script_run.c new file mode 100644 index 0000000000..91a4833028 --- /dev/null +++ b/thirdparty/pcre2/src/pcre2_script_run.c @@ -0,0 +1,441 @@ +/************************************************* +* Perl-Compatible Regular Expressions * +*************************************************/ + +/* PCRE is a library of functions to support regular expressions whose syntax +and semantics are as close as possible to those of the Perl 5 language. + + Written by Philip Hazel + Original API code Copyright (c) 1997-2012 University of Cambridge + New API code Copyright (c) 2016-2018 University of Cambridge + +----------------------------------------------------------------------------- +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of the University of Cambridge nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. +----------------------------------------------------------------------------- +*/ + +/* This module contains the function for checking a script run. */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "pcre2_internal.h" + + +/************************************************* +* Check script run * +*************************************************/ + +/* A script run is conceptually a sequence of characters all in the same +Unicode script. However, it isn't quite that simple. There are special rules +for scripts that are commonly used together, and also special rules for digits. +This function implements the appropriate checks, which is possible only when +PCRE2 is compiled with Unicode support. The function returns TRUE if there is +no Unicode support; however, it should never be called in that circumstance +because an error is given by pcre2_compile() if a script run is called for in a +version of PCRE2 compiled without Unicode support. + +Arguments: + pgr point to the first character + endptr point after the last character + utf TRUE if in UTF mode + +Returns: TRUE if this is a valid script run +*/ + +/* These dummy values must be less than the negation of the largest offset in +the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD +records (and is only likely to be a few hundred). */ + +#define SCRIPT_UNSET (-99999) +#define SCRIPT_HANPENDING (-99998) +#define SCRIPT_HANHIRAKATA (-99997) +#define SCRIPT_HANBOPOMOFO (-99996) +#define SCRIPT_HANHANGUL (-99995) +#define SCRIPT_LIST (-99994) + +#define INTERSECTION_LIST_SIZE 50 + +BOOL +PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf) +{ +#ifdef SUPPORT_UNICODE +int require_script = SCRIPT_UNSET; +uint8_t intersection_list[INTERSECTION_LIST_SIZE]; +const uint8_t *require_list = NULL; +uint32_t require_digitset = 0; +uint32_t c; + +#if PCRE2_CODE_UNIT_WIDTH == 32 +(void)utf; /* Avoid compiler warning */ +#endif + +/* Any string containing fewer than 2 characters is a valid script run. */ + +if (ptr >= endptr) return TRUE; +GETCHARINCTEST(c, ptr); +if (ptr >= endptr) return TRUE; + +/* Scan strings of two or more characters, checking the Unicode characteristics +of each code point. We make use of the Script Extensions property. There is +special code for scripts that can be combined with characters from the Han +Chinese script. This may be used in conjunction with four other scripts in +these combinations: + +. Han with Hiragana and Katakana is allowed (for Japanese). +. Han with Bopomofo is allowed (for Taiwanese Mandarin). +. Han with Hangul is allowed (for Korean). + +If the first significant character's script is one of the four, the required +script type is immediately known. However, if the first significant +character's script is Han, we have to keep checking for a non-Han character. +Hence the SCRIPT_HANPENDING state. */ + +for (;;) + { + const ucd_record *ucd = GET_UCD(c); + int32_t scriptx = ucd->scriptx; + + /* If the script extension is Unknown, the string is not a valid script run. + Such characters can only form script runs of length one. */ + + if (scriptx == ucp_Unknown) return FALSE; + + /* A character whose script extension is Inherited is always accepted with + any script, and plays no further part in this testing. A character whose + script is Common is always accepted, but must still be tested for a digit + below. The scriptx value at this point is non-zero, because zero is + ucp_Unknown, tested for above. */ + + if (scriptx != ucp_Inherited) + { + if (scriptx != ucp_Common) + { + /* If the script extension value is positive, the character is not a mark + that can be used with many scripts. In the simple case we either set or + compare with the required script. However, handling the scripts that can + combine with Han are more complicated, as is the case when the previous + characters have been man-script marks. */ + + if (scriptx > 0) + { + switch(require_script) + { + /* Either the first significant character (require_script unset) or + after only Han characters. */ + + case SCRIPT_UNSET: + case SCRIPT_HANPENDING: + switch(scriptx) + { + case ucp_Han: + require_script = SCRIPT_HANPENDING; + break; + + case ucp_Hiragana: + case ucp_Katakana: + require_script = SCRIPT_HANHIRAKATA; + break; + + case ucp_Bopomofo: + require_script = SCRIPT_HANBOPOMOFO; + break; + + case ucp_Hangul: + require_script = SCRIPT_HANHANGUL; + break; + + /* Not a Han-related script. If expecting one, fail. Otherise set + the requirement to this script. */ + + default: + if (require_script == SCRIPT_HANPENDING) return FALSE; + require_script = scriptx; + break; + } + break; + + /* Previously encountered one of the "with Han" scripts. Check that + this character is appropriate. */ + + case SCRIPT_HANHIRAKATA: + if (scriptx != ucp_Han && scriptx != ucp_Hiragana && + scriptx != ucp_Katakana) + return FALSE; + break; + + case SCRIPT_HANBOPOMOFO: + if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE; + break; + + case SCRIPT_HANHANGUL: + if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE; + break; + + /* We have a list of scripts to check that is derived from one or + more previous characters. This is either one of the lists in + ucd_script_sets[] (for one previous character) or the intersection of + several lists for multiple characters. */ + + case SCRIPT_LIST: + { + const uint8_t *list; + for (list = require_list; *list != 0; list++) + { + if (*list == scriptx) break; + } + if (*list == 0) return FALSE; + } + + /* The rest of the string must be in this script, but we have to + allow for the Han complications. */ + + switch(scriptx) + { + case ucp_Han: + require_script = SCRIPT_HANPENDING; + break; + + case ucp_Hiragana: + case ucp_Katakana: + require_script = SCRIPT_HANHIRAKATA; + break; + + case ucp_Bopomofo: + require_script = SCRIPT_HANBOPOMOFO; + break; + + case ucp_Hangul: + require_script = SCRIPT_HANHANGUL; + break; + + default: + require_script = scriptx; + break; + } + break; + + /* This is the easy case when a single script is required. */ + + default: + if (scriptx != require_script) return FALSE; + break; + } + } /* End of handing positive scriptx */ + + /* If scriptx is negative, this character is a mark-type character that + has a list of permitted scripts. */ + + else + { + uint32_t chspecial; + const uint8_t *clist, *rlist; + const uint8_t *list = PRIV(ucd_script_sets) - scriptx; + + switch(require_script) + { + case SCRIPT_UNSET: + require_list = PRIV(ucd_script_sets) - scriptx; + require_script = SCRIPT_LIST; + break; + + /* An inspection of the Unicode 11.0.0 files shows that there are the + following types of Script Extension list that involve the Han, + Bopomofo, Hiragana, Katakana, and Hangul scripts: + + . Bopomofo + Han + . Han + Hiragana + Katakana + . Hiragana + Katakana + . Bopopmofo + Hangul + Han + Hiragana + Katakana + + The following code tries to make sense of this. */ + +#define FOUND_BOPOMOFO 1 +#define FOUND_HIRAGANA 2 +#define FOUND_KATAKANA 4 +#define FOUND_HANGUL 8 + + case SCRIPT_HANPENDING: + chspecial = 0; + for (; *list != 0; list++) + { + switch (*list) + { + case ucp_Bopomofo: chspecial |= FOUND_BOPOMOFO; break; + case ucp_Hiragana: chspecial |= FOUND_HIRAGANA; break; + case ucp_Katakana: chspecial |= FOUND_KATAKANA; break; + case ucp_Hangul: chspecial |= FOUND_HANGUL; break; + default: break; + } + } + + if (chspecial == 0) return FALSE; + + if (chspecial == FOUND_BOPOMOFO) + { + require_script = SCRIPT_HANBOPOMOFO; + } + else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA)) + { + require_script = SCRIPT_HANHIRAKATA; + } + + /* Otherwise it must be allowed with all of them, so remain in + the pending state. */ + + break; + + case SCRIPT_HANHIRAKATA: + for (; *list != 0; list++) + { + if (*list == ucp_Hiragana || *list == ucp_Katakana) break; + } + if (*list == 0) return FALSE; + break; + + case SCRIPT_HANBOPOMOFO: + for (; *list != 0; list++) + { + if (*list == ucp_Bopomofo) break; + } + if (*list == 0) return FALSE; + break; + + case SCRIPT_HANHANGUL: + for (; *list != 0; list++) + { + if (*list == ucp_Hangul) break; + } + if (*list == 0) return FALSE; + break; + + /* Previously encountered one or more characters that are allowed + with a list of scripts. Build the intersection of the required list + with this character's list in intersection_list[]. This code is + written so that it still works OK if the required list is already in + that vector. */ + + case SCRIPT_LIST: + { + int i = 0; + for (rlist = require_list; *rlist != 0; rlist++) + { + for (clist = list; *clist != 0; clist++) + { + if (*rlist == *clist) + { + intersection_list[i++] = *rlist; + break; + } + } + } + if (i == 0) return FALSE; /* No scripts in common */ + + /* If there's just one script in common, we can set it as the + unique required script. Otherwise, terminate the intersection list + and make it the required list. */ + + if (i == 1) + { + require_script = intersection_list[0]; + } + else + { + intersection_list[i] = 0; + require_list = intersection_list; + } + } + break; + + /* The previously set required script is a single script, not + Han-related. Check that it is in this character's list. */ + + default: + for (; *list != 0; list++) + { + if (*list == require_script) break; + } + if (*list == 0) return FALSE; + break; + } + } /* End of handling negative scriptx */ + } /* End of checking non-Common character */ + + /* The character is in an acceptable script. We must now ensure that all + decimal digits in the string come from the same set. Some scripts (e.g. + Common, Arabic) have more than one set of decimal digits. This code does + not allow mixing sets, even within the same script. The vector called + PRIV(ucd_digit_sets)[] contains, in its first element, the number of + following elements, and then, in ascending order, the code points of the + '9' characters in every set of 10 digits. Each set is identified by the + offset in the vector of its '9' character. An initial check of the first + value picks up ASCII digits quickly. Otherwise, a binary chop is used. */ + + if (ucd->chartype == ucp_Nd) + { + uint32_t digitset; + + if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else + { + int mid; + int bot = 1; + int top = PRIV(ucd_digit_sets)[0]; + for (;;) + { + if (top <= bot + 1) /* <= rather than == is paranoia */ + { + digitset = top; + break; + } + mid = (top + bot) / 2; + if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid; + } + } + + /* A required value of 0 means "unset". */ + + if (require_digitset == 0) require_digitset = digitset; + else if (digitset != require_digitset) return FALSE; + } /* End digit handling */ + } /* End checking non-Inherited character */ + + /* If we haven't yet got to the end, pick up the next character. */ + + if (ptr >= endptr) return TRUE; + GETCHARINCTEST(c, ptr); + } /* End checking loop */ + +#else /* NOT SUPPORT_UNICODE */ +(void)ptr; +(void)endptr; +(void)utf; +return TRUE; +#endif /* SUPPORT_UNICODE */ +} + +/* End of pcre2_script_run.c */ |