diff options
Diffstat (limited to 'thirdparty/pcre2/src/pcre2_script_run.c')
-rw-r--r-- | thirdparty/pcre2/src/pcre2_script_run.c | 473 |
1 files changed, 188 insertions, 285 deletions
diff --git a/thirdparty/pcre2/src/pcre2_script_run.c b/thirdparty/pcre2/src/pcre2_script_run.c index 91a4833028..4926fa63bb 100644 --- a/thirdparty/pcre2/src/pcre2_script_run.c +++ b/thirdparty/pcre2/src/pcre2_script_run.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language. Written by Philip Hazel Original API code Copyright (c) 1997-2012 University of Cambridge - New API code Copyright (c) 2016-2018 University of Cambridge + New API code Copyright (c) 2016-2021 University of Cambridge ----------------------------------------------------------------------------- Redistribution and use in source and binary forms, with or without @@ -68,26 +68,26 @@ Arguments: Returns: TRUE if this is a valid script run */ -/* These dummy values must be less than the negation of the largest offset in -the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD -records (and is only likely to be a few hundred). */ +/* These are states in the checking process. */ -#define SCRIPT_UNSET (-99999) -#define SCRIPT_HANPENDING (-99998) -#define SCRIPT_HANHIRAKATA (-99997) -#define SCRIPT_HANBOPOMOFO (-99996) -#define SCRIPT_HANHANGUL (-99995) -#define SCRIPT_LIST (-99994) +enum { SCRIPT_UNSET, /* Requirement as yet unknown */ + SCRIPT_MAP, /* Bitmap contains acceptable scripts */ + SCRIPT_HANPENDING, /* Have had only Han characters */ + SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */ + SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */ + SCRIPT_HANHANGUL /* Expect Han or Hangul */ + }; -#define INTERSECTION_LIST_SIZE 50 +#define UCD_MAPSIZE (ucp_Unknown/32 + 1) +#define FULL_MAPSIZE (ucp_Script_Count/32 + 1) BOOL PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf) { #ifdef SUPPORT_UNICODE -int require_script = SCRIPT_UNSET; -uint8_t intersection_list[INTERSECTION_LIST_SIZE]; -const uint8_t *require_list = NULL; +uint32_t require_state = SCRIPT_UNSET; +uint32_t require_map[FULL_MAPSIZE]; +uint32_t map[FULL_MAPSIZE]; uint32_t require_digitset = 0; uint32_t c; @@ -101,11 +101,17 @@ if (ptr >= endptr) return TRUE; GETCHARINCTEST(c, ptr); if (ptr >= endptr) return TRUE; +/* Initialize the require map. This is a full-size bitmap that has a bit for +every script, as opposed to the maps in ucd_script_sets, which only have bits +for scripts less than ucp_Unknown - those that appear in script extension +lists. */ + +for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0; + /* Scan strings of two or more characters, checking the Unicode characteristics -of each code point. We make use of the Script Extensions property. There is -special code for scripts that can be combined with characters from the Han -Chinese script. This may be used in conjunction with four other scripts in -these combinations: +of each code point. There is special code for scripts that can be combined with +characters from the Han Chinese script. This may be used in conjunction with +four other scripts in these combinations: . Han with Hiragana and Katakana is allowed (for Japanese). . Han with Bopomofo is allowed (for Taiwanese Mandarin). @@ -119,310 +125,207 @@ Hence the SCRIPT_HANPENDING state. */ for (;;) { const ucd_record *ucd = GET_UCD(c); - int32_t scriptx = ucd->scriptx; + uint32_t script = ucd->script; - /* If the script extension is Unknown, the string is not a valid script run. - Such characters can only form script runs of length one. */ + /* If the script is Unknown, the string is not a valid script run. Such + characters can only form script runs of length one (see test above). */ - if (scriptx == ucp_Unknown) return FALSE; + if (script == ucp_Unknown) return FALSE; - /* A character whose script extension is Inherited is always accepted with - any script, and plays no further part in this testing. A character whose - script is Common is always accepted, but must still be tested for a digit - below. The scriptx value at this point is non-zero, because zero is - ucp_Unknown, tested for above. */ + /* A character without any script extensions whose script is Inherited or + Common is always accepted with any script. If there are extensions, the + following processing happens for all scripts. */ - if (scriptx != ucp_Inherited) + if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common)) { - if (scriptx != ucp_Common) + BOOL OK; + + /* Set up a full-sized map for this character that can include bits for all + scripts. Copy the scriptx map for this character (which covers those + scripts that appear in script extension lists), set the remaining values to + zero, and then, except for Common or Inherited, add this script's bit to + the map. */ + + memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t)); + memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t)); + if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script); + + /* Handle the different checking states */ + + switch(require_state) { - /* If the script extension value is positive, the character is not a mark - that can be used with many scripts. In the simple case we either set or - compare with the required script. However, handling the scripts that can - combine with Han are more complicated, as is the case when the previous - characters have been man-script marks. */ + /* First significant character - it might follow Common or Inherited + characters that do not have any script extensions. */ - if (scriptx > 0) + case SCRIPT_UNSET: + switch(script) { - switch(require_script) - { - /* Either the first significant character (require_script unset) or - after only Han characters. */ - - case SCRIPT_UNSET: - case SCRIPT_HANPENDING: - switch(scriptx) - { - case ucp_Han: - require_script = SCRIPT_HANPENDING; - break; - - case ucp_Hiragana: - case ucp_Katakana: - require_script = SCRIPT_HANHIRAKATA; - break; - - case ucp_Bopomofo: - require_script = SCRIPT_HANBOPOMOFO; - break; - - case ucp_Hangul: - require_script = SCRIPT_HANHANGUL; - break; - - /* Not a Han-related script. If expecting one, fail. Otherise set - the requirement to this script. */ - - default: - if (require_script == SCRIPT_HANPENDING) return FALSE; - require_script = scriptx; - break; - } - break; + case ucp_Han: + require_state = SCRIPT_HANPENDING; + break; + + case ucp_Hiragana: + case ucp_Katakana: + require_state = SCRIPT_HANHIRAKATA; + break; + + case ucp_Bopomofo: + require_state = SCRIPT_HANBOPOMOFO; + break; + + case ucp_Hangul: + require_state = SCRIPT_HANHANGUL; + break; + + default: + memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t)); + require_state = SCRIPT_MAP; + break; + } + break; - /* Previously encountered one of the "with Han" scripts. Check that - this character is appropriate. */ + /* The first significant character was Han. An inspection of the Unicode + 11.0.0 files shows that there are the following types of Script Extension + list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul + scripts: - case SCRIPT_HANHIRAKATA: - if (scriptx != ucp_Han && scriptx != ucp_Hiragana && - scriptx != ucp_Katakana) - return FALSE; - break; + . Bopomofo + Han + . Han + Hiragana + Katakana + . Hiragana + Katakana + . Bopopmofo + Hangul + Han + Hiragana + Katakana - case SCRIPT_HANBOPOMOFO: - if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE; - break; + The following code tries to make sense of this. */ - case SCRIPT_HANHANGUL: - if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE; - break; +#define FOUND_BOPOMOFO 1 +#define FOUND_HIRAGANA 2 +#define FOUND_KATAKANA 4 +#define FOUND_HANGUL 8 - /* We have a list of scripts to check that is derived from one or - more previous characters. This is either one of the lists in - ucd_script_sets[] (for one previous character) or the intersection of - several lists for multiple characters. */ - - case SCRIPT_LIST: - { - const uint8_t *list; - for (list = require_list; *list != 0; list++) - { - if (*list == scriptx) break; - } - if (*list == 0) return FALSE; - } - - /* The rest of the string must be in this script, but we have to - allow for the Han complications. */ - - switch(scriptx) - { - case ucp_Han: - require_script = SCRIPT_HANPENDING; - break; - - case ucp_Hiragana: - case ucp_Katakana: - require_script = SCRIPT_HANHIRAKATA; - break; - - case ucp_Bopomofo: - require_script = SCRIPT_HANBOPOMOFO; - break; - - case ucp_Hangul: - require_script = SCRIPT_HANHANGUL; - break; - - default: - require_script = scriptx; - break; - } - break; + case SCRIPT_HANPENDING: + if (script != ucp_Han) /* Another Han does nothing */ + { + uint32_t chspecial = 0; - /* This is the easy case when a single script is required. */ + if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO; + if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA; + if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA; + if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL; - default: - if (scriptx != require_script) return FALSE; - break; - } - } /* End of handing positive scriptx */ + if (chspecial == 0) return FALSE; /* Not allowed with Han */ - /* If scriptx is negative, this character is a mark-type character that - has a list of permitted scripts. */ + if (chspecial == FOUND_BOPOMOFO) + require_state = SCRIPT_HANBOPOMOFO; + else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA)) + require_state = SCRIPT_HANHIRAKATA; - else - { - uint32_t chspecial; - const uint8_t *clist, *rlist; - const uint8_t *list = PRIV(ucd_script_sets) - scriptx; - - switch(require_script) - { - case SCRIPT_UNSET: - require_list = PRIV(ucd_script_sets) - scriptx; - require_script = SCRIPT_LIST; - break; + /* Otherwise this character must be allowed with all of them, so remain + in the pending state. */ + } + break; - /* An inspection of the Unicode 11.0.0 files shows that there are the - following types of Script Extension list that involve the Han, - Bopomofo, Hiragana, Katakana, and Hangul scripts: + /* Previously encountered one of the "with Han" scripts. Check that + this character is appropriate. */ - . Bopomofo + Han - . Han + Hiragana + Katakana - . Hiragana + Katakana - . Bopopmofo + Hangul + Han + Hiragana + Katakana + case SCRIPT_HANHIRAKATA: + if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) + + MAPBIT(map, ucp_Katakana) == 0) return FALSE; + break; - The following code tries to make sense of this. */ + case SCRIPT_HANBOPOMOFO: + if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE; + break; -#define FOUND_BOPOMOFO 1 -#define FOUND_HIRAGANA 2 -#define FOUND_KATAKANA 4 -#define FOUND_HANGUL 8 + case SCRIPT_HANHANGUL: + if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE; + break; - case SCRIPT_HANPENDING: - chspecial = 0; - for (; *list != 0; list++) - { - switch (*list) - { - case ucp_Bopomofo: chspecial |= FOUND_BOPOMOFO; break; - case ucp_Hiragana: chspecial |= FOUND_HIRAGANA; break; - case ucp_Katakana: chspecial |= FOUND_KATAKANA; break; - case ucp_Hangul: chspecial |= FOUND_HANGUL; break; - default: break; - } - } - - if (chspecial == 0) return FALSE; - - if (chspecial == FOUND_BOPOMOFO) - { - require_script = SCRIPT_HANBOPOMOFO; - } - else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA)) - { - require_script = SCRIPT_HANHIRAKATA; - } - - /* Otherwise it must be allowed with all of them, so remain in - the pending state. */ + /* Previously encountered one or more characters that are allowed with a + list of scripts. */ - break; + case SCRIPT_MAP: + OK = FALSE; - case SCRIPT_HANHIRAKATA: - for (; *list != 0; list++) - { - if (*list == ucp_Hiragana || *list == ucp_Katakana) break; - } - if (*list == 0) return FALSE; + for (int i = 0; i < FULL_MAPSIZE; i++) + { + if ((require_map[i] & map[i]) != 0) + { + OK = TRUE; break; + } + } - case SCRIPT_HANBOPOMOFO: - for (; *list != 0; list++) - { - if (*list == ucp_Bopomofo) break; - } - if (*list == 0) return FALSE; - break; + if (!OK) return FALSE; - case SCRIPT_HANHANGUL: - for (; *list != 0; list++) - { - if (*list == ucp_Hangul) break; - } - if (*list == 0) return FALSE; - break; + /* The rest of the string must be in this script, but we have to + allow for the Han complications. */ - /* Previously encountered one or more characters that are allowed - with a list of scripts. Build the intersection of the required list - with this character's list in intersection_list[]. This code is - written so that it still works OK if the required list is already in - that vector. */ - - case SCRIPT_LIST: - { - int i = 0; - for (rlist = require_list; *rlist != 0; rlist++) - { - for (clist = list; *clist != 0; clist++) - { - if (*rlist == *clist) - { - intersection_list[i++] = *rlist; - break; - } - } - } - if (i == 0) return FALSE; /* No scripts in common */ - - /* If there's just one script in common, we can set it as the - unique required script. Otherwise, terminate the intersection list - and make it the required list. */ - - if (i == 1) - { - require_script = intersection_list[0]; - } - else - { - intersection_list[i] = 0; - require_list = intersection_list; - } - } - break; + switch(script) + { + case ucp_Han: + require_state = SCRIPT_HANPENDING; + break; - /* The previously set required script is a single script, not - Han-related. Check that it is in this character's list. */ + case ucp_Hiragana: + case ucp_Katakana: + require_state = SCRIPT_HANHIRAKATA; + break; - default: - for (; *list != 0; list++) - { - if (*list == require_script) break; - } - if (*list == 0) return FALSE; - break; - } - } /* End of handling negative scriptx */ - } /* End of checking non-Common character */ - - /* The character is in an acceptable script. We must now ensure that all - decimal digits in the string come from the same set. Some scripts (e.g. - Common, Arabic) have more than one set of decimal digits. This code does - not allow mixing sets, even within the same script. The vector called - PRIV(ucd_digit_sets)[] contains, in its first element, the number of - following elements, and then, in ascending order, the code points of the - '9' characters in every set of 10 digits. Each set is identified by the - offset in the vector of its '9' character. An initial check of the first - value picks up ASCII digits quickly. Otherwise, a binary chop is used. */ - - if (ucd->chartype == ucp_Nd) - { - uint32_t digitset; + case ucp_Bopomofo: + require_state = SCRIPT_HANBOPOMOFO; + break; + + case ucp_Hangul: + require_state = SCRIPT_HANHANGUL; + break; + + /* Compute the intersection of the required list of scripts and the + allowed scripts for this character. */ - if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else + default: + for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i]; + break; + } + + break; + } + } /* End checking character's script and extensions. */ + + /* The character is in an acceptable script. We must now ensure that all + decimal digits in the string come from the same set. Some scripts (e.g. + Common, Arabic) have more than one set of decimal digits. This code does + not allow mixing sets, even within the same script. The vector called + PRIV(ucd_digit_sets)[] contains, in its first element, the number of + following elements, and then, in ascending order, the code points of the + '9' characters in every set of 10 digits. Each set is identified by the + offset in the vector of its '9' character. An initial check of the first + value picks up ASCII digits quickly. Otherwise, a binary chop is used. */ + + if (ucd->chartype == ucp_Nd) + { + uint32_t digitset; + + if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else + { + int mid; + int bot = 1; + int top = PRIV(ucd_digit_sets)[0]; + for (;;) { - int mid; - int bot = 1; - int top = PRIV(ucd_digit_sets)[0]; - for (;;) + if (top <= bot + 1) /* <= rather than == is paranoia */ { - if (top <= bot + 1) /* <= rather than == is paranoia */ - { - digitset = top; - break; - } - mid = (top + bot) / 2; - if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid; + digitset = top; + break; } + mid = (top + bot) / 2; + if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid; } + } - /* A required value of 0 means "unset". */ + /* A required value of 0 means "unset". */ - if (require_digitset == 0) require_digitset = digitset; - else if (digitset != require_digitset) return FALSE; - } /* End digit handling */ - } /* End checking non-Inherited character */ + if (require_digitset == 0) require_digitset = digitset; + else if (digitset != require_digitset) return FALSE; + } /* End digit handling */ /* If we haven't yet got to the end, pick up the next character. */ |