summaryrefslogtreecommitdiff
path: root/thirdparty/pcre2/src/pcre2_script_run.c
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/pcre2/src/pcre2_script_run.c')
-rw-r--r--thirdparty/pcre2/src/pcre2_script_run.c473
1 files changed, 188 insertions, 285 deletions
diff --git a/thirdparty/pcre2/src/pcre2_script_run.c b/thirdparty/pcre2/src/pcre2_script_run.c
index 91a4833028..4926fa63bb 100644
--- a/thirdparty/pcre2/src/pcre2_script_run.c
+++ b/thirdparty/pcre2/src/pcre2_script_run.c
@@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.
Written by Philip Hazel
Original API code Copyright (c) 1997-2012 University of Cambridge
- New API code Copyright (c) 2016-2018 University of Cambridge
+ New API code Copyright (c) 2016-2021 University of Cambridge
-----------------------------------------------------------------------------
Redistribution and use in source and binary forms, with or without
@@ -68,26 +68,26 @@ Arguments:
Returns: TRUE if this is a valid script run
*/
-/* These dummy values must be less than the negation of the largest offset in
-the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD
-records (and is only likely to be a few hundred). */
+/* These are states in the checking process. */
-#define SCRIPT_UNSET (-99999)
-#define SCRIPT_HANPENDING (-99998)
-#define SCRIPT_HANHIRAKATA (-99997)
-#define SCRIPT_HANBOPOMOFO (-99996)
-#define SCRIPT_HANHANGUL (-99995)
-#define SCRIPT_LIST (-99994)
+enum { SCRIPT_UNSET, /* Requirement as yet unknown */
+ SCRIPT_MAP, /* Bitmap contains acceptable scripts */
+ SCRIPT_HANPENDING, /* Have had only Han characters */
+ SCRIPT_HANHIRAKATA, /* Expect Han or Hirikata */
+ SCRIPT_HANBOPOMOFO, /* Expect Han or Bopomofo */
+ SCRIPT_HANHANGUL /* Expect Han or Hangul */
+ };
-#define INTERSECTION_LIST_SIZE 50
+#define UCD_MAPSIZE (ucp_Unknown/32 + 1)
+#define FULL_MAPSIZE (ucp_Script_Count/32 + 1)
BOOL
PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)
{
#ifdef SUPPORT_UNICODE
-int require_script = SCRIPT_UNSET;
-uint8_t intersection_list[INTERSECTION_LIST_SIZE];
-const uint8_t *require_list = NULL;
+uint32_t require_state = SCRIPT_UNSET;
+uint32_t require_map[FULL_MAPSIZE];
+uint32_t map[FULL_MAPSIZE];
uint32_t require_digitset = 0;
uint32_t c;
@@ -101,11 +101,17 @@ if (ptr >= endptr) return TRUE;
GETCHARINCTEST(c, ptr);
if (ptr >= endptr) return TRUE;
+/* Initialize the require map. This is a full-size bitmap that has a bit for
+every script, as opposed to the maps in ucd_script_sets, which only have bits
+for scripts less than ucp_Unknown - those that appear in script extension
+lists. */
+
+for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0;
+
/* Scan strings of two or more characters, checking the Unicode characteristics
-of each code point. We make use of the Script Extensions property. There is
-special code for scripts that can be combined with characters from the Han
-Chinese script. This may be used in conjunction with four other scripts in
-these combinations:
+of each code point. There is special code for scripts that can be combined with
+characters from the Han Chinese script. This may be used in conjunction with
+four other scripts in these combinations:
. Han with Hiragana and Katakana is allowed (for Japanese).
. Han with Bopomofo is allowed (for Taiwanese Mandarin).
@@ -119,310 +125,207 @@ Hence the SCRIPT_HANPENDING state. */
for (;;)
{
const ucd_record *ucd = GET_UCD(c);
- int32_t scriptx = ucd->scriptx;
+ uint32_t script = ucd->script;
- /* If the script extension is Unknown, the string is not a valid script run.
- Such characters can only form script runs of length one. */
+ /* If the script is Unknown, the string is not a valid script run. Such
+ characters can only form script runs of length one (see test above). */
- if (scriptx == ucp_Unknown) return FALSE;
+ if (script == ucp_Unknown) return FALSE;
- /* A character whose script extension is Inherited is always accepted with
- any script, and plays no further part in this testing. A character whose
- script is Common is always accepted, but must still be tested for a digit
- below. The scriptx value at this point is non-zero, because zero is
- ucp_Unknown, tested for above. */
+ /* A character without any script extensions whose script is Inherited or
+ Common is always accepted with any script. If there are extensions, the
+ following processing happens for all scripts. */
- if (scriptx != ucp_Inherited)
+ if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common))
{
- if (scriptx != ucp_Common)
+ BOOL OK;
+
+ /* Set up a full-sized map for this character that can include bits for all
+ scripts. Copy the scriptx map for this character (which covers those
+ scripts that appear in script extension lists), set the remaining values to
+ zero, and then, except for Common or Inherited, add this script's bit to
+ the map. */
+
+ memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t));
+ memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t));
+ if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script);
+
+ /* Handle the different checking states */
+
+ switch(require_state)
{
- /* If the script extension value is positive, the character is not a mark
- that can be used with many scripts. In the simple case we either set or
- compare with the required script. However, handling the scripts that can
- combine with Han are more complicated, as is the case when the previous
- characters have been man-script marks. */
+ /* First significant character - it might follow Common or Inherited
+ characters that do not have any script extensions. */
- if (scriptx > 0)
+ case SCRIPT_UNSET:
+ switch(script)
{
- switch(require_script)
- {
- /* Either the first significant character (require_script unset) or
- after only Han characters. */
-
- case SCRIPT_UNSET:
- case SCRIPT_HANPENDING:
- switch(scriptx)
- {
- case ucp_Han:
- require_script = SCRIPT_HANPENDING;
- break;
-
- case ucp_Hiragana:
- case ucp_Katakana:
- require_script = SCRIPT_HANHIRAKATA;
- break;
-
- case ucp_Bopomofo:
- require_script = SCRIPT_HANBOPOMOFO;
- break;
-
- case ucp_Hangul:
- require_script = SCRIPT_HANHANGUL;
- break;
-
- /* Not a Han-related script. If expecting one, fail. Otherise set
- the requirement to this script. */
-
- default:
- if (require_script == SCRIPT_HANPENDING) return FALSE;
- require_script = scriptx;
- break;
- }
- break;
+ case ucp_Han:
+ require_state = SCRIPT_HANPENDING;
+ break;
+
+ case ucp_Hiragana:
+ case ucp_Katakana:
+ require_state = SCRIPT_HANHIRAKATA;
+ break;
+
+ case ucp_Bopomofo:
+ require_state = SCRIPT_HANBOPOMOFO;
+ break;
+
+ case ucp_Hangul:
+ require_state = SCRIPT_HANHANGUL;
+ break;
+
+ default:
+ memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t));
+ require_state = SCRIPT_MAP;
+ break;
+ }
+ break;
- /* Previously encountered one of the "with Han" scripts. Check that
- this character is appropriate. */
+ /* The first significant character was Han. An inspection of the Unicode
+ 11.0.0 files shows that there are the following types of Script Extension
+ list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul
+ scripts:
- case SCRIPT_HANHIRAKATA:
- if (scriptx != ucp_Han && scriptx != ucp_Hiragana &&
- scriptx != ucp_Katakana)
- return FALSE;
- break;
+ . Bopomofo + Han
+ . Han + Hiragana + Katakana
+ . Hiragana + Katakana
+ . Bopopmofo + Hangul + Han + Hiragana + Katakana
- case SCRIPT_HANBOPOMOFO:
- if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE;
- break;
+ The following code tries to make sense of this. */
- case SCRIPT_HANHANGUL:
- if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE;
- break;
+#define FOUND_BOPOMOFO 1
+#define FOUND_HIRAGANA 2
+#define FOUND_KATAKANA 4
+#define FOUND_HANGUL 8
- /* We have a list of scripts to check that is derived from one or
- more previous characters. This is either one of the lists in
- ucd_script_sets[] (for one previous character) or the intersection of
- several lists for multiple characters. */
-
- case SCRIPT_LIST:
- {
- const uint8_t *list;
- for (list = require_list; *list != 0; list++)
- {
- if (*list == scriptx) break;
- }
- if (*list == 0) return FALSE;
- }
-
- /* The rest of the string must be in this script, but we have to
- allow for the Han complications. */
-
- switch(scriptx)
- {
- case ucp_Han:
- require_script = SCRIPT_HANPENDING;
- break;
-
- case ucp_Hiragana:
- case ucp_Katakana:
- require_script = SCRIPT_HANHIRAKATA;
- break;
-
- case ucp_Bopomofo:
- require_script = SCRIPT_HANBOPOMOFO;
- break;
-
- case ucp_Hangul:
- require_script = SCRIPT_HANHANGUL;
- break;
-
- default:
- require_script = scriptx;
- break;
- }
- break;
+ case SCRIPT_HANPENDING:
+ if (script != ucp_Han) /* Another Han does nothing */
+ {
+ uint32_t chspecial = 0;
- /* This is the easy case when a single script is required. */
+ if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO;
+ if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA;
+ if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA;
+ if (MAPBIT(map, ucp_Hangul) != 0) chspecial |= FOUND_HANGUL;
- default:
- if (scriptx != require_script) return FALSE;
- break;
- }
- } /* End of handing positive scriptx */
+ if (chspecial == 0) return FALSE; /* Not allowed with Han */
- /* If scriptx is negative, this character is a mark-type character that
- has a list of permitted scripts. */
+ if (chspecial == FOUND_BOPOMOFO)
+ require_state = SCRIPT_HANBOPOMOFO;
+ else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
+ require_state = SCRIPT_HANHIRAKATA;
- else
- {
- uint32_t chspecial;
- const uint8_t *clist, *rlist;
- const uint8_t *list = PRIV(ucd_script_sets) - scriptx;
-
- switch(require_script)
- {
- case SCRIPT_UNSET:
- require_list = PRIV(ucd_script_sets) - scriptx;
- require_script = SCRIPT_LIST;
- break;
+ /* Otherwise this character must be allowed with all of them, so remain
+ in the pending state. */
+ }
+ break;
- /* An inspection of the Unicode 11.0.0 files shows that there are the
- following types of Script Extension list that involve the Han,
- Bopomofo, Hiragana, Katakana, and Hangul scripts:
+ /* Previously encountered one of the "with Han" scripts. Check that
+ this character is appropriate. */
- . Bopomofo + Han
- . Han + Hiragana + Katakana
- . Hiragana + Katakana
- . Bopopmofo + Hangul + Han + Hiragana + Katakana
+ case SCRIPT_HANHIRAKATA:
+ if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) +
+ MAPBIT(map, ucp_Katakana) == 0) return FALSE;
+ break;
- The following code tries to make sense of this. */
+ case SCRIPT_HANBOPOMOFO:
+ if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE;
+ break;
-#define FOUND_BOPOMOFO 1
-#define FOUND_HIRAGANA 2
-#define FOUND_KATAKANA 4
-#define FOUND_HANGUL 8
+ case SCRIPT_HANHANGUL:
+ if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE;
+ break;
- case SCRIPT_HANPENDING:
- chspecial = 0;
- for (; *list != 0; list++)
- {
- switch (*list)
- {
- case ucp_Bopomofo: chspecial |= FOUND_BOPOMOFO; break;
- case ucp_Hiragana: chspecial |= FOUND_HIRAGANA; break;
- case ucp_Katakana: chspecial |= FOUND_KATAKANA; break;
- case ucp_Hangul: chspecial |= FOUND_HANGUL; break;
- default: break;
- }
- }
-
- if (chspecial == 0) return FALSE;
-
- if (chspecial == FOUND_BOPOMOFO)
- {
- require_script = SCRIPT_HANBOPOMOFO;
- }
- else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA))
- {
- require_script = SCRIPT_HANHIRAKATA;
- }
-
- /* Otherwise it must be allowed with all of them, so remain in
- the pending state. */
+ /* Previously encountered one or more characters that are allowed with a
+ list of scripts. */
- break;
+ case SCRIPT_MAP:
+ OK = FALSE;
- case SCRIPT_HANHIRAKATA:
- for (; *list != 0; list++)
- {
- if (*list == ucp_Hiragana || *list == ucp_Katakana) break;
- }
- if (*list == 0) return FALSE;
+ for (int i = 0; i < FULL_MAPSIZE; i++)
+ {
+ if ((require_map[i] & map[i]) != 0)
+ {
+ OK = TRUE;
break;
+ }
+ }
- case SCRIPT_HANBOPOMOFO:
- for (; *list != 0; list++)
- {
- if (*list == ucp_Bopomofo) break;
- }
- if (*list == 0) return FALSE;
- break;
+ if (!OK) return FALSE;
- case SCRIPT_HANHANGUL:
- for (; *list != 0; list++)
- {
- if (*list == ucp_Hangul) break;
- }
- if (*list == 0) return FALSE;
- break;
+ /* The rest of the string must be in this script, but we have to
+ allow for the Han complications. */
- /* Previously encountered one or more characters that are allowed
- with a list of scripts. Build the intersection of the required list
- with this character's list in intersection_list[]. This code is
- written so that it still works OK if the required list is already in
- that vector. */
-
- case SCRIPT_LIST:
- {
- int i = 0;
- for (rlist = require_list; *rlist != 0; rlist++)
- {
- for (clist = list; *clist != 0; clist++)
- {
- if (*rlist == *clist)
- {
- intersection_list[i++] = *rlist;
- break;
- }
- }
- }
- if (i == 0) return FALSE; /* No scripts in common */
-
- /* If there's just one script in common, we can set it as the
- unique required script. Otherwise, terminate the intersection list
- and make it the required list. */
-
- if (i == 1)
- {
- require_script = intersection_list[0];
- }
- else
- {
- intersection_list[i] = 0;
- require_list = intersection_list;
- }
- }
- break;
+ switch(script)
+ {
+ case ucp_Han:
+ require_state = SCRIPT_HANPENDING;
+ break;
- /* The previously set required script is a single script, not
- Han-related. Check that it is in this character's list. */
+ case ucp_Hiragana:
+ case ucp_Katakana:
+ require_state = SCRIPT_HANHIRAKATA;
+ break;
- default:
- for (; *list != 0; list++)
- {
- if (*list == require_script) break;
- }
- if (*list == 0) return FALSE;
- break;
- }
- } /* End of handling negative scriptx */
- } /* End of checking non-Common character */
-
- /* The character is in an acceptable script. We must now ensure that all
- decimal digits in the string come from the same set. Some scripts (e.g.
- Common, Arabic) have more than one set of decimal digits. This code does
- not allow mixing sets, even within the same script. The vector called
- PRIV(ucd_digit_sets)[] contains, in its first element, the number of
- following elements, and then, in ascending order, the code points of the
- '9' characters in every set of 10 digits. Each set is identified by the
- offset in the vector of its '9' character. An initial check of the first
- value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
-
- if (ucd->chartype == ucp_Nd)
- {
- uint32_t digitset;
+ case ucp_Bopomofo:
+ require_state = SCRIPT_HANBOPOMOFO;
+ break;
+
+ case ucp_Hangul:
+ require_state = SCRIPT_HANHANGUL;
+ break;
+
+ /* Compute the intersection of the required list of scripts and the
+ allowed scripts for this character. */
- if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
+ default:
+ for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i];
+ break;
+ }
+
+ break;
+ }
+ } /* End checking character's script and extensions. */
+
+ /* The character is in an acceptable script. We must now ensure that all
+ decimal digits in the string come from the same set. Some scripts (e.g.
+ Common, Arabic) have more than one set of decimal digits. This code does
+ not allow mixing sets, even within the same script. The vector called
+ PRIV(ucd_digit_sets)[] contains, in its first element, the number of
+ following elements, and then, in ascending order, the code points of the
+ '9' characters in every set of 10 digits. Each set is identified by the
+ offset in the vector of its '9' character. An initial check of the first
+ value picks up ASCII digits quickly. Otherwise, a binary chop is used. */
+
+ if (ucd->chartype == ucp_Nd)
+ {
+ uint32_t digitset;
+
+ if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else
+ {
+ int mid;
+ int bot = 1;
+ int top = PRIV(ucd_digit_sets)[0];
+ for (;;)
{
- int mid;
- int bot = 1;
- int top = PRIV(ucd_digit_sets)[0];
- for (;;)
+ if (top <= bot + 1) /* <= rather than == is paranoia */
{
- if (top <= bot + 1) /* <= rather than == is paranoia */
- {
- digitset = top;
- break;
- }
- mid = (top + bot) / 2;
- if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
+ digitset = top;
+ break;
}
+ mid = (top + bot) / 2;
+ if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;
}
+ }
- /* A required value of 0 means "unset". */
+ /* A required value of 0 means "unset". */
- if (require_digitset == 0) require_digitset = digitset;
- else if (digitset != require_digitset) return FALSE;
- } /* End digit handling */
- } /* End checking non-Inherited character */
+ if (require_digitset == 0) require_digitset = digitset;
+ else if (digitset != require_digitset) return FALSE;
+ } /* End digit handling */
/* If we haven't yet got to the end, pick up the next character. */