diff options
Diffstat (limited to 'thirdparty/pcre2/src/pcre2_script_run.c')
| -rw-r--r-- | thirdparty/pcre2/src/pcre2_script_run.c | 473 | 
1 files changed, 188 insertions, 285 deletions
diff --git a/thirdparty/pcre2/src/pcre2_script_run.c b/thirdparty/pcre2/src/pcre2_script_run.c index 91a4833028..4926fa63bb 100644 --- a/thirdparty/pcre2/src/pcre2_script_run.c +++ b/thirdparty/pcre2/src/pcre2_script_run.c @@ -7,7 +7,7 @@ and semantics are as close as possible to those of the Perl 5 language.                         Written by Philip Hazel       Original API code Copyright (c) 1997-2012 University of Cambridge -          New API code Copyright (c) 2016-2018 University of Cambridge +          New API code Copyright (c) 2016-2021 University of Cambridge  -----------------------------------------------------------------------------  Redistribution and use in source and binary forms, with or without @@ -68,26 +68,26 @@ Arguments:  Returns:    TRUE if this is a valid script run  */ -/* These dummy values must be less than the negation of the largest offset in -the PRIV(ucd_script_sets) vector, which is held in a 16-bit field in UCD -records (and is only likely to be a few hundred). */ +/* These are states in the checking process. */ -#define SCRIPT_UNSET        (-99999) -#define SCRIPT_HANPENDING   (-99998) -#define SCRIPT_HANHIRAKATA  (-99997) -#define SCRIPT_HANBOPOMOFO  (-99996) -#define SCRIPT_HANHANGUL    (-99995) -#define SCRIPT_LIST         (-99994) +enum { SCRIPT_UNSET,          /* Requirement as yet unknown */ +       SCRIPT_MAP,            /* Bitmap contains acceptable scripts */ +       SCRIPT_HANPENDING,     /* Have had only Han characters */ +       SCRIPT_HANHIRAKATA,    /* Expect Han or Hirikata */ +       SCRIPT_HANBOPOMOFO,    /* Expect Han or Bopomofo */ +       SCRIPT_HANHANGUL       /* Expect Han or Hangul */ +       }; -#define INTERSECTION_LIST_SIZE 50 +#define UCD_MAPSIZE (ucp_Unknown/32 + 1) +#define FULL_MAPSIZE (ucp_Script_Count/32 + 1)  BOOL  PRIV(script_run)(PCRE2_SPTR ptr, PCRE2_SPTR endptr, BOOL utf)  {  #ifdef SUPPORT_UNICODE -int require_script = SCRIPT_UNSET; -uint8_t intersection_list[INTERSECTION_LIST_SIZE]; -const uint8_t *require_list = NULL; +uint32_t require_state = SCRIPT_UNSET; +uint32_t require_map[FULL_MAPSIZE]; +uint32_t map[FULL_MAPSIZE];  uint32_t require_digitset = 0;  uint32_t c; @@ -101,11 +101,17 @@ if (ptr >= endptr) return TRUE;  GETCHARINCTEST(c, ptr);  if (ptr >= endptr) return TRUE; +/* Initialize the require map. This is a full-size bitmap that has a bit for +every script, as opposed to the maps in ucd_script_sets, which only have bits +for scripts less than ucp_Unknown - those that appear in script extension +lists. */ + +for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] = 0; +  /* Scan strings of two or more characters, checking the Unicode characteristics -of each code point. We make use of the Script Extensions property. There is -special code for scripts that can be combined with characters from the Han -Chinese script. This may be used in conjunction with four other scripts in -these combinations: +of each code point. There is special code for scripts that can be combined with +characters from the Han Chinese script. This may be used in conjunction with +four other scripts in these combinations:  . Han with Hiragana and Katakana is allowed (for Japanese).  . Han with Bopomofo is allowed (for Taiwanese Mandarin). @@ -119,310 +125,207 @@ Hence the SCRIPT_HANPENDING state. */  for (;;)    {    const ucd_record *ucd = GET_UCD(c); -  int32_t scriptx = ucd->scriptx; +  uint32_t script = ucd->script; -  /* If the script extension is Unknown, the string is not a valid script run. -  Such characters can only form script runs of length one. */ +  /* If the script is Unknown, the string is not a valid script run. Such +  characters can only form script runs of length one (see test above). */ -  if (scriptx == ucp_Unknown) return FALSE; +  if (script == ucp_Unknown) return FALSE; -  /* A character whose script extension is Inherited is always accepted with -  any script, and plays no further part in this testing. A character whose -  script is Common is always accepted, but must still be tested for a digit -  below. The scriptx value at this point is non-zero, because zero is -  ucp_Unknown, tested for above. */ +  /* A character without any script extensions whose script is Inherited or +  Common is always accepted with any script. If there are extensions, the +  following processing happens for all scripts. */ -  if (scriptx != ucp_Inherited) +  if (UCD_SCRIPTX_PROP(ucd) != 0 || (script != ucp_Inherited && script != ucp_Common))      { -    if (scriptx != ucp_Common) +    BOOL OK; + +    /* Set up a full-sized map for this character that can include bits for all +    scripts. Copy the scriptx map for this character (which covers those +    scripts that appear in script extension lists), set the remaining values to +    zero, and then, except for Common or Inherited, add this script's bit to +    the map. */ + +    memcpy(map, PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(ucd), UCD_MAPSIZE * sizeof(uint32_t)); +    memset(map + UCD_MAPSIZE, 0, (FULL_MAPSIZE - UCD_MAPSIZE) * sizeof(uint32_t)); +    if (script != ucp_Common && script != ucp_Inherited) MAPSET(map, script); + +    /* Handle the different checking states */ + +    switch(require_state)        { -      /* If the script extension value is positive, the character is not a mark -      that can be used with many scripts. In the simple case we either set or -      compare with the required script. However, handling the scripts that can -      combine with Han are more complicated, as is the case when the previous -      characters have been man-script marks. */ +      /* First significant character - it might follow Common or Inherited +      characters that do not have any script extensions. */ -      if (scriptx > 0) +      case SCRIPT_UNSET: +      switch(script)          { -        switch(require_script) -          { -          /* Either the first significant character (require_script unset) or -          after only Han characters. */ - -          case SCRIPT_UNSET: -          case SCRIPT_HANPENDING: -          switch(scriptx) -            { -            case ucp_Han: -            require_script = SCRIPT_HANPENDING; -            break; - -            case ucp_Hiragana: -            case ucp_Katakana: -            require_script = SCRIPT_HANHIRAKATA; -            break; - -            case ucp_Bopomofo: -            require_script = SCRIPT_HANBOPOMOFO; -            break; - -            case ucp_Hangul: -            require_script = SCRIPT_HANHANGUL; -            break; - -            /* Not a Han-related script. If expecting one, fail. Otherise set -            the requirement to this script. */ - -            default: -            if (require_script == SCRIPT_HANPENDING) return FALSE; -            require_script = scriptx; -            break; -            } -          break; +        case ucp_Han: +        require_state = SCRIPT_HANPENDING; +        break; + +        case ucp_Hiragana: +        case ucp_Katakana: +        require_state = SCRIPT_HANHIRAKATA; +        break; + +        case ucp_Bopomofo: +        require_state = SCRIPT_HANBOPOMOFO; +        break; + +        case ucp_Hangul: +        require_state = SCRIPT_HANHANGUL; +        break; + +        default: +        memcpy(require_map, map, FULL_MAPSIZE * sizeof(uint32_t)); +        require_state = SCRIPT_MAP; +        break; +        } +      break; -          /* Previously encountered one of the "with Han" scripts. Check that -          this character is appropriate. */ +      /* The first significant character was Han. An inspection of the Unicode +      11.0.0 files shows that there are the following types of Script Extension +      list that involve the Han, Bopomofo, Hiragana, Katakana, and Hangul +      scripts: -          case SCRIPT_HANHIRAKATA: -          if (scriptx != ucp_Han && scriptx != ucp_Hiragana &&  -              scriptx != ucp_Katakana) -            return FALSE; -          break; +      . Bopomofo + Han +      . Han + Hiragana + Katakana +      . Hiragana + Katakana +      . Bopopmofo + Hangul + Han + Hiragana + Katakana -          case SCRIPT_HANBOPOMOFO: -          if (scriptx != ucp_Han && scriptx != ucp_Bopomofo) return FALSE; -          break; +      The following code tries to make sense of this. */ -          case SCRIPT_HANHANGUL: -          if (scriptx != ucp_Han && scriptx != ucp_Hangul) return FALSE; -          break; +#define FOUND_BOPOMOFO 1 +#define FOUND_HIRAGANA 2 +#define FOUND_KATAKANA 4 +#define FOUND_HANGUL   8 -          /* We have a list of scripts to check that is derived from one or -          more previous characters. This is either one of the lists in -          ucd_script_sets[] (for one previous character) or the intersection of -          several lists for multiple characters. */ - -          case SCRIPT_LIST: -            { -            const uint8_t *list; -            for (list = require_list; *list != 0; list++) -              { -              if (*list == scriptx) break; -              } -            if (*list == 0) return FALSE; -            } - -          /* The rest of the string must be in this script, but we have to  -          allow for the Han complications. */ -           -          switch(scriptx) -            { -            case ucp_Han: -            require_script = SCRIPT_HANPENDING; -            break; - -            case ucp_Hiragana: -            case ucp_Katakana: -            require_script = SCRIPT_HANHIRAKATA; -            break; - -            case ucp_Bopomofo: -            require_script = SCRIPT_HANBOPOMOFO; -            break; - -            case ucp_Hangul: -            require_script = SCRIPT_HANHANGUL; -            break; - -            default: -            require_script = scriptx; -            break; -            }   -          break; +      case SCRIPT_HANPENDING: +      if (script != ucp_Han)   /* Another Han does nothing */ +        { +        uint32_t chspecial = 0; -          /* This is the easy case when a single script is required. */ +        if (MAPBIT(map, ucp_Bopomofo) != 0) chspecial |= FOUND_BOPOMOFO; +        if (MAPBIT(map, ucp_Hiragana) != 0) chspecial |= FOUND_HIRAGANA; +        if (MAPBIT(map, ucp_Katakana) != 0) chspecial |= FOUND_KATAKANA; +        if (MAPBIT(map, ucp_Hangul) != 0)   chspecial |= FOUND_HANGUL; -          default: -          if (scriptx != require_script) return FALSE; -          break; -          } -        }  /* End of handing positive scriptx */ +        if (chspecial == 0) return FALSE;   /* Not allowed with Han */ -      /* If scriptx is negative, this character is a mark-type character that -      has a list of permitted scripts. */ +        if (chspecial == FOUND_BOPOMOFO) +          require_state = SCRIPT_HANBOPOMOFO; +        else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA)) +          require_state = SCRIPT_HANHIRAKATA; -      else -        { -        uint32_t chspecial; -        const uint8_t *clist, *rlist; -        const uint8_t *list = PRIV(ucd_script_sets) - scriptx; -         -        switch(require_script) -          { -          case SCRIPT_UNSET: -          require_list = PRIV(ucd_script_sets) - scriptx; -          require_script = SCRIPT_LIST; -          break; +        /* Otherwise this character must be allowed with all of them, so remain +        in the pending state. */ +        } +      break; -          /* An inspection of the Unicode 11.0.0 files shows that there are the -          following types of Script Extension list that involve the Han, -          Bopomofo, Hiragana, Katakana, and Hangul scripts: +      /* Previously encountered one of the "with Han" scripts. Check that +      this character is appropriate. */ -          . Bopomofo + Han -          . Han + Hiragana + Katakana -          . Hiragana + Katakana -          . Bopopmofo + Hangul + Han + Hiragana + Katakana +      case SCRIPT_HANHIRAKATA: +      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hiragana) + +          MAPBIT(map, ucp_Katakana) == 0) return FALSE; +      break; -          The following code tries to make sense of this. */ +      case SCRIPT_HANBOPOMOFO: +      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Bopomofo) == 0) return FALSE; +      break; -#define FOUND_BOPOMOFO 1 -#define FOUND_HIRAGANA 2 -#define FOUND_KATAKANA 4 -#define FOUND_HANGUL   8 +      case SCRIPT_HANHANGUL: +      if (MAPBIT(map, ucp_Han) + MAPBIT(map, ucp_Hangul) == 0) return FALSE; +      break; -          case SCRIPT_HANPENDING: -          chspecial = 0; -          for (; *list != 0; list++) -            { -            switch (*list) -              { -              case ucp_Bopomofo: chspecial |= FOUND_BOPOMOFO; break; -              case ucp_Hiragana: chspecial |= FOUND_HIRAGANA; break; -              case ucp_Katakana: chspecial |= FOUND_KATAKANA; break; -              case ucp_Hangul:   chspecial |= FOUND_HANGUL; break; -              default: break; -              } -            } - -           if (chspecial == 0) return FALSE; - -           if (chspecial == FOUND_BOPOMOFO) -             { -             require_script = SCRIPT_HANBOPOMOFO; -             } -           else if (chspecial == (FOUND_HIRAGANA|FOUND_KATAKANA)) -             { -             require_script = SCRIPT_HANHIRAKATA; -             } - -          /* Otherwise it must be allowed with all of them, so remain in -          the pending state. */ +      /* Previously encountered one or more characters that are allowed with a +      list of scripts. */ -          break; +      case SCRIPT_MAP: +      OK = FALSE; -          case SCRIPT_HANHIRAKATA: -          for (; *list != 0; list++) -            { -            if (*list == ucp_Hiragana || *list == ucp_Katakana) break; -            } -          if (*list == 0) return FALSE; +      for (int i = 0; i < FULL_MAPSIZE; i++) +        { +        if ((require_map[i] & map[i]) != 0) +          { +          OK = TRUE;            break; +          } +        } -          case SCRIPT_HANBOPOMOFO: -          for (; *list != 0; list++) -            { -            if (*list == ucp_Bopomofo) break; -            } -          if (*list == 0) return FALSE; -          break; +      if (!OK) return FALSE; -          case SCRIPT_HANHANGUL: -          for (; *list != 0; list++) -            { -            if (*list == ucp_Hangul) break; -            } -          if (*list == 0) return FALSE; -          break; +      /* The rest of the string must be in this script, but we have to +      allow for the Han complications. */ -          /* Previously encountered one or more characters that are allowed -          with a list of scripts. Build the intersection of the required list -          with this character's list in intersection_list[]. This code is -          written so that it still works OK if the required list is already in -          that vector. */ - -          case SCRIPT_LIST: -            { -            int i = 0; -            for (rlist = require_list; *rlist != 0; rlist++) -              { -              for (clist = list; *clist != 0; clist++) -                { -                if (*rlist == *clist) -                  { -                  intersection_list[i++] = *rlist; -                  break; -                  } -                } -              } -            if (i == 0) return FALSE;  /* No scripts in common */ - -            /* If there's just one script in common, we can set it as the -            unique required script. Otherwise, terminate the intersection list -            and make it the required list. */ - -            if (i == 1) -              { -              require_script = intersection_list[0]; -              } -            else -              { -              intersection_list[i] = 0; -              require_list = intersection_list; -              } -            } -          break; +      switch(script) +        { +        case ucp_Han: +        require_state = SCRIPT_HANPENDING; +        break; -          /* The previously set required script is a single script, not -          Han-related. Check that it is in this character's list. */ +        case ucp_Hiragana: +        case ucp_Katakana: +        require_state = SCRIPT_HANHIRAKATA; +        break; -          default: -          for (; *list != 0; list++) -            { -            if (*list == require_script) break; -            } -          if (*list == 0) return FALSE; -          break; -          } -        }  /* End of handling negative scriptx */ -      }    /* End of checking non-Common character */ - -    /* The character is in an acceptable script. We must now ensure that all -    decimal digits in the string come from the same set. Some scripts (e.g. -    Common, Arabic) have more than one set of decimal digits. This code does -    not allow mixing sets, even within the same script. The vector called -    PRIV(ucd_digit_sets)[] contains, in its first element, the number of -    following elements, and then, in ascending order, the code points of the -    '9' characters in every set of 10 digits. Each set is identified by the -    offset in the vector of its '9' character. An initial check of the first -    value picks up ASCII digits quickly. Otherwise, a binary chop is used. */ - -    if (ucd->chartype == ucp_Nd) -      { -      uint32_t digitset; +        case ucp_Bopomofo: +        require_state = SCRIPT_HANBOPOMOFO; +        break; + +        case ucp_Hangul: +        require_state = SCRIPT_HANHANGUL; +        break; + +        /* Compute the intersection of the required list of scripts and the +        allowed scripts for this character. */ -      if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else +        default: +        for (int i = 0; i < FULL_MAPSIZE; i++) require_map[i] &= map[i]; +        break; +        } + +      break; +      } +    }   /* End checking character's script and extensions. */ + +  /* The character is in an acceptable script. We must now ensure that all +  decimal digits in the string come from the same set. Some scripts (e.g. +  Common, Arabic) have more than one set of decimal digits. This code does +  not allow mixing sets, even within the same script. The vector called +  PRIV(ucd_digit_sets)[] contains, in its first element, the number of +  following elements, and then, in ascending order, the code points of the +  '9' characters in every set of 10 digits. Each set is identified by the +  offset in the vector of its '9' character. An initial check of the first +  value picks up ASCII digits quickly. Otherwise, a binary chop is used. */ + +  if (ucd->chartype == ucp_Nd) +    { +    uint32_t digitset; + +    if (c <= PRIV(ucd_digit_sets)[1]) digitset = 1; else +      { +      int mid; +      int bot = 1; +      int top = PRIV(ucd_digit_sets)[0]; +      for (;;)          { -        int mid; -        int bot = 1; -        int top = PRIV(ucd_digit_sets)[0]; -        for (;;) +        if (top <= bot + 1)    /* <= rather than == is paranoia */            { -          if (top <= bot + 1)    /* <= rather than == is paranoia */ -            { -            digitset = top; -            break; -            } -          mid = (top + bot) / 2; -          if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid; +          digitset = top; +          break;            } +        mid = (top + bot) / 2; +        if (c <= PRIV(ucd_digit_sets)[mid]) top = mid; else bot = mid;          } +      } -      /* A required value of 0 means "unset". */ +    /* A required value of 0 means "unset". */ -      if (require_digitset == 0) require_digitset = digitset; -        else if (digitset != require_digitset) return FALSE; -      }   /* End digit handling */ -    }     /* End checking non-Inherited character */ +    if (require_digitset == 0) require_digitset = digitset; +      else if (digitset != require_digitset) return FALSE; +    }   /* End digit handling */    /* If we haven't yet got to the end, pick up the next character. */  |