[v4,1/3] Add support for locales with zero collation rules.

Message ID 20210729063515.1541388-2-carlos@redhat.com
State Superseded
Headers
Series C.UTF-8 |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Carlos O'Donell July 29, 2021, 6:35 a.m. UTC
  While there is code to handle 'nrules == 0' in various locations
within posix/fnmatch_loop.c, posix/regcomp.c and posix/regexec.c,
these conditionals do not work.  The only collation with zero
rules in effect today is the builtin C/POSIX locale which is
built by hand, and despite have zero rules it has a collseqmb
and collseqwc tables stored in the locale data. These tables are
simple identity tables which are not actually required and could
be removed at a later date after this change.  The changes are in
order to prepare for C.UTF-8 which has zero rules and has no
collation sequence tables (multibyte or widechar).

No regressions on x86_64 or i686.
---
 posix/fnmatch_loop.c | 95 +++++++++++++++++++++++++++-----------------
 posix/regcomp.c      | 12 +++---
 posix/regexec.c      | 85 ++++++++++++++++++---------------------
 3 files changed, 104 insertions(+), 88 deletions(-)
  

Comments

Florian Weimer July 29, 2021, 9:44 a.m. UTC | #1
* Carlos O'Donell via Libc-alpha:

> @@ -600,42 +597,51 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
>                      if (c == L_('-') && *p != L_(']'))
>                        {
>  #if _LIBC
> -                        /* We have to find the collation sequence
> -                           value for C.  Collation sequence is nothing
> -                           we can regularly access.  The sequence
> -                           value is defined by the order in which the
> -                           definitions of the collation values for the
> -                           various characters appear in the source
> -                           file.  A strange concept, nowhere
> -                           documented.  */
> -                        uint32_t fcollseq;
> -                        uint32_t lcollseq;
> +			/* We must find the collation sequence values for
> +			   the low part of the range, the high part of the
> +			   range and the searched value FN.  We do this by
> +			   using the POSIX concept of Collation Element
> +			   Ordering, which is the defined order of elements
> +			   in the source locale.  FCOLLSEQ is the searched
> +			   element in the range, while LCOLLSEQ is the low
> +			   element in the range.  If we have no collation
> +			   rules (nrules == 0) then we must fall back to a
> +			   basic code point value for the collation
> +			   sequence value (which is correct for ASCII and
> +			   UTF-8).  We must never use collseq if nrules ==
> +			   0 since none of the tables we need will be
> +			   present in the compiled binary locale.  We start
> +			   with fcollseq and lcollseq at unknown collation
> +			   sequences.  We only compute hcollseq, the high
> +			   part of the range if required.  */
> +                        uint32_t fcollseq = ~((uint32_t) 0);
> +                        uint32_t lcollseq = ~((uint32_t) 0);
>                          UCHAR cend = *p++;

Looks like ~((uint32_t) 0) needs to be added as a macro/constant to
__collseq_table_lookup.



>  
> +			if (nrules != 0)
> +			  {
>  # if WIDE_CHAR_VERSION
> +			    /* Search the collation data for the character.  */
> +			    fcollseq = __collseq_table_lookup (collseq, fn);
> +			    if (fcollseq == ~((uint32_t) 0))
> +			      /* We don't know anything about the character
> +				 we are supposed to match.  This means we are
> +				 failing.  */
> +			      goto range_not_matched;
> +
> +			    if (is_seqval)
> +			      lcollseq = cold;
> +			    else
> +			      lcollseq = __collseq_table_lookup (collseq, cold);
>  # else
> +			    fcollseq = collseq[fn];
> +			    lcollseq = is_seqval ? cold : collseq[(UCHAR) cold];
>  # endif
> +			  }
>  
>                          is_seqval = false;
>                          if (cend == L_('[') && *p == L_('.'))
>                            {
>                              const CHAR *startp = p;
>                              size_t c1 = 0;
>  
> @@ -752,14 +758,20 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
>                              cend = FOLD (cend);
>                            }
>  
> +			/* If we have rules, and the low sequence is lower than
> +			   the value of the searched sequence then we must
> +			   lookup the high collation sequence value and
> +			   determine if the fcollseq falls within the range.
> +			   If hcollseq is unknown then we could still match
> +			   fcollseq on the low end of the range.  If lcollseq
> +			   if unknown (0xffffffff) we will still fail to
> +			   match, but in the future we might consider matching
> +			   the high end of the range on an exact match.  */
> +                        if (nrules != 0 && (
>  # if WIDE_CHAR_VERSION
>                              lcollseq == 0xffffffff ||

This should use the same constant as the initialization.  The #if is now
unnecessary and can be removed.

>  # endif
> +                            lcollseq <= fcollseq))
>                            {
>                              /* We have to look at the upper bound.  */
>                              uint32_t hcollseq;
> @@ -789,6 +801,17 @@ FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
>                              if (lcollseq <= hcollseq && fcollseq <= hcollseq)
>                                goto matched;
>                            }
> +
> +			/* No rules, but we have a range.  */
> +			if (nrules == 0)
> +			  {
> +			    if (cend == L_('\0'))
> +			      return FNM_NOMATCH;
> +
> +			    /* Compare that fn is within the range.  */
> +			    if ((UCHAR) cold <= fn && fn <= cend)
> +			      goto matched;
> +

This part looks okay to me.

> diff --git a/posix/regcomp.c b/posix/regcomp.c
> index d93698ae78..f55d20cbfd 100644
> --- a/posix/regcomp.c
> +++ b/posix/regcomp.c
> @@ -2889,7 +2889,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
>  	  if (MB_CUR_MAX == 1)
>  	  */
>  	  if (nrules == 0)
> -	    return collseqmb[br_elem->opr.ch];
> +	    return br_elem->opr.ch;
>  	  else
>  	    {
>  	      wint_t wc = __btowc (br_elem->opr.ch);
> @@ -2900,6 +2900,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
>  	{
>  	  if (nrules != 0)
>  	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
> +	  else
> +	    return br_elem->opr.wch;
>  	}
>        else if (br_elem->type == COLL_SYM)
>  	{
> @@ -2935,7 +2937,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
>  		}
>  	    }
>  	  else if (sym_name_len == 1)
> -	    return collseqmb[br_elem->opr.name[0]];
> +	    return br_elem->opr.name[0];
>  	}
>        return UINT_MAX;
>      }
> @@ -3017,7 +3019,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
>  	  if (MB_CUR_MAX == 1)
>  	  */
>  	  if (nrules == 0)
> -	    ch_collseq = collseqmb[ch];
> +	    ch_collseq = ch;
>  	  else
>  	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
>  	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
> @@ -3103,11 +3105,11 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
>    int token_len;
>    bool first_round = true;
>  #ifdef _LIBC
> -  collseqmb = (const unsigned char *)
> -    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
>    nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
>    if (nrules)
>      {
> +      collseqmb = (const unsigned char *)
> +	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
>        /*
>        if (MB_CUR_MAX > 1)
>        */

These changes look good.

> diff --git a/posix/regexec.c b/posix/regexec.c
> index f7b4f9cfc3..6cc23831aa 100644
> --- a/posix/regexec.c
> +++ b/posix/regexec.c
> @@ -3858,62 +3858,53 @@ check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
>  }
>  
>  # ifdef _LIBC
> +#include <assert.h>

This really should got to the start of the file.

> +
>  static unsigned int
>  find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
>  {
> +  int32_t idx;
> +  const unsigned char *extra = (const unsigned char *)
>  	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
> +  int32_t extrasize = (const unsigned char *)
>  	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
> +  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
> +
> +  /* Only called from within 'if (nrules != 0)'.  */ 

Trailing whitespace on the last line.  Most of this is just
reindentation, so okay.

> +  assert (nrules != 0);

Right, the single caller is under nrules != 0.

strcoll and strxfrm are already correct.

Thanks,
Florian
  
Paul Eggert July 29, 2021, 7:12 p.m. UTC | #2
On 7/29/21 2:44 AM, Florian Weimer via Libc-alpha wrote:
> Looks like ~((uint32_t) 0) needs to be added as a macro/constant to
> __collseq_table_lookup.

A small point: UINT32_MAX would be simpler and cleaner definiens for 
that macro, as UINT32_MAX can be used in #if and is more portable to 
potential future platforms where int is wider than 32 bits.
  

Patch

diff --git a/posix/fnmatch_loop.c b/posix/fnmatch_loop.c
index 7f938af590..547952f0a9 100644
--- a/posix/fnmatch_loop.c
+++ b/posix/fnmatch_loop.c
@@ -51,6 +51,7 @@  FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
     _NL_CURRENT(LC_COLLATE, _NL_COLLATE_COLLSEQMB);
 # endif
 #endif
+  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
 
   while ((c = *p++) != L_('\0'))
     {
@@ -324,8 +325,6 @@  FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
                        diagnose a "used initialized" in a dead branch in the
                        findidx function.  */
                     UCHAR str;
-                    uint32_t nrules =
-                      _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
                     const CHAR *startp = p;
 
                     c = *++p;
@@ -437,8 +436,6 @@  FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
 
                     if (c == L_('[') && *p == L_('.'))
                       {
-                        uint32_t nrules =
-                          _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
                         const CHAR *startp = p;
                         size_t c1 = 0;
 
@@ -600,42 +597,51 @@  FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
                     if (c == L_('-') && *p != L_(']'))
                       {
 #if _LIBC
-                        /* We have to find the collation sequence
-                           value for C.  Collation sequence is nothing
-                           we can regularly access.  The sequence
-                           value is defined by the order in which the
-                           definitions of the collation values for the
-                           various characters appear in the source
-                           file.  A strange concept, nowhere
-                           documented.  */
-                        uint32_t fcollseq;
-                        uint32_t lcollseq;
+			/* We must find the collation sequence values for
+			   the low part of the range, the high part of the
+			   range and the searched value FN.  We do this by
+			   using the POSIX concept of Collation Element
+			   Ordering, which is the defined order of elements
+			   in the source locale.  FCOLLSEQ is the searched
+			   element in the range, while LCOLLSEQ is the low
+			   element in the range.  If we have no collation
+			   rules (nrules == 0) then we must fall back to a
+			   basic code point value for the collation
+			   sequence value (which is correct for ASCII and
+			   UTF-8).  We must never use collseq if nrules ==
+			   0 since none of the tables we need will be
+			   present in the compiled binary locale.  We start
+			   with fcollseq and lcollseq at unknown collation
+			   sequences.  We only compute hcollseq, the high
+			   part of the range if required.  */
+                        uint32_t fcollseq = ~((uint32_t) 0);
+                        uint32_t lcollseq = ~((uint32_t) 0);
                         UCHAR cend = *p++;
 
+			if (nrules != 0)
+			  {
 # if WIDE_CHAR_VERSION
-                        /* Search in the 'names' array for the characters.  */
-                        fcollseq = __collseq_table_lookup (collseq, fn);
-                        if (fcollseq == ~((uint32_t) 0))
-                          /* XXX We don't know anything about the character
-                             we are supposed to match.  This means we are
-                             failing.  */
-                          goto range_not_matched;
-
-                        if (is_seqval)
-                          lcollseq = cold;
-                        else
-                          lcollseq = __collseq_table_lookup (collseq, cold);
+			    /* Search the collation data for the character.  */
+			    fcollseq = __collseq_table_lookup (collseq, fn);
+			    if (fcollseq == ~((uint32_t) 0))
+			      /* We don't know anything about the character
+				 we are supposed to match.  This means we are
+				 failing.  */
+			      goto range_not_matched;
+
+			    if (is_seqval)
+			      lcollseq = cold;
+			    else
+			      lcollseq = __collseq_table_lookup (collseq, cold);
 # else
-                        fcollseq = collseq[fn];
-                        lcollseq = is_seqval ? cold : collseq[(UCHAR) cold];
+			    fcollseq = collseq[fn];
+			    lcollseq = is_seqval ? cold : collseq[(UCHAR) cold];
 # endif
+			  }
 
                         is_seqval = false;
                         if (cend == L_('[') && *p == L_('.'))
                           {
-                            uint32_t nrules =
-                              _NL_CURRENT_WORD (LC_COLLATE,
-                                                _NL_COLLATE_NRULES);
                             const CHAR *startp = p;
                             size_t c1 = 0;
 
@@ -752,14 +758,20 @@  FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
                             cend = FOLD (cend);
                           }
 
-                        /* XXX It is not entirely clear to me how to handle
-                           characters which are not mentioned in the
-                           collation specification.  */
-                        if (
+			/* If we have rules, and the low sequence is lower than
+			   the value of the searched sequence then we must
+			   lookup the high collation sequence value and
+			   determine if the fcollseq falls within the range.
+			   If hcollseq is unknown then we could still match
+			   fcollseq on the low end of the range.  If lcollseq
+			   if unknown (0xffffffff) we will still fail to
+			   match, but in the future we might consider matching
+			   the high end of the range on an exact match.  */
+                        if (nrules != 0 && (
 # if WIDE_CHAR_VERSION
                             lcollseq == 0xffffffff ||
 # endif
-                            lcollseq <= fcollseq)
+                            lcollseq <= fcollseq))
                           {
                             /* We have to look at the upper bound.  */
                             uint32_t hcollseq;
@@ -789,6 +801,17 @@  FCT (const CHAR *pattern, const CHAR *string, const CHAR *string_end,
                             if (lcollseq <= hcollseq && fcollseq <= hcollseq)
                               goto matched;
                           }
+
+			/* No rules, but we have a range.  */
+			if (nrules == 0)
+			  {
+			    if (cend == L_('\0'))
+			      return FNM_NOMATCH;
+
+			    /* Compare that fn is within the range.  */
+			    if ((UCHAR) cold <= fn && fn <= cend)
+			      goto matched;
+			  }
 # if WIDE_CHAR_VERSION
                       range_not_matched:
 # endif
diff --git a/posix/regcomp.c b/posix/regcomp.c
index d93698ae78..f55d20cbfd 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -2889,7 +2889,7 @@  parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 	  if (MB_CUR_MAX == 1)
 	  */
 	  if (nrules == 0)
-	    return collseqmb[br_elem->opr.ch];
+	    return br_elem->opr.ch;
 	  else
 	    {
 	      wint_t wc = __btowc (br_elem->opr.ch);
@@ -2900,6 +2900,8 @@  parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 	{
 	  if (nrules != 0)
 	    return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
+	  else
+	    return br_elem->opr.wch;
 	}
       else if (br_elem->type == COLL_SYM)
 	{
@@ -2935,7 +2937,7 @@  parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 		}
 	    }
 	  else if (sym_name_len == 1)
-	    return collseqmb[br_elem->opr.name[0]];
+	    return br_elem->opr.name[0];
 	}
       return UINT_MAX;
     }
@@ -3017,7 +3019,7 @@  parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
 	  if (MB_CUR_MAX == 1)
 	  */
 	  if (nrules == 0)
-	    ch_collseq = collseqmb[ch];
+	    ch_collseq = ch;
 	  else
 	    ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
 	  if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
@@ -3103,11 +3105,11 @@  parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
   int token_len;
   bool first_round = true;
 #ifdef _LIBC
-  collseqmb = (const unsigned char *)
-    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
   nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
   if (nrules)
     {
+      collseqmb = (const unsigned char *)
+	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
       /*
       if (MB_CUR_MAX > 1)
       */
diff --git a/posix/regexec.c b/posix/regexec.c
index f7b4f9cfc3..6cc23831aa 100644
--- a/posix/regexec.c
+++ b/posix/regexec.c
@@ -3858,62 +3858,53 @@  check_node_accept_bytes (const re_dfa_t *dfa, Idx node_idx,
 }
 
 # ifdef _LIBC
+#include <assert.h>
+
 static unsigned int
 find_collation_sequence_value (const unsigned char *mbs, size_t mbs_len)
 {
-  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
-  if (nrules == 0)
-    {
-      if (mbs_len == 1)
-	{
-	  /* No valid character.  Match it as a single byte character.  */
-	  const unsigned char *collseq = (const unsigned char *)
-	    _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
-	  return collseq[mbs[0]];
-	}
-      return UINT_MAX;
-    }
-  else
-    {
-      int32_t idx;
-      const unsigned char *extra = (const unsigned char *)
+  int32_t idx;
+  const unsigned char *extra = (const unsigned char *)
 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB);
-      int32_t extrasize = (const unsigned char *)
+  int32_t extrasize = (const unsigned char *)
 	_NL_CURRENT (LC_COLLATE, _NL_COLLATE_SYMB_EXTRAMB + 1) - extra;
+  uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
+
+  /* Only called from within 'if (nrules != 0)'.  */ 
+  assert (nrules != 0);
 
-      for (idx = 0; idx < extrasize;)
+  for (idx = 0; idx < extrasize;)
+    {
+      int mbs_cnt;
+      bool found = false;
+      int32_t elem_mbs_len;
+      /* Skip the name of collating element name.  */
+      idx = idx + extra[idx] + 1;
+      elem_mbs_len = extra[idx++];
+      if (mbs_len == elem_mbs_len)
 	{
-	  int mbs_cnt;
-	  bool found = false;
-	  int32_t elem_mbs_len;
-	  /* Skip the name of collating element name.  */
-	  idx = idx + extra[idx] + 1;
-	  elem_mbs_len = extra[idx++];
-	  if (mbs_len == elem_mbs_len)
-	    {
-	      for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
-		if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
-		  break;
-	      if (mbs_cnt == elem_mbs_len)
-		/* Found the entry.  */
-		found = true;
-	    }
-	  /* Skip the byte sequence of the collating element.  */
-	  idx += elem_mbs_len;
-	  /* Adjust for the alignment.  */
-	  idx = (idx + 3) & ~3;
-	  /* Skip the collation sequence value.  */
-	  idx += sizeof (uint32_t);
-	  /* Skip the wide char sequence of the collating element.  */
-	  idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
-	  /* If we found the entry, return the sequence value.  */
-	  if (found)
-	    return *(uint32_t *) (extra + idx);
-	  /* Skip the collation sequence value.  */
-	  idx += sizeof (uint32_t);
+	  for (mbs_cnt = 0; mbs_cnt < elem_mbs_len; ++mbs_cnt)
+	    if (extra[idx + mbs_cnt] != mbs[mbs_cnt])
+	      break;
+	  if (mbs_cnt == elem_mbs_len)
+	    /* Found the entry.  */
+	    found = true;
 	}
-      return UINT_MAX;
+      /* Skip the byte sequence of the collating element.  */
+      idx += elem_mbs_len;
+      /* Adjust for the alignment.  */
+      idx = (idx + 3) & ~3;
+      /* Skip the collation sequence value.  */
+      idx += sizeof (uint32_t);
+      /* Skip the wide char sequence of the collating element.  */
+      idx = idx + sizeof (uint32_t) * (*(int32_t *) (extra + idx) + 1);
+      /* If we found the entry, return the sequence value.  */
+      if (found)
+        return *(uint32_t *) (extra + idx);
+      /* Skip the collation sequence value.  */
+      idx += sizeof (uint32_t);
     }
+  return UINT_MAX;
 }
 # endif /* _LIBC */
 #endif /* RE_ENABLE_I18N */