[v4,1/4] Add support for processing wide ellipsis ranges in UTF-8.

Message ID 20210428130033.3196848-2-carlos@redhat.com
State Superseded
Headers
Series Add new C.UTF-8 locale (Bug 17318) |

Commit Message

Carlos O'Donell April 28, 2021, 1 p.m. UTC
  If the input charater map is UTF-8 then the ellipsis handling is
relaxed with regards to the POSIX requirement for null byte
output and instead a custom increment function is used to
correctly handle the ellipsis output to generate valid UTF-8
code points.

Developers of locales want to be able to write large ellipsis
sequences without having apriori knowledge of the encoding that
would require them to split the ellipsis to avoid null byte
output.

Tested on x86_64 and i686 without regression.
---
 locale/programs/charmap.c | 174 ++++++++++++++++++++++++++++++++++----
 1 file changed, 156 insertions(+), 18 deletions(-)
  

Comments

Florian Weimer April 29, 2021, 2:11 p.m. UTC | #1
* Carlos O'Donell:

> If the input charater map is UTF-8 then the ellipsis handling is
> relaxed with regards to the POSIX requirement for null byte
> output and instead a custom increment function is used to
> correctly handle the ellipsis output to generate valid UTF-8
> code points.
>
> Developers of locales want to be able to write large ellipsis
> sequences without having apriori knowledge of the encoding that
> would require them to split the ellipsis to avoid null byte
> output.

This looks okay to me.

Thanks,
Florian
  

Patch

diff --git a/locale/programs/charmap.c b/locale/programs/charmap.c
index 3d51e702dc..cb134e3b8a 100644
--- a/locale/programs/charmap.c
+++ b/locale/programs/charmap.c
@@ -49,7 +49,7 @@  static void new_width (struct linereader *cmfile, struct charmap_t *result,
 static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
 			      size_t nbytes, unsigned char *bytes,
 			      const char *from, const char *to,
-			      int decimal_ellipsis, int step);
+			      int decimal_ellipsis, int step, bool is_utf8);
 
 
 bool enc_not_ascii_compatible;
@@ -285,6 +285,27 @@  parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
   enum token_t ellipsis = 0;
   int step = 1;
 
+  /* POSIX explicitly requires that ellipsis processing do the
+     following: "Bytes shall be treated as unsigned octets, and carry
+     shall be propagated between the bytes as necessary to represent the
+     range."  It then goes on to say that such a declaration should
+     never be specified because it creates null bytes.  Therefore we
+     error on this condition (see charmap_new_char).  However this still
+     leaves a problem for encodings which use less than the full 8-bits,
+     like UTF-8, and in such encodings you can use an ellipsis to
+     silently and accidentally create invalid ranges.  In UTF-8 you have
+     only N-bits of the first byte and if your ellipsis covers a code
+     point range larger than this code point block the output is going
+     to be an invalid non-UTF-8 multi-byte sequence.  Thus for
+     UTF-8 we add a special ellipsis handling loop that can increment
+     UTF-8 multi-byte output effectively and for UTF-8 we allow larger
+     ellipsis ranges without error.  There may still be other encodings
+     for which the ellipsis will still generate invalid multi-byte
+     output, but not for UTF-8.  The only alternative would be to call
+     gconv for each Unicode code point in the loop to convert it to the
+     appropriate multi-byte output, but that would be slow.  */
+  bool is_utf8 = false;
+
   /* We don't want symbolic names in string to be translated.  */
   cmfile->translate_strings = 0;
 
@@ -385,9 +406,14 @@  parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
 		}
 
 	      if (nowtok == tok_code_set_name)
-		result->code_set_name = obstack_copy0 (&result->mem_pool,
-						       arg->val.str.startmb,
-						       arg->val.str.lenmb);
+		{
+		  result->code_set_name = obstack_copy0 (&result->mem_pool,
+							 arg->val.str.startmb,
+							 arg->val.str.lenmb);
+
+		  if (strcmp (result->code_set_name, "UTF-8") == 0)
+		    is_utf8 = true;
+		}
 	      else
 		result->repertoiremap = obstack_copy0 (&result->mem_pool,
 						       arg->val.str.startmb,
@@ -570,7 +596,7 @@  character sets with locking states are not supported"));
 	  else
 	    charmap_new_char (cmfile, result, now->val.charcode.nbytes,
 			      now->val.charcode.bytes, from_name, to_name,
-			      ellipsis != tok_ellipsis2, step);
+			      ellipsis != tok_ellipsis2, step, is_utf8);
 
 	  /* Ignore trailing comment silently.  */
 	  lr_ignore_rest (cmfile, 0);
@@ -929,12 +955,81 @@  charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
 	  < 0 ? NULL : (struct charseq *) result);
 }
 
+/* This function takes the Unicode code point CP and encodes it into
+   a UTF-8 byte stream that must be NBYTES long and is stored into
+   the unsigned character array at BYTES.
+
+   If CP requires more than NBYTES to be encoded then we return an
+   error of -1.
+
+   If CP is not within any of the valid Unicode code point ranges
+   then we return an error of -2.
+
+   Otherwise we return the number of bytes encoded.  */
+static int
+output_utf8_bytes (unsigned int cp, size_t nbytes, unsigned char *bytes)
+{
+  /* We need at least 1 byte.  */
+  if (nbytes < 1)
+    return -1;
+
+  /* One byte range.  */
+  if (cp >= 0x0 && cp <= 0x7f)
+    {
+      bytes[0] = cp;
+      return 1;
+    }
+
+  /* We need at least 2 bytes.  */
+  if (nbytes < 2)
+    return -1;
+
+  /* Two byte range.  */
+  if (cp >= 0x80 && cp <= 0x7ff)
+    {
+      bytes[0] = 0xc0 | ((cp & 0x07c0) >> 6);
+      bytes[1] = 0x80 | (cp & 0x003f);
+      return 2;
+    }
+
+  /* We need at least 3 bytes.  */
+  if (nbytes < 3)
+    return -1;
+
+  /* Three byte range.  Explicitly allow the surrogate range from
+     0xd800 to 0xdfff since we want consistent sorting of the invalid
+     values that might appear in UTF-8 data.  */
+  if (cp >= 0x800 && cp <= 0xffff)
+    {
+      bytes[0] = 0xe0 | ((cp & 0xf000) >> 12);
+      bytes[1] = 0x80 | ((cp & 0x0fc0) >> 6);
+      bytes[2] = 0x80 | (cp & 0x003f);
+      return 3;
+    }
+
+  /* We need at least 4 bytes.  */
+  if (nbytes < 4)
+    return -1;
+
+  /* Four byte range.  */
+  if (cp >= 0x10000 && cp <= 0x10ffff)
+    {
+      bytes[0] = 0xf0 | ((cp & 0x1c0000) >> 18);
+      bytes[1] = 0x80 | ((cp & 0x03f000) >> 12);
+      bytes[2] = 0x80 | ((cp & 0x000fc0) >> 6);
+      bytes[3] = 0x80 | (cp & 0x00003f);
+      return 4;
+    }
+
+  /* Invalid code point.  */
+  return -2;
+}
 
 static void
 charmap_new_char (struct linereader *lr, struct charmap_t *cm,
 		  size_t nbytes, unsigned char *bytes,
 		  const char *from, const char *to,
-		  int decimal_ellipsis, int step)
+		  int decimal_ellipsis, int step, bool is_utf8)
 {
   hash_table *ht = &cm->char_table;
   hash_table *bt = &cm->byte_table;
@@ -1039,11 +1134,56 @@  hexadecimal range format should use only capital characters"));
   for (cnt = from_nr; cnt <= to_nr; cnt += step)
     {
       char *name_end;
+      unsigned char ubytes[4] = { '\0', '\0', '\0', '\0' };
       obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
 		      prefix_len, from, len1 - prefix_len, cnt);
       obstack_1grow (ob, '\0');
       name_end = obstack_finish (ob);
 
+      /* Either we have a UTF-8 charmap, and we compute the bytes (see
+	 comment above), or we have a non-UTF-8 charmap and we follow
+	 POSIX rules as further below for incrementing the bytes in an
+	 ellipsis.  */
+      if (is_utf8)
+	{
+	  int nubytes;
+
+	  /* Directly convert the code point to the UTF-8 encoded bytes.  */
+	  nubytes = output_utf8_bytes (cnt, 4, ubytes);
+
+	  /* This should not happen, but we check for it just in case.  */
+	  if (nubytes == -1)
+	    lr_error (lr,
+		      _("not enough space to output UTF-8 encoding."));
+
+	  /* The other defect here could be that we have a mismatch
+	     between the code point and the encoded value or number of
+	     output bytes.  For example you specify U0000 but assign it
+	     an encoded value that is 3-bytes long (an error), or U0000
+	     is assigned a value of /x01.  */
+	  if (cnt == from_nr)
+	    {
+	      if (nubytes != nbytes)
+		lr_error (lr,
+			  _("encoding length does not match "
+			    "Unicode code point."));
+	      else
+		if (memcmp (bytes, ubytes, nbytes) != 0)
+		  lr_error (lr,
+			    _("encoded value does not match "
+			      "Unicode code point."));
+	    }
+
+	  /* The range does not cover one of the 4 UTF-8 code point ranges.  */
+	  if (nubytes == -2)
+	    lr_error (lr,
+		      _("invalid code point in the range."));
+
+	  /* Use the generated UTF-8 bytes.  */
+	  bytes = ubytes;
+	  nbytes = nubytes;
+	}
+
       newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
       newp->nbytes = nbytes;
       memcpy (newp->bytes, bytes, nbytes);
@@ -1081,19 +1221,17 @@  hexadecimal range format should use only capital characters"));
       /* Please note we don't examine the return value since it is no error
 	 if we have two definitions for a symbol.  */
 
-      /* Increment the value in the byte sequence.  */
-      if (++bytes[nbytes - 1] == '\0')
-	{
-	  int b = nbytes - 2;
+      /* Increment the byte stream following POSIX rules.  */
+      if (!is_utf8)
+        bytes[nbytes - 1]++;
 
-	  do
-	    if (b < 0)
-	      {
-		lr_error (lr,
-			  _("resulting bytes for range not representable."));
-		return;
-	      }
-	  while (++bytes[b--] == 0);
+      /* If we overflowed then that generates a null byte which is an invalid
+	 specification according to POSIX and we issue a parser error.  */
+      if (bytes[nbytes - 1] == '\0')
+	{
+	  lr_error (lr,
+		    _("resulting bytes for range would contain null byte."));
+	  return;
 	}
     }
 }