diff mbox

Handle surrogate pairs in c16rtomb (bug 23794, DR#488, C2X)

Message ID alpine.DEB.2.21.1810182138270.4093@digraph.polyomino.org.uk
State New, archived
Headers show

Commit Message

Joseph Myers Oct. 18, 2018, 9:39 p.m. UTC
The c16rtomb implementation has:

  // XXX The ISO C 11 spec I have does not say anything about handling
  // XXX surrogates in this interface.

The DR#488 resolution, as applied to C2X, requires surrogate pairs to
be handled here (so the first call returns 0 and stores the high
surrogate in the mbstate_t, while the second call combines the
surrogates, produces a multibyte character and returns the number of
bytes written).  This patch implements that.  (mbrtoc16 already
handled producing surrogates as output.)

Tested for x86_64.

2018-10-18  Joseph Myers  <joseph@codesourcery.com>

	[BZ #23794]
	* wcsmbs/c16rtomb.c (c16rtomb): Save first character of surrogate
	pair and return 0 in that case, and use saved character to
	interpret following character.
	* wcsmbs/tst-c16-surrogate.c: New file.
	* wcsmbs/Makefile (tests): Add tst-c16-surrogate.c.
	[$(run-built-tests) = yes] ($(objpfx)tst-c16-surrogate.out):
	Depend on $(gen-locales)

Comments

Florian Weimer Oct. 19, 2018, 5:42 a.m. UTC | #1
* Joseph Myers:

> +	/* This is not a low surrogate; ensure an EILSEQ error by
> +	   trying to decode the high surrogate as a wide character on
> +	   its own.  */
> +	wc = ps->__value.__wch;

Is this an attempt to support CESU-8 in the future, which is a
UTF-8-style multi-byte encoding for UCS-2, so it can encode lone
surrogate pairs?  Or just something to reduce code size?

> +  /* Test errors for invalid conversions.  */
> +  static const char16_t err_cases[][2] =
> +    {
> +      /* High surrogate followed by non-surrogate.  */
> +      { 0xd800, 0x1 },
> +      /* High surrogate followed by another high surrogate.  */
> +      { 0xd800, 0xd800 },
> +      /* Low surrogate not following high surrogate.  */
> +      { 0xdc00, 0 }

You could add a test for a low surrogate/high surrogate sequence.

Most of the TEST_VERIFY comparisons could use TEST_COMPARE for improved
error diagnostics.

Looks good otherwise.  Thanks.

Florian
diff mbox

Patch

diff --git a/wcsmbs/Makefile b/wcsmbs/Makefile
index 2e02b22c3b..a58d36ff8f 100644
--- a/wcsmbs/Makefile
+++ b/wcsmbs/Makefile
@@ -51,7 +51,8 @@  tests := tst-wcstof wcsmbs-tst1 tst-wcsnlen tst-btowc tst-mbrtowc \
 	 tst-wcrtomb tst-wcpncpy tst-mbsrtowcs tst-wchar-h tst-mbrtowc2 \
 	 tst-c16c32-1 wcsatcliff tst-wcstol-locale tst-wcstod-nan-locale \
 	 tst-wcstod-round test-char-types tst-fgetwc-after-eof \
-	 tst-wcstod-nan-sign $(addprefix test-,$(strop-tests))
+	 tst-wcstod-nan-sign tst-c16-surrogate \
+	 $(addprefix test-,$(strop-tests))
 
 include ../Rules
 
@@ -68,6 +69,7 @@  $(objpfx)tst-wcrtomb.out: $(gen-locales)
 $(objpfx)wcsmbs-tst1.out: $(gen-locales)
 $(objpfx)tst-wcstol-locale.out: $(gen-locales)
 $(objpfx)tst-wcstod-nan-locale.out: $(gen-locales)
+$(objpfx)tst-c16-surrogate.out: $(gen-locales)
 endif
 
 $(objpfx)tst-wcstod-round: $(libm)
diff --git a/wcsmbs/c16rtomb.c b/wcsmbs/c16rtomb.c
index 48a63d067b..2b45a0f443 100644
--- a/wcsmbs/c16rtomb.c
+++ b/wcsmbs/c16rtomb.c
@@ -26,7 +26,40 @@  static mbstate_t state;
 size_t
 c16rtomb (char *s, char16_t c16, mbstate_t *ps)
 {
-  // XXX The ISO C 11 spec I have does not say anything about handling
-  // XXX surrogates in this interface.
-  return wcrtomb (s, c16, ps ?: &state);
+  wchar_t wc = c16;
+
+  if (ps == NULL)
+    ps = &state;
+
+  if (s == NULL)
+    {
+      /* Reset any state relating to surrogate pairs.  */
+      ps->__count &= 0x7fffffff;
+      wc = 0;
+    }
+
+  if (ps->__count & 0x80000000)
+    {
+      /* The previous call passed in the first surrogate of a
+	 surrogate pair.  */
+      ps->__count &= 0x7fffffff;
+      if (wc >= 0xdc00 && wc < 0xe000)
+	wc = (0x10000
+	      + ((ps->__value.__wch & 0x3ff) << 10)
+	      + (wc & 0x3ff));
+      else
+	/* This is not a low surrogate; ensure an EILSEQ error by
+	   trying to decode the high surrogate as a wide character on
+	   its own.  */
+	wc = ps->__value.__wch;
+    }
+  else if (wc >= 0xd800 && wc < 0xdc00)
+    {
+      /* The high part of a surrogate pair.  */
+      ps->__count |= 0x80000000;
+      ps->__value.__wch = wc;
+      return 0;
+    }
+
+  return wcrtomb (s, wc, ps);
 }
diff --git a/wcsmbs/tst-c16-surrogate.c b/wcsmbs/tst-c16-surrogate.c
new file mode 100644
index 0000000000..e40426256f
--- /dev/null
+++ b/wcsmbs/tst-c16-surrogate.c
@@ -0,0 +1,89 @@ 
+/* Test c16rtomb handling of surrogate pairs (DR#488, bug 23794).
+   Copyright (C) 2018 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <locale.h>
+#include <stdio.h>
+#include <string.h>
+#include <uchar.h>
+#include <wchar.h>
+#include <array_length.h>
+#include <support/check.h>
+
+static int
+do_test (void)
+{
+  TEST_VERIFY_EXIT (setlocale (LC_ALL, "de_DE.UTF-8") != NULL);
+  /* Test conversions of surrogate pairs.  */
+  for (char32_t c = 0x10000; c <= 0x10ffff; c += 0x123)
+    {
+      char32_t c_pos = c - 0x10000;
+      char16_t c_hi = (c_pos >> 10) + 0xd800;
+      char16_t c_lo = (c_pos & 0x3ff) + 0xdc00;
+      printf ("testing U+0x%08x (0x%x 0x%x)\n",
+	      (unsigned int) c, (unsigned int) c_hi, (unsigned int) c_lo);
+      char buf[16] = { 0 };
+      size_t ret_hi = c16rtomb (buf, c_hi, NULL);
+      TEST_VERIFY (ret_hi == 0);
+      size_t ret_lo = c16rtomb (buf, c_lo, NULL);
+      TEST_VERIFY (ret_lo == 4);
+      wchar_t wc = 0;
+      size_t ret_wc = mbrtowc (&wc, buf, 4, NULL);
+      TEST_VERIFY (ret_wc == 4);
+      TEST_VERIFY (wc == (wchar_t) c);
+    }
+  /* Test errors for invalid conversions.  */
+  static const char16_t err_cases[][2] =
+    {
+      /* High surrogate followed by non-surrogate.  */
+      { 0xd800, 0x1 },
+      /* High surrogate followed by another high surrogate.  */
+      { 0xd800, 0xd800 },
+      /* Low surrogate not following high surrogate.  */
+      { 0xdc00, 0 }
+    };
+  for (size_t i = 0; i < array_length (err_cases); i++)
+    {
+      char16_t c_hi = err_cases[i][0];
+      char16_t c_lo = err_cases[i][1];
+      printf ("testing error case: 0x%x 0x%x\n", (unsigned int) c_hi,
+	      (unsigned int) c_lo);
+      c16rtomb (NULL, 0, NULL);
+      char buf[16] = { 0 };
+      errno = 0;
+      size_t ret_hi = c16rtomb (buf, c_hi, NULL);
+      if (c_lo == 0)
+	{
+	  /* Unmatched low surrogate in first place.  */
+	  TEST_VERIFY (ret_hi == (size_t) -1);
+	  TEST_VERIFY (errno == EILSEQ);
+	}
+      else
+	{
+	  /* High surrogate; error in second place.  */
+	  TEST_VERIFY (ret_hi == 0);
+	  errno = 0;
+	  size_t ret_lo = c16rtomb (buf, c_lo, NULL);
+	  TEST_VERIFY (ret_lo == (size_t) -1);
+	  TEST_VERIFY (errno == EILSEQ);
+	}
+    }
+  return 0;
+}
+
+#include <support/test-driver.c>