diff mbox series

[v13] POSIX locale covers every byte [BZ# 29511]

Message ID	37rnsj5e3zfbkbin4hhl7ltvuwzo2voxiys2qeond5efvxrku6@kpj3uugguwgg
State	Superseded
Headers	DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org C59833857707 Date: Mon, 29 May 2023 15:54:17 +0200 To: Florian Weimer <fweimer@redhat.com> Cc: libc-alpha@sourceware.org, Victor Stinner <vstinner@redhat.com> Subject: [PATCH v13] POSIX locale covers every byte [BZ# 29511] Message-ID: <37rnsj5e3zfbkbin4hhl7ltvuwzo2voxiys2qeond5efvxrku6@kpj3uugguwgg> References: <phkvqhkfyyxodn4fiwii7of6stxa7iwekqb5e3lwkpnap3ravd@b2lotjr65iik> MIME-Version: 1.0 Content-Type: multipart/signed; micalg=pgp-sha512; protocol="application/pgp-signature"; boundary="caaxfumhjwccxsmc" Content-Disposition: inline In-Reply-To: <phkvqhkfyyxodn4fiwii7of6stxa7iwekqb5e3lwkpnap3ravd@b2lotjr65iik> User-Agent: NeoMutt/20230517 Precedence: list From: =?utf-8?b?0L3QsNCxIHZpYSBMaWJjLWFscGhh?= <libc-alpha@sourceware.org> Reply-To: =?utf-8?b?0L3QsNCx?= <nabijaczleweli@nabijaczleweli.xyz> Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
Series	[v13] POSIX locale covers every byte [BZ# 29511] \| [v13] POSIX locale covers every byte [BZ# 29511]

Checks

Context	Check	Description
redhat-pt-bot/TryBot-apply_patch	success	Patch applied to master at the time it was sent
redhat-pt-bot/TryBot-32bit	success	Build for i686
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64	success	Testing passed
linaro-tcwg-bot/tcwg_glibc_build--master-arm	success	Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm	pending	Patch applied
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64	success	Testing passed

Commit Message

Ahelenia Ziemiańska May 29, 2023, 1:54 p.m. UTC

  This largely duplicates the ASCII code with the error path changed

There are two user-facing changes:
  * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
  * mbrtowc() and friends return b if b <= 0x7F else <UDC00>+b

Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
  (a) is 1-byte, stateless, and contains 256 characters
  (b) they collate in byte order
  (c) the first 128 characters are equivalent to ASCII (like previous)
cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
changes to the standard;
in short, this means that mbrtowc() must never fail and must return
  b if b <= 0x7F else ab+c for all bytes b
  where c is some constant >=0x80
    and a is a positive integer constant

By strategically picking c=<UDC00> we land at the same point of the
Unicode Low Surrogate Area at DC00-DCFF, described as
  > Isolated surrogate code points have no interpretation;
  > consequently, no character code charts or names lists
  > are provided for this range.
as the Python UTF-8 errors=surrogateescape encoding.

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
Clean rebase on current master.

 NEWS                                |   8 ++
 iconv/Makefile                      |   2 +-
 iconv/gconv_builtin.h               |   8 ++
 iconv/gconv_int.h                   |   8 ++
 iconv/gconv_posix.c                 |  94 ++++++++++++++++++
 iconv/tst-iconv_prog.sh             |  43 +++++++++
 iconvdata/tst-tables.sh             |   1 +
 inet/tst-idna_name_classify.c       |   6 +-
 locale/C_name.c                     |   2 +-
 locale/tst-C-locale.c               |  44 +++++++++
 localedata/charmaps/POSIX           | 136 ++++++++++++++++++++++++++
 localedata/locales/POSIX            | 143 +++++++++++++++++++++++++++-
 localedata/tst-c-utf8-consistency.c |  24 ++---
 stdio-common/Makefile               |   1 +
 stdio-common/tst-printf-bz25691.c   |   2 +
 wcsmbs/wcsmbsload.c                 |  14 +--
 16 files changed, 512 insertions(+), 24 deletions(-)
 create mode 100644 iconv/gconv_posix.c
 create mode 100644 localedata/charmaps/POSIX

diff mbox series

Patch

diff --git a/NEWS b/NEWS
index a52c17c677..4fdc39b14e 100644
--- a/NEWS
+++ b/NEWS
@@ -35,6 +35,14 @@  Major new features:
   The symbol names follow the AArch64 vector ABI, they are declared
   in math.h and have to be called manually at this point.
 
+* The default/"POSIX"/"C" locale's character set is now "POSIX",
+  instead of "ANSI_X3.4-1968" ‒ this is a new fully-reversible
+  8-bit transparent encoding for compatibility with POSIX Issue 7 TC 2,
+  identity-mapping bytes in the ASCII [0, 0x7F] range,
+  and mapping [0x80, 0xFF] bytes to [<U+DC80>, <U+DCFF>].
+  The standard now requires the "POSIX"/"C" locale to have an encoding
+  with these features ‒ 8-bit transparency and a continuous collation sequence.
+
 Deprecated and removed features, and other changes affecting compatibility:
 
 * In the Linux kernel for the hppa/parisc architecture some of the
diff --git a/iconv/Makefile b/iconv/Makefile
index afb3fb7bdb..b61e130377 100644
--- a/iconv/Makefile
+++ b/iconv/Makefile
@@ -25,7 +25,7 @@  include ../Makeconfig
 headers		= iconv.h gconv.h
 routines	= iconv_open iconv iconv_close \
 		  gconv_open gconv gconv_close gconv_db gconv_conf \
-		  gconv_builtin gconv_simple gconv_trans gconv_cache
+		  gconv_builtin gconv_simple gconv_posix gconv_trans gconv_cache
 routines	+= gconv_dl gconv_charset
 
 vpath %.c ../locale/programs ../intl
diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h
index 35608b4461..d2dcdd44a3 100644
--- a/iconv/gconv_builtin.h
+++ b/iconv/gconv_builtin.h
@@ -89,6 +89,14 @@  BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii",
 			__gconv_transform_internal_ascii, NULL, 4, 4, 1, 1)
 
 
+BUILTIN_TRANSFORMATION ("POSIX//", "INTERNAL", 1, "=posix->INTERNAL",
+			__gconv_transform_posix_internal, __gconv_btwoc_posix,
+			1, 1, 4, 4)
+
+BUILTIN_TRANSFORMATION ("INTERNAL", "POSIX//", 1, "=INTERNAL->posix",
+			__gconv_transform_internal_posix, NULL, 4, 4, 1, 1)
+
+
 #if BYTE_ORDER == BIG_ENDIAN
 BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/")
 BUILTIN_ALIAS ("UCS-2BE//", "ISO-10646/UCS2/")
diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
index 19d042faff..3d0889b321 100644
--- a/iconv/gconv_int.h
+++ b/iconv/gconv_int.h
@@ -309,6 +309,8 @@  extern int __gconv_compare_alias (const char *name1, const char *name2)
 
 __BUILTIN_TRANSFORM (__gconv_transform_ascii_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_posix_internal);
+__BUILTIN_TRANSFORM (__gconv_transform_internal_posix);
 __BUILTIN_TRANSFORM (__gconv_transform_utf8_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_utf8);
 __BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal);
@@ -327,6 +329,12 @@  __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
    only ASCII characters.  */
 extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
 
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DC80, U+DCFF].  */
+extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+     attribute_hidden;
+
 #endif
 
 __END_DECLS
diff --git a/iconv/gconv_posix.c b/iconv/gconv_posix.c
new file mode 100644
index 0000000000..885929baca
--- /dev/null
+++ b/iconv/gconv_posix.c
@@ -0,0 +1,94 @@ 
+/* "POSIX" locale transformation functions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <gconv_int.h>
+
+
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DC80, U+DCFF].  */
+wint_t
+__gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+{
+  if (c < 0x80)
+    return c;
+  else
+    return 0xdc00 + c;
+}
+
+
+/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DC80, U+DCFF]}
+   to the internal (UCS4-like) format.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		1
+#define MIN_NEEDED_TO		4
+#define FROM_DIRECTION		1
+#define FROM_LOOP		posix_internal_loop
+#define TO_LOOP			posix_internal_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_posix_internal
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    if (__glibc_unlikely (*inptr > '\x7f'))				      \
+      *((uint32_t *) outptr) = 0xdc00 + *inptr++;			      \
+    else								      \
+      *((uint32_t *) outptr) = *inptr++;				      \
+    outptr += sizeof (uint32_t);					      \
+  }
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
+/* Convert from the internal (UCS4-like) format to
+   {ISO 646-IRV => [0, 0x7F]; [U+DC80, U+DCFF] => [0x80, 0xFF]}.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		4
+#define MIN_NEEDED_TO		1
+#define FROM_DIRECTION		1
+#define FROM_LOOP		internal_posix_loop
+#define TO_LOOP			internal_posix_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_internal_posix
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    uint32_t val = *((const uint32_t *) inptr);				      \
+    if (__glibc_unlikely ((val > 0x7f && val < 0xdc80) || val > 0xdcff))      \
+      {									      \
+	UNICODE_TAG_HANDLER (val, 4);					      \
+	STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
+      }									      \
+    else								      \
+      {									      \
+	*outptr++ = val & 0xff;						      \
+	inptr += sizeof (uint32_t);					      \
+      }									      \
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh
index 76400cddfc..c757fb2c40 100644
--- a/iconv/tst-iconv_prog.sh
+++ b/iconv/tst-iconv_prog.sh
@@ -285,3 +285,46 @@  for errorcommand in "${errorarray[@]}"; do
   execute_test
   check_errtest_result
 done
+
+allbytes ()
+{
+  for (( i = 0; i <= 255; i++ )); do
+    printf '\'"$(printf "%o" "$i")"
+  done
+}
+
+allucs4be ()
+{
+  for (( i = 0; i <= 127; i++ )); do
+    printf '\0\0\0\'"$(printf "%o" "$i")"
+  done
+  for (( i = 128; i <= 255; i++ )); do
+    printf '\0\0\xdc\'"$(printf "%o" "$i")"
+  done
+}
+
+check_posix_result ()
+{
+  if [ $? -eq 0 ]; then
+    result=PASS
+  else
+    result=FAIL
+  fi
+
+  echo "$result: from \"$1\", to: \"$2\""
+
+  if [ "$result" != "PASS" ]; then
+    exit 1
+  fi
+}
+
+check_posix_encoding ()
+{
+  eval PROG=\"$ICONV\"
+  allbytes  | $PROG -f POSIX -t UCS-4BE | cmp -s - <(allucs4be)
+  check_posix_result POSIX UCS-4BE
+  allucs4be | $PROG -f UCS-4BE -t POSIX | cmp -s - <(allbytes)
+  check_posix_result UCS-4BE POSIX
+}
+
+check_posix_encoding
diff --git a/iconvdata/tst-tables.sh b/iconvdata/tst-tables.sh
index ddac85daa1..badce3e4ca 100755
--- a/iconvdata/tst-tables.sh
+++ b/iconvdata/tst-tables.sh
@@ -31,6 +31,7 @@  cat <<EOF |
   # Keep this list in the same order as gconv-modules.
   #
   # charset name    table name          comment
+  POSIX
   ASCII             ANSI_X3.4-1968
   ISO646-GB         BS_4730
   ISO646-CA         CSA_Z243.4-1985-1
diff --git a/inet/tst-idna_name_classify.c b/inet/tst-idna_name_classify.c
index bb1c0b5331..0f645cbca5 100644
--- a/inet/tst-idna_name_classify.c
+++ b/inet/tst-idna_name_classify.c
@@ -37,11 +37,11 @@  do_test (void)
   puts ("info: C locale tests");
   locale_insensitive_tests ();
   TEST_COMPARE (__idna_name_classify ("abc\200def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
   TEST_COMPARE (__idna_name_classify ("abc\200\\def"),
-                idna_name_encoding_error);
+                idna_name_nonascii_backslash);
   TEST_COMPARE (__idna_name_classify ("abc\377def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
 
   puts ("info: en_US.ISO-8859-1 locale tests");
   if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0)
diff --git a/locale/C_name.c b/locale/C_name.c
index 7612544f2f..2f52636828 100644
--- a/locale/C_name.c
+++ b/locale/C_name.c
@@ -8,4 +8,4 @@  const char _nl_C_name[] = "C";
 const char _nl_POSIX_name[] = "POSIX";
 
 /* The standard codeset.  */
-const char _nl_C_codeset[] = "ANSI_X3.4-1968";
+const char _nl_C_codeset[] = "POSIX";
diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c
index d4c22b749a..a25bff4910 100644
--- a/locale/tst-C-locale.c
+++ b/locale/tst-C-locale.c
@@ -20,6 +20,7 @@ 
 #include <langinfo.h>
 #include <limits.h>
 #include <locale.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <string.h>
 #include <wchar.h>
@@ -229,6 +230,49 @@  run_test (const char *locname)
   STRTEST (YESSTR, "");
   STRTEST (NOSTR, "");
 
+  for(int i = 0; i <= 0xff; ++i)
+    {
+      unsigned char bs[] = {i, 0};
+      mbstate_t ctx = {};
+      wchar_t wc = -1, exp = i <= 0x7f ? i : (0xdc00 + i);
+      size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx);
+      if (sz != !!i)
+	{
+	  printf ("mbrtowc(%02hhx) width in locale %s wrong "
+		  "(is %zd, should be %d)\n", *bs, locname, sz, !!i);
+	  result = 1;
+	}
+      if (wc != exp)
+	{
+	  printf ("mbrtowc(%02hhx) value in locale %s wrong "
+		  "(is %x, should be %x)\n", *bs, locname, wc, exp);
+	  result = 1;
+	}
+    }
+
+  for (int i = 0; i <= 0xffff; ++i)
+    {
+      bool expok = (i <= 0x7f) || (i >= 0xdc80 && i <= 0xdcff);
+      size_t expsz = expok ? 1 : (size_t) -1;
+      unsigned char expob = expok ? (i & 0xff) : (unsigned char) -1;
+
+      unsigned char ob = -1;
+      mbstate_t ctx = {};
+      size_t sz = wcrtomb ((char *) &ob, i, &ctx);
+      if (sz != expsz)
+	{
+	  printf ("wcrtomb(%x) width in locale %s wrong "
+		  "(is %zd, should be %zd)\n", i, locname, sz, expsz);
+	  result = 1;
+	}
+      if (ob != expob)
+	{
+	  printf ("wcrtomb(%x) value in locale %s wrong "
+		  "(is %hhx, should be %hhx)\n", i, locname, ob, expob);
+	  result = 1;
+	}
+    }
+
   /* Test the new locale mechanisms.  */
   loc = newlocale (LC_ALL_MASK, locname, NULL);
   if (loc == NULL)
diff --git a/localedata/charmaps/POSIX b/localedata/charmaps/POSIX
new file mode 100644
index 0000000000..69bdf6b485
--- /dev/null
+++ b/localedata/charmaps/POSIX
@@ -0,0 +1,136 @@ 
+<code_set_name> POSIX
+<comment_char> %
+<escape_char> /
+% source: cf. localedata/locales/POSIX, LC_COLLATE
+
+CHARMAP
+<U0000>     /x00         NULL (NUL)
+<U0001>     /x01         START OF HEADING (SOH)
+<U0002>     /x02         START OF TEXT (STX)
+<U0003>     /x03         END OF TEXT (ETX)
+<U0004>     /x04         END OF TRANSMISSION (EOT)
+<U0005>     /x05         ENQUIRY (ENQ)
+<U0006>     /x06         ACKNOWLEDGE (ACK)
+<U0007>     /x07         BELL (BEL)
+<U0008>     /x08         BACKSPACE (BS)
+<U0009>     /x09         CHARACTER TABULATION (HT)
+<U000A>     /x0a         LINE FEED (LF)
+<U000B>     /x0b         LINE TABULATION (VT)
+<U000C>     /x0c         FORM FEED (FF)
+<U000D>     /x0d         CARRIAGE RETURN (CR)
+<U000E>     /x0e         SHIFT OUT (SO)
+<U000F>     /x0f         SHIFT IN (SI)
+<U0010>     /x10         DATALINK ESCAPE (DLE)
+<U0011>     /x11         DEVICE CONTROL ONE (DC1)
+<U0012>     /x12         DEVICE CONTROL TWO (DC2)
+<U0013>     /x13         DEVICE CONTROL THREE (DC3)
+<U0014>     /x14         DEVICE CONTROL FOUR (DC4)
+<U0015>     /x15         NEGATIVE ACKNOWLEDGE (NAK)
+<U0016>     /x16         SYNCHRONOUS IDLE (SYN)
+<U0017>     /x17         END OF TRANSMISSION BLOCK (ETB)
+<U0018>     /x18         CANCEL (CAN)
+<U0019>     /x19         END OF MEDIUM (EM)
+<U001A>     /x1a         SUBSTITUTE (SUB)
+<U001B>     /x1b         ESCAPE (ESC)
+<U001C>     /x1c         FILE SEPARATOR (IS4)
+<U001D>     /x1d         GROUP SEPARATOR (IS3)
+<U001E>     /x1e         RECORD SEPARATOR (IS2)
+<U001F>     /x1f         UNIT SEPARATOR (IS1)
+<U0020>     /x20         SPACE
+<U0021>     /x21         EXCLAMATION MARK
+<U0022>     /x22         QUOTATION MARK
+<U0023>     /x23         NUMBER SIGN
+<U0024>     /x24         DOLLAR SIGN
+<U0025>     /x25         PERCENT SIGN
+<U0026>     /x26         AMPERSAND
+<U0027>     /x27         APOSTROPHE
+<U0028>     /x28         LEFT PARENTHESIS
+<U0029>     /x29         RIGHT PARENTHESIS
+<U002A>     /x2a         ASTERISK
+<U002B>     /x2b         PLUS SIGN
+<U002C>     /x2c         COMMA
+<U002D>     /x2d         HYPHEN-MINUS
+<U002E>     /x2e         FULL STOP
+<U002F>     /x2f         SOLIDUS
+<U0030>     /x30         DIGIT ZERO
+<U0031>     /x31         DIGIT ONE
+<U0032>     /x32         DIGIT TWO
+<U0033>     /x33         DIGIT THREE
+<U0034>     /x34         DIGIT FOUR
+<U0035>     /x35         DIGIT FIVE
+<U0036>     /x36         DIGIT SIX
+<U0037>     /x37         DIGIT SEVEN
+<U0038>     /x38         DIGIT EIGHT
+<U0039>     /x39         DIGIT NINE
+<U003A>     /x3a         COLON
+<U003B>     /x3b         SEMICOLON
+<U003C>     /x3c         LESS-THAN SIGN
+<U003D>     /x3d         EQUALS SIGN
+<U003E>     /x3e         GREATER-THAN SIGN
+<U003F>     /x3f         QUESTION MARK
+<U0040>     /x40         COMMERCIAL AT
+<U0041>     /x41         LATIN CAPITAL LETTER A
+<U0042>     /x42         LATIN CAPITAL LETTER B
+<U0043>     /x43         LATIN CAPITAL LETTER C
+<U0044>     /x44         LATIN CAPITAL LETTER D
+<U0045>     /x45         LATIN CAPITAL LETTER E
+<U0046>     /x46         LATIN CAPITAL LETTER F
+<U0047>     /x47         LATIN CAPITAL LETTER G
+<U0048>     /x48         LATIN CAPITAL LETTER H
+<U0049>     /x49         LATIN CAPITAL LETTER I
+<U004A>     /x4a         LATIN CAPITAL LETTER J
+<U004B>     /x4b         LATIN CAPITAL LETTER K
+<U004C>     /x4c         LATIN CAPITAL LETTER L
+<U004D>     /x4d         LATIN CAPITAL LETTER M
+<U004E>     /x4e         LATIN CAPITAL LETTER N
+<U004F>     /x4f         LATIN CAPITAL LETTER O
+<U0050>     /x50         LATIN CAPITAL LETTER P
+<U0051>     /x51         LATIN CAPITAL LETTER Q
+<U0052>     /x52         LATIN CAPITAL LETTER R
+<U0053>     /x53         LATIN CAPITAL LETTER S
+<U0054>     /x54         LATIN CAPITAL LETTER T
+<U0055>     /x55         LATIN CAPITAL LETTER U
+<U0056>     /x56         LATIN CAPITAL LETTER V
+<U0057>     /x57         LATIN CAPITAL LETTER W
+<U0058>     /x58         LATIN CAPITAL LETTER X
+<U0059>     /x59         LATIN CAPITAL LETTER Y
+<U005A>     /x5a         LATIN CAPITAL LETTER Z
+<U005B>     /x5b         LEFT SQUARE BRACKET
+<U005C>     /x5c         REVERSE SOLIDUS
+<U005D>     /x5d         RIGHT SQUARE BRACKET
+<U005E>     /x5e         CIRCUMFLEX ACCENT
+<U005F>     /x5f         LOW LINE
+<U0060>     /x60         GRAVE ACCENT
+<U0061>     /x61         LATIN SMALL LETTER A
+<U0062>     /x62         LATIN SMALL LETTER B
+<U0063>     /x63         LATIN SMALL LETTER C
+<U0064>     /x64         LATIN SMALL LETTER D
+<U0065>     /x65         LATIN SMALL LETTER E
+<U0066>     /x66         LATIN SMALL LETTER F
+<U0067>     /x67         LATIN SMALL LETTER G
+<U0068>     /x68         LATIN SMALL LETTER H
+<U0069>     /x69         LATIN SMALL LETTER I
+<U006A>     /x6a         LATIN SMALL LETTER J
+<U006B>     /x6b         LATIN SMALL LETTER K
+<U006C>     /x6c         LATIN SMALL LETTER L
+<U006D>     /x6d         LATIN SMALL LETTER M
+<U006E>     /x6e         LATIN SMALL LETTER N
+<U006F>     /x6f         LATIN SMALL LETTER O
+<U0070>     /x70         LATIN SMALL LETTER P
+<U0071>     /x71         LATIN SMALL LETTER Q
+<U0072>     /x72         LATIN SMALL LETTER R
+<U0073>     /x73         LATIN SMALL LETTER S
+<U0074>     /x74         LATIN SMALL LETTER T
+<U0075>     /x75         LATIN SMALL LETTER U
+<U0076>     /x76         LATIN SMALL LETTER V
+<U0077>     /x77         LATIN SMALL LETTER W
+<U0078>     /x78         LATIN SMALL LETTER X
+<U0079>     /x79         LATIN SMALL LETTER Y
+<U007A>     /x7a         LATIN SMALL LETTER Z
+<U007B>     /x7b         LEFT CURLY BRACKET
+<U007C>     /x7c         VERTICAL LINE
+<U007D>     /x7d         RIGHT CURLY BRACKET
+<U007E>     /x7e         TILDE
+<U007F>     /x7f         DELETE (DEL)
+<UDC80>..<UDCFF> /x80
+END CHARMAP
diff --git a/localedata/locales/POSIX b/localedata/locales/POSIX
index 7ec7f1c577..45f2fa0b31 100644
--- a/localedata/locales/POSIX
+++ b/localedata/locales/POSIX
@@ -97,6 +97,20 @@  END LC_CTYPE
 LC_COLLATE
 % This is the POSIX Locale definition for the LC_COLLATE category.
 % The order is the same as in the ASCII code set.
+% Values above <DEL> (<U007F>) inserted in order, per Issue 7 TC2,
+% XBD, 7.3.2, LC_COLLATE Category in the POSIX Locale:
+% > All characters not explicitly listed here shall be inserted
+% > in the character collation order after the listed characters
+% > and shall be assigned unique primary weights. If the listed
+% > characters have ASCII encoding, the other characters shall
+% > be in ascending order according to their coded character set values
+% Since Issue 7 TC2 (XBD, 6.2 Character Encoding):
+% > The POSIX locale shall contain 256 single-byte characters [...]
+% (cf. bug 663, 674).
+% this is in contrast to previous issues, which limited the POSIX
+% locale to the Portable Character Set (7-bit ASCII).
+% We use the same part of the Low Surrogate Area as Python
+% to contain these, yielding [<UDC80>, <UDCFF>]
 order_start forward
 <U0000>
 <U0001>
@@ -226,7 +240,134 @@  order_start forward
 <U007D>
 <U007E>
 <U007F>
-UNDEFINED
+<UDC80>
+<UDC81>
+<UDC82>
+<UDC83>
+<UDC84>
+<UDC85>
+<UDC86>
+<UDC87>
+<UDC88>
+<UDC89>
+<UDC8A>
+<UDC8B>
+<UDC8C>
+<UDC8D>
+<UDC8E>
+<UDC8F>
+<UDC90>
+<UDC91>
+<UDC92>
+<UDC93>
+<UDC94>
+<UDC95>
+<UDC96>
+<UDC97>
+<UDC98>
+<UDC99>
+<UDC9A>
+<UDC9B>
+<UDC9C>
+<UDC9D>
+<UDC9E>
+<UDC9F>
+<UDCA0>
+<UDCA1>
+<UDCA2>
+<UDCA3>
+<UDCA4>
+<UDCA5>
+<UDCA6>
+<UDCA7>
+<UDCA8>
+<UDCA9>
+<UDCAA>
+<UDCAB>
+<UDCAC>
+<UDCAD>
+<UDCAE>
+<UDCAF>
+<UDCB0>
+<UDCB1>
+<UDCB2>
+<UDCB3>
+<UDCB4>
+<UDCB5>
+<UDCB6>
+<UDCB7>
+<UDCB8>
+<UDCB9>
+<UDCBA>
+<UDCBB>
+<UDCBC>
+<UDCBD>
+<UDCBE>
+<UDCBF>
+<UDCC0>
+<UDCC1>
+<UDCC2>
+<UDCC3>
+<UDCC4>
+<UDCC5>
+<UDCC6>
+<UDCC7>
+<UDCC8>
+<UDCC9>
+<UDCCA>
+<UDCCB>
+<UDCCC>
+<UDCCD>
+<UDCCE>
+<UDCCF>
+<UDCD0>
+<UDCD1>
+<UDCD2>
+<UDCD3>
+<UDCD4>
+<UDCD5>
+<UDCD6>
+<UDCD7>
+<UDCD8>
+<UDCD9>
+<UDCDA>
+<UDCDB>
+<UDCDC>
+<UDCDD>
+<UDCDE>
+<UDCDF>
+<UDCE0>
+<UDCE1>
+<UDCE2>
+<UDCE3>
+<UDCE4>
+<UDCE5>
+<UDCE6>
+<UDCE7>
+<UDCE8>
+<UDCE9>
+<UDCEA>
+<UDCEB>
+<UDCEC>
+<UDCED>
+<UDCEE>
+<UDCEF>
+<UDCF0>
+<UDCF1>
+<UDCF2>
+<UDCF3>
+<UDCF4>
+<UDCF5>
+<UDCF6>
+<UDCF7>
+<UDCF8>
+<UDCF9>
+<UDCFA>
+<UDCFB>
+<UDCFC>
+<UDCFD>
+<UDCFE>
+<UDCFF>
 order_end
 %
 END LC_COLLATE
diff --git a/localedata/tst-c-utf8-consistency.c b/localedata/tst-c-utf8-consistency.c
index 1625e4dd0b..bd2f56834c 100644
--- a/localedata/tst-c-utf8-consistency.c
+++ b/localedata/tst-c-utf8-consistency.c
@@ -253,7 +253,7 @@  one_pass (void)
   TEST_COMPARE_STRING_WIDE (wstr (_NL_W_DATE_FMT), wstr_utf8 (_NL_W_DATE_FMT));
 
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_TIME_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_TIME_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_TIME_CODESET), "UTF-8");
 
   TEST_COMPARE_STRING (str (ALTMON_1), str_utf8 (ALTMON_1));
@@ -321,11 +321,11 @@  one_pass (void)
                             wstr_utf8 (_NL_WABALTMON_12));
 
   /* LC_COLLATE.  Mostly untested, only expected differences.  */
-  TEST_COMPARE_STRING (str (_NL_COLLATE_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_COLLATE_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_COLLATE_CODESET), "UTF-8");
 
   /* LC_CTYPE.  Mostly untested, only expected differences.  */
-  TEST_COMPARE_STRING (str (CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (CODESET), "UTF-8");
 
   /* LC_MONETARY.  */
@@ -401,7 +401,7 @@  one_pass (void)
   TEST_COMPARE (word (_NL_MONETARY_THOUSANDS_SEP_WC),
                 word_utf8 (_NL_MONETARY_THOUSANDS_SEP_WC));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_MONETARY_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_MONETARY_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_MONETARY_CODESET), "UTF-8");
 
   /* LC_NUMERIC.  */
@@ -416,7 +416,7 @@  one_pass (void)
   TEST_COMPARE (word (_NL_NUMERIC_THOUSANDS_SEP_WC),
                 word_utf8 (_NL_NUMERIC_THOUSANDS_SEP_WC));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_NUMERIC_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_NUMERIC_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_NUMERIC_CODESET), "UTF-8");
 
   /* LC_MESSAGES.  */
@@ -426,7 +426,7 @@  one_pass (void)
   TEST_COMPARE_STRING (str (YESSTR), str_utf8 (YESSTR));
   TEST_COMPARE_STRING (str (NOSTR), str_utf8 (NOSTR));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_MESSAGES_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_MESSAGES_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_MESSAGES_CODESET), "UTF-8");
 
   /* LC_PAPER.  */
@@ -434,7 +434,7 @@  one_pass (void)
   TEST_COMPARE (word (_NL_PAPER_HEIGHT), word_utf8 (_NL_PAPER_HEIGHT));
   TEST_COMPARE (word (_NL_PAPER_WIDTH), word_utf8 (_NL_PAPER_WIDTH));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_PAPER_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_PAPER_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_PAPER_CODESET), "UTF-8");
 
   /* LC_NAME.  */
@@ -452,7 +452,7 @@  one_pass (void)
   TEST_COMPARE_STRING (str (_NL_NAME_NAME_MS),
                        str_utf8 (_NL_NAME_NAME_MS));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_NAME_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_NAME_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_NAME_CODESET), "UTF-8");
 
   /* LC_ADDRESS.  */
@@ -482,7 +482,7 @@  one_pass (void)
   TEST_COMPARE_STRING (str (_NL_ADDRESS_LANG_LIB),
                        str_utf8 (_NL_ADDRESS_LANG_LIB));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_ADDRESS_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_ADDRESS_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_ADDRESS_CODESET), "UTF-8");
 
   /* LC_TELEPHONE.  */
@@ -496,7 +496,7 @@  one_pass (void)
   TEST_COMPARE_STRING (str (_NL_TELEPHONE_INT_PREFIX),
                        str_utf8 (_NL_TELEPHONE_INT_PREFIX));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_TELEPHONE_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_TELEPHONE_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_TELEPHONE_CODESET), "UTF-8");
 
   /* LC_MEASUREMENT.  */
@@ -504,7 +504,7 @@  one_pass (void)
   TEST_COMPARE (byte (_NL_MEASUREMENT_MEASUREMENT),
                 byte_utf8 (_NL_MEASUREMENT_MEASUREMENT));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_MEASUREMENT_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_MEASUREMENT_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_MEASUREMENT_CODESET), "UTF-8");
 
   /* LC_IDENTIFICATION is skipped since C.UTF-8 is distinct from C.  */
@@ -512,7 +512,7 @@  one_pass (void)
   /* _NL_IDENTIFICATION_CATEGORY cannot be tested because it is a
      string array.  */
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_IDENTIFICATION_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_IDENTIFICATION_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_IDENTIFICATION_CODESET), "UTF-8");
 }
 
diff --git a/stdio-common/Makefile b/stdio-common/Makefile
index 4c15b97683..291f502878 100644
--- a/stdio-common/Makefile
+++ b/stdio-common/Makefile
@@ -359,6 +359,7 @@  $(objpfx)test-vfprintf.out: $(gen-locales)
 $(objpfx)tst-grouping.out: $(gen-locales)
 $(objpfx)tst-grouping2.out: $(gen-locales)
 $(objpfx)tst-grouping_iterator.out: $(gen-locales)
+$(objpfx)tst-printf-bz25691-mem.out: $(gen-locales)
 $(objpfx)tst-sprintf.out: $(gen-locales)
 $(objpfx)tst-sscanf.out: $(gen-locales)
 $(objpfx)tst-swprintf.out: $(gen-locales)
diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c
index e6fa2433fa..f953e0d956 100644
--- a/stdio-common/tst-printf-bz25691.c
+++ b/stdio-common/tst-printf-bz25691.c
@@ -30,6 +30,8 @@ 
 static int
 do_test (void)
 {
+  setlocale(LC_CTYPE, "C.UTF-8");
+
   mtrace ();
 
   /* For 's' conversion specifier with 'l' modifier the array must be
diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
index 7b338b6775..86666e8231 100644
--- a/wcsmbs/wcsmbsload.c
+++ b/wcsmbs/wcsmbsload.c
@@ -33,10 +33,10 @@  static const struct __gconv_step to_wc =
   .__shlib_handle = NULL,
   .__modname = NULL,
   .__counter = INT_MAX,
-  .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
+  .__from_name = (char *) "POSIX",
   .__to_name = (char *) "INTERNAL",
-  .__fct = __gconv_transform_ascii_internal,
-  .__btowc_fct = __gconv_btwoc_ascii,
+  .__fct = __gconv_transform_posix_internal,
+  .__btowc_fct = __gconv_btwoc_posix,
   .__init_fct = NULL,
   .__end_fct = NULL,
   .__min_needed_from = 1,
@@ -53,8 +53,8 @@  static const struct __gconv_step to_mb =
   .__modname = NULL,
   .__counter = INT_MAX,
   .__from_name = (char *) "INTERNAL",
-  .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
-  .__fct = __gconv_transform_internal_ascii,
+  .__to_name = (char *) "POSIX",
+  .__fct = __gconv_transform_internal_posix,
   .__btowc_fct = NULL,
   .__init_fct = NULL,
   .__end_fct = NULL,
@@ -67,7 +67,9 @@  static const struct __gconv_step to_mb =
 };
 
 
-/* For the default locale we only have to handle ANSI_X3.4-1968.  */
+/* The default/"POSIX"/"C" locale is an 8-bit-clean mapping
+   with ANSI_X3.4-1968 in the first 128 characters;
+   we lift the remaining bytes by <UDC00>.  */
 const struct gconv_fcts __wcsmbs_gconv_fcts_c =
 {
   .towc = (struct __gconv_step *) &to_wc,