[v2] Use pending character state in IBM1390, IBM1399 character sets (CVE-2026-4046)

Message ID lhucy0b6l2w.fsf@oldenburg.str.redhat.com (mailing list archive)
State Superseded
Headers
Series [v2] Use pending character state in IBM1390, IBM1399 character sets (CVE-2026-4046) |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Test passed
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Test passed

Commit Message

Florian Weimer April 7, 2026, 11:55 a.m. UTC
  Follow the example in iso-2022-jp-3.c and use the __count state
variable to store the pending character.  This avoids restarting
the conversion if the output buffer ends between two 4-byte UCS-4
code points, so that the assert reported in the bug can no longer
happen.

Even though the fix is applied to ibm1364.c, the change is only
effective for the two HAS_COMBINED codecs for IBM1390, IBM1399.

The test case was mostly auto-generated using
claude-4.6-opus-high-thinking, and composer-2-fast shows up in the log
as well.  During review, gpt-5.4-xhigh flagged that the original version
of the test case was not exercising the new character flush logic.

This fixes bug 33980.

---
v2: Update commit message to reflect that IBM1390, IBM1399 are the only
    impacted codecs.  Include test case.

 iconvdata/Makefile       |   4 +-
 iconvdata/ibm1364.c      |  70 ++++++++++++++++++-----
 iconvdata/tst-bug33980.c | 144 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 202 insertions(+), 16 deletions(-)


base-commit: 1b2f868fb4958fd59875695c1828d9804b116dc2
  

Comments

Andreas Schwab April 7, 2026, 2:45 p.m. UTC | #1
On Apr 07 2026, Florian Weimer wrote:

> Follow the example in iso-2022-jp-3.c and use the __count state
> variable to store the pending character.  This avoids restarting
> the conversion if the output buffer ends between two 4-byte UCS-4
> code points, so that the assert reported in the bug can no longer
> happen.
>
> Even though the fix is applied to ibm1364.c, the change is only
> effective for the two HAS_COMBINED codecs for IBM1390, IBM1399.

Ok.
  
Arjun Shankar April 9, 2026, 12:31 p.m. UTC | #2
Hi Andreas,

> > Follow the example in iso-2022-jp-3.c and use the __count state
> > variable to store the pending character.  This avoids restarting
> > the conversion if the output buffer ends between two 4-byte UCS-4
> > code points, so that the assert reported in the bug can no longer
> > happen.
> >
> > Even though the fix is applied to ibm1364.c, the change is only
> > effective for the two HAS_COMBINED codecs for IBM1390, IBM1399.
>
> Ok.

Florian mentioned that the test is auto-generated by an LLM:

> The test case was mostly auto-generated using
> claude-4.6-opus-high-thinking, and composer-2-fast shows up in the log
> as well.  During review, gpt-5.4-xhigh flagged that the original version
> of the test case was not exercising the new character flush logic.

Do you have any objections to this? I'm asking specifically because we
tend to treat maintainers' OK/ACK as post-review approval to commit.

Thanks!
Arjun
  

Patch

diff --git a/iconvdata/Makefile b/iconvdata/Makefile
index 26e888b443..fbb0067302 100644
--- a/iconvdata/Makefile
+++ b/iconvdata/Makefile
@@ -76,7 +76,7 @@  tests = bug-iconv1 bug-iconv2 tst-loading tst-e2big tst-iconv4 bug-iconv4 \
 	tst-iconv6 bug-iconv5 bug-iconv6 tst-iconv7 bug-iconv8 bug-iconv9 \
 	bug-iconv10 bug-iconv11 bug-iconv12 tst-iconv-big5-hkscs-to-2ucs4 \
 	bug-iconv13 bug-iconv14 bug-iconv15 \
-	tst-iconv-iso-2022-cn-ext
+	tst-iconv-iso-2022-cn-ext tst-bug33980
 ifeq ($(have-thread-library),yes)
 tests += bug-iconv3
 endif
@@ -333,6 +333,8 @@  $(objpfx)bug-iconv15.out: $(addprefix $(objpfx), $(gconv-modules)) \
 			  $(addprefix $(objpfx),$(modules.so))
 $(objpfx)tst-iconv-iso-2022-cn-ext.out: $(addprefix $(objpfx), $(gconv-modules)) \
 					$(addprefix $(objpfx),$(modules.so))
+$(objpfx)tst-bug33980.out: $(addprefix $(objpfx), $(gconv-modules)) \
+			   $(addprefix $(objpfx),$(modules.so))
 
 $(objpfx)iconv-test.out: run-iconv-test.sh \
 			 $(addprefix $(objpfx), $(gconv-modules)) \
diff --git a/iconvdata/ibm1364.c b/iconvdata/ibm1364.c
index 4f41f22c12..8df66ea048 100644
--- a/iconvdata/ibm1364.c
+++ b/iconvdata/ibm1364.c
@@ -67,12 +67,29 @@ 
 
 /* Since this is a stateful encoding we have to provide code which resets
    the output state to the initial state.  This has to be done during the
-   flushing.  */
+   flushing.  For the to-internal direction (FROM_DIRECTION is true),
+   there may be a pending character that needs flushing.  */
 #define EMIT_SHIFT_TO_INIT \
   if ((data->__statep->__count & ~7) != sb)				      \
     {									      \
       if (FROM_DIRECTION)						      \
-	data->__statep->__count &= 7;					      \
+	{								      \
+	  uint32_t ch = data->__statep->__count >> 7;			      \
+	  if (__glibc_unlikely (ch != 0))				      \
+	    {								      \
+	      if (__glibc_unlikely (outend - outbuf < 4))		      \
+		status = __GCONV_FULL_OUTPUT;				      \
+	      else							      \
+		{							      \
+		  put32 (outbuf, ch);					      \
+		  outbuf += 4;						      \
+		  /* Clear character and db bit.  */			      \
+		  data->__statep->__count &= 7;				      \
+		}							      \
+	    }								      \
+	  else								      \
+	    data->__statep->__count &= 7;				      \
+	}								      \
       else								      \
 	{								      \
 	  /* We are not in the initial state.  To switch back we have	      \
@@ -99,11 +116,13 @@ 
     *curcsp = save_curcs
 
 
-/* Current codeset type.  */
+/* Current codeset type.  The bit is stored in the __count variable of
+   the conversion state.  If the db bit is set, bit 7 and above store
+   a pending UCS-4 code point if non-zero.  */
 enum
 {
-  sb = 0,
-  db = 64
+  sb = 0,			/* Single byte mode.  */
+  db = 64			/* Double byte mode.  */
 };
 
 
@@ -119,21 +138,29 @@  enum
       }									      \
     else								      \
       {									      \
-	/* This is a combined character.  Make sure we have room.  */	      \
-	if (__glibc_unlikely (outptr + 8 > outend))			      \
-	  {								      \
-	    result = __GCONV_FULL_OUTPUT;				      \
-	    break;							      \
-	  }								      \
-									      \
 	const struct divide *cmbp					      \
 	  = &DB_TO_UCS4_COMB[ch - __TO_UCS4_COMBINED_MIN];		      \
 	assert (cmbp->res1 != 0 && cmbp->res2 != 0);			      \
 									      \
 	put32 (outptr, cmbp->res1);					      \
 	outptr += 4;							      \
-	put32 (outptr, cmbp->res2);					      \
-	outptr += 4;							      \
+									      \
+	/* See whether we have room for the second character.  */	      \
+	if (outend - outptr >= 4)					      \
+	  {								      \
+	    put32 (outptr, cmbp->res2);					      \
+	    outptr += 4;						      \
+	  }								      \
+	else								      \
+	  {								      \
+	    /* Otherwise store only the first character now, and	      \
+	       put the second one into the queue.  */			      \
+	    curcs |= cmbp->res2 << 7;					      \
+	    inptr += 2;							      \
+	    /* Tell the caller why we terminate the loop.  */		      \
+	    result = __GCONV_FULL_OUTPUT;				      \
+	    break;							      \
+	  }								      \
       }									      \
   }
 #else
@@ -153,7 +180,20 @@  enum
 #define LOOPFCT 		FROM_LOOP
 #define BODY \
   {									      \
-    uint32_t ch = *inptr;						      \
+    uint32_t ch;							      \
+									      \
+    ch = curcs >> 7;							      \
+    if (__glibc_unlikely (ch != 0))					      \
+      {									      \
+	put32 (outptr, ch);						      \
+	outptr += 4;							      \
+	/* Remove the pending character, but preserve state bits.  */	      \
+	curcs &= (1 << 7) - 1;						      \
+	continue;							      \
+      }									      \
+									      \
+    /* Otherwise read the next input byte.  */				      \
+    ch = *inptr;							      \
 									      \
     if (__builtin_expect (ch, 0) == SO)					      \
       {									      \
diff --git a/iconvdata/tst-bug33980.c b/iconvdata/tst-bug33980.c
new file mode 100644
index 0000000000..a5bc3154b4
--- /dev/null
+++ b/iconvdata/tst-bug33980.c
@@ -0,0 +1,144 @@ 
+/* Test for bug 33980: combining characters in IBM1390/IBM1399.
+   Copyright (C) 2026 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <alloc_buffer.h>
+#include <errno.h>
+#include <iconv.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include <support/check.h>
+#include <support/next_to_fault.h>
+#include <support/support.h>
+
+/* Run iconv in a loop with a small output buffer of OUTBUFSIZE bytes
+   starting at OUTBUF.  OUTBUF should be right before an unmapped page
+   so that writing past the end will fault.  Skip SHIFT bytes and
+   input and output, to exercise different buffer alignment.
+   TRUNCATE indicates skipped bytes at the end of input.  */
+static void
+test_one (const char *encoding, unsigned int shift, unsigned int truncate,
+          char *outbuf, size_t outbufsize)
+{
+  /* In IBM1390 and IBM1399, the DBCS code 0xECB5 expands to two
+     Unicode code points when translated.  */
+  static const char input[] =
+    {
+      /* 8 letters X.  */
+      0xe7, 0xe7, 0xe7, 0xe7, 0xe7, 0xe7, 0xe7, 0xe7,
+      /* SO, 0xECB5, SI: shift to DBCS, special character, shift back.  */
+      0x0e, 0xec, 0xb5, 0x0f
+    };
+
+  /* Expected output after UTF-8 conversion.  */
+  static const char expected[] =
+    {
+      'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X',
+      /* U+304B (HIRAGANA LETTER KA).  */
+      0xe3, 0x81, 0x8b,
+      /* U+309A (COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK).  */
+      0xe3, 0x82, 0x9a
+    };
+
+  iconv_t cd = iconv_open ("UTF-8", encoding);
+  TEST_VERIFY_EXIT (cd != (iconv_t) -1);
+
+  char result_storage[64];
+  struct alloc_buffer result_buf
+    = alloc_buffer_create (result_storage, sizeof (result_storage));
+
+  const char *inptr = &input[shift];
+  size_t inleft = sizeof (input) - shift - truncate;
+
+  while (inleft > 0)
+    {
+      char *outptr = outbuf;
+      size_t outleft = outbufsize;
+      size_t inleft_before = inleft;
+
+      size_t ret = iconv (cd, (char **) &inptr, &inleft, &outptr, &outleft);
+      size_t produced = outptr - outbuf;
+      alloc_buffer_copy_bytes (&result_buf, outbuf, produced);
+
+      if (ret == (size_t) -1 && errno == E2BIG)
+        {
+          if (produced == 0 && inleft == inleft_before)
+            {
+              /* Output buffer too small to make progress.  This is
+                 expected for very small output buffer sizes.  */
+              TEST_VERIFY_EXIT (outbufsize < 3);
+              break;
+            }
+          continue;
+        }
+      if (ret == (size_t) -1)
+        FAIL_EXIT1 ("%s (outbufsize %zu): iconv: %m", encoding, outbufsize);
+      break;
+    }
+
+  /* Flush any pending state (e.g. a buffered combined character).  */
+  while (true)
+    {
+      char *outptr = outbuf;
+      size_t outleft = outbufsize;
+
+      size_t ret = iconv (cd, NULL, NULL, &outptr, &outleft);
+      size_t produced = outptr - outbuf;
+      alloc_buffer_copy_bytes (&result_buf, outbuf, produced);
+
+      if (ret == (size_t) -1 && errno == E2BIG)
+        continue;
+      TEST_VERIFY_EXIT (ret == 0);
+      break;
+    }
+
+  TEST_VERIFY_EXIT (!alloc_buffer_has_failed (&result_buf));
+  size_t result_used
+    = sizeof (result_storage) - alloc_buffer_size (&result_buf);
+
+  if (outbufsize >= 3)
+    {
+      TEST_COMPARE (inleft, 0);
+      TEST_COMPARE (result_used, sizeof (expected) - shift);
+      TEST_COMPARE_BLOB (result_storage, result_used,
+                         &expected[shift], sizeof (expected) - shift);
+    }
+
+  TEST_VERIFY_EXIT (iconv_close (cd) == 0);
+}
+
+static int
+do_test (void)
+{
+  struct support_next_to_fault ntf
+    = support_next_to_fault_allocate (8);
+
+  for (int shift = 0; shift <= 8; ++shift)
+    for (int truncate = 0; truncate < 2; ++truncate)
+      for (size_t outbufsize = 1; outbufsize <= 8; outbufsize++)
+        {
+          char *outbuf = ntf.buffer + ntf.length - outbufsize;
+          test_one ("IBM1390", shift, truncate, outbuf, outbufsize);
+          test_one ("IBM1399", shift, truncate, outbuf, outbufsize);
+        }
+
+  support_next_to_fault_free (&ntf);
+  return 0;
+}
+
+#include <support/test-driver.c>