@@ -76,7 +76,7 @@ tests = bug-iconv1 bug-iconv2 tst-loading tst-e2big tst-iconv4 bug-iconv4 \
tst-iconv6 bug-iconv5 bug-iconv6 tst-iconv7 bug-iconv8 bug-iconv9 \
bug-iconv10 bug-iconv11 bug-iconv12 tst-iconv-big5-hkscs-to-2ucs4 \
bug-iconv13 bug-iconv14 bug-iconv15 \
- tst-iconv-iso-2022-cn-ext
+ tst-iconv-iso-2022-cn-ext tst-bug33980
ifeq ($(have-thread-library),yes)
tests += bug-iconv3
endif
@@ -333,6 +333,8 @@ $(objpfx)bug-iconv15.out: $(addprefix $(objpfx), $(gconv-modules)) \
$(addprefix $(objpfx),$(modules.so))
$(objpfx)tst-iconv-iso-2022-cn-ext.out: $(addprefix $(objpfx), $(gconv-modules)) \
$(addprefix $(objpfx),$(modules.so))
+$(objpfx)tst-bug33980.out: $(addprefix $(objpfx), $(gconv-modules)) \
+ $(addprefix $(objpfx),$(modules.so))
$(objpfx)iconv-test.out: run-iconv-test.sh \
$(addprefix $(objpfx), $(gconv-modules)) \
@@ -67,12 +67,29 @@
/* Since this is a stateful encoding we have to provide code which resets
the output state to the initial state. This has to be done during the
- flushing. */
+ flushing. For the to-internal direction (FROM_DIRECTION is true),
+ there may be a pending character that needs flushing. */
#define EMIT_SHIFT_TO_INIT \
if ((data->__statep->__count & ~7) != sb) \
{ \
if (FROM_DIRECTION) \
- data->__statep->__count &= 7; \
+ { \
+ uint32_t ch = data->__statep->__count >> 7; \
+ if (__glibc_unlikely (ch != 0)) \
+ { \
+ if (__glibc_unlikely (outend - outbuf < 4)) \
+ status = __GCONV_FULL_OUTPUT; \
+ else \
+ { \
+ put32 (outbuf, ch); \
+ outbuf += 4; \
+ /* Clear character and db bit. */ \
+ data->__statep->__count &= 7; \
+ } \
+ } \
+ else \
+ data->__statep->__count &= 7; \
+ } \
else \
{ \
/* We are not in the initial state. To switch back we have \
@@ -99,11 +116,13 @@
*curcsp = save_curcs
-/* Current codeset type. */
+/* Current codeset type. The bit is stored in the __count variable of
+ the conversion state. If the db bit is set, bit 7 and above store
+ a pending UCS-4 code point if non-zero. */
enum
{
- sb = 0,
- db = 64
+ sb = 0, /* Single byte mode. */
+ db = 64 /* Double byte mode. */
};
@@ -119,21 +138,29 @@ enum
} \
else \
{ \
- /* This is a combined character. Make sure we have room. */ \
- if (__glibc_unlikely (outptr + 8 > outend)) \
- { \
- result = __GCONV_FULL_OUTPUT; \
- break; \
- } \
- \
const struct divide *cmbp \
= &DB_TO_UCS4_COMB[ch - __TO_UCS4_COMBINED_MIN]; \
assert (cmbp->res1 != 0 && cmbp->res2 != 0); \
\
put32 (outptr, cmbp->res1); \
outptr += 4; \
- put32 (outptr, cmbp->res2); \
- outptr += 4; \
+ \
+ /* See whether we have room for the second character. */ \
+ if (outend - outptr >= 4) \
+ { \
+ put32 (outptr, cmbp->res2); \
+ outptr += 4; \
+ } \
+ else \
+ { \
+ /* Otherwise store only the first character now, and \
+ put the second one into the queue. */ \
+ curcs |= cmbp->res2 << 7; \
+ inptr += 2; \
+ /* Tell the caller why we terminate the loop. */ \
+ result = __GCONV_FULL_OUTPUT; \
+ break; \
+ } \
} \
}
#else
@@ -153,7 +180,20 @@ enum
#define LOOPFCT FROM_LOOP
#define BODY \
{ \
- uint32_t ch = *inptr; \
+ uint32_t ch; \
+ \
+ ch = curcs >> 7; \
+ if (__glibc_unlikely (ch != 0)) \
+ { \
+ put32 (outptr, ch); \
+ outptr += 4; \
+ /* Remove the pending character, but preserve state bits. */ \
+ curcs &= (1 << 7) - 1; \
+ continue; \
+ } \
+ \
+ /* Otherwise read the next input byte. */ \
+ ch = *inptr; \
\
if (__builtin_expect (ch, 0) == SO) \
{ \
new file mode 100644
@@ -0,0 +1,144 @@
+/* Test for bug 33980: combining characters in IBM1390/IBM1399.
+ Copyright (C) 2026 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <alloc_buffer.h>
+#include <errno.h>
+#include <iconv.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include <support/check.h>
+#include <support/next_to_fault.h>
+#include <support/support.h>
+
+/* Run iconv in a loop with a small output buffer of OUTBUFSIZE bytes
+ starting at OUTBUF. OUTBUF should be right before an unmapped page
+ so that writing past the end will fault. Skip SHIFT bytes and
+ input and output, to exercise different buffer alignment.
+ TRUNCATE indicates skipped bytes at the end of input. */
+static void
+test_one (const char *encoding, unsigned int shift, unsigned int truncate,
+ char *outbuf, size_t outbufsize)
+{
+ /* In IBM1390 and IBM1399, the DBCS code 0xECB5 expands to two
+ Unicode code points when translated. */
+ static const char input[] =
+ {
+ /* 8 letters X. */
+ 0xe7, 0xe7, 0xe7, 0xe7, 0xe7, 0xe7, 0xe7, 0xe7,
+ /* SO, 0xECB5, SI: shift to DBCS, special character, shift back. */
+ 0x0e, 0xec, 0xb5, 0x0f
+ };
+
+ /* Expected output after UTF-8 conversion. */
+ static const char expected[] =
+ {
+ 'X', 'X', 'X', 'X', 'X', 'X', 'X', 'X',
+ /* U+304B (HIRAGANA LETTER KA). */
+ 0xe3, 0x81, 0x8b,
+ /* U+309A (COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK). */
+ 0xe3, 0x82, 0x9a
+ };
+
+ iconv_t cd = iconv_open ("UTF-8", encoding);
+ TEST_VERIFY_EXIT (cd != (iconv_t) -1);
+
+ char result_storage[64];
+ struct alloc_buffer result_buf
+ = alloc_buffer_create (result_storage, sizeof (result_storage));
+
+ const char *inptr = &input[shift];
+ size_t inleft = sizeof (input) - shift - truncate;
+
+ while (inleft > 0)
+ {
+ char *outptr = outbuf;
+ size_t outleft = outbufsize;
+ size_t inleft_before = inleft;
+
+ size_t ret = iconv (cd, (char **) &inptr, &inleft, &outptr, &outleft);
+ size_t produced = outptr - outbuf;
+ alloc_buffer_copy_bytes (&result_buf, outbuf, produced);
+
+ if (ret == (size_t) -1 && errno == E2BIG)
+ {
+ if (produced == 0 && inleft == inleft_before)
+ {
+ /* Output buffer too small to make progress. This is
+ expected for very small output buffer sizes. */
+ TEST_VERIFY_EXIT (outbufsize < 3);
+ break;
+ }
+ continue;
+ }
+ if (ret == (size_t) -1)
+ FAIL_EXIT1 ("%s (outbufsize %zu): iconv: %m", encoding, outbufsize);
+ break;
+ }
+
+ /* Flush any pending state (e.g. a buffered combined character). */
+ while (true)
+ {
+ char *outptr = outbuf;
+ size_t outleft = outbufsize;
+
+ size_t ret = iconv (cd, NULL, NULL, &outptr, &outleft);
+ size_t produced = outptr - outbuf;
+ alloc_buffer_copy_bytes (&result_buf, outbuf, produced);
+
+ if (ret == (size_t) -1 && errno == E2BIG)
+ continue;
+ TEST_VERIFY_EXIT (ret == 0);
+ break;
+ }
+
+ TEST_VERIFY_EXIT (!alloc_buffer_has_failed (&result_buf));
+ size_t result_used
+ = sizeof (result_storage) - alloc_buffer_size (&result_buf);
+
+ if (outbufsize >= 3)
+ {
+ TEST_COMPARE (inleft, 0);
+ TEST_COMPARE (result_used, sizeof (expected) - shift);
+ TEST_COMPARE_BLOB (result_storage, result_used,
+ &expected[shift], sizeof (expected) - shift);
+ }
+
+ TEST_VERIFY_EXIT (iconv_close (cd) == 0);
+}
+
+static int
+do_test (void)
+{
+ struct support_next_to_fault ntf
+ = support_next_to_fault_allocate (8);
+
+ for (int shift = 0; shift <= 8; ++shift)
+ for (int truncate = 0; truncate < 2; ++truncate)
+ for (size_t outbufsize = 1; outbufsize <= 8; outbufsize++)
+ {
+ char *outbuf = ntf.buffer + ntf.length - outbufsize;
+ test_one ("IBM1390", shift, truncate, outbuf, outbufsize);
+ test_one ("IBM1399", shift, truncate, outbuf, outbufsize);
+ }
+
+ support_next_to_fault_free (&ntf);
+ return 0;
+}
+
+#include <support/test-driver.c>