[v4,1/3] gconv: Correct Big5-HKSCS conversion to preserve all state bits. [BZ #25744]

Message ID 20220630125215.6052-2-tom@honermann.net
State Committed
Commit 598f790fb17bcfff7fedde5209933a82d7748328
Headers
Series C++20 P0482R6 and C2X N2653: char8_t, mbrtoc8(), and c8rtomb() |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Tom Honermann June 30, 2022, 12:52 p.m. UTC
  This patch corrects the Big5-HKSCS converter to preserve the lowest 3 bits of
the mbstate_t __count data member when the converter encounters an incomplete
multibyte character.

This fixes BZ #25744.
---
 iconvdata/big5hkscs.c                     | 16 +++---
 iconvdata/tst-iconv-big5-hkscs-to-2ucs4.c | 65 +++++++++++++++++++++++
 2 files changed, 73 insertions(+), 8 deletions(-)
  

Comments

Adhemerval Zanella Netto July 4, 2022, 6:16 p.m. UTC | #1
> On 30 Jun 2022, at 09:52, Tom Honermann via Libc-alpha <libc-alpha@sourceware.org> wrote:
> 
> This patch corrects the Big5-HKSCS converter to preserve the lowest 3 bits of
> the mbstate_t __count data member when the converter encounters an incomplete
> multibyte character.
> 
> This fixes BZ #25744.

LGTM, thanks.

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

> ---
> iconvdata/big5hkscs.c                     | 16 +++---
> iconvdata/tst-iconv-big5-hkscs-to-2ucs4.c | 65 +++++++++++++++++++++++
> 2 files changed, 73 insertions(+), 8 deletions(-)
> 
> diff --git a/iconvdata/big5hkscs.c b/iconvdata/big5hkscs.c
> index a28b18a5ec..d12389b2e3 100644
> --- a/iconvdata/big5hkscs.c
> +++ b/iconvdata/big5hkscs.c
> @@ -17769,7 +17769,7 @@ static struct
>    the output state to the initial state.  This has to be done during the
>    flushing.  */
> #define EMIT_SHIFT_TO_INIT \
> -  if (data->__statep->__count != 0)					      \
> +  if ((data->__statep->__count >> 3) != 0)				      \
>     {									      \
>       if (FROM_DIRECTION)						      \
> 	{								      \
> @@ -17778,7 +17778,7 @@ static struct
> 	      /* Write out the last character.  */			      \
> 	      *((uint32_t *) outbuf) = data->__statep->__count >> 3;	      \
> 	      outbuf += sizeof (uint32_t);				      \
> -	      data->__statep->__count = 0;				      \
> +	      data->__statep->__count &= 7;				      \
> 	    }								      \
> 	  else								      \
> 	    /* We don't have enough room in the output buffer.  */	      \
> @@ -17792,7 +17792,7 @@ static struct
> 	      uint32_t lasttwo = data->__statep->__count >> 3;		      \
> 	      *outbuf++ = (lasttwo >> 8) & 0xff;			      \
> 	      *outbuf++ = lasttwo & 0xff;				      \
> -	      data->__statep->__count = 0;				      \
> +	      data->__statep->__count &= 7;				      \
> 	    }								      \
> 	  else								      \
> 	    /* We don't have enough room in the output buffer.  */	      \
> @@ -17878,7 +17878,7 @@ static struct
> 									      \
> 		/* Otherwise store only the first character now, and	      \
> 		   put the second one into the queue.  */		      \
> -		*statep = ch2 << 3;					      \
> +		*statep = (ch2 << 3) | (*statep & 7);			      \
> 		/* Tell the caller why we terminate the loop.  */	      \
> 		result = __GCONV_FULL_OUTPUT;				      \
> 		break;							      \
> @@ -17895,7 +17895,7 @@ static struct
>       }									      \
>     else								      \
>       /* Clear the queue and proceed to output the saved character.  */	      \
> -      *statep = 0;							      \
> +      *statep &= 7;							      \
> 									      \
>     put32 (outptr, ch);							      \
>     outptr += 4;							      \
> @@ -17946,7 +17946,7 @@ static struct
> 	  }								      \
> 	*outptr++ = (ch >> 8) & 0xff;					      \
> 	*outptr++ = ch & 0xff;						      \
> -	*statep = 0;							      \
> +	*statep &= 7;							      \
> 	inptr += 4;							      \
> 	continue;							      \
> 									      \
> @@ -17959,7 +17959,7 @@ static struct
> 	  }								      \
> 	*outptr++ = (lasttwo >> 8) & 0xff;				      \
> 	*outptr++ = lasttwo & 0xff;					      \
> -	*statep = 0;							      \
> +	*statep &= 7;							      \
> 	continue;							      \
>       }									      \
> 									      \
> @@ -17996,7 +17996,7 @@ static struct
> 	   /* Check for possible combining character.  */		      \
> 	    if (__glibc_unlikely (ch == 0xca || ch == 0xea))		      \
> 	      {								      \
> -		*statep = ((cp[0] << 8) | cp[1]) << 3;			      \
> +		*statep = (((cp[0] << 8) | cp[1]) << 3) | (*statep & 7);      \
> 		inptr += 4;						      \
> 		continue;						      \
> 	      }								      \
> diff --git a/iconvdata/tst-iconv-big5-hkscs-to-2ucs4.c b/iconvdata/tst-iconv-big5-hkscs-to-2ucs4.c
> index 9601b6c1d9..e1472dc2e2 100644
> --- a/iconvdata/tst-iconv-big5-hkscs-to-2ucs4.c
> +++ b/iconvdata/tst-iconv-big5-hkscs-to-2ucs4.c
> @@ -128,6 +128,71 @@ check_conversion (struct testdata test)
>       printf ("error: Result of third conversion was wrong.\n");
>       err++;
>     }
> +
> +  /* Now perform the same test as above consuming one byte at a time.  */
> +  mbs = test.input;
> +  memset (&st, 0, sizeof (st));
> +
> +  /* Consume the first byte; expect an incomplete multibyte character.  */
> +  ret = mbrtowc (&wc, mbs, 1, &st);
> +  if (ret != -2)
> +    {
> +      printf ("error: First byte conversion returned %zd.\n", ret);
> +      err++;
> +    }
> +  /* Advance past the first consumed byte.  */
> +  mbs += 1;
> +  /* Consume the second byte; expect the first wchar_t.  */
> +  ret = mbrtowc (&wc, mbs, 1, &st);
> +  if (ret != 1)
> +    {
> +      printf ("error: Second byte conversion returned %zd.\n", ret);
> +      err++;
> +    }
> +  /* Advance past the second consumed byte.  */
> +  mbs += 1;
> +  if (wc != test.expected[0])
> +    {
> +      printf ("error: Result of first wchar_t conversion was wrong.\n");
> +      err++;
> +    }
> +  /* Consume no bytes; expect the second wchar_t.  */
> +  ret = mbrtowc (&wc, mbs, 1, &st);
> +  if (ret != 0)
> +    {
> +      printf ("error: First attempt of third byte conversion returned %zd.\n", ret);
> +      err++;
> +    }
> +  /* Do not advance past the third byte.  */
> +  mbs += 0;
> +  if (wc != test.expected[1])
> +    {
> +      printf ("error: Result of second wchar_t conversion was wrong.\n");
> +      err++;
> +    }
> +  /* After the second wchar_t conversion, the converter should be in
> +     the initial state since the two input BIG5-HKSCS bytes have been
> +     consumed and the two wchar_t's have been output.  */
> +  if (mbsinit (&st) == 0)
> +    {
> +      printf ("error: Converter not in initial state.\n");
> +      err++;
> +    }
> +  /* Consume the third byte; expect the third wchar_t.  */
> +  ret = mbrtowc (&wc, mbs, 1, &st);
> +  if (ret != 1)
> +    {
> +      printf ("error: Third byte conversion returned %zd.\n", ret);
> +      err++;
> +    }
> +  /* Advance past the third consumed byte.  */
> +  mbs += 1;
> +  if (wc != test.expected[2])
> +    {
> +      printf ("error: Result of third wchar_t conversion was wrong.\n");
> +      err++;
> +    }
> +
>   /* Return 0 if we saw no errors.  */
>   return err;
> }
> -- 
> 2.32.0
>
  

Patch

diff --git a/iconvdata/big5hkscs.c b/iconvdata/big5hkscs.c
index a28b18a5ec..d12389b2e3 100644
--- a/iconvdata/big5hkscs.c
+++ b/iconvdata/big5hkscs.c
@@ -17769,7 +17769,7 @@  static struct
    the output state to the initial state.  This has to be done during the
    flushing.  */
 #define EMIT_SHIFT_TO_INIT \
-  if (data->__statep->__count != 0)					      \
+  if ((data->__statep->__count >> 3) != 0)				      \
     {									      \
       if (FROM_DIRECTION)						      \
 	{								      \
@@ -17778,7 +17778,7 @@  static struct
 	      /* Write out the last character.  */			      \
 	      *((uint32_t *) outbuf) = data->__statep->__count >> 3;	      \
 	      outbuf += sizeof (uint32_t);				      \
-	      data->__statep->__count = 0;				      \
+	      data->__statep->__count &= 7;				      \
 	    }								      \
 	  else								      \
 	    /* We don't have enough room in the output buffer.  */	      \
@@ -17792,7 +17792,7 @@  static struct
 	      uint32_t lasttwo = data->__statep->__count >> 3;		      \
 	      *outbuf++ = (lasttwo >> 8) & 0xff;			      \
 	      *outbuf++ = lasttwo & 0xff;				      \
-	      data->__statep->__count = 0;				      \
+	      data->__statep->__count &= 7;				      \
 	    }								      \
 	  else								      \
 	    /* We don't have enough room in the output buffer.  */	      \
@@ -17878,7 +17878,7 @@  static struct
 									      \
 		/* Otherwise store only the first character now, and	      \
 		   put the second one into the queue.  */		      \
-		*statep = ch2 << 3;					      \
+		*statep = (ch2 << 3) | (*statep & 7);			      \
 		/* Tell the caller why we terminate the loop.  */	      \
 		result = __GCONV_FULL_OUTPUT;				      \
 		break;							      \
@@ -17895,7 +17895,7 @@  static struct
       }									      \
     else								      \
       /* Clear the queue and proceed to output the saved character.  */	      \
-      *statep = 0;							      \
+      *statep &= 7;							      \
 									      \
     put32 (outptr, ch);							      \
     outptr += 4;							      \
@@ -17946,7 +17946,7 @@  static struct
 	  }								      \
 	*outptr++ = (ch >> 8) & 0xff;					      \
 	*outptr++ = ch & 0xff;						      \
-	*statep = 0;							      \
+	*statep &= 7;							      \
 	inptr += 4;							      \
 	continue;							      \
 									      \
@@ -17959,7 +17959,7 @@  static struct
 	  }								      \
 	*outptr++ = (lasttwo >> 8) & 0xff;				      \
 	*outptr++ = lasttwo & 0xff;					      \
-	*statep = 0;							      \
+	*statep &= 7;							      \
 	continue;							      \
       }									      \
 									      \
@@ -17996,7 +17996,7 @@  static struct
 	   /* Check for possible combining character.  */		      \
 	    if (__glibc_unlikely (ch == 0xca || ch == 0xea))		      \
 	      {								      \
-		*statep = ((cp[0] << 8) | cp[1]) << 3;			      \
+		*statep = (((cp[0] << 8) | cp[1]) << 3) | (*statep & 7);      \
 		inptr += 4;						      \
 		continue;						      \
 	      }								      \
diff --git a/iconvdata/tst-iconv-big5-hkscs-to-2ucs4.c b/iconvdata/tst-iconv-big5-hkscs-to-2ucs4.c
index 9601b6c1d9..e1472dc2e2 100644
--- a/iconvdata/tst-iconv-big5-hkscs-to-2ucs4.c
+++ b/iconvdata/tst-iconv-big5-hkscs-to-2ucs4.c
@@ -128,6 +128,71 @@  check_conversion (struct testdata test)
       printf ("error: Result of third conversion was wrong.\n");
       err++;
     }
+
+  /* Now perform the same test as above consuming one byte at a time.  */
+  mbs = test.input;
+  memset (&st, 0, sizeof (st));
+
+  /* Consume the first byte; expect an incomplete multibyte character.  */
+  ret = mbrtowc (&wc, mbs, 1, &st);
+  if (ret != -2)
+    {
+      printf ("error: First byte conversion returned %zd.\n", ret);
+      err++;
+    }
+  /* Advance past the first consumed byte.  */
+  mbs += 1;
+  /* Consume the second byte; expect the first wchar_t.  */
+  ret = mbrtowc (&wc, mbs, 1, &st);
+  if (ret != 1)
+    {
+      printf ("error: Second byte conversion returned %zd.\n", ret);
+      err++;
+    }
+  /* Advance past the second consumed byte.  */
+  mbs += 1;
+  if (wc != test.expected[0])
+    {
+      printf ("error: Result of first wchar_t conversion was wrong.\n");
+      err++;
+    }
+  /* Consume no bytes; expect the second wchar_t.  */
+  ret = mbrtowc (&wc, mbs, 1, &st);
+  if (ret != 0)
+    {
+      printf ("error: First attempt of third byte conversion returned %zd.\n", ret);
+      err++;
+    }
+  /* Do not advance past the third byte.  */
+  mbs += 0;
+  if (wc != test.expected[1])
+    {
+      printf ("error: Result of second wchar_t conversion was wrong.\n");
+      err++;
+    }
+  /* After the second wchar_t conversion, the converter should be in
+     the initial state since the two input BIG5-HKSCS bytes have been
+     consumed and the two wchar_t's have been output.  */
+  if (mbsinit (&st) == 0)
+    {
+      printf ("error: Converter not in initial state.\n");
+      err++;
+    }
+  /* Consume the third byte; expect the third wchar_t.  */
+  ret = mbrtowc (&wc, mbs, 1, &st);
+  if (ret != 1)
+    {
+      printf ("error: Third byte conversion returned %zd.\n", ret);
+      err++;
+    }
+  /* Advance past the third consumed byte.  */
+  mbs += 1;
+  if (wc != test.expected[2])
+    {
+      printf ("error: Result of third wchar_t conversion was wrong.\n");
+      err++;
+    }
+
   /* Return 0 if we saw no errors.  */
   return err;
 }