[v5,2/4] iconv: Better mapping to RFC for UTF-7

Message ID YjdZIdeskC6Ncq0F@ol-mgautier
State Committed
Headers
Series None |

Commit Message

Max Gautier March 20, 2022, 4:41 p.m. UTC
  - Direct use of characters instead of arcane arrays
- isxbase64 is not the Modified BASE64 alphabet, but the characters who
  needs to trigger an explicit shift back to US-ASCII. Make that clearer

Signed-off-by: Max Gautier <mg@max.gautier.name>
---
 iconvdata/utf-7.c | 64 ++++++++++++++++++++++++-----------------------
 1 file changed, 33 insertions(+), 31 deletions(-)
  

Comments

Adhemerval Zanella March 21, 2022, 11:53 a.m. UTC | #1
On 20/03/2022 13:41, Max Gautier via Libc-alpha wrote:
> - Direct use of characters instead of arcane arrays
> - isxbase64 is not the Modified BASE64 alphabet, but the characters who
>   needs to trigger an explicit shift back to US-ASCII. Make that clearer
> 
> Signed-off-by: Max Gautier <mg@max.gautier.name>


LGTM, thanks.

Reviewed-by: Adhemerval Zanellla  <adhemerval.zanella@linaro.org>

> ---
>  iconvdata/utf-7.c | 64 ++++++++++++++++++++++++-----------------------
>  1 file changed, 33 insertions(+), 31 deletions(-)
> 
> diff --git a/iconvdata/utf-7.c b/iconvdata/utf-7.c
> index 9ba0974959..15f3669ac8 100644
> --- a/iconvdata/utf-7.c
> +++ b/iconvdata/utf-7.c
> @@ -30,20 +30,27 @@
>  
>  
>  
> +static bool
> +between (uint32_t const ch,
> +	 uint32_t const lower_bound, uint32_t const upper_bound)
> +{
> +  return (ch >= lower_bound && ch <= upper_bound);
> +}
> +
>  /* The set of "direct characters":
>     A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
>  */
>  
> -static const unsigned char direct_tab[128 / 8] =
> -  {
> -    0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87,
> -    0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
> -  };
> -
> -static int
> -isdirect (uint32_t ch)
> +static bool
> +isdirect (uint32_t ch, enum variant var)
>  {
> -  return (ch < 128 && ((direct_tab[ch >> 3] >> (ch & 7)) & 1));
> +  return (between (ch, 'A', 'Z')
> +	  || between (ch, 'a', 'z')
> +	  || between (ch, '0', '9')
> +	  || ch == '\'' || ch == '(' || ch == ')'
> +	  || between (ch, ',', '/')
> +	  || ch == ':' || ch == '?'
> +	  || ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
>  }
>  
>  
> @@ -52,33 +59,27 @@ isdirect (uint32_t ch)
>     ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
>  */
>  
> -static const unsigned char xdirect_tab[128 / 8] =
> -  {
> -    0x00, 0x26, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff,
> -    0xff, 0xff, 0xff, 0xef, 0xff, 0xff, 0xff, 0x3f
> -  };
> -
> -static int
> -isxdirect (uint32_t ch)
> +static bool
> +isxdirect (uint32_t ch, enum variant var)
>  {
> -  return (ch < 128 && ((xdirect_tab[ch >> 3] >> (ch & 7)) & 1));
> +  return (ch == '\t'
> +	  || ch == '\n'
> +	  || ch == '\r'
> +	  || (between (ch, ' ', '}') && ch != '+' && ch != '\\'));
>  }
>  
>  
> -/* The set of "extended base64 characters":
> +/* Characters which needs to trigger an explicit shift back to US-ASCII (UTF-7
> +   only): Modified base64 + '-' (shift back character)
>     A-Z a-z 0-9 + / -
>  */
>  
> -static const unsigned char xbase64_tab[128 / 8] =
> -  {
> -    0x00, 0x00, 0x00, 0x00, 0x00, 0xa8, 0xff, 0x03,
> -    0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
> -  };
> -
> -static int
> -isxbase64 (uint32_t ch)
> +static bool
> +needs_explicit_shift (uint32_t ch)
>  {
> -  return (ch < 128 && ((xbase64_tab[ch >> 3] >> (ch & 7)) & 1));
> +  return (between (ch, 'A', 'Z')
> +	  || between (ch, 'a', 'z')
> +	  || between (ch, '/', '9') || ch == '+' || ch == '-');
>  }
>  
>  
> @@ -252,7 +253,7 @@ base64 (unsigned int i)
>  		   indeed form a Low Surrogate.  */			      \
>  		uint32_t wc2 = wch & 0xffff;				      \
>  									      \
> -		if (! __builtin_expect (wc2 >= 0xdc00 && wc2 < 0xe000, 1))    \
> +		if (! __glibc_likely (wc2 >= 0xdc00 && wc2 < 0xe000))	      \
>  		  {							      \
>  		    STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1));\
>  		  }							      \
> @@ -372,7 +373,8 @@ base64 (unsigned int i)
>  	    /* deactivate base64 encoding */				      \
>  	    size_t count;						      \
>  									      \
> -	    count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1;  \
> +	    count = ((statep->__count & 0x18) >= 0x10)			      \
> +	      + needs_explicit_shift (ch) + 1;				      \
>  	    if (__glibc_unlikely (outptr + count > outend))		      \
>  	      {								      \
>  		result = __GCONV_FULL_OUTPUT;				      \
> @@ -381,7 +383,7 @@ base64 (unsigned int i)
>  									      \
>  	    if ((statep->__count & 0x18) >= 0x10)			      \
>  	      *outptr++ = base64 ((statep->__count >> 3) & ~3);		      \
> -	    if (isxbase64 (ch))						      \
> +	    if (needs_explicit_shift (ch))				      \
>  	      *outptr++ = '-';						      \
>  	    *outptr++ = (unsigned char) ch;				      \
>  	    statep->__count = 0;					      \
  
Adhemerval Zanella March 21, 2022, 11:59 a.m. UTC | #2
On 21/03/2022 08:53, Adhemerval Zanella wrote:
> 
> 
> On 20/03/2022 13:41, Max Gautier via Libc-alpha wrote:
>> - Direct use of characters instead of arcane arrays
>> - isxbase64 is not the Modified BASE64 alphabet, but the characters who
>>   needs to trigger an explicit shift back to US-ASCII. Make that clearer
>>
>> Signed-off-by: Max Gautier <mg@max.gautier.name>
> 
> 
> LGTM, thanks.
> 
> Reviewed-by: Adhemerval Zanellla  <adhemerval.zanella@linaro.org>
> 
>> ---
>>  iconvdata/utf-7.c | 64 ++++++++++++++++++++++++-----------------------
>>  1 file changed, 33 insertions(+), 31 deletions(-)
>>
>> diff --git a/iconvdata/utf-7.c b/iconvdata/utf-7.c
>> index 9ba0974959..15f3669ac8 100644
>> --- a/iconvdata/utf-7.c
>> +++ b/iconvdata/utf-7.c
>> @@ -30,20 +30,27 @@
>>  
>>  
>>  
>> +static bool
>> +between (uint32_t const ch,
>> +	 uint32_t const lower_bound, uint32_t const upper_bound)
>> +{
>> +  return (ch >= lower_bound && ch <= upper_bound);
>> +}
>> +
>>  /* The set of "direct characters":
>>     A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
>>  */
>>  
>> -static const unsigned char direct_tab[128 / 8] =
>> -  {
>> -    0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87,
>> -    0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
>> -  };
>> -
>> -static int
>> -isdirect (uint32_t ch)
>> +static bool
>> +isdirect (uint32_t ch, enum variant var)
>>  {

In fact I am seeing this failure:

utf-7.c:45:29: error: ‘enum variant’ declared inside parameter list will not be visible outside of this definition o
r declaration [-Werror]
   45 | isdirect (uint32_t ch, enum variant var)
      |                             ^~~~~~~

Since 'enum variant' in only defined on next patch.  Usually the best
practice is keep each patch consistent, so could you move the definition
on this patch?

Or I can fix it for you before installing, it is up to you.

>> -  return (ch < 128 && ((direct_tab[ch >> 3] >> (ch & 7)) & 1));
>> +  return (between (ch, 'A', 'Z')
>> +	  || between (ch, 'a', 'z')
>> +	  || between (ch, '0', '9')
>> +	  || ch == '\'' || ch == '(' || ch == ')'
>> +	  || between (ch, ',', '/')
>> +	  || ch == ':' || ch == '?'
>> +	  || ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
>>  }
>>  
>>  
>> @@ -52,33 +59,27 @@ isdirect (uint32_t ch)
>>     ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
>>  */
>>  
>> -static const unsigned char xdirect_tab[128 / 8] =
>> -  {
>> -    0x00, 0x26, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff,
>> -    0xff, 0xff, 0xff, 0xef, 0xff, 0xff, 0xff, 0x3f
>> -  };
>> -
>> -static int
>> -isxdirect (uint32_t ch)
>> +static bool
>> +isxdirect (uint32_t ch, enum variant var)
>>  {
>> -  return (ch < 128 && ((xdirect_tab[ch >> 3] >> (ch & 7)) & 1));
>> +  return (ch == '\t'
>> +	  || ch == '\n'
>> +	  || ch == '\r'
>> +	  || (between (ch, ' ', '}') && ch != '+' && ch != '\\'));
>>  }
>>  
>>  
>> -/* The set of "extended base64 characters":
>> +/* Characters which needs to trigger an explicit shift back to US-ASCII (UTF-7
>> +   only): Modified base64 + '-' (shift back character)
>>     A-Z a-z 0-9 + / -
>>  */
>>  
>> -static const unsigned char xbase64_tab[128 / 8] =
>> -  {
>> -    0x00, 0x00, 0x00, 0x00, 0x00, 0xa8, 0xff, 0x03,
>> -    0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
>> -  };
>> -
>> -static int
>> -isxbase64 (uint32_t ch)
>> +static bool
>> +needs_explicit_shift (uint32_t ch)
>>  {
>> -  return (ch < 128 && ((xbase64_tab[ch >> 3] >> (ch & 7)) & 1));
>> +  return (between (ch, 'A', 'Z')
>> +	  || between (ch, 'a', 'z')
>> +	  || between (ch, '/', '9') || ch == '+' || ch == '-');
>>  }
>>  
>>  
>> @@ -252,7 +253,7 @@ base64 (unsigned int i)
>>  		   indeed form a Low Surrogate.  */			      \
>>  		uint32_t wc2 = wch & 0xffff;				      \
>>  									      \
>> -		if (! __builtin_expect (wc2 >= 0xdc00 && wc2 < 0xe000, 1))    \
>> +		if (! __glibc_likely (wc2 >= 0xdc00 && wc2 < 0xe000))	      \
>>  		  {							      \
>>  		    STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1));\
>>  		  }							      \
>> @@ -372,7 +373,8 @@ base64 (unsigned int i)
>>  	    /* deactivate base64 encoding */				      \
>>  	    size_t count;						      \
>>  									      \
>> -	    count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1;  \
>> +	    count = ((statep->__count & 0x18) >= 0x10)			      \
>> +	      + needs_explicit_shift (ch) + 1;				      \
>>  	    if (__glibc_unlikely (outptr + count > outend))		      \
>>  	      {								      \
>>  		result = __GCONV_FULL_OUTPUT;				      \
>> @@ -381,7 +383,7 @@ base64 (unsigned int i)
>>  									      \
>>  	    if ((statep->__count & 0x18) >= 0x10)			      \
>>  	      *outptr++ = base64 ((statep->__count >> 3) & ~3);		      \
>> -	    if (isxbase64 (ch))						      \
>> +	    if (needs_explicit_shift (ch))				      \
>>  	      *outptr++ = '-';						      \
>>  	    *outptr++ = (unsigned char) ch;				      \
>>  	    statep->__count = 0;					      \
  
Adhemerval Zanella March 21, 2022, 12:06 p.m. UTC | #3
On 21/03/2022 08:59, Adhemerval Zanella wrote:
> 
> 
> On 21/03/2022 08:53, Adhemerval Zanella wrote:
>>
>>
>> On 20/03/2022 13:41, Max Gautier via Libc-alpha wrote:
>>> - Direct use of characters instead of arcane arrays
>>> - isxbase64 is not the Modified BASE64 alphabet, but the characters who
>>>   needs to trigger an explicit shift back to US-ASCII. Make that clearer
>>>
>>> Signed-off-by: Max Gautier <mg@max.gautier.name>
>>
>>
>> LGTM, thanks.
>>
>> Reviewed-by: Adhemerval Zanellla  <adhemerval.zanella@linaro.org>
>>
>>> ---
>>>  iconvdata/utf-7.c | 64 ++++++++++++++++++++++++-----------------------
>>>  1 file changed, 33 insertions(+), 31 deletions(-)
>>>
>>> diff --git a/iconvdata/utf-7.c b/iconvdata/utf-7.c
>>> index 9ba0974959..15f3669ac8 100644
>>> --- a/iconvdata/utf-7.c
>>> +++ b/iconvdata/utf-7.c
>>> @@ -30,20 +30,27 @@
>>>  
>>>  
>>>  
>>> +static bool
>>> +between (uint32_t const ch,
>>> +	 uint32_t const lower_bound, uint32_t const upper_bound)
>>> +{
>>> +  return (ch >= lower_bound && ch <= upper_bound);
>>> +}
>>> +
>>>  /* The set of "direct characters":
>>>     A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
>>>  */
>>>  
>>> -static const unsigned char direct_tab[128 / 8] =
>>> -  {
>>> -    0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87,
>>> -    0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
>>> -  };
>>> -
>>> -static int
>>> -isdirect (uint32_t ch)
>>> +static bool
>>> +isdirect (uint32_t ch, enum variant var)
>>>  {
> 
> In fact I am seeing this failure:
> 
> utf-7.c:45:29: error: ‘enum variant’ declared inside parameter list will not be visible outside of this definition o
> r declaration [-Werror]
>    45 | isdirect (uint32_t ch, enum variant var)
>       |                             ^~~~~~~
> 
> Since 'enum variant' in only defined on next patch.  Usually the best
> practice is keep each patch consistent, so could you move the definition
> on this patch?
> 
> Or I can fix it for you before installing, it is up to you.

And it does not actually required the variant argument on neither, isdirect or
isxdirect.  The obvious fix for this patch is:

diff --git a/iconvdata/utf-7.c b/iconvdata/utf-7.c
index 4a89de235a..815b1891c7 100644
--- a/iconvdata/utf-7.c
+++ b/iconvdata/utf-7.c
@@ -42,7 +42,7 @@ between (uint32_t const ch,
 */
 
 static bool
-isdirect (uint32_t ch, enum variant var)
+isdirect (uint32_t ch)
 {
   return (between (ch, 'A', 'Z')
          || between (ch, 'a', 'z')
@@ -60,7 +60,7 @@ isdirect (uint32_t ch, enum variant var)
 */
 
 static bool
-isxdirect (uint32_t ch, enum variant var)
+isxdirect (uint32_t ch)
 {
   return (ch == '\t'
          || ch == '\n'
  
Max Gautier March 21, 2022, 2:07 p.m. UTC | #4
On Mon, Mar 21, 2022 at 08:59:27AM -0300, Adhemerval Zanella wrote:
> 
> 
> On 21/03/2022 08:53, Adhemerval Zanella wrote:
> > 
> > 
> > On 20/03/2022 13:41, Max Gautier via Libc-alpha wrote:
> >> - Direct use of characters instead of arcane arrays
> >> - isxbase64 is not the Modified BASE64 alphabet, but the characters who
> >>   needs to trigger an explicit shift back to US-ASCII. Make that clearer
> >>
> >> Signed-off-by: Max Gautier <mg@max.gautier.name>
> > 
> > 
> > LGTM, thanks.
> > 
> > Reviewed-by: Adhemerval Zanellla  <adhemerval.zanella@linaro.org>
> > 
> >> ---
> >>  iconvdata/utf-7.c | 64 ++++++++++++++++++++++++-----------------------
> >>  1 file changed, 33 insertions(+), 31 deletions(-)
> >>
> >> diff --git a/iconvdata/utf-7.c b/iconvdata/utf-7.c
> >> index 9ba0974959..15f3669ac8 100644
> >> --- a/iconvdata/utf-7.c
> >> +++ b/iconvdata/utf-7.c
> >> @@ -30,20 +30,27 @@
> >>  
> >>  
> >>  
> >> +static bool
> >> +between (uint32_t const ch,
> >> +	 uint32_t const lower_bound, uint32_t const upper_bound)
> >> +{
> >> +  return (ch >= lower_bound && ch <= upper_bound);
> >> +}
> >> +
> >>  /* The set of "direct characters":
> >>     A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
> >>  */
> >>  
> >> -static const unsigned char direct_tab[128 / 8] =
> >> -  {
> >> -    0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87,
> >> -    0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
> >> -  };
> >> -
> >> -static int
> >> -isdirect (uint32_t ch)
> >> +static bool
> >> +isdirect (uint32_t ch, enum variant var)
> >>  {
> 
> In fact I am seeing this failure:
> 
> utf-7.c:45:29: error: ‘enum variant’ declared inside parameter list will not be visible outside of this definition o
> r declaration [-Werror]
>    45 | isdirect (uint32_t ch, enum variant var)
>       |                             ^~~~~~~
> 
> Since 'enum variant' in only defined on next patch.  Usually the best
> practice is keep each patch consistent, so could you move the definition
> on this patch?
> 
> Or I can fix it for you before installing, it is up to you.
> 

I think I mixed up my patches while integrating the corrections and
style fixes you mentionned, sorry.
No problem for me I you fix it before applying.

Thanks !
  

Patch

diff --git a/iconvdata/utf-7.c b/iconvdata/utf-7.c
index 9ba0974959..15f3669ac8 100644
--- a/iconvdata/utf-7.c
+++ b/iconvdata/utf-7.c
@@ -30,20 +30,27 @@ 
 
 
 
+static bool
+between (uint32_t const ch,
+	 uint32_t const lower_bound, uint32_t const upper_bound)
+{
+  return (ch >= lower_bound && ch <= upper_bound);
+}
+
 /* The set of "direct characters":
    A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
 */
 
-static const unsigned char direct_tab[128 / 8] =
-  {
-    0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87,
-    0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
-  };
-
-static int
-isdirect (uint32_t ch)
+static bool
+isdirect (uint32_t ch, enum variant var)
 {
-  return (ch < 128 && ((direct_tab[ch >> 3] >> (ch & 7)) & 1));
+  return (between (ch, 'A', 'Z')
+	  || between (ch, 'a', 'z')
+	  || between (ch, '0', '9')
+	  || ch == '\'' || ch == '(' || ch == ')'
+	  || between (ch, ',', '/')
+	  || ch == ':' || ch == '?'
+	  || ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
 }
 
 
@@ -52,33 +59,27 @@  isdirect (uint32_t ch)
    ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
 */
 
-static const unsigned char xdirect_tab[128 / 8] =
-  {
-    0x00, 0x26, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff,
-    0xff, 0xff, 0xff, 0xef, 0xff, 0xff, 0xff, 0x3f
-  };
-
-static int
-isxdirect (uint32_t ch)
+static bool
+isxdirect (uint32_t ch, enum variant var)
 {
-  return (ch < 128 && ((xdirect_tab[ch >> 3] >> (ch & 7)) & 1));
+  return (ch == '\t'
+	  || ch == '\n'
+	  || ch == '\r'
+	  || (between (ch, ' ', '}') && ch != '+' && ch != '\\'));
 }
 
 
-/* The set of "extended base64 characters":
+/* Characters which needs to trigger an explicit shift back to US-ASCII (UTF-7
+   only): Modified base64 + '-' (shift back character)
    A-Z a-z 0-9 + / -
 */
 
-static const unsigned char xbase64_tab[128 / 8] =
-  {
-    0x00, 0x00, 0x00, 0x00, 0x00, 0xa8, 0xff, 0x03,
-    0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
-  };
-
-static int
-isxbase64 (uint32_t ch)
+static bool
+needs_explicit_shift (uint32_t ch)
 {
-  return (ch < 128 && ((xbase64_tab[ch >> 3] >> (ch & 7)) & 1));
+  return (between (ch, 'A', 'Z')
+	  || between (ch, 'a', 'z')
+	  || between (ch, '/', '9') || ch == '+' || ch == '-');
 }
 
 
@@ -252,7 +253,7 @@  base64 (unsigned int i)
 		   indeed form a Low Surrogate.  */			      \
 		uint32_t wc2 = wch & 0xffff;				      \
 									      \
-		if (! __builtin_expect (wc2 >= 0xdc00 && wc2 < 0xe000, 1))    \
+		if (! __glibc_likely (wc2 >= 0xdc00 && wc2 < 0xe000))	      \
 		  {							      \
 		    STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1));\
 		  }							      \
@@ -372,7 +373,8 @@  base64 (unsigned int i)
 	    /* deactivate base64 encoding */				      \
 	    size_t count;						      \
 									      \
-	    count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1;  \
+	    count = ((statep->__count & 0x18) >= 0x10)			      \
+	      + needs_explicit_shift (ch) + 1;				      \
 	    if (__glibc_unlikely (outptr + count > outend))		      \
 	      {								      \
 		result = __GCONV_FULL_OUTPUT;				      \
@@ -381,7 +383,7 @@  base64 (unsigned int i)
 									      \
 	    if ((statep->__count & 0x18) >= 0x10)			      \
 	      *outptr++ = base64 ((statep->__count >> 3) & ~3);		      \
-	    if (isxbase64 (ch))						      \
+	    if (needs_explicit_shift (ch))				      \
 	      *outptr++ = '-';						      \
 	    *outptr++ = (unsigned char) ch;				      \
 	    statep->__count = 0;					      \