[v4,2/4] iconv: Better mapping to RFC for UTF-7

Message ID 20211209093152.313872-3-mg@max.gautier.name
State Superseded
Headers
Series iconv: Add support for UTF-7-IMAP |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Max Gautier Dec. 9, 2021, 9:31 a.m. UTC
  - Direct use of characters instead of arcane arrays
- isxbase64 is not the Modified BASE64 alphabet, but the characters who
  needs to trigger an explicit shift back to US-ASCII. Make that clearer

Signed-off-by: Max Gautier <mg@max.gautier.name>
---
 iconvdata/utf-7.c | 56 +++++++++++++++++++++++++++--------------------
 1 file changed, 32 insertions(+), 24 deletions(-)
  

Comments

Adhemerval Zanella Netto March 7, 2022, 12:14 p.m. UTC | #1
On 09/12/2021 06:31, Max Gautier via Libc-alpha wrote:
> - Direct use of characters instead of arcane arrays
> - isxbase64 is not the Modified BASE64 alphabet, but the characters who
>   needs to trigger an explicit shift back to US-ASCII. Make that clearer
> 
> Signed-off-by: Max Gautier <mg@max.gautier.name>

LGTM with style fixes below.

Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

> ---
>  iconvdata/utf-7.c | 56 +++++++++++++++++++++++++++--------------------
>  1 file changed, 32 insertions(+), 24 deletions(-)
> 
> diff --git a/iconvdata/utf-7.c b/iconvdata/utf-7.c
> index 9ba0974959..ac7d78141a 100644
> --- a/iconvdata/utf-7.c
> +++ b/iconvdata/utf-7.c
> @@ -30,20 +30,27 @@
>  
>  
>  
> +static int
> +between(uint32_t const ch,

Space before '(') and for other usages below..  Also 'const' does not change much
here.

> +        uint32_t const lower_bound, uint32_t const upper_bound)
> +{
> +    return (ch >= lower_bound && ch <= upper_bound);
> +}
> +
>  /* The set of "direct characters":
>     A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
>  */
>  
> -static const unsigned char direct_tab[128 / 8] =
> -  {
> -    0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87,
> -    0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
> -  };
> -
>  static int
>  isdirect (uint32_t ch)
>  {
> -  return (ch < 128 && ((direct_tab[ch >> 3] >> (ch & 7)) & 1));
> +    return (between(ch, 'A', 'Z')

Ok, it is indeed clear.

> +	    || between(ch, 'a', 'z')
> +	    || between(ch, '0', '9')
> +	    || ch == '\'' || ch == '(' || ch == ')'
> +	    || between(ch, ',', '/')
> +	    || ch == ':' || ch == '?'
> +	    || ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
>  }
>  
>  
> @@ -52,33 +59,33 @@ isdirect (uint32_t ch)
>     ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
>  */
>  
> -static const unsigned char xdirect_tab[128 / 8] =
> -  {
> -    0x00, 0x26, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff,
> -    0xff, 0xff, 0xff, 0xef, 0xff, 0xff, 0xff, 0x3f
> -  };
>  
>  static int
>  isxdirect (uint32_t ch)
>  {
> -  return (ch < 128 && ((xdirect_tab[ch >> 3] >> (ch & 7)) & 1));
> +    return (ch == '\t'
> +            || ch == '\n'
> +            || ch == '\r'
> +            || (between(ch, ' ','}')
> +                && ch != '+' && ch != '\\')
> +           );
>  }
>  
>  

Ok.

> -/* The set of "extended base64 characters":
> +/* Characters which needs to trigger an explicit shift back to US-ASCII (UTF-7
> +   only): Modified base64 + '-' (shift back character)
>     A-Z a-z 0-9 + / -
>  */
>  
> -static const unsigned char xbase64_tab[128 / 8] =
> -  {
> -    0x00, 0x00, 0x00, 0x00, 0x00, 0xa8, 0xff, 0x03,
> -    0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
> -  };
> -
>  static int
> -isxbase64 (uint32_t ch)
> +needs_explicit_shift (uint32_t ch)
>  {
> -  return (ch < 128 && ((xbase64_tab[ch >> 3] >> (ch & 7)) & 1));
> +  return (between(ch, 'A', 'Z')
> +          || between(ch, 'a', 'z')
> +          || between(ch, '/', '9')
> +          || ch == '+'
> +          || ch == '-'
> +          );
>  }
>  
>  

Ok.

> @@ -372,7 +379,8 @@ base64 (unsigned int i)
>  	    /* deactivate base64 encoding */				      \
>  	    size_t count;						      \
>  									      \
> -	    count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1;  \
> +	    count = ((statep->__count & 0x18) >= 0x10)			      \
> +	      + needs_explicit_shift (ch) + 1;				      \
>  	    if (__glibc_unlikely (outptr + count > outend))		      \
>  	      {								      \
>  		result = __GCONV_FULL_OUTPUT;				      \
> @@ -381,7 +389,7 @@ base64 (unsigned int i)
>  									      \
>  	    if ((statep->__count & 0x18) >= 0x10)			      \
>  	      *outptr++ = base64 ((statep->__count >> 3) & ~3);		      \
> -	    if (isxbase64 (ch))						      \
> +	    if (needs_explicit_shift (ch))				      \
>  	      *outptr++ = '-';						      \
>  	    *outptr++ = (unsigned char) ch;				      \
>  	    statep->__count = 0;					      \

Ok, it just change the function name.
  

Patch

diff --git a/iconvdata/utf-7.c b/iconvdata/utf-7.c
index 9ba0974959..ac7d78141a 100644
--- a/iconvdata/utf-7.c
+++ b/iconvdata/utf-7.c
@@ -30,20 +30,27 @@ 
 
 
 
+static int
+between(uint32_t const ch,
+        uint32_t const lower_bound, uint32_t const upper_bound)
+{
+    return (ch >= lower_bound && ch <= upper_bound);
+}
+
 /* The set of "direct characters":
    A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
 */
 
-static const unsigned char direct_tab[128 / 8] =
-  {
-    0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87,
-    0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
-  };
-
 static int
 isdirect (uint32_t ch)
 {
-  return (ch < 128 && ((direct_tab[ch >> 3] >> (ch & 7)) & 1));
+    return (between(ch, 'A', 'Z')
+	    || between(ch, 'a', 'z')
+	    || between(ch, '0', '9')
+	    || ch == '\'' || ch == '(' || ch == ')'
+	    || between(ch, ',', '/')
+	    || ch == ':' || ch == '?'
+	    || ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
 }
 
 
@@ -52,33 +59,33 @@  isdirect (uint32_t ch)
    ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
 */
 
-static const unsigned char xdirect_tab[128 / 8] =
-  {
-    0x00, 0x26, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff,
-    0xff, 0xff, 0xff, 0xef, 0xff, 0xff, 0xff, 0x3f
-  };
 
 static int
 isxdirect (uint32_t ch)
 {
-  return (ch < 128 && ((xdirect_tab[ch >> 3] >> (ch & 7)) & 1));
+    return (ch == '\t'
+            || ch == '\n'
+            || ch == '\r'
+            || (between(ch, ' ','}')
+                && ch != '+' && ch != '\\')
+           );
 }
 
 
-/* The set of "extended base64 characters":
+/* Characters which needs to trigger an explicit shift back to US-ASCII (UTF-7
+   only): Modified base64 + '-' (shift back character)
    A-Z a-z 0-9 + / -
 */
 
-static const unsigned char xbase64_tab[128 / 8] =
-  {
-    0x00, 0x00, 0x00, 0x00, 0x00, 0xa8, 0xff, 0x03,
-    0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
-  };
-
 static int
-isxbase64 (uint32_t ch)
+needs_explicit_shift (uint32_t ch)
 {
-  return (ch < 128 && ((xbase64_tab[ch >> 3] >> (ch & 7)) & 1));
+  return (between(ch, 'A', 'Z')
+          || between(ch, 'a', 'z')
+          || between(ch, '/', '9')
+          || ch == '+'
+          || ch == '-'
+          );
 }
 
 
@@ -372,7 +379,8 @@  base64 (unsigned int i)
 	    /* deactivate base64 encoding */				      \
 	    size_t count;						      \
 									      \
-	    count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1;  \
+	    count = ((statep->__count & 0x18) >= 0x10)			      \
+	      + needs_explicit_shift (ch) + 1;				      \
 	    if (__glibc_unlikely (outptr + count > outend))		      \
 	      {								      \
 		result = __GCONV_FULL_OUTPUT;				      \
@@ -381,7 +389,7 @@  base64 (unsigned int i)
 									      \
 	    if ((statep->__count & 0x18) >= 0x10)			      \
 	      *outptr++ = base64 ((statep->__count >> 3) & ~3);		      \
-	    if (isxbase64 (ch))						      \
+	    if (needs_explicit_shift (ch))				      \
 	      *outptr++ = '-';						      \
 	    *outptr++ = (unsigned char) ch;				      \
 	    statep->__count = 0;					      \