[3/5] Transform UTF-7 to MODIFIED-UTF-7

Message ID 20200819230702.229822-4-mg@max.gautier.name
State Superseded
Headers
Series iconv: module for MODIFIED-UTF-7 |

Commit Message

Max Gautier Aug. 19, 2020, 11:07 p.m. UTC
  * shift character is '&' instead of '+'
* No "optionnal direct characters" set
* modified base64 character set
* use direct comparison instead of arrays and bitwise op 
---
Regarding the fourth item, if there is reasons to use the bitwise way,
please let me know.
 iconvdata/modified-utf-7.c | 97 ++++++++++++--------------------------
 1 file changed, 31 insertions(+), 66 deletions(-)
  

Patch

diff --git a/iconvdata/modified-utf-7.c b/iconvdata/modified-utf-7.c
index fc6a8dfcfd..e6eb784891 100644
--- a/iconvdata/modified-utf-7.c
+++ b/iconvdata/modified-utf-7.c
@@ -1,4 +1,4 @@ 
-/* Conversion module for UTF-7.
+/* Conversion module for Modified UTF-7.
    Copyright (C) 2000-2020 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,12 +16,12 @@ 
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-/* UTF-7 is a legacy encoding used for transmitting Unicode within the
-   ASCII character set, used primarily by mail agents.  New programs
-   are encouraged to use UTF-8 instead.
+/* Modified UTF-7 is a legacy encoding used for transmitting Unicode within the
+   ASCII character set, used primarily by IMAP server and clients agents.
+   New programs are encouraged to use UTF-8 instead.
 
-   UTF-7 is specified in RFC 2152 (and old RFC 1641, RFC 1642).  The
-   original Base64 encoding is defined in RFC 2045.  */
+   Modified UTF-7 is specified in RFC 3501 as part of the IMAPv4 specification.
+   The original Base64 encoding is defined in RFC 2045.  */
 
 #include <dlfcn.h>
 #include <gconv.h>
@@ -29,64 +29,29 @@ 
 #include <stdlib.h>
 
 
-/* Define this to 1 if you want the so-called "optional direct" characters
-      ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
-   to be encoded. Define to 0 if you want them to be passed straight
-   through, like the so-called "direct" characters.
-   We set this to 1 because it's safer.
- */
-#define UTF7_ENCODE_OPTIONAL_CHARS 1
-
-
 /* The set of "direct characters":
    A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
+   ! " # $ % + * ; < = > @ [ ] ^ _ ` { | }
 */
 
-static const unsigned char direct_tab[128 / 8] =
-  {
-    0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87,
-    0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
-  };
-
 static int
 isdirect (uint32_t ch)
 {
-  return (ch < 128 && ((direct_tab[ch >> 3] >> (ch & 7)) & 1));
-}
-
-
-/* The set of "direct and optional direct characters":
-   A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
-   ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
-*/
-
-static const unsigned char xdirect_tab[128 / 8] =
-  {
-    0x00, 0x26, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff,
-    0xff, 0xff, 0xff, 0xef, 0xff, 0xff, 0xff, 0x3f
-  };
-
-static int
-isxdirect (uint32_t ch)
-{
-  return (ch < 128 && ((xdirect_tab[ch >> 3] >> (ch & 7)) & 1));
+  return ((ch == '\n' || ch == '\t' || ch == '\r')
+		  || (ch >= 0x20 && ch <= 0x7e && ch != '&'));
 }
 
-
-/* The set of "extended base64 characters":
-   A-Z a-z 0-9 + / -
+/* The set of "modified base64 characters":
+   A-Z a-z 0-9 + , -
 */
 
-static const unsigned char xbase64_tab[128 / 8] =
-  {
-    0x00, 0x00, 0x00, 0x00, 0x00, 0xa8, 0xff, 0x03,
-    0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
-  };
-
 static int
-isxbase64 (uint32_t ch)
+ismbase64 (uint32_t ch)
 {
-  return (ch < 128 && ((xbase64_tab[ch >> 3] >> (ch & 7)) & 1));
+  return ((ch >= 'a' && ch <= 'z')
+			  || (ch >= 'A' && ch <= 'Z')
+			  || (ch >= '0' && ch <= '9')
+			  || (ch == '+' || ch == ','));
 }
 
 
@@ -103,18 +68,18 @@  base64 (unsigned int i)
   else if (i == 62)
     return '+';
   else if (i == 63)
-    return '/';
+    return ',';
   else
     abort ();
 }
 
 
 /* Definitions used in the body of the `gconv' function.  */
-#define CHARSET_NAME		"UTF-7//"
+#define CHARSET_NAME		"MODIFIED-UTF-7//"
 #define DEFINE_INIT		1
 #define DEFINE_FINI		1
-#define FROM_LOOP		from_utf7_loop
-#define TO_LOOP			to_utf7_loop
+#define FROM_LOOP		from_m_utf7_loop
+#define TO_LOOP			to_m_utf7_loop
 #define MIN_NEEDED_FROM		1
 #define MAX_NEEDED_FROM		6
 #define MIN_NEEDED_TO		4
@@ -161,13 +126,13 @@  base64 (unsigned int i)
     if ((statep->__count >> 3) == 0)					      \
       {									      \
 	/* base64 encoding inactive.  */				      \
-	if (isxdirect (ch))						      \
+	if (isdirect (ch))						      \
 	  {								      \
 	    inptr++;							      \
 	    put32 (outptr, ch);						      \
 	    outptr += 4;						      \
 	  }								      \
-	else if (__glibc_likely (ch == '+'))				      \
+	else if (__glibc_likely (ch == '&'))				      \
 	  {								      \
 	    if (__glibc_unlikely (inptr + 2 > inend))			      \
 	      {								      \
@@ -209,7 +174,7 @@  base64 (unsigned int i)
 	  i = ch - '0' + 52;						      \
 	else if (ch == '+')						      \
 	  i = 62;							      \
-	else if (ch == '/')						      \
+	else if (ch == ',')						      \
 	  i = 63;							      \
 	else								      \
 	  {								      \
@@ -323,7 +288,7 @@  base64 (unsigned int i)
     if ((statep->__count & 0x18) == 0)					      \
       {									      \
 	/* base64 encoding inactive */					      \
-	if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch))      \
+	if (isdirect (ch))      \
 	  {								      \
 	    *outptr++ = (unsigned char) ch;				      \
 	  }								      \
@@ -331,7 +296,7 @@  base64 (unsigned int i)
 	  {								      \
 	    size_t count;						      \
 									      \
-	    if (ch == '+')						      \
+	    if (ch == '&')						      \
 	      count = 2;						      \
 	    else if (ch < 0x10000)					      \
 	      count = 3;						      \
@@ -346,8 +311,8 @@  base64 (unsigned int i)
 		break;							      \
 	      }								      \
 									      \
-	    *outptr++ = '+';						      \
-	    if (ch == '+')						      \
+	    *outptr++ = '&';						      \
+	    if (ch == '&')						      \
 	      *outptr++ = '-';						      \
 	    else if (ch < 0x10000)					      \
 	      {								      \
@@ -375,12 +340,12 @@  base64 (unsigned int i)
     else								      \
       {									      \
 	/* base64 encoding active */					      \
-	if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch))      \
+	if (isdirect (ch))      \
 	  {								      \
 	    /* deactivate base64 encoding */				      \
 	    size_t count;						      \
 									      \
-	    count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1;  \
+	    count = ((statep->__count & 0x18) >= 0x10) + ismbase64 (ch) + 1;  \
 	    if (__glibc_unlikely (outptr + count > outend))		      \
 	      {								      \
 		result = __GCONV_FULL_OUTPUT;				      \
@@ -389,7 +354,7 @@  base64 (unsigned int i)
 									      \
 	    if ((statep->__count & 0x18) >= 0x10)			      \
 	      *outptr++ = base64 ((statep->__count >> 3) & ~3);		      \
-	    if (isxbase64 (ch))						      \
+	    if (ismbase64 (ch))						      \
 	      *outptr++ = '-';						      \
 	    *outptr++ = (unsigned char) ch;				      \
 	    statep->__count = 0;					      \
@@ -499,7 +464,7 @@  base64 (unsigned int i)
     memset (data->__statep, '\0', sizeof (mbstate_t));			      \
   else									      \
     {									      \
-      /* The "to UTF-7" direction.  Flush the remaining bits and terminate    \
+      /* The "to M-UTF-7" direction.  Flush the remaining bits and terminate    \
 	 with a '-' byte.  This will guarantee correct decoding if more	      \
 	 UTF-7 encoded text is added afterwards.  */			      \
       int state = data->__statep->__count;				      \