[v5,2/4] iconv: Better mapping to RFC for UTF-7
Commit Message
- Direct use of characters instead of arcane arrays
- isxbase64 is not the Modified BASE64 alphabet, but the characters who
needs to trigger an explicit shift back to US-ASCII. Make that clearer
Signed-off-by: Max Gautier <mg@max.gautier.name>
---
iconvdata/utf-7.c | 64 ++++++++++++++++++++++++-----------------------
1 file changed, 33 insertions(+), 31 deletions(-)
Comments
On 20/03/2022 13:41, Max Gautier via Libc-alpha wrote:
> - Direct use of characters instead of arcane arrays
> - isxbase64 is not the Modified BASE64 alphabet, but the characters who
> needs to trigger an explicit shift back to US-ASCII. Make that clearer
>
> Signed-off-by: Max Gautier <mg@max.gautier.name>
LGTM, thanks.
Reviewed-by: Adhemerval Zanellla <adhemerval.zanella@linaro.org>
> ---
> iconvdata/utf-7.c | 64 ++++++++++++++++++++++++-----------------------
> 1 file changed, 33 insertions(+), 31 deletions(-)
>
> diff --git a/iconvdata/utf-7.c b/iconvdata/utf-7.c
> index 9ba0974959..15f3669ac8 100644
> --- a/iconvdata/utf-7.c
> +++ b/iconvdata/utf-7.c
> @@ -30,20 +30,27 @@
>
>
>
> +static bool
> +between (uint32_t const ch,
> + uint32_t const lower_bound, uint32_t const upper_bound)
> +{
> + return (ch >= lower_bound && ch <= upper_bound);
> +}
> +
> /* The set of "direct characters":
> A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
> */
>
> -static const unsigned char direct_tab[128 / 8] =
> - {
> - 0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87,
> - 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
> - };
> -
> -static int
> -isdirect (uint32_t ch)
> +static bool
> +isdirect (uint32_t ch, enum variant var)
> {
> - return (ch < 128 && ((direct_tab[ch >> 3] >> (ch & 7)) & 1));
> + return (between (ch, 'A', 'Z')
> + || between (ch, 'a', 'z')
> + || between (ch, '0', '9')
> + || ch == '\'' || ch == '(' || ch == ')'
> + || between (ch, ',', '/')
> + || ch == ':' || ch == '?'
> + || ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
> }
>
>
> @@ -52,33 +59,27 @@ isdirect (uint32_t ch)
> ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
> */
>
> -static const unsigned char xdirect_tab[128 / 8] =
> - {
> - 0x00, 0x26, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff,
> - 0xff, 0xff, 0xff, 0xef, 0xff, 0xff, 0xff, 0x3f
> - };
> -
> -static int
> -isxdirect (uint32_t ch)
> +static bool
> +isxdirect (uint32_t ch, enum variant var)
> {
> - return (ch < 128 && ((xdirect_tab[ch >> 3] >> (ch & 7)) & 1));
> + return (ch == '\t'
> + || ch == '\n'
> + || ch == '\r'
> + || (between (ch, ' ', '}') && ch != '+' && ch != '\\'));
> }
>
>
> -/* The set of "extended base64 characters":
> +/* Characters which needs to trigger an explicit shift back to US-ASCII (UTF-7
> + only): Modified base64 + '-' (shift back character)
> A-Z a-z 0-9 + / -
> */
>
> -static const unsigned char xbase64_tab[128 / 8] =
> - {
> - 0x00, 0x00, 0x00, 0x00, 0x00, 0xa8, 0xff, 0x03,
> - 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
> - };
> -
> -static int
> -isxbase64 (uint32_t ch)
> +static bool
> +needs_explicit_shift (uint32_t ch)
> {
> - return (ch < 128 && ((xbase64_tab[ch >> 3] >> (ch & 7)) & 1));
> + return (between (ch, 'A', 'Z')
> + || between (ch, 'a', 'z')
> + || between (ch, '/', '9') || ch == '+' || ch == '-');
> }
>
>
> @@ -252,7 +253,7 @@ base64 (unsigned int i)
> indeed form a Low Surrogate. */ \
> uint32_t wc2 = wch & 0xffff; \
> \
> - if (! __builtin_expect (wc2 >= 0xdc00 && wc2 < 0xe000, 1)) \
> + if (! __glibc_likely (wc2 >= 0xdc00 && wc2 < 0xe000)) \
> { \
> STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1));\
> } \
> @@ -372,7 +373,8 @@ base64 (unsigned int i)
> /* deactivate base64 encoding */ \
> size_t count; \
> \
> - count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1; \
> + count = ((statep->__count & 0x18) >= 0x10) \
> + + needs_explicit_shift (ch) + 1; \
> if (__glibc_unlikely (outptr + count > outend)) \
> { \
> result = __GCONV_FULL_OUTPUT; \
> @@ -381,7 +383,7 @@ base64 (unsigned int i)
> \
> if ((statep->__count & 0x18) >= 0x10) \
> *outptr++ = base64 ((statep->__count >> 3) & ~3); \
> - if (isxbase64 (ch)) \
> + if (needs_explicit_shift (ch)) \
> *outptr++ = '-'; \
> *outptr++ = (unsigned char) ch; \
> statep->__count = 0; \
On 21/03/2022 08:53, Adhemerval Zanella wrote:
>
>
> On 20/03/2022 13:41, Max Gautier via Libc-alpha wrote:
>> - Direct use of characters instead of arcane arrays
>> - isxbase64 is not the Modified BASE64 alphabet, but the characters who
>> needs to trigger an explicit shift back to US-ASCII. Make that clearer
>>
>> Signed-off-by: Max Gautier <mg@max.gautier.name>
>
>
> LGTM, thanks.
>
> Reviewed-by: Adhemerval Zanellla <adhemerval.zanella@linaro.org>
>
>> ---
>> iconvdata/utf-7.c | 64 ++++++++++++++++++++++++-----------------------
>> 1 file changed, 33 insertions(+), 31 deletions(-)
>>
>> diff --git a/iconvdata/utf-7.c b/iconvdata/utf-7.c
>> index 9ba0974959..15f3669ac8 100644
>> --- a/iconvdata/utf-7.c
>> +++ b/iconvdata/utf-7.c
>> @@ -30,20 +30,27 @@
>>
>>
>>
>> +static bool
>> +between (uint32_t const ch,
>> + uint32_t const lower_bound, uint32_t const upper_bound)
>> +{
>> + return (ch >= lower_bound && ch <= upper_bound);
>> +}
>> +
>> /* The set of "direct characters":
>> A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
>> */
>>
>> -static const unsigned char direct_tab[128 / 8] =
>> - {
>> - 0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87,
>> - 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
>> - };
>> -
>> -static int
>> -isdirect (uint32_t ch)
>> +static bool
>> +isdirect (uint32_t ch, enum variant var)
>> {
In fact I am seeing this failure:
utf-7.c:45:29: error: ‘enum variant’ declared inside parameter list will not be visible outside of this definition o
r declaration [-Werror]
45 | isdirect (uint32_t ch, enum variant var)
| ^~~~~~~
Since 'enum variant' in only defined on next patch. Usually the best
practice is keep each patch consistent, so could you move the definition
on this patch?
Or I can fix it for you before installing, it is up to you.
>> - return (ch < 128 && ((direct_tab[ch >> 3] >> (ch & 7)) & 1));
>> + return (between (ch, 'A', 'Z')
>> + || between (ch, 'a', 'z')
>> + || between (ch, '0', '9')
>> + || ch == '\'' || ch == '(' || ch == ')'
>> + || between (ch, ',', '/')
>> + || ch == ':' || ch == '?'
>> + || ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
>> }
>>
>>
>> @@ -52,33 +59,27 @@ isdirect (uint32_t ch)
>> ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
>> */
>>
>> -static const unsigned char xdirect_tab[128 / 8] =
>> - {
>> - 0x00, 0x26, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff,
>> - 0xff, 0xff, 0xff, 0xef, 0xff, 0xff, 0xff, 0x3f
>> - };
>> -
>> -static int
>> -isxdirect (uint32_t ch)
>> +static bool
>> +isxdirect (uint32_t ch, enum variant var)
>> {
>> - return (ch < 128 && ((xdirect_tab[ch >> 3] >> (ch & 7)) & 1));
>> + return (ch == '\t'
>> + || ch == '\n'
>> + || ch == '\r'
>> + || (between (ch, ' ', '}') && ch != '+' && ch != '\\'));
>> }
>>
>>
>> -/* The set of "extended base64 characters":
>> +/* Characters which needs to trigger an explicit shift back to US-ASCII (UTF-7
>> + only): Modified base64 + '-' (shift back character)
>> A-Z a-z 0-9 + / -
>> */
>>
>> -static const unsigned char xbase64_tab[128 / 8] =
>> - {
>> - 0x00, 0x00, 0x00, 0x00, 0x00, 0xa8, 0xff, 0x03,
>> - 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
>> - };
>> -
>> -static int
>> -isxbase64 (uint32_t ch)
>> +static bool
>> +needs_explicit_shift (uint32_t ch)
>> {
>> - return (ch < 128 && ((xbase64_tab[ch >> 3] >> (ch & 7)) & 1));
>> + return (between (ch, 'A', 'Z')
>> + || between (ch, 'a', 'z')
>> + || between (ch, '/', '9') || ch == '+' || ch == '-');
>> }
>>
>>
>> @@ -252,7 +253,7 @@ base64 (unsigned int i)
>> indeed form a Low Surrogate. */ \
>> uint32_t wc2 = wch & 0xffff; \
>> \
>> - if (! __builtin_expect (wc2 >= 0xdc00 && wc2 < 0xe000, 1)) \
>> + if (! __glibc_likely (wc2 >= 0xdc00 && wc2 < 0xe000)) \
>> { \
>> STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1));\
>> } \
>> @@ -372,7 +373,8 @@ base64 (unsigned int i)
>> /* deactivate base64 encoding */ \
>> size_t count; \
>> \
>> - count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1; \
>> + count = ((statep->__count & 0x18) >= 0x10) \
>> + + needs_explicit_shift (ch) + 1; \
>> if (__glibc_unlikely (outptr + count > outend)) \
>> { \
>> result = __GCONV_FULL_OUTPUT; \
>> @@ -381,7 +383,7 @@ base64 (unsigned int i)
>> \
>> if ((statep->__count & 0x18) >= 0x10) \
>> *outptr++ = base64 ((statep->__count >> 3) & ~3); \
>> - if (isxbase64 (ch)) \
>> + if (needs_explicit_shift (ch)) \
>> *outptr++ = '-'; \
>> *outptr++ = (unsigned char) ch; \
>> statep->__count = 0; \
On 21/03/2022 08:59, Adhemerval Zanella wrote:
>
>
> On 21/03/2022 08:53, Adhemerval Zanella wrote:
>>
>>
>> On 20/03/2022 13:41, Max Gautier via Libc-alpha wrote:
>>> - Direct use of characters instead of arcane arrays
>>> - isxbase64 is not the Modified BASE64 alphabet, but the characters who
>>> needs to trigger an explicit shift back to US-ASCII. Make that clearer
>>>
>>> Signed-off-by: Max Gautier <mg@max.gautier.name>
>>
>>
>> LGTM, thanks.
>>
>> Reviewed-by: Adhemerval Zanellla <adhemerval.zanella@linaro.org>
>>
>>> ---
>>> iconvdata/utf-7.c | 64 ++++++++++++++++++++++++-----------------------
>>> 1 file changed, 33 insertions(+), 31 deletions(-)
>>>
>>> diff --git a/iconvdata/utf-7.c b/iconvdata/utf-7.c
>>> index 9ba0974959..15f3669ac8 100644
>>> --- a/iconvdata/utf-7.c
>>> +++ b/iconvdata/utf-7.c
>>> @@ -30,20 +30,27 @@
>>>
>>>
>>>
>>> +static bool
>>> +between (uint32_t const ch,
>>> + uint32_t const lower_bound, uint32_t const upper_bound)
>>> +{
>>> + return (ch >= lower_bound && ch <= upper_bound);
>>> +}
>>> +
>>> /* The set of "direct characters":
>>> A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
>>> */
>>>
>>> -static const unsigned char direct_tab[128 / 8] =
>>> - {
>>> - 0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87,
>>> - 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
>>> - };
>>> -
>>> -static int
>>> -isdirect (uint32_t ch)
>>> +static bool
>>> +isdirect (uint32_t ch, enum variant var)
>>> {
>
> In fact I am seeing this failure:
>
> utf-7.c:45:29: error: ‘enum variant’ declared inside parameter list will not be visible outside of this definition o
> r declaration [-Werror]
> 45 | isdirect (uint32_t ch, enum variant var)
> | ^~~~~~~
>
> Since 'enum variant' in only defined on next patch. Usually the best
> practice is keep each patch consistent, so could you move the definition
> on this patch?
>
> Or I can fix it for you before installing, it is up to you.
And it does not actually required the variant argument on neither, isdirect or
isxdirect. The obvious fix for this patch is:
diff --git a/iconvdata/utf-7.c b/iconvdata/utf-7.c
index 4a89de235a..815b1891c7 100644
--- a/iconvdata/utf-7.c
+++ b/iconvdata/utf-7.c
@@ -42,7 +42,7 @@ between (uint32_t const ch,
*/
static bool
-isdirect (uint32_t ch, enum variant var)
+isdirect (uint32_t ch)
{
return (between (ch, 'A', 'Z')
|| between (ch, 'a', 'z')
@@ -60,7 +60,7 @@ isdirect (uint32_t ch, enum variant var)
*/
static bool
-isxdirect (uint32_t ch, enum variant var)
+isxdirect (uint32_t ch)
{
return (ch == '\t'
|| ch == '\n'
On Mon, Mar 21, 2022 at 08:59:27AM -0300, Adhemerval Zanella wrote:
>
>
> On 21/03/2022 08:53, Adhemerval Zanella wrote:
> >
> >
> > On 20/03/2022 13:41, Max Gautier via Libc-alpha wrote:
> >> - Direct use of characters instead of arcane arrays
> >> - isxbase64 is not the Modified BASE64 alphabet, but the characters who
> >> needs to trigger an explicit shift back to US-ASCII. Make that clearer
> >>
> >> Signed-off-by: Max Gautier <mg@max.gautier.name>
> >
> >
> > LGTM, thanks.
> >
> > Reviewed-by: Adhemerval Zanellla <adhemerval.zanella@linaro.org>
> >
> >> ---
> >> iconvdata/utf-7.c | 64 ++++++++++++++++++++++++-----------------------
> >> 1 file changed, 33 insertions(+), 31 deletions(-)
> >>
> >> diff --git a/iconvdata/utf-7.c b/iconvdata/utf-7.c
> >> index 9ba0974959..15f3669ac8 100644
> >> --- a/iconvdata/utf-7.c
> >> +++ b/iconvdata/utf-7.c
> >> @@ -30,20 +30,27 @@
> >>
> >>
> >>
> >> +static bool
> >> +between (uint32_t const ch,
> >> + uint32_t const lower_bound, uint32_t const upper_bound)
> >> +{
> >> + return (ch >= lower_bound && ch <= upper_bound);
> >> +}
> >> +
> >> /* The set of "direct characters":
> >> A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
> >> */
> >>
> >> -static const unsigned char direct_tab[128 / 8] =
> >> - {
> >> - 0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87,
> >> - 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
> >> - };
> >> -
> >> -static int
> >> -isdirect (uint32_t ch)
> >> +static bool
> >> +isdirect (uint32_t ch, enum variant var)
> >> {
>
> In fact I am seeing this failure:
>
> utf-7.c:45:29: error: ‘enum variant’ declared inside parameter list will not be visible outside of this definition o
> r declaration [-Werror]
> 45 | isdirect (uint32_t ch, enum variant var)
> | ^~~~~~~
>
> Since 'enum variant' in only defined on next patch. Usually the best
> practice is keep each patch consistent, so could you move the definition
> on this patch?
>
> Or I can fix it for you before installing, it is up to you.
>
I think I mixed up my patches while integrating the corrections and
style fixes you mentionned, sorry.
No problem for me I you fix it before applying.
Thanks !
@@ -30,20 +30,27 @@
+static bool
+between (uint32_t const ch,
+ uint32_t const lower_bound, uint32_t const upper_bound)
+{
+ return (ch >= lower_bound && ch <= upper_bound);
+}
+
/* The set of "direct characters":
A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
*/
-static const unsigned char direct_tab[128 / 8] =
- {
- 0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87,
- 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
- };
-
-static int
-isdirect (uint32_t ch)
+static bool
+isdirect (uint32_t ch, enum variant var)
{
- return (ch < 128 && ((direct_tab[ch >> 3] >> (ch & 7)) & 1));
+ return (between (ch, 'A', 'Z')
+ || between (ch, 'a', 'z')
+ || between (ch, '0', '9')
+ || ch == '\'' || ch == '(' || ch == ')'
+ || between (ch, ',', '/')
+ || ch == ':' || ch == '?'
+ || ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r');
}
@@ -52,33 +59,27 @@ isdirect (uint32_t ch)
! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
*/
-static const unsigned char xdirect_tab[128 / 8] =
- {
- 0x00, 0x26, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xef, 0xff, 0xff, 0xff, 0x3f
- };
-
-static int
-isxdirect (uint32_t ch)
+static bool
+isxdirect (uint32_t ch, enum variant var)
{
- return (ch < 128 && ((xdirect_tab[ch >> 3] >> (ch & 7)) & 1));
+ return (ch == '\t'
+ || ch == '\n'
+ || ch == '\r'
+ || (between (ch, ' ', '}') && ch != '+' && ch != '\\'));
}
-/* The set of "extended base64 characters":
+/* Characters which needs to trigger an explicit shift back to US-ASCII (UTF-7
+ only): Modified base64 + '-' (shift back character)
A-Z a-z 0-9 + / -
*/
-static const unsigned char xbase64_tab[128 / 8] =
- {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xa8, 0xff, 0x03,
- 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
- };
-
-static int
-isxbase64 (uint32_t ch)
+static bool
+needs_explicit_shift (uint32_t ch)
{
- return (ch < 128 && ((xbase64_tab[ch >> 3] >> (ch & 7)) & 1));
+ return (between (ch, 'A', 'Z')
+ || between (ch, 'a', 'z')
+ || between (ch, '/', '9') || ch == '+' || ch == '-');
}
@@ -252,7 +253,7 @@ base64 (unsigned int i)
indeed form a Low Surrogate. */ \
uint32_t wc2 = wch & 0xffff; \
\
- if (! __builtin_expect (wc2 >= 0xdc00 && wc2 < 0xe000, 1)) \
+ if (! __glibc_likely (wc2 >= 0xdc00 && wc2 < 0xe000)) \
{ \
STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1));\
} \
@@ -372,7 +373,8 @@ base64 (unsigned int i)
/* deactivate base64 encoding */ \
size_t count; \
\
- count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1; \
+ count = ((statep->__count & 0x18) >= 0x10) \
+ + needs_explicit_shift (ch) + 1; \
if (__glibc_unlikely (outptr + count > outend)) \
{ \
result = __GCONV_FULL_OUTPUT; \
@@ -381,7 +383,7 @@ base64 (unsigned int i)
\
if ((statep->__count & 0x18) >= 0x10) \
*outptr++ = base64 ((statep->__count >> 3) & ~3); \
- if (isxbase64 (ch)) \
+ if (needs_explicit_shift (ch)) \
*outptr++ = '-'; \
*outptr++ = (unsigned char) ch; \
statep->__count = 0; \