[1/2] Show 4 byte UTF-8 as \Uxxxxxxxx Simplify this by using a single uint32 rather than four bytes.
Commit Message
---
binutils/doc/binutils.texi | 32 +++++++++++++++++---------------
binutils/nm.c | 9 +++++----
binutils/objdump.c | 9 +++++----
binutils/readelf.c | 10 +++++-----
binutils/strings.c | 2 +-
5 files changed, 33 insertions(+), 29 deletions(-)
Comments
Hello,
On Tue, 9 Sep 2025, Andrew C Aitchison wrote:
> diff --git a/binutils/readelf.c b/binutils/readelf.c
> index 8162cbb7003..2ed40f073fb 100644
> --- a/binutils/readelf.c
> +++ b/binutils/readelf.c
> @@ -847,11 +847,11 @@ print_symbol_name (signed int width, const char * symbol)
> case 4:
> if (width_remaining < 8)
> break;
This still checks for 8 characters fitting ...
> - printf ("\\u%02x%02x%02x",
> - ((bytes[0] & 0x07) << 2) | ((bytes[1] & 0x30) >> 4),
> - ((bytes[1] & 0x0f) << 4) | ((bytes[2] & 0x3c) >> 2),
> - ((bytes[2] & 0x03) << 6) | (bytes[3] & 0x3f));
> -
> + printf ("\\U%08x",
> + ((bytes[0] & 0x07) << 18) |
> + ((bytes[1] & 0x3f) << 12) |
> + ((bytes[2] & 0x3f) << 6) |
> + ((bytes[3] & 0x3f)) );
... while you now emit 10.
Ciao,
Michael.
On Tue, 9 Sep 2025, Michael Matz wrote:
> Hello,
>
> On Tue, 9 Sep 2025, Andrew C Aitchison wrote:
>
>> diff --git a/binutils/readelf.c b/binutils/readelf.c
>> index 8162cbb7003..2ed40f073fb 100644
>> --- a/binutils/readelf.c
>> +++ b/binutils/readelf.c
>> @@ -847,11 +847,11 @@ print_symbol_name (signed int width, const char * symbol)
>> case 4:
>> if (width_remaining < 8)
>> break;
>
> This still checks for 8 characters fitting ...
>
>> - printf ("\\u%02x%02x%02x",
>> - ((bytes[0] & 0x07) << 2) | ((bytes[1] & 0x30) >> 4),
>> - ((bytes[1] & 0x0f) << 4) | ((bytes[2] & 0x3c) >> 2),
>> - ((bytes[2] & 0x03) << 6) | (bytes[3] & 0x3f));
>> -
>> + printf ("\\U%08x",
>> + ((bytes[0] & 0x07) << 18) |
>> + ((bytes[1] & 0x3f) << 12) |
>> + ((bytes[2] & 0x3f) << 6) |
>> + ((bytes[3] & 0x3f)) );
>
> ... while you now emit 10.
Good catch.
Do I send a new full patch series or just a fix for this ?
On Tue, 9 Sep 2025, Andrew C Aitchison wrote:
> ---
> binutils/doc/binutils.texi | 32 +++++++++++++++++---------------
> binutils/nm.c | 9 +++++----
> binutils/objdump.c | 9 +++++----
> binutils/readelf.c | 10 +++++-----
> binutils/strings.c | 2 +-
> 5 files changed, 33 insertions(+), 29 deletions(-)
> diff --git a/binutils/objdump.c b/binutils/objdump.c
> index 34199e30ab8..7fc7b6fb86d 100644
> --- a/binutils/objdump.c
> +++ b/binutils/objdump.c
> @@ -658,10 +658,11 @@ display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
> break;
>
> case 4:
> - out += sprintf (out, "\\u%02x%02x%02x",
> - ((in[0] & 0x07) << 2) | ((in[1] & 0x30) >> 4),
> - ((in[1] & 0x0f) << 4) | ((in[2] & 0x3c) >> 2),
> - ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
> + out += sprintf (out, "\\U%08x",
> + ((out[0] & 0x07) << 18) |
> + ((out[1] & 0x3f) << 12) |
> + ((out[2] & 0x3f) << 6) |
> + ((out[3] & 0x3f)) );
This is wrong.
We should be calculating with the values of in[] no out[].
I will cut a full new set of patches.
On Tue, Sep 09, 2025 at 08:56:10PM +0100, Andrew C Aitchison wrote:
> On Tue, 9 Sep 2025, Andrew C Aitchison wrote:
>
> > ---
> > binutils/doc/binutils.texi | 32 +++++++++++++++++---------------
> > binutils/nm.c | 9 +++++----
> > binutils/objdump.c | 9 +++++----
> > binutils/readelf.c | 10 +++++-----
> > binutils/strings.c | 2 +-
> > 5 files changed, 33 insertions(+), 29 deletions(-)
>
> > diff --git a/binutils/objdump.c b/binutils/objdump.c
> > index 34199e30ab8..7fc7b6fb86d 100644
> > --- a/binutils/objdump.c
> > +++ b/binutils/objdump.c
> > @@ -658,10 +658,11 @@ display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
> > break;
> >
> > case 4:
> > - out += sprintf (out, "\\u%02x%02x%02x",
> > - ((in[0] & 0x07) << 2) | ((in[1] & 0x30) >> 4),
> > - ((in[1] & 0x0f) << 4) | ((in[2] & 0x3c) >> 2),
> > - ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
> > +
> > +
> > +
> > +
> > +
>
> This is wrong.
> We should be calculating with the values of in[] no out[].
>
> I will cut a full new set of patches.
Please fix the formatting when you do that. No trailing operators.
@@ -1213,10 +1213,11 @@ in the current locale, which may or may not support them. The options
hex byte sequences enclosed by either angle brackets or curly braces.
The @option{--unicode=escape} option displays them as escape sequences
-(@var{\uxxxx}) and the @option{--unicode=highlight} option displays
-them as escape sequences highlighted in red (if supported by the
-output device). The colouring is intended to draw attention to the
-presence of unicode sequences where they might not be expected.
+(@var{\uxxxx} and @var{\Uxxxxxxxx}) and the @option{--unicode=highlight}
+option displays them as escape sequences highlighted in red (if
+supported by the output device). The colouring is intended to draw
+attention to the presence of unicode sequences where they might not be
+expected.
@item -W
@itemx --no-weak
@@ -3079,10 +3080,11 @@ in the current locale, which may or may not support them. The options
hex byte sequences enclosed by either angle brackets or curly braces.
The @option{--unicode=escape} option displays them as escape sequences
-(@var{\uxxxx}) and the @option{--unicode=highlight} option displays
-them as escape sequences highlighted in red (if supported by the
-output device). The colouring is intended to draw attention to the
-presence of unicode sequences where they might not be expected.
+(@var{\uxxxx} and @var{\Uxxxxxxxx}) and the @option{--unicode=highlight}
+option displays them as escape sequences highlighted in red (if
+supported by the output device). The colouring is intended to draw
+attention to the presence of unicode sequences where they might not be
+expected.
@item -V
@itemx --version
@@ -3492,11 +3494,11 @@ The @option{--unicode=locale} option displays them in the current
locale, which may or may not support UTF-8 encoding. The
@option{--unicode=hex} option displays them as hex byte sequences
enclosed between @var{<>} characters. The @option{--unicode=escape}
-option displays them as escape sequences (@var{\uxxxx}) and the
-@option{--unicode=highlight} option displays them as escape sequences
-highlighted in red (if supported by the output device). The colouring
-is intended to draw attention to the presence of unicode sequences
-where they might not be expected.
+option displays them as escape sequences (@var{\uxxxx} and
+@var{\Uxxxxxxxx}) and the @option{--unicode=highlight} option displays
+them as escape sequences highlighted in red (if supported by the output
+device). The colouring is intended to draw attention to the presence of
+unicode sequences where they might not be expected.
@item -T @var{bfdname}
@itemx --target=@var{bfdname}
@@ -5231,8 +5233,8 @@ they are displayed as a sequence of hex bytes, encloses in curly
parethesis characters.
Using the @option{--unicode=escape} option will display the characters
-as as unicode escape sequences (@var{\uxxxx}). Using the
-@option{--unicode=hex} will display the characters as hex byte
+as as unicode escape sequences (@var{\uxxxx} and @var{\Uxxxxxxxx}).
+Using the @option{--unicode=hex} will display the characters as hex byte
sequences enclosed between angle brackets.
Using the @option{--unicode=highlight} will display the characters as
@@ -564,10 +564,11 @@ display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
break;
case 4:
- out += sprintf (out, "\\u%02x%02x%02x",
- ((in[0] & 0x07) << 2) | ((in[1] & 0x30) >> 4),
- ((in[1] & 0x0f) << 4) | ((in[2] & 0x3c) >> 2),
- ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
+ out += sprintf (out, "\\U%08x",
+ ((in[0] & 0x07) << 18) |
+ ((in[1] & 0x3f) << 12) |
+ ((in[2] & 0x3f) << 6) |
+ ((in[3] & 0x3f)) );
break;
default:
/* URG. */
@@ -658,10 +658,11 @@ display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
break;
case 4:
- out += sprintf (out, "\\u%02x%02x%02x",
- ((in[0] & 0x07) << 2) | ((in[1] & 0x30) >> 4),
- ((in[1] & 0x0f) << 4) | ((in[2] & 0x3c) >> 2),
- ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
+ out += sprintf (out, "\\U%08x",
+ ((out[0] & 0x07) << 18) |
+ ((out[1] & 0x3f) << 12) |
+ ((out[2] & 0x3f) << 6) |
+ ((out[3] & 0x3f)) );
break;
default:
/* URG. */
@@ -847,11 +847,11 @@ print_symbol_name (signed int width, const char * symbol)
case 4:
if (width_remaining < 8)
break;
- printf ("\\u%02x%02x%02x",
- ((bytes[0] & 0x07) << 2) | ((bytes[1] & 0x30) >> 4),
- ((bytes[1] & 0x0f) << 4) | ((bytes[2] & 0x3c) >> 2),
- ((bytes[2] & 0x03) << 6) | (bytes[3] & 0x3f));
-
+ printf ("\\U%08x",
+ ((bytes[0] & 0x07) << 18) |
+ ((bytes[1] & 0x3f) << 12) |
+ ((bytes[2] & 0x3f) << 6) |
+ ((bytes[3] & 0x3f)) );
break;
default:
/* URG. */
@@ -757,7 +757,7 @@ display_utf8_char (const unsigned char * buffer)
break;
case 4:
- printf ("\\u%06x",
+ printf ("\\U%08x",
((buffer[0] & 0x07) << 18) |
((buffer[1] & 0x3f) << 12) |
((buffer[2] & 0x3f) << 6) |