[1/2] Show 4 byte UTF-8 as \Uxxxxxxxx Simplify this by using a single uint32 rather than four bytes.

Message ID 20250909110440.3489774-2-werdna@aitchison.me.uk
State New
Headers
Series UTF-8 output correction and simplification |

Commit Message

Andrew C Aitchison Sept. 9, 2025, 11:04 a.m. UTC
  ---
 binutils/doc/binutils.texi | 32 +++++++++++++++++---------------
 binutils/nm.c              |  9 +++++----
 binutils/objdump.c         |  9 +++++----
 binutils/readelf.c         | 10 +++++-----
 binutils/strings.c         |  2 +-
 5 files changed, 33 insertions(+), 29 deletions(-)
  

Comments

Michael Matz Sept. 9, 2025, 1:54 p.m. UTC | #1
Hello,

On Tue, 9 Sep 2025, Andrew C Aitchison wrote:

> diff --git a/binutils/readelf.c b/binutils/readelf.c
> index 8162cbb7003..2ed40f073fb 100644
> --- a/binutils/readelf.c
> +++ b/binutils/readelf.c
> @@ -847,11 +847,11 @@ print_symbol_name (signed int width, const char * symbol)
>  		case 4:
>  		  if (width_remaining < 8)
>  		    break;

This still checks for 8 characters fitting ...

> -		  printf ("\\u%02x%02x%02x",
> -			  ((bytes[0] & 0x07) << 2) | ((bytes[1] & 0x30) >> 4),
> -			  ((bytes[1] & 0x0f) << 4) | ((bytes[2] & 0x3c) >> 2),
> -			  ((bytes[2] & 0x03) << 6) | (bytes[3] & 0x3f));
> -
> +		  printf ("\\U%08x",
> +			  ((bytes[0] & 0x07) << 18) |
> +			  ((bytes[1] & 0x3f) << 12) |
> +			  ((bytes[2] & 0x3f) << 6) |
> +			  ((bytes[3] & 0x3f)) );

... while you now emit 10.


Ciao,
Michael.
  
Andrew C Aitchison Sept. 9, 2025, 5:22 p.m. UTC | #2
On Tue, 9 Sep 2025, Michael Matz wrote:

> Hello,
>
> On Tue, 9 Sep 2025, Andrew C Aitchison wrote:
>
>> diff --git a/binutils/readelf.c b/binutils/readelf.c
>> index 8162cbb7003..2ed40f073fb 100644
>> --- a/binutils/readelf.c
>> +++ b/binutils/readelf.c
>> @@ -847,11 +847,11 @@ print_symbol_name (signed int width, const char * symbol)
>>  		case 4:
>>  		  if (width_remaining < 8)
>>  		    break;
>
> This still checks for 8 characters fitting ...
>
>> -		  printf ("\\u%02x%02x%02x",
>> -			  ((bytes[0] & 0x07) << 2) | ((bytes[1] & 0x30) >> 4),
>> -			  ((bytes[1] & 0x0f) << 4) | ((bytes[2] & 0x3c) >> 2),
>> -			  ((bytes[2] & 0x03) << 6) | (bytes[3] & 0x3f));
>> -
>> +		  printf ("\\U%08x",
>> +			  ((bytes[0] & 0x07) << 18) |
>> +			  ((bytes[1] & 0x3f) << 12) |
>> +			  ((bytes[2] & 0x3f) << 6) |
>> +			  ((bytes[3] & 0x3f)) );
>
> ... while you now emit 10.

Good catch.

Do I send a new full patch series or just a fix for this ?
  
Andrew C Aitchison Sept. 9, 2025, 7:56 p.m. UTC | #3
On Tue, 9 Sep 2025, Andrew C Aitchison wrote:

> ---
> binutils/doc/binutils.texi | 32 +++++++++++++++++---------------
> binutils/nm.c              |  9 +++++----
> binutils/objdump.c         |  9 +++++----
> binutils/readelf.c         | 10 +++++-----
> binutils/strings.c         |  2 +-
> 5 files changed, 33 insertions(+), 29 deletions(-)

> diff --git a/binutils/objdump.c b/binutils/objdump.c
> index 34199e30ab8..7fc7b6fb86d 100644
> --- a/binutils/objdump.c
> +++ b/binutils/objdump.c
> @@ -658,10 +658,11 @@ display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
> 	  break;
>
> 	case 4:
> -	  out += sprintf (out, "\\u%02x%02x%02x",
> -		  ((in[0] & 0x07) << 2) | ((in[1] & 0x30) >> 4),
> -		  ((in[1] & 0x0f) << 4) | ((in[2] & 0x3c) >> 2),
> -		  ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
> +	  out += sprintf (out, "\\U%08x",
> +		  ((out[0] & 0x07) << 18) |
> +		  ((out[1] & 0x3f) << 12) |
> +		  ((out[2] & 0x3f) << 6) |
> +		  ((out[3] & 0x3f)) );

This is wrong.
We should be calculating with the values of in[] no out[].

I will cut a full new set of patches.
  
Alan Modra Sept. 9, 2025, 9:48 p.m. UTC | #4
On Tue, Sep 09, 2025 at 08:56:10PM +0100, Andrew C Aitchison wrote:
> On Tue, 9 Sep 2025, Andrew C Aitchison wrote:
> 
> > ---
> > binutils/doc/binutils.texi | 32 +++++++++++++++++---------------
> > binutils/nm.c              |  9 +++++----
> > binutils/objdump.c         |  9 +++++----
> > binutils/readelf.c         | 10 +++++-----
> > binutils/strings.c         |  2 +-
> > 5 files changed, 33 insertions(+), 29 deletions(-)
> 
> > diff --git a/binutils/objdump.c b/binutils/objdump.c
> > index 34199e30ab8..7fc7b6fb86d 100644
> > --- a/binutils/objdump.c
> > +++ b/binutils/objdump.c
> > @@ -658,10 +658,11 @@ display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
> > 	  break;
> > 
> > 	case 4:
> > -	  out += sprintf (out, "\\u%02x%02x%02x",
> > -		  ((in[0] & 0x07) << 2) | ((in[1] & 0x30) >> 4),
> > -		  ((in[1] & 0x0f) << 4) | ((in[2] & 0x3c) >> 2),
> > -		  ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
> > +
> > +
> > +
> > +
> > +
> 
> This is wrong.
> We should be calculating with the values of in[] no out[].
> 
> I will cut a full new set of patches.

Please fix the formatting when you do that.  No trailing operators.
  

Patch

diff --git a/binutils/doc/binutils.texi b/binutils/doc/binutils.texi
index 89425b8a15b..60a448a36d0 100644
--- a/binutils/doc/binutils.texi
+++ b/binutils/doc/binutils.texi
@@ -1213,10 +1213,11 @@  in the current locale, which may or may not support them.  The options
 hex byte sequences enclosed by either angle brackets or curly braces.
 
 The @option{--unicode=escape} option displays them as escape sequences
-(@var{\uxxxx}) and the @option{--unicode=highlight} option displays
-them as escape sequences highlighted in red (if supported by the
-output device).  The colouring is intended to draw attention to the
-presence of unicode sequences where they might not be expected.
+(@var{\uxxxx} and @var{\Uxxxxxxxx}) and the @option{--unicode=highlight}
+option displays them as escape sequences highlighted in red (if
+supported by the output device).  The colouring is intended to draw
+attention to the presence of unicode sequences where they might not be
+expected.
 
 @item -W
 @itemx --no-weak
@@ -3079,10 +3080,11 @@  in the current locale, which may or may not support them.  The options
 hex byte sequences enclosed by either angle brackets or curly braces.
 
 The @option{--unicode=escape} option displays them as escape sequences
-(@var{\uxxxx}) and the @option{--unicode=highlight} option displays
-them as escape sequences highlighted in red (if supported by the
-output device).  The colouring is intended to draw attention to the
-presence of unicode sequences where they might not be expected.
+(@var{\uxxxx} and @var{\Uxxxxxxxx}) and the @option{--unicode=highlight}
+option displays them as escape sequences highlighted in red (if
+supported by the output device).  The colouring is intended to draw
+attention to the presence of unicode sequences where they might not be
+expected.
 
 @item -V
 @itemx --version
@@ -3492,11 +3494,11 @@  The @option{--unicode=locale} option displays them in the current
 locale, which may or may not support UTF-8 encoding.  The
 @option{--unicode=hex} option displays them as hex byte sequences
 enclosed between @var{<>} characters.  The @option{--unicode=escape}
-option displays them as escape sequences (@var{\uxxxx}) and the
-@option{--unicode=highlight} option displays them as escape sequences
-highlighted in red (if supported by the output device).  The colouring
-is intended to draw attention to the presence of unicode sequences
-where they might not be expected.
+option displays them as escape sequences (@var{\uxxxx} and
+@var{\Uxxxxxxxx}) and the @option{--unicode=highlight} option displays
+them as escape sequences highlighted in red (if supported by the output
+device).  The colouring is intended to draw attention to the presence of
+unicode sequences where they might not be expected.
 
 @item -T @var{bfdname}
 @itemx --target=@var{bfdname}
@@ -5231,8 +5233,8 @@  they are displayed as a sequence of hex bytes, encloses in curly
 parethesis characters.
 
 Using the @option{--unicode=escape} option will display the characters
-as as unicode escape sequences (@var{\uxxxx}).  Using the
-@option{--unicode=hex} will display the characters as hex byte
+as as unicode escape sequences (@var{\uxxxx} and @var{\Uxxxxxxxx}).
+Using the @option{--unicode=hex} will display the characters as hex byte
 sequences enclosed between angle brackets.
 
 Using the @option{--unicode=highlight} will display the characters as 
diff --git a/binutils/nm.c b/binutils/nm.c
index 6aa5143d6f2..831f348127b 100644
--- a/binutils/nm.c
+++ b/binutils/nm.c
@@ -564,10 +564,11 @@  display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
 	  break;
 
 	case 4:
-	  out += sprintf (out, "\\u%02x%02x%02x",
-		  ((in[0] & 0x07) << 2) | ((in[1] & 0x30) >> 4),
-		  ((in[1] & 0x0f) << 4) | ((in[2] & 0x3c) >> 2),
-		  ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
+	  out += sprintf (out, "\\U%08x",
+		  ((in[0] & 0x07) << 18) |
+		  ((in[1] & 0x3f) << 12) |
+		  ((in[2] & 0x3f) << 6) |
+		  ((in[3] & 0x3f)) );
 	  break;
 	default:
 	  /* URG.  */
diff --git a/binutils/objdump.c b/binutils/objdump.c
index 34199e30ab8..7fc7b6fb86d 100644
--- a/binutils/objdump.c
+++ b/binutils/objdump.c
@@ -658,10 +658,11 @@  display_utf8 (const unsigned char * in, char * out, unsigned int * consumed)
 	  break;
 
 	case 4:
-	  out += sprintf (out, "\\u%02x%02x%02x",
-		  ((in[0] & 0x07) << 2) | ((in[1] & 0x30) >> 4),
-		  ((in[1] & 0x0f) << 4) | ((in[2] & 0x3c) >> 2),
-		  ((in[2] & 0x03) << 6) | ((in[3] & 0x3f)));
+	  out += sprintf (out, "\\U%08x",
+		  ((out[0] & 0x07) << 18) |
+		  ((out[1] & 0x3f) << 12) |
+		  ((out[2] & 0x3f) << 6) |
+		  ((out[3] & 0x3f)) );
 	  break;
 	default:
 	  /* URG.  */
diff --git a/binutils/readelf.c b/binutils/readelf.c
index 8162cbb7003..2ed40f073fb 100644
--- a/binutils/readelf.c
+++ b/binutils/readelf.c
@@ -847,11 +847,11 @@  print_symbol_name (signed int width, const char * symbol)
 		case 4:
 		  if (width_remaining < 8)
 		    break;
-		  printf ("\\u%02x%02x%02x",
-			  ((bytes[0] & 0x07) << 2) | ((bytes[1] & 0x30) >> 4),
-			  ((bytes[1] & 0x0f) << 4) | ((bytes[2] & 0x3c) >> 2),
-			  ((bytes[2] & 0x03) << 6) | (bytes[3] & 0x3f));
-
+		  printf ("\\U%08x",
+			  ((bytes[0] & 0x07) << 18) |
+			  ((bytes[1] & 0x3f) << 12) |
+			  ((bytes[2] & 0x3f) << 6) |
+			  ((bytes[3] & 0x3f)) );
 		  break;
 		default:
 		  /* URG.  */
diff --git a/binutils/strings.c b/binutils/strings.c
index 8132b2ec1ae..4bf8b19d6e0 100644
--- a/binutils/strings.c
+++ b/binutils/strings.c
@@ -757,7 +757,7 @@  display_utf8_char (const unsigned char * buffer)
 	  break;
 
 	case 4:
-	  printf ("\\u%06x",
+	  printf ("\\U%08x",
 		  ((buffer[0] & 0x07) << 18) |
 		  ((buffer[1] & 0x3f) << 12) |
 		  ((buffer[2] & 0x3f) << 6) |