[03/11] gas: multi-byte warning adjustments

Message ID f6b9dc9f-db9d-4299-91b5-cde13702684b@suse.com
State Superseded
Headers
Series gas: scrubber adjustments |

Commit Message

Jan Beulich June 28, 2024, 1:18 p.m. UTC
  First input_scrub_next_buffer()'s invocation was wrong, leading to input
only being checked from the last newline till the end of the current
buffer. Correcting the invocation, however, leads to duplicate checking
unless -f (or the #NO_APP equivalent thereof) is in effect. Move the
invocation to input_file_give_next_buffer(), to restrict it accordingly.

Then, when macros contain multi-byte characters, warning about them
again in every expansion isn't useful. Suppress such warnings from
sb_scrub_and_add_sb().
---
The latest with this change the 3-way as_warn() in
scan_for_multibyte_characters() becomes questionable: as_warn() is going
to prefix the warning with data obtained from as_where() anyway. This
means the 2nd one really is no more useful than the 1st one. Whereas the
3rd one now comes into play only for sufficiently large inputs, where
the line number would already be non-zero when fetching subsequent parts
of the input. Saying "at or near line" only then is bogus, though: When
the line number is still zero, we could as well say "at or near line 1".
Bumping by 1 the line number reported is going to get closer even in the
common case, as on average we're in the middle of a buffer rather than
at its start. Alternatively / additionally we'd also get things more
correct if we didn't uniformly bump the number, but said "at or after
line" instead: The report is never going to be for a past line.

Why is input re-scrubbed from input_scrub_include_sb() (via
sb_scrub_and_add_sb())? I can kind of see that there may be a need to
collapse successive whitespace, which may have formed in the course of
macro expansion. But I'm pretty sure comments which may have "formed"
should be left alone, for whatever meaning that may have. Other more
special handling may be equally inapplicable; the handling of double
quoted strings in particular may cause issues here as well, considering
especially that elsewhere in gas we have two different ways of escaping
double quotes in such strings.
  

Patch

--- a/gas/app.c
+++ b/gas/app.c
@@ -412,7 +412,8 @@  scan_for_multibyte_characters (const uns
    This is the way the old code used to work.  */
 
 size_t
-do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen)
+do_scrub_chars (size_t (*get) (char *, size_t), char *tostart, size_t tolen,
+		bool check_multibyte)
 {
   char *to = tostart;
   char *toend = tostart + tolen;
@@ -515,7 +516,7 @@  do_scrub_chars (size_t (*get) (char *, s
       from = input_buffer;
       fromend = from + fromlen;
 
-      if (multibyte_handling == multibyte_warn)
+      if (check_multibyte)
 	(void) scan_for_multibyte_characters ((const unsigned char *) from,
 					      (const unsigned char* ) fromend,
 					      true /* Generate warnings.  */);
--- a/gas/as.h
+++ b/gas/as.h
@@ -496,7 +496,7 @@  void   input_scrub_insert_line (const ch
 void   input_scrub_insert_file (char *);
 char * input_scrub_new_file (const char *);
 char * input_scrub_next_buffer (char **bufp);
-size_t do_scrub_chars (size_t (*get) (char *, size_t), char *, size_t);
+size_t do_scrub_chars (size_t (*get) (char *, size_t), char *, size_t, bool);
 size_t do_scrub_pending (void);
 bool   scan_for_multibyte_characters (const unsigned char *, const unsigned char *, bool);
 int    gen_to_words (LITTLENUM_TYPE *, int, long);
--- a/gas/input-file.c
+++ b/gas/input-file.c
@@ -240,9 +240,20 @@  input_file_give_next_buffer (char *where
      Since the assembler shouldn't do any output to stdout, we
      don't bother to synch output and input.  */
   if (preprocess)
-    size = do_scrub_chars (input_file_get, where, BUFFER_SIZE);
+    size = do_scrub_chars (input_file_get, where, BUFFER_SIZE,
+                           multibyte_handling == multibyte_warn);
   else
-    size = input_file_get (where, BUFFER_SIZE);
+    {
+      size = input_file_get (where, BUFFER_SIZE);
+
+      if (multibyte_handling == multibyte_warn)
+	{
+	  const unsigned char *start = (const unsigned char *) where;
+
+	  (void) scan_for_multibyte_characters (start, start + size,
+						true /* Generate warnings */);
+	}
+    }
 
   if (size)
     return_value = where + size;
--- a/gas/input-scrub.c
+++ b/gas/input-scrub.c
@@ -386,11 +386,6 @@  input_scrub_next_buffer (char **bufp)
 	  ++p;
 	}
 
-      if (multibyte_handling == multibyte_warn)
-	(void) scan_for_multibyte_characters ((const unsigned char *) p,
-					      (const unsigned char *) limit,
-					      true /* Generate warnings */);
-
       /* We found a newline in the newly read chars.  */
       partial_where = p;
       partial_size = limit - p;
--- a/gas/sb.c
+++ b/gas/sb.c
@@ -124,7 +124,7 @@  sb_scrub_and_add_sb (sb *ptr, sb *s)
 	break;
       sb_check (ptr, copy);
       ptr->len += do_scrub_chars (scrub_from_sb, ptr->ptr + ptr->len,
-				  ptr->max - ptr->len);
+				  ptr->max - ptr->len, false);
     }
 
   sb_to_scrub = 0;
--- a/gas/testsuite/gas/all/gas.exp
+++ b/gas/testsuite/gas/all/gas.exp
@@ -532,3 +532,5 @@  run_dump_test "pr27384"
 run_dump_test "pr27381"
 run_dump_test "multibyte1"
 run_dump_test "multibyte2"
+run_list_test "multibyte3" "--multibyte-handling=warn"
+run_list_test "multibyte3" "-f --multibyte-handling=warn"
--- /dev/null
+++ b/gas/testsuite/gas/all/multibyte3.l
@@ -0,0 +1,10 @@ 
+[^:]*: Assembler messages:
+[^:]*: Warning: multibyte character \(.*\) encountered in .*
+[^:]*: Warning: multibyte character \(.*\) encountered in .*
+[^:]*: Warning: multibyte character \(.*\) encountered in .*
+[^:]*: Warning: multibyte character \(.*\) encountered in .*
+[^:]*: Warning: multibyte character \(.*\) encountered in .*
+[^:]*: Warning: multibyte character \(.*\) encountered in .*
+[^:]*: Warning: multibyte character \(.*\) encountered in .*
+[^:]*: Warning: multibyte character \(.*\) encountered in .*
+[^:]*:[0-9]+: Warning: end of input
--- /dev/null
+++ b/gas/testsuite/gas/all/multibyte3.s
@@ -0,0 +1,11 @@ 
+	.macro m
+UmlautÜ\@:
+	.endm
+
+UmlautÄ:
+	.irpc c,szß
+UmlautÖ\@\c\():
+	m
+	.endr
+
+	.warning "end of input"