[v2,01/65] gas: consolidate whitespace recognition

Message ID b5e24935-d240-4fa6-a3bc-425c856e6263@suse.com
State New
Headers
Series gas: whitespace handling |

Commit Message

Jan Beulich Jan. 27, 2025, 3:26 p.m. UTC
  Let's extend lex_type[] to also cover whitespace, then having a simple
macro to uniformly recognize both blanks and tabs (and \r when it's not
EOL) as such.

In macro.c use sb_skip_white() as appropriate, instead of open-coding
it.
---
There's one place where \f is also checked for. I'm inclined to also
include that in the set, but for starters I didn't want to change
behavior in this regard.

Further, what about \v?

Is there any reason to retain PERMIT_WHITESPACE? It's always defined
in read.h, without any overrides. And even in compiler output some
"unnecessary" whitespace will often appear, simply for readability
reasons.
---
v2: Also replace ISSPACE(). Respect CR_EOL. Re-base.
  

Comments

Hans-Peter Nilsson Jan. 31, 2025, 6:02 a.m. UTC | #1
On Mon, 27 Jan 2025, Jan Beulich wrote:
> --- a/gas/read.c
> +++ b/gas/read.c
> @@ -112,9 +112,13 @@ die horribly;
>  
>  /* Used by is_... macros. our ctype[].  */
>  char lex_type[256] = {
> -  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* @ABCDEFGHIJKLMNO */
> +#ifndef CR_EOL
> +  0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 8, 0, 0,	/* @ABCDEFGHIJKLMNO */
> +#else
> +  0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,	/* @ABCDEFGHIJKLMNO */
> +#endif
>    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* PQRSTUVWXYZ[\]^_ */
> -  0, 0, 0, LEX_HASH, LEX_DOLLAR, LEX_PCT, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, /* _!"#$%&'()*+,-./ */
> +  8, 0, 0, LEX_HASH, LEX_DOLLAR, LEX_PCT, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, /* _!"#$%&'()*+,-./ */
>    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, LEX_QM,	/* 0123456789:;<=>? */
>    LEX_AT, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,	/* @ABCDEFGHIJKLMNO */
>    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, LEX_BR, 0, LEX_BR, 0, 3, /* PQRSTUVWXYZ[\]^_ */

Please use LEX_WHITE, not 8.  (Columns not lining up is not a 
valid excuse for naked literals, if that's the claimed reason.)

brgds, H-P
  
Jan Beulich Jan. 31, 2025, 8:36 a.m. UTC | #2
On 31.01.2025 07:02, Hans-Peter Nilsson wrote:
> On Mon, 27 Jan 2025, Jan Beulich wrote:
>> --- a/gas/read.c
>> +++ b/gas/read.c
>> @@ -112,9 +112,13 @@ die horribly;
>>  
>>  /* Used by is_... macros. our ctype[].  */
>>  char lex_type[256] = {
>> -  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* @ABCDEFGHIJKLMNO */
>> +#ifndef CR_EOL
>> +  0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 8, 0, 0,	/* @ABCDEFGHIJKLMNO */
>> +#else
>> +  0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,	/* @ABCDEFGHIJKLMNO */
>> +#endif
>>    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* PQRSTUVWXYZ[\]^_ */
>> -  0, 0, 0, LEX_HASH, LEX_DOLLAR, LEX_PCT, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, /* _!"#$%&'()*+,-./ */
>> +  8, 0, 0, LEX_HASH, LEX_DOLLAR, LEX_PCT, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, /* _!"#$%&'()*+,-./ */
>>    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, LEX_QM,	/* 0123456789:;<=>? */
>>    LEX_AT, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,	/* @ABCDEFGHIJKLMNO */
>>    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, LEX_BR, 0, LEX_BR, 0, 3, /* PQRSTUVWXYZ[\]^_ */
> 
> Please use LEX_WHITE, not 8.  (Columns not lining up is not a 
> valid excuse for naked literals, if that's the claimed reason.)

That's the presumed reason I derived, yes. The model appears to
be to use LEX_* here only when they're overridable by targets.
See the many 1 and 3 entries in particular. (I certainly agree
using such literal numbers isn't very nice. Yet when making
these changes - and there's another set to follow - it seemed
more reasonable to me to stay consistent.)

Jan
  

Patch

--- a/gas/cond.c
+++ b/gas/cond.c
@@ -250,7 +250,7 @@  get_mri_string (int terminator, int *len
 	     && ! is_end_of_line[(unsigned char) *input_line_pointer])
 	++input_line_pointer;
       s = input_line_pointer;
-      while (s > ret && (s[-1] == ' ' || s[-1] == '\t'))
+      while (s > ret && is_whitespace (s[-1]))
 	--s;
     }
 
--- a/gas/expr.c
+++ b/gas/expr.c
@@ -1436,7 +1436,7 @@  operand (expressionS *expressionP, enum
      created.  Doing it here saves lines of code.  */
   clean_up_expression (expressionP);
   SKIP_ALL_WHITESPACE ();		/* -> 1st char after operand.  */
-  know (*input_line_pointer != ' ');
+  know (!is_whitespace (*input_line_pointer));
 
   /* The PA port needs this information.  */
   if (expressionP->X_add_symbol)
@@ -1858,7 +1858,7 @@  expr (int rankarg,		/* Larger # is highe
   retval = operand (resultP, mode);
 
   /* operand () gobbles spaces.  */
-  know (*input_line_pointer != ' ');
+  know (!is_whitespace (*input_line_pointer));
 
   op_left = operatorf (&op_chars);
   while (op_left != O_illegal && op_rank[(int) op_left] > rank)
@@ -1880,7 +1880,7 @@  expr (int rankarg,		/* Larger # is highe
 	  right.X_op_symbol = NULL;
 	}
 
-      know (*input_line_pointer != ' ');
+      know (!is_whitespace (*input_line_pointer));
 
       if (op_left == O_index)
 	{
--- a/gas/listing.c
+++ b/gas/listing.c
@@ -1152,7 +1152,7 @@  debugging_pseudo (list_info_type *list A
   in_debug = false;
 #endif
 
-  while (ISSPACE (*line))
+  while (is_whitespace (*line))
     line++;
 
   if (*line != '.')
--- a/gas/macro.c
+++ b/gas/macro.c
@@ -29,10 +29,8 @@ 
 /* The routines in this file handle macro definition and expansion.
    They are called by gas.  */
 
-#define ISWHITE(x) ((x) == ' ' || (x) == '\t')
-
 #define ISSEP(x) \
- ((x) == ' ' || (x) == '\t' || (x) == ',' || (x) == '"' || (x) == ';' \
+ (is_whitespace (x) || (x) == ',' || (x) == '"' || (x) == ';' \
   || (x) == ')' || (x) == '(' \
   || ((flag_macro_alternate || flag_mri) && ((x) == '<' || (x) == '>')))
 
@@ -139,8 +137,7 @@  buffer_and_nest (const char *from, const
       if (! LABELS_WITHOUT_COLONS)
 	{
 	  /* Skip leading whitespace.  */
-	  while (i < ptr->len && ISWHITE (ptr->ptr[i]))
-	    i++;
+	  i = sb_skip_white (i, ptr);
 	}
 
       for (;;)
@@ -154,8 +151,7 @@  buffer_and_nest (const char *from, const
 	  if (i < ptr->len && is_name_ender (ptr->ptr[i]))
 	    i++;
 	  /* Skip whitespace.  */
-	  while (i < ptr->len && ISWHITE (ptr->ptr[i]))
-	    i++;
+	  i = sb_skip_white (i, ptr);
 	  /* Check for the colon.  */
 	  if (i >= ptr->len || ptr->ptr[i] != ':')
 	    {
@@ -174,8 +170,7 @@  buffer_and_nest (const char *from, const
 	}
 
       /* Skip trailing whitespace.  */
-      while (i < ptr->len && ISWHITE (ptr->ptr[i]))
-	i++;
+      i = sb_skip_white (i, ptr);
 
       if (i < ptr->len && (ptr->ptr[i] == '.'
 			   || NO_PSEUDO_DOT
@@ -424,9 +419,7 @@  get_any_string (size_t idx, sb *in, sb *
 
 	  *in_br = '\0';
 	  while (idx < in->len
-		 && (*in_br
-		     || (in->ptr[idx] != ' '
-			 && in->ptr[idx] != '\t'))
+		 && (*in_br || !is_whitespace (in->ptr[idx]))
 		 && in->ptr[idx] != ','
 		 && (in->ptr[idx] != '<'
 		     || (! flag_macro_alternate && ! flag_mri)))
@@ -916,7 +909,7 @@  macro_expand_body (sb *in, sb *out, form
 	  if (! macro
 	      || src + 5 >= in->len
 	      || strncasecmp (in->ptr + src, "LOCAL", 5) != 0
-	      || ! ISWHITE (in->ptr[src + 5])
+	      || ! is_whitespace (in->ptr[src + 5])
 	      /* PR 11507: Skip keyword LOCAL if it is found inside a quoted string.  */
 	      || inquote)
 	    {
@@ -1069,9 +1062,7 @@  macro_expand (size_t idx, sb *in, macro_
 	  /* The Microtec assembler ignores this if followed by a white space.
 	     (Macro invocation with empty extension) */
 	  idx++;
-	  if (    idx < in->len
-		  && in->ptr[idx] != ' '
-		  && in->ptr[idx] != '\t')
+	  if (idx < in->len && !is_whitespace (in->ptr[idx]))
 	    {
 	      formal_entry *n = new_formal ();
 
@@ -1192,7 +1183,7 @@  macro_expand (size_t idx, sb *in, macro_
 	{
 	  if (idx < in->len && in->ptr[idx] == ',')
 	    ++idx;
-	  if (idx < in->len && ISWHITE (in->ptr[idx]))
+	  if (idx < in->len && is_whitespace (in->ptr[idx]))
 	    break;
 	}
     }
--- a/gas/read.c
+++ b/gas/read.c
@@ -112,9 +112,13 @@  die horribly;
 
 /* Used by is_... macros. our ctype[].  */
 char lex_type[256] = {
-  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* @ABCDEFGHIJKLMNO */
+#ifndef CR_EOL
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 8, 0, 0,	/* @ABCDEFGHIJKLMNO */
+#else
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0,	/* @ABCDEFGHIJKLMNO */
+#endif
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* PQRSTUVWXYZ[\]^_ */
-  0, 0, 0, LEX_HASH, LEX_DOLLAR, LEX_PCT, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, /* _!"#$%&'()*+,-./ */
+  8, 0, 0, LEX_HASH, LEX_DOLLAR, LEX_PCT, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, /* _!"#$%&'()*+,-./ */
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, LEX_QM,	/* 0123456789:;<=>? */
   LEX_AT, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,	/* @ABCDEFGHIJKLMNO */
   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, LEX_BR, 0, LEX_BR, 0, 3, /* PQRSTUVWXYZ[\]^_ */
@@ -1068,11 +1072,11 @@  read_a_source_file (const char *name)
 
 			  if (*rest == ':')
 			    ++rest;
-			  if (*rest == ' ' || *rest == '\t')
+			  if (is_whitespace (*rest))
 			    ++rest;
 			  if ((strncasecmp (rest, "EQU", 3) == 0
 			       || strncasecmp (rest, "SET", 3) == 0)
-			      && (rest[3] == ' ' || rest[3] == '\t'))
+			      && is_whitespace (rest[3]))
 			    {
 			      input_line_pointer = rest + 3;
 			      equals (line_start,
@@ -1080,8 +1084,7 @@  read_a_source_file (const char *name)
 			      continue;
 			    }
 			  if (strncasecmp (rest, "MACRO", 5) == 0
-			      && (rest[5] == ' '
-				  || rest[5] == '\t'
+			      && (is_whitespace (rest[5])
 				  || is_end_of_line[(unsigned char) rest[5]]))
 			    mri_line_macro = 1;
 			}
@@ -1117,7 +1120,7 @@  read_a_source_file (const char *name)
 	     level.  */
 	  do
 	    nul_char = next_char = *input_line_pointer++;
-	  while (next_char == '\t' || next_char == ' ' || next_char == '\f');
+	  while (is_whitespace (next_char) || next_char == '\f');
 
 	  /* C is the 1st significant character.
 	     Input_line_pointer points after that character.  */
@@ -1146,12 +1149,12 @@  read_a_source_file (const char *name)
 		      if (*rest == ':')
 			++rest;
 
-		      if (*rest == ' ' || *rest == '\t')
+		      if (is_whitespace (*rest))
 			++rest;
 
 		      if ((strncasecmp (rest, "EQU", 3) == 0
 			   || strncasecmp (rest, "SET", 3) == 0)
-			  && (rest[3] == ' ' || rest[3] == '\t'))
+			  && is_whitespace (rest[3]))
 			{
 			  input_line_pointer = rest + 3;
 			  equals (s, 1);
@@ -1169,7 +1172,7 @@  read_a_source_file (const char *name)
 		  SKIP_WHITESPACE ();
 		}
 	      else if ((next_char == '=' && *rest == '=')
-		       || ((next_char == ' ' || next_char == '\t')
+		       || (is_whitespace (next_char)
 			   && rest[0] == '='
 			   && rest[1] == '='))
 		{
@@ -1177,7 +1180,7 @@  read_a_source_file (const char *name)
 		  demand_empty_rest_of_line ();
 		}
 	      else if ((next_char == '='
-		       || ((next_char == ' ' || next_char == '\t')
+		       || (is_whitespace (next_char)
 			    && *rest == '='))
 #ifdef TC_EQUAL_IN_INSN
 			   && !TC_EQUAL_IN_INSN (next_char, s)
@@ -1284,7 +1287,7 @@  read_a_source_file (const char *name)
 		      /* The following skip of whitespace is compulsory.
 			 A well shaped space is sometimes all that separates
 			 keyword from operands.  */
-		      if (next_char == ' ' || next_char == '\t')
+		      if (is_whitespace (next_char))
 			input_line_pointer++;
 
 		      /* Input_line is restored.
@@ -1501,7 +1504,7 @@  mri_comment_field (char *stopcp)
   know (flag_m68k_mri);
 
   for (s = input_line_pointer;
-       ((!is_end_of_line[(unsigned char) *s] && *s != ' ' && *s != '\t')
+       ((!is_end_of_line[(unsigned char) *s] && !is_whitespace (*s))
 	|| inquote);
        s++)
     {
@@ -6326,7 +6329,7 @@  equals (char *sym_name, int reassign)
   if (reassign < 0 && *input_line_pointer == '=')
     input_line_pointer++;
 
-  while (*input_line_pointer == ' ' || *input_line_pointer == '\t')
+  while (is_whitespace (*input_line_pointer))
     input_line_pointer++;
 
   if (flag_mri)
@@ -6500,8 +6503,7 @@  s_include (int arg ATTRIBUTE_UNUSED)
       SKIP_WHITESPACE ();
       i = 0;
       while (!is_end_of_line[(unsigned char) *input_line_pointer]
-	     && *input_line_pointer != ' '
-	     && *input_line_pointer != '\t')
+	     && !is_whitespace (*input_line_pointer))
 	{
 	  obstack_1grow (&notes, *input_line_pointer);
 	  ++input_line_pointer;
--- a/gas/read.h
+++ b/gas/read.h
@@ -29,17 +29,18 @@  extern bool input_from_string;
 
 #ifdef PERMIT_WHITESPACE
 #define SKIP_WHITESPACE()			\
-  ((*input_line_pointer == ' ') ? ++input_line_pointer : 0)
+  (is_whitespace (*input_line_pointer) ? ++input_line_pointer : 0)
 #define SKIP_ALL_WHITESPACE()			\
-  while (*input_line_pointer == ' ') ++input_line_pointer
+  while (is_whitespace (*input_line_pointer)) ++input_line_pointer
 #else
-#define SKIP_WHITESPACE() know (*input_line_pointer != ' ' )
+#define SKIP_WHITESPACE() know (!is_whitespace (*input_line_pointer))
 #define SKIP_ALL_WHITESPACE() SKIP_WHITESPACE()
 #endif
 
-#define	LEX_NAME	(1)	/* may continue a name */
+#define LEX_NAME	(1)	/* may continue a name */
 #define LEX_BEGIN_NAME	(2)	/* may begin a name */
 #define LEX_END_NAME	(4)	/* ends a name */
+#define LEX_WHITE	(8)	/* whitespace */
 
 #define is_name_beginner(c) \
   ( lex_type[(unsigned char) (c)] & LEX_BEGIN_NAME )
@@ -47,6 +48,8 @@  extern bool input_from_string;
   ( lex_type[(unsigned char) (c)] & LEX_NAME       )
 #define is_name_ender(c) \
   ( lex_type[(unsigned char) (c)] & LEX_END_NAME   )
+#define is_whitespace(c) \
+  ( lex_type[(unsigned char) (c)] & LEX_WHITE      )
 
 /* The distinction of "line" and "statement" sadly is blurred by unhelpful
    naming of e.g. the underlying array.  Most users really mean "end of
--- a/gas/sb.c
+++ b/gas/sb.c
@@ -215,9 +215,7 @@  sb_terminate (sb *in)
 size_t
 sb_skip_white (size_t idx, sb *ptr)
 {
-  while (idx < ptr->len
-	 && (ptr->ptr[idx] == ' '
-	     || ptr->ptr[idx] == '\t'))
+  while (idx < ptr->len && is_whitespace (ptr->ptr[idx]))
     idx++;
   return idx;
 }
@@ -229,18 +227,14 @@  sb_skip_white (size_t idx, sb *ptr)
 size_t
 sb_skip_comma (size_t idx, sb *ptr)
 {
-  while (idx < ptr->len
-	 && (ptr->ptr[idx] == ' '
-	     || ptr->ptr[idx] == '\t'))
+  while (idx < ptr->len && is_whitespace (ptr->ptr[idx]))
     idx++;
 
   if (idx < ptr->len
       && ptr->ptr[idx] == ',')
     idx++;
 
-  while (idx < ptr->len
-	 && (ptr->ptr[idx] == ' '
-	     || ptr->ptr[idx] == '\t'))
+  while (idx < ptr->len && is_whitespace (ptr->ptr[idx]))
     idx++;
 
   return idx;