[v2,01/65] gas: consolidate whitespace recognition
Commit Message
Let's extend lex_type[] to also cover whitespace, then having a simple
macro to uniformly recognize both blanks and tabs (and \r when it's not
EOL) as such.
In macro.c use sb_skip_white() as appropriate, instead of open-coding
it.
---
There's one place where \f is also checked for. I'm inclined to also
include that in the set, but for starters I didn't want to change
behavior in this regard.
Further, what about \v?
Is there any reason to retain PERMIT_WHITESPACE? It's always defined
in read.h, without any overrides. And even in compiler output some
"unnecessary" whitespace will often appear, simply for readability
reasons.
---
v2: Also replace ISSPACE(). Respect CR_EOL. Re-base.
Comments
On Mon, 27 Jan 2025, Jan Beulich wrote:
> --- a/gas/read.c
> +++ b/gas/read.c
> @@ -112,9 +112,13 @@ die horribly;
>
> /* Used by is_... macros. our ctype[]. */
> char lex_type[256] = {
> - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* @ABCDEFGHIJKLMNO */
> +#ifndef CR_EOL
> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 8, 0, 0, /* @ABCDEFGHIJKLMNO */
> +#else
> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, /* @ABCDEFGHIJKLMNO */
> +#endif
> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* PQRSTUVWXYZ[\]^_ */
> - 0, 0, 0, LEX_HASH, LEX_DOLLAR, LEX_PCT, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, /* _!"#$%&'()*+,-./ */
> + 8, 0, 0, LEX_HASH, LEX_DOLLAR, LEX_PCT, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, /* _!"#$%&'()*+,-./ */
> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, LEX_QM, /* 0123456789:;<=>? */
> LEX_AT, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* @ABCDEFGHIJKLMNO */
> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, LEX_BR, 0, LEX_BR, 0, 3, /* PQRSTUVWXYZ[\]^_ */
Please use LEX_WHITE, not 8. (Columns not lining up is not a
valid excuse for naked literals, if that's the claimed reason.)
brgds, H-P
On 31.01.2025 07:02, Hans-Peter Nilsson wrote:
> On Mon, 27 Jan 2025, Jan Beulich wrote:
>> --- a/gas/read.c
>> +++ b/gas/read.c
>> @@ -112,9 +112,13 @@ die horribly;
>>
>> /* Used by is_... macros. our ctype[]. */
>> char lex_type[256] = {
>> - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* @ABCDEFGHIJKLMNO */
>> +#ifndef CR_EOL
>> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 8, 0, 0, /* @ABCDEFGHIJKLMNO */
>> +#else
>> + 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, /* @ABCDEFGHIJKLMNO */
>> +#endif
>> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* PQRSTUVWXYZ[\]^_ */
>> - 0, 0, 0, LEX_HASH, LEX_DOLLAR, LEX_PCT, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, /* _!"#$%&'()*+,-./ */
>> + 8, 0, 0, LEX_HASH, LEX_DOLLAR, LEX_PCT, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, /* _!"#$%&'()*+,-./ */
>> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, LEX_QM, /* 0123456789:;<=>? */
>> LEX_AT, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* @ABCDEFGHIJKLMNO */
>> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, LEX_BR, 0, LEX_BR, 0, 3, /* PQRSTUVWXYZ[\]^_ */
>
> Please use LEX_WHITE, not 8. (Columns not lining up is not a
> valid excuse for naked literals, if that's the claimed reason.)
That's the presumed reason I derived, yes. The model appears to
be to use LEX_* here only when they're overridable by targets.
See the many 1 and 3 entries in particular. (I certainly agree
using such literal numbers isn't very nice. Yet when making
these changes - and there's another set to follow - it seemed
more reasonable to me to stay consistent.)
Jan
@@ -250,7 +250,7 @@ get_mri_string (int terminator, int *len
&& ! is_end_of_line[(unsigned char) *input_line_pointer])
++input_line_pointer;
s = input_line_pointer;
- while (s > ret && (s[-1] == ' ' || s[-1] == '\t'))
+ while (s > ret && is_whitespace (s[-1]))
--s;
}
@@ -1436,7 +1436,7 @@ operand (expressionS *expressionP, enum
created. Doing it here saves lines of code. */
clean_up_expression (expressionP);
SKIP_ALL_WHITESPACE (); /* -> 1st char after operand. */
- know (*input_line_pointer != ' ');
+ know (!is_whitespace (*input_line_pointer));
/* The PA port needs this information. */
if (expressionP->X_add_symbol)
@@ -1858,7 +1858,7 @@ expr (int rankarg, /* Larger # is highe
retval = operand (resultP, mode);
/* operand () gobbles spaces. */
- know (*input_line_pointer != ' ');
+ know (!is_whitespace (*input_line_pointer));
op_left = operatorf (&op_chars);
while (op_left != O_illegal && op_rank[(int) op_left] > rank)
@@ -1880,7 +1880,7 @@ expr (int rankarg, /* Larger # is highe
right.X_op_symbol = NULL;
}
- know (*input_line_pointer != ' ');
+ know (!is_whitespace (*input_line_pointer));
if (op_left == O_index)
{
@@ -1152,7 +1152,7 @@ debugging_pseudo (list_info_type *list A
in_debug = false;
#endif
- while (ISSPACE (*line))
+ while (is_whitespace (*line))
line++;
if (*line != '.')
@@ -29,10 +29,8 @@
/* The routines in this file handle macro definition and expansion.
They are called by gas. */
-#define ISWHITE(x) ((x) == ' ' || (x) == '\t')
-
#define ISSEP(x) \
- ((x) == ' ' || (x) == '\t' || (x) == ',' || (x) == '"' || (x) == ';' \
+ (is_whitespace (x) || (x) == ',' || (x) == '"' || (x) == ';' \
|| (x) == ')' || (x) == '(' \
|| ((flag_macro_alternate || flag_mri) && ((x) == '<' || (x) == '>')))
@@ -139,8 +137,7 @@ buffer_and_nest (const char *from, const
if (! LABELS_WITHOUT_COLONS)
{
/* Skip leading whitespace. */
- while (i < ptr->len && ISWHITE (ptr->ptr[i]))
- i++;
+ i = sb_skip_white (i, ptr);
}
for (;;)
@@ -154,8 +151,7 @@ buffer_and_nest (const char *from, const
if (i < ptr->len && is_name_ender (ptr->ptr[i]))
i++;
/* Skip whitespace. */
- while (i < ptr->len && ISWHITE (ptr->ptr[i]))
- i++;
+ i = sb_skip_white (i, ptr);
/* Check for the colon. */
if (i >= ptr->len || ptr->ptr[i] != ':')
{
@@ -174,8 +170,7 @@ buffer_and_nest (const char *from, const
}
/* Skip trailing whitespace. */
- while (i < ptr->len && ISWHITE (ptr->ptr[i]))
- i++;
+ i = sb_skip_white (i, ptr);
if (i < ptr->len && (ptr->ptr[i] == '.'
|| NO_PSEUDO_DOT
@@ -424,9 +419,7 @@ get_any_string (size_t idx, sb *in, sb *
*in_br = '\0';
while (idx < in->len
- && (*in_br
- || (in->ptr[idx] != ' '
- && in->ptr[idx] != '\t'))
+ && (*in_br || !is_whitespace (in->ptr[idx]))
&& in->ptr[idx] != ','
&& (in->ptr[idx] != '<'
|| (! flag_macro_alternate && ! flag_mri)))
@@ -916,7 +909,7 @@ macro_expand_body (sb *in, sb *out, form
if (! macro
|| src + 5 >= in->len
|| strncasecmp (in->ptr + src, "LOCAL", 5) != 0
- || ! ISWHITE (in->ptr[src + 5])
+ || ! is_whitespace (in->ptr[src + 5])
/* PR 11507: Skip keyword LOCAL if it is found inside a quoted string. */
|| inquote)
{
@@ -1069,9 +1062,7 @@ macro_expand (size_t idx, sb *in, macro_
/* The Microtec assembler ignores this if followed by a white space.
(Macro invocation with empty extension) */
idx++;
- if ( idx < in->len
- && in->ptr[idx] != ' '
- && in->ptr[idx] != '\t')
+ if (idx < in->len && !is_whitespace (in->ptr[idx]))
{
formal_entry *n = new_formal ();
@@ -1192,7 +1183,7 @@ macro_expand (size_t idx, sb *in, macro_
{
if (idx < in->len && in->ptr[idx] == ',')
++idx;
- if (idx < in->len && ISWHITE (in->ptr[idx]))
+ if (idx < in->len && is_whitespace (in->ptr[idx]))
break;
}
}
@@ -112,9 +112,13 @@ die horribly;
/* Used by is_... macros. our ctype[]. */
char lex_type[256] = {
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* @ABCDEFGHIJKLMNO */
+#ifndef CR_EOL
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 8, 0, 0, /* @ABCDEFGHIJKLMNO */
+#else
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, /* @ABCDEFGHIJKLMNO */
+#endif
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* PQRSTUVWXYZ[\]^_ */
- 0, 0, 0, LEX_HASH, LEX_DOLLAR, LEX_PCT, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, /* _!"#$%&'()*+,-./ */
+ 8, 0, 0, LEX_HASH, LEX_DOLLAR, LEX_PCT, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, /* _!"#$%&'()*+,-./ */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, LEX_QM, /* 0123456789:;<=>? */
LEX_AT, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* @ABCDEFGHIJKLMNO */
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, LEX_BR, 0, LEX_BR, 0, 3, /* PQRSTUVWXYZ[\]^_ */
@@ -1068,11 +1072,11 @@ read_a_source_file (const char *name)
if (*rest == ':')
++rest;
- if (*rest == ' ' || *rest == '\t')
+ if (is_whitespace (*rest))
++rest;
if ((strncasecmp (rest, "EQU", 3) == 0
|| strncasecmp (rest, "SET", 3) == 0)
- && (rest[3] == ' ' || rest[3] == '\t'))
+ && is_whitespace (rest[3]))
{
input_line_pointer = rest + 3;
equals (line_start,
@@ -1080,8 +1084,7 @@ read_a_source_file (const char *name)
continue;
}
if (strncasecmp (rest, "MACRO", 5) == 0
- && (rest[5] == ' '
- || rest[5] == '\t'
+ && (is_whitespace (rest[5])
|| is_end_of_line[(unsigned char) rest[5]]))
mri_line_macro = 1;
}
@@ -1117,7 +1120,7 @@ read_a_source_file (const char *name)
level. */
do
nul_char = next_char = *input_line_pointer++;
- while (next_char == '\t' || next_char == ' ' || next_char == '\f');
+ while (is_whitespace (next_char) || next_char == '\f');
/* C is the 1st significant character.
Input_line_pointer points after that character. */
@@ -1146,12 +1149,12 @@ read_a_source_file (const char *name)
if (*rest == ':')
++rest;
- if (*rest == ' ' || *rest == '\t')
+ if (is_whitespace (*rest))
++rest;
if ((strncasecmp (rest, "EQU", 3) == 0
|| strncasecmp (rest, "SET", 3) == 0)
- && (rest[3] == ' ' || rest[3] == '\t'))
+ && is_whitespace (rest[3]))
{
input_line_pointer = rest + 3;
equals (s, 1);
@@ -1169,7 +1172,7 @@ read_a_source_file (const char *name)
SKIP_WHITESPACE ();
}
else if ((next_char == '=' && *rest == '=')
- || ((next_char == ' ' || next_char == '\t')
+ || (is_whitespace (next_char)
&& rest[0] == '='
&& rest[1] == '='))
{
@@ -1177,7 +1180,7 @@ read_a_source_file (const char *name)
demand_empty_rest_of_line ();
}
else if ((next_char == '='
- || ((next_char == ' ' || next_char == '\t')
+ || (is_whitespace (next_char)
&& *rest == '='))
#ifdef TC_EQUAL_IN_INSN
&& !TC_EQUAL_IN_INSN (next_char, s)
@@ -1284,7 +1287,7 @@ read_a_source_file (const char *name)
/* The following skip of whitespace is compulsory.
A well shaped space is sometimes all that separates
keyword from operands. */
- if (next_char == ' ' || next_char == '\t')
+ if (is_whitespace (next_char))
input_line_pointer++;
/* Input_line is restored.
@@ -1501,7 +1504,7 @@ mri_comment_field (char *stopcp)
know (flag_m68k_mri);
for (s = input_line_pointer;
- ((!is_end_of_line[(unsigned char) *s] && *s != ' ' && *s != '\t')
+ ((!is_end_of_line[(unsigned char) *s] && !is_whitespace (*s))
|| inquote);
s++)
{
@@ -6326,7 +6329,7 @@ equals (char *sym_name, int reassign)
if (reassign < 0 && *input_line_pointer == '=')
input_line_pointer++;
- while (*input_line_pointer == ' ' || *input_line_pointer == '\t')
+ while (is_whitespace (*input_line_pointer))
input_line_pointer++;
if (flag_mri)
@@ -6500,8 +6503,7 @@ s_include (int arg ATTRIBUTE_UNUSED)
SKIP_WHITESPACE ();
i = 0;
while (!is_end_of_line[(unsigned char) *input_line_pointer]
- && *input_line_pointer != ' '
- && *input_line_pointer != '\t')
+ && !is_whitespace (*input_line_pointer))
{
obstack_1grow (¬es, *input_line_pointer);
++input_line_pointer;
@@ -29,17 +29,18 @@ extern bool input_from_string;
#ifdef PERMIT_WHITESPACE
#define SKIP_WHITESPACE() \
- ((*input_line_pointer == ' ') ? ++input_line_pointer : 0)
+ (is_whitespace (*input_line_pointer) ? ++input_line_pointer : 0)
#define SKIP_ALL_WHITESPACE() \
- while (*input_line_pointer == ' ') ++input_line_pointer
+ while (is_whitespace (*input_line_pointer)) ++input_line_pointer
#else
-#define SKIP_WHITESPACE() know (*input_line_pointer != ' ' )
+#define SKIP_WHITESPACE() know (!is_whitespace (*input_line_pointer))
#define SKIP_ALL_WHITESPACE() SKIP_WHITESPACE()
#endif
-#define LEX_NAME (1) /* may continue a name */
+#define LEX_NAME (1) /* may continue a name */
#define LEX_BEGIN_NAME (2) /* may begin a name */
#define LEX_END_NAME (4) /* ends a name */
+#define LEX_WHITE (8) /* whitespace */
#define is_name_beginner(c) \
( lex_type[(unsigned char) (c)] & LEX_BEGIN_NAME )
@@ -47,6 +48,8 @@ extern bool input_from_string;
( lex_type[(unsigned char) (c)] & LEX_NAME )
#define is_name_ender(c) \
( lex_type[(unsigned char) (c)] & LEX_END_NAME )
+#define is_whitespace(c) \
+ ( lex_type[(unsigned char) (c)] & LEX_WHITE )
/* The distinction of "line" and "statement" sadly is blurred by unhelpful
naming of e.g. the underlying array. Most users really mean "end of
@@ -215,9 +215,7 @@ sb_terminate (sb *in)
size_t
sb_skip_white (size_t idx, sb *ptr)
{
- while (idx < ptr->len
- && (ptr->ptr[idx] == ' '
- || ptr->ptr[idx] == '\t'))
+ while (idx < ptr->len && is_whitespace (ptr->ptr[idx]))
idx++;
return idx;
}
@@ -229,18 +227,14 @@ sb_skip_white (size_t idx, sb *ptr)
size_t
sb_skip_comma (size_t idx, sb *ptr)
{
- while (idx < ptr->len
- && (ptr->ptr[idx] == ' '
- || ptr->ptr[idx] == '\t'))
+ while (idx < ptr->len && is_whitespace (ptr->ptr[idx]))
idx++;
if (idx < ptr->len
&& ptr->ptr[idx] == ',')
idx++;
- while (idx < ptr->len
- && (ptr->ptr[idx] == ' '
- || ptr->ptr[idx] == '\t'))
+ while (idx < ptr->len && is_whitespace (ptr->ptr[idx]))
idx++;
return idx;