AArch64: Cleanup alignment macros

Message ID PAWPR08MB8982939F83368DC921ACC09C83362@PAWPR08MB8982.eurprd08.prod.outlook.com
State New
Headers
Series AArch64: Cleanup alignment macros |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm fail Patch failed to apply
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 fail Patch failed to apply

Commit Message

Wilco Dijkstra Dec. 3, 2024, 2:08 p.m. UTC
  Change the AARCH64_EXPAND_ALIGNMENT macro into proper function calls to make
future changes easier.  Use the existing alignment settings, however avoid
overaligning small array's or structs to 64 bits when there is no benefit.
This gives a small reduction in data and stack size.

Passes regress & bootstrap, OK for commit?

gcc/ChangeLog:

	* config/aarch64/aarch64.h (AARCH64_EXPAND_ALIGNMENT): Remove.
	(DATA_ALIGNMENT): Use aarch64_data_alignment.
	(LOCAL_ALIGNMENT): Use aarch64_stack_alignment.
	* config/aarch64/aarch64.cc (aarch64_data_alignment): New function.
	(aarch64_stack_alignment): Likewise.
	* config/aarch64/aarch64-protos.h (aarch64_data_alignment): New prototype.
	(aarch64_stack_alignment): Likewise.

---
  

Comments

Richard Sandiford Dec. 6, 2024, 4:57 p.m. UTC | #1
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Change the AARCH64_EXPAND_ALIGNMENT macro into proper function calls to make
> future changes easier.  Use the existing alignment settings, however avoid
> overaligning small array's or structs to 64 bits when there is no benefit.
> This gives a small reduction in data and stack size.

So just to be sure I understand: we still want to align (say) an array
of 4 chars to 32 bits so that the LDR & STR are aligned, and an array of
3 chars to 32 bits so that the LDRH & STRH for the leading two bytes are
aligned?  Is that right?  We don't seem to take advantage of the padding
and do an LDR & STR for the 3-byte case, either for globals or on the stack.

If so, what's the advantage of aligning (say) a 6-byte array to 64 bits
rather than 32 bits, given that we don't use a 64-bit LDR & STR?
Could we save more with size < 64 instead of size <= 32?

Thanks,
Richard

> Passes regress & bootstrap, OK for commit?
>
> gcc/ChangeLog:
>
>         * config/aarch64/aarch64.h (AARCH64_EXPAND_ALIGNMENT): Remove.
>         (DATA_ALIGNMENT): Use aarch64_data_alignment.
>         (LOCAL_ALIGNMENT): Use aarch64_stack_alignment.
>         * config/aarch64/aarch64.cc (aarch64_data_alignment): New function.
>         (aarch64_stack_alignment): Likewise.
>         * config/aarch64/aarch64-protos.h (aarch64_data_alignment): New prototype.
>         (aarch64_stack_alignment): Likewise.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index 6da81556110c978a9de6f6fad5775c9d77771b10..4133a47693b24abca071a7f77fcdbb91d3dc261a 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -1207,4 +1207,7 @@ extern void aarch64_adjust_reg_alloc_order ();
>  bool aarch64_optimize_mode_switching (aarch64_mode_entity);
>  void aarch64_restore_za (rtx);
>
> +extern unsigned aarch64_data_alignment (const_tree exp, unsigned align);
> +extern unsigned aarch64_stack_alignment (const_tree exp, unsigned align);
> +
>  #endif /* GCC_AARCH64_PROTOS_H */
> diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
> index f07b2c49f0d9abd3309afb98499ab7eebcff05bd..64f55f6f94e37bffa6b1e7403274ec5f5d906095 100644
> --- a/gcc/config/aarch64/aarch64.h
> +++ b/gcc/config/aarch64/aarch64.h
> @@ -121,24 +121,11 @@
>     of LSE instructions.  */
>  #define TARGET_OUTLINE_ATOMICS (aarch64_flag_outline_atomics)
>
> -/* Align definitions of arrays, unions and structures so that
> -   initializations and copies can be made more efficient.  This is not
> -   ABI-changing, so it only affects places where we can see the
> -   definition.  Increasing the alignment tends to introduce padding,
> -   so don't do this when optimizing for size/conserving stack space.  */
> -#define AARCH64_EXPAND_ALIGNMENT(COND, EXP, ALIGN)                     \
> -  (((COND) && ((ALIGN) < BITS_PER_WORD)                                        \
> -    && (TREE_CODE (EXP) == ARRAY_TYPE                                  \
> -       || TREE_CODE (EXP) == UNION_TYPE                                \
> -       || TREE_CODE (EXP) == RECORD_TYPE)) ? BITS_PER_WORD : (ALIGN))
> -
> -/* Align global data.  */
> -#define DATA_ALIGNMENT(EXP, ALIGN)                     \
> -  AARCH64_EXPAND_ALIGNMENT (!optimize_size, EXP, ALIGN)
> -
> -/* Similarly, make sure that objects on the stack are sensibly aligned.  */
> -#define LOCAL_ALIGNMENT(EXP, ALIGN)                            \
> -  AARCH64_EXPAND_ALIGNMENT (!flag_conserve_stack, EXP, ALIGN)
> +/* Align global data as an optimization.  */
> +#define DATA_ALIGNMENT(EXP, ALIGN) aarch64_data_alignment (EXP, ALIGN)
> +
> +/* Align stack data as an optimization.  */
> +#define LOCAL_ALIGNMENT(EXP, ALIGN) aarch64_stack_alignment (EXP, ALIGN)
>
>  #define STRUCTURE_SIZE_BOUNDARY                8
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index c78845fc27e6d6a8a1631b487b19fb3143a231ac..5369129d4a405afe5a760081149da1347e7b8842 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -2651,6 +2651,60 @@ aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
>    return align;
>  }
>
> +/* Align definitions of arrays, unions and structures so that
> +   initializations and copies can be made more efficient.  This is not
> +   ABI-changing, so it only affects places where we can see the
> +   definition.  Increasing the alignment tends to introduce padding,
> +   so don't do this when optimizing for size/conserving stack space.  */
> +
> +unsigned
> +aarch64_data_alignment (const_tree type, unsigned align)
> +{
> +  if (optimize_size)
> +    return align;
> +
> +  if (AGGREGATE_TYPE_P (type))
> +    {
> +      unsigned HOST_WIDE_INT size = 0;
> +
> +      if (TYPE_SIZE (type) && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
> +         && tree_fits_uhwi_p (TYPE_SIZE (type)))
> +       size = tree_to_uhwi (TYPE_SIZE (type));
> +
> +      /* Align small structs/arrays to 32 bits, or 64 bits if larger.  */
> +      if (align < 32 && size <= 32)
> +       align = 32;
> +      else if (align < 64)
> +       align = 64;
> +    }
> +
> +  return align;
> +}
> +
> +unsigned
> +aarch64_stack_alignment (const_tree type, unsigned align)
> +{
> +  if (flag_conserve_stack)
> +    return align;
> +
> +  if (AGGREGATE_TYPE_P (type))
> +    {
> +      unsigned HOST_WIDE_INT size = 0;
> +
> +      if (TYPE_SIZE (type) && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
> +         && tree_fits_uhwi_p (TYPE_SIZE (type)))
> +       size = tree_to_uhwi (TYPE_SIZE (type));
> +
> +      /* Align small structs/arrays to 32 bits, or 64 bits if larger.  */
> +      if (align < 32 && size <= 32)
> +       align = 32;
> +      else if (align < 64)
> +       align = 64;
> +    }
> +
> +  return align;
> +}
> +
>  /* Return true if calls to DECL should be treated as
>     long-calls (ie called via a register).  */
>  static bool
  
Wilco Dijkstra Dec. 6, 2024, 5:43 p.m. UTC | #2
Hi Richard,

> So just to be sure I understand: we still want to align (say) an array
> of 4 chars to 32 bits so that the LDR & STR are aligned, and an array of
> 3 chars to 32 bits so that the LDRH & STRH for the leading two bytes are
> aligned?  Is that right?  We don't seem to take advantage of the padding
> and do an LDR & STR for the 3-byte case, either for globals or on the stack.

Taking advantage of padding is possible within the compilation unit for
data that is defined locally (and not interposable), and always with LTO.

> If so, what's the advantage of aligning (say) a 6-byte array to 64 bits
> rather than 32 bits, given that we don't use a 64-bit LDR & STR?
> Could we save more with size < 64 instead of size <= 32?

A common case is a constant string which is compared against some
argument. Most string functions work on 8 or 16-byte quantities. If we
ensure the whole array fits in one aligned load, we save time in the
string function.

Runtime data collected for strlen calls shows 97+% has 8-byte alignment
or higher - this kind of overalignment helps achieving that.

There are likely some further tweaks we could do in the future: 1/2-byte
objects are unlikely to benefit even from 4-byte alignment. And large
objects may benefit from higher alignment (allowing 16-byte aligned
LDP for loading values or faster memcpy of whole structs).

Cheers,
Wilco
  
Richard Sandiford Dec. 6, 2024, 5:50 p.m. UTC | #3
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Hi Richard,
>
>> So just to be sure I understand: we still want to align (say) an array
>> of 4 chars to 32 bits so that the LDR & STR are aligned, and an array of
>> 3 chars to 32 bits so that the LDRH & STRH for the leading two bytes are
>> aligned?  Is that right?  We don't seem to take advantage of the padding
>> and do an LDR & STR for the 3-byte case, either for globals or on the stack.
>
> Taking advantage of padding is possible within the compilation unit for
> data that is defined locally (and not interposable), and always with LTO.
>
>> If so, what's the advantage of aligning (say) a 6-byte array to 64 bits
>> rather than 32 bits, given that we don't use a 64-bit LDR & STR?
>> Could we save more with size < 64 instead of size <= 32?
>
> A common case is a constant string which is compared against some
> argument. Most string functions work on 8 or 16-byte quantities. If we
> ensure the whole array fits in one aligned load, we save time in the
> string function.
>
> Runtime data collected for strlen calls shows 97+% has 8-byte alignment
> or higher - this kind of overalignment helps achieving that.

Ah, ok.  But aren't we then losing that advantage for 4-byte arrays?
Or are you assuming a 4-byte path too?  Or is strlen just very unlikely
for such small data?

> There are likely some further tweaks we could do in the future: 1/2-byte
> objects are unlikely to benefit even from 4-byte alignment.

Yeah, was wondering about that too (but realised it was outside the
intended scope of the patch).

Thanks,
Richard

> And large objects may benefit from higher alignment (allowing 16-byte
> aligned LDP for loading values or faster memcpy of whole structs).
>
> Cheers,
> Wilco
  
Wilco Dijkstra Dec. 6, 2024, 7:45 p.m. UTC | #4
Hi Richard,

>> A common case is a constant string which is compared against some
>> argument. Most string functions work on 8 or 16-byte quantities. If we
>> ensure the whole array fits in one aligned load, we save time in the
>> string function.
>>
>> Runtime data collected for strlen calls shows 97+% has 8-byte alignment
>> or higher - this kind of overalignment helps achieving that.
>
> Ah, ok.  But aren't we then losing that advantage for 4-byte arrays?
> Or are you assuming a 4-byte path too?  Or is strlen just very unlikely
> for such small data?

The advantage comes from being aligned enough. Eg. a strlen implementation
may start like this:

	bic	src, srcin, 15
	ld1	{vdata.16b}, [src]                          // 16-byte aligned load
	cmeq	vhas_nul.16b, vdata.16b, 0  // check for NUL byte

It always does a 16-byte aligned load and test for the end of the string. So we want
to ensure that small strings fully fit inside the first 16-byte load (if not, it takes almost
twice the number of instructions even if the string is only 4 bytes). 4-byte alignment
is enough to ensure this.

Another approach is to always load the first 16 bytes from the start of the string
(if not close to the end of a page). That is often an unaligned load, and then the
difference between 4- and 8-byte alignment is negligible.

Cheers,
Wilco
  
Richard Sandiford Dec. 10, 2024, 10:30 a.m. UTC | #5
Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
> Hi Richard,
>
>>> A common case is a constant string which is compared against some
>>> argument. Most string functions work on 8 or 16-byte quantities. If we
>>> ensure the whole array fits in one aligned load, we save time in the
>>> string function.
>>>
>>> Runtime data collected for strlen calls shows 97+% has 8-byte alignment
>>> or higher - this kind of overalignment helps achieving that.
>>
>> Ah, ok.  But aren't we then losing that advantage for 4-byte arrays?
>> Or are you assuming a 4-byte path too?  Or is strlen just very unlikely
>> for such small data?
>
> The advantage comes from being aligned enough. Eg. a strlen implementation
> may start like this:
>
> 	bic	src, srcin, 15
> 	ld1	{vdata.16b}, [src]                          // 16-byte aligned load
> 	cmeq	vhas_nul.16b, vdata.16b, 0  // check for NUL byte
>
> It always does a 16-byte aligned load and test for the end of the string. So we want
> to ensure that small strings fully fit inside the first 16-byte load (if not, it takes almost
> twice the number of instructions even if the string is only 4 bytes). 4-byte alignment
> is enough to ensure this.

Ah, I see.  Can you add a summary of these explanations as a comment,
so that someone reading it later will understand the rationale?

OK with that change.

Thanks,
Richard

>
> Another approach is to always load the first 16 bytes from the start of the string
> (if not close to the end of a page). That is often an unaligned load, and then the
> difference between 4- and 8-byte alignment is negligible.
>
> Cheers,
> Wilco
  
Richard Sandiford Dec. 30, 2024, 3:25 p.m. UTC | #6
Richard Sandiford <richard.sandiford@arm.com> writes:
> Wilco Dijkstra <Wilco.Dijkstra@arm.com> writes:
>> Hi Richard,
>>
>>>> A common case is a constant string which is compared against some
>>>> argument. Most string functions work on 8 or 16-byte quantities. If we
>>>> ensure the whole array fits in one aligned load, we save time in the
>>>> string function.
>>>>
>>>> Runtime data collected for strlen calls shows 97+% has 8-byte alignment
>>>> or higher - this kind of overalignment helps achieving that.
>>>
>>> Ah, ok.  But aren't we then losing that advantage for 4-byte arrays?
>>> Or are you assuming a 4-byte path too?  Or is strlen just very unlikely
>>> for such small data?
>>
>> The advantage comes from being aligned enough. Eg. a strlen implementation
>> may start like this:
>>
>> 	bic	src, srcin, 15
>> 	ld1	{vdata.16b}, [src]                          // 16-byte aligned load
>> 	cmeq	vhas_nul.16b, vdata.16b, 0  // check for NUL byte
>>
>> It always does a 16-byte aligned load and test for the end of the string. So we want
>> to ensure that small strings fully fit inside the first 16-byte load (if not, it takes almost
>> twice the number of instructions even if the string is only 4 bytes). 4-byte alignment
>> is enough to ensure this.
>
> Ah, I see.  Can you add a summary of these explanations as a comment,
> so that someone reading it later will understand the rationale?
>
> OK with that change.

It looks like you committed the original version instead, with no extra
explanation.  I suppose I should have asked for another review round
instead.

Richard

> Thanks,
> Richard
>
>>
>> Another approach is to always load the first 16 bytes from the start of the string
>> (if not close to the end of a page). That is often an unaligned load, and then the
>> difference between 4- and 8-byte alignment is negligible.
>>
>> Cheers,
>> Wilco
  
Wilco Dijkstra Jan. 10, 2025, 7:55 p.m. UTC | #7
Hi Richard,

> It looks like you committed the original version instead, with no extra
> explanation.  I suppose I should have asked for another review round
> instead.

Did you check the commit log?

    Change the AARCH64_EXPAND_ALIGNMENT macro into proper function calls to make
    future changes easier.  Use the existing alignment settings, however avoid
    overaligning small array's or structs to 64 bits when there is no benefit.
    The lower alignment gives a small reduction in data and stack size.
    Using 32-bit alignment for small char arrays still improves performance of
    string functions since it can be loaded in full by the first 8/16-byte load.

That should be enough without it becoming a tutorial on how to align data for
performance...

Cheers,
Wilco
  

Patch

diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 6da81556110c978a9de6f6fad5775c9d77771b10..4133a47693b24abca071a7f77fcdbb91d3dc261a 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -1207,4 +1207,7 @@  extern void aarch64_adjust_reg_alloc_order ();
 bool aarch64_optimize_mode_switching (aarch64_mode_entity);
 void aarch64_restore_za (rtx);
 
+extern unsigned aarch64_data_alignment (const_tree exp, unsigned align);
+extern unsigned aarch64_stack_alignment (const_tree exp, unsigned align);
+
 #endif /* GCC_AARCH64_PROTOS_H */
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index f07b2c49f0d9abd3309afb98499ab7eebcff05bd..64f55f6f94e37bffa6b1e7403274ec5f5d906095 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -121,24 +121,11 @@ 
    of LSE instructions.  */
 #define TARGET_OUTLINE_ATOMICS (aarch64_flag_outline_atomics)
 
-/* Align definitions of arrays, unions and structures so that
-   initializations and copies can be made more efficient.  This is not
-   ABI-changing, so it only affects places where we can see the
-   definition.  Increasing the alignment tends to introduce padding,
-   so don't do this when optimizing for size/conserving stack space.  */
-#define AARCH64_EXPAND_ALIGNMENT(COND, EXP, ALIGN)			\
-  (((COND) && ((ALIGN) < BITS_PER_WORD)					\
-    && (TREE_CODE (EXP) == ARRAY_TYPE					\
-	|| TREE_CODE (EXP) == UNION_TYPE				\
-	|| TREE_CODE (EXP) == RECORD_TYPE)) ? BITS_PER_WORD : (ALIGN))
-
-/* Align global data.  */
-#define DATA_ALIGNMENT(EXP, ALIGN)			\
-  AARCH64_EXPAND_ALIGNMENT (!optimize_size, EXP, ALIGN)
-
-/* Similarly, make sure that objects on the stack are sensibly aligned.  */
-#define LOCAL_ALIGNMENT(EXP, ALIGN)				\
-  AARCH64_EXPAND_ALIGNMENT (!flag_conserve_stack, EXP, ALIGN)
+/* Align global data as an optimization.  */
+#define DATA_ALIGNMENT(EXP, ALIGN) aarch64_data_alignment (EXP, ALIGN)
+
+/* Align stack data as an optimization.  */
+#define LOCAL_ALIGNMENT(EXP, ALIGN) aarch64_stack_alignment (EXP, ALIGN)
 
 #define STRUCTURE_SIZE_BOUNDARY		8
 
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index c78845fc27e6d6a8a1631b487b19fb3143a231ac..5369129d4a405afe5a760081149da1347e7b8842 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -2651,6 +2651,60 @@  aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
   return align;
 }
 
+/* Align definitions of arrays, unions and structures so that
+   initializations and copies can be made more efficient.  This is not
+   ABI-changing, so it only affects places where we can see the
+   definition.  Increasing the alignment tends to introduce padding,
+   so don't do this when optimizing for size/conserving stack space.  */
+
+unsigned
+aarch64_data_alignment (const_tree type, unsigned align)
+{
+  if (optimize_size)
+    return align;
+
+  if (AGGREGATE_TYPE_P (type))
+    {
+      unsigned HOST_WIDE_INT size = 0;
+
+      if (TYPE_SIZE (type) && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
+	  && tree_fits_uhwi_p (TYPE_SIZE (type)))
+	size = tree_to_uhwi (TYPE_SIZE (type));
+
+      /* Align small structs/arrays to 32 bits, or 64 bits if larger.  */
+      if (align < 32 && size <= 32)
+	align = 32;
+      else if (align < 64)
+	align = 64;
+    }
+
+  return align;
+}
+
+unsigned
+aarch64_stack_alignment (const_tree type, unsigned align)
+{
+  if (flag_conserve_stack)
+    return align;
+
+  if (AGGREGATE_TYPE_P (type))
+    {
+      unsigned HOST_WIDE_INT size = 0;
+
+      if (TYPE_SIZE (type) && TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
+	  && tree_fits_uhwi_p (TYPE_SIZE (type)))
+	size = tree_to_uhwi (TYPE_SIZE (type));
+
+      /* Align small structs/arrays to 32 bits, or 64 bits if larger.  */
+      if (align < 32 && size <= 32)
+	align = 32;
+      else if (align < 64)
+	align = 64;
+    }
+
+  return align;
+}
+
 /* Return true if calls to DECL should be treated as
    long-calls (ie called via a register).  */
 static bool