[v2,2/2] x86-64: Only define used SSE/AVX/AVX512 run-time resolvers

Message ID 20220627211858.2807553-2-hjl.tools@gmail.com
State Committed
Commit cfdc4df66ce1464611e1b508f7a5a8f38afd5337
Headers
Series [v2,1/2] x86: Move CPU_FEATURE{S}_{USABLE|ARCH}_P to isa-level.h |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

H.J. Lu June 27, 2022, 9:18 p.m. UTC
  When glibc is built with x86-64 ISA level v3, SSE run-time resolvers
aren't used.  For x86-64 ISA level v4 build, both SSE and AVX resolvers
are unused.  Check the minimum x86-64 ISA level to exclude the unused
run-time resolvers.
---
 sysdeps/x86/isa-level.h        |  2 ++
 sysdeps/x86_64/dl-machine.h    | 12 ++++---
 sysdeps/x86_64/dl-trampoline.S | 59 ++++++++++++++++++----------------
 3 files changed, 42 insertions(+), 31 deletions(-)
  

Comments

Noah Goldstein June 27, 2022, 9:47 p.m. UTC | #1
On Mon, Jun 27, 2022 at 2:19 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> When glibc is built with x86-64 ISA level v3, SSE run-time resolvers
> aren't used.  For x86-64 ISA level v4 build, both SSE and AVX resolvers
> are unused.  Check the minimum x86-64 ISA level to exclude the unused
> run-time resolvers.
> ---
>  sysdeps/x86/isa-level.h        |  2 ++
>  sysdeps/x86_64/dl-machine.h    | 12 ++++---
>  sysdeps/x86_64/dl-trampoline.S | 59 ++++++++++++++++++----------------
>  3 files changed, 42 insertions(+), 31 deletions(-)
>
> diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
> index c6156e7f7a..f293aea906 100644
> --- a/sysdeps/x86/isa-level.h
> +++ b/sysdeps/x86/isa-level.h
> @@ -68,10 +68,12 @@
>     compile-time constant.. */
>
>  /* ISA level >= 4 guaranteed includes.  */
> +#define AVX512F_X86_ISA_LEVEL 4
>  #define AVX512VL_X86_ISA_LEVEL 4
>  #define AVX512BW_X86_ISA_LEVEL 4
>
>  /* ISA level >= 3 guaranteed includes.  */
> +#define AVX_X86_ISA_LEVEL 3
>  #define AVX2_X86_ISA_LEVEL 3
>  #define BMI2_X86_ISA_LEVEL 3
>
> diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> index 34766325ae..005d089501 100644
> --- a/sysdeps/x86_64/dl-machine.h
> +++ b/sysdeps/x86_64/dl-machine.h
> @@ -28,6 +28,7 @@
>  #include <dl-tlsdesc.h>
>  #include <dl-static-tls.h>
>  #include <dl-machine-rel.h>
> +#include <isa-level.h>
>
>  /* Return nonzero iff ELF header is compatible with the running host.  */
>  static inline int __attribute__ ((unused))
> @@ -86,6 +87,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>        /* Identify this shared object.  */
>        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
>
> +      const struct cpu_features* cpu_features = __get_cpu_features ();
> +
>        /* The got[2] entry contains the address of a function which gets
>          called to get the address of a so far unresolved function and
>          jump to it.  The profiling extension of the dynamic linker allows
> @@ -94,9 +97,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>          end in this function.  */
>        if (__glibc_unlikely (profile))
>         {
> -         if (CPU_FEATURE_USABLE (AVX512F))
> +         if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
>             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
> -         else if (CPU_FEATURE_USABLE (AVX))
> +         else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
>             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
>           else
>             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
> @@ -112,9 +115,10 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>           /* This function will get called to fix up the GOT entry
>              indicated by the offset on the stack, and then jump to
>              the resolved address.  */
> -         if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> +         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> +             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
>             *(ElfW(Addr) *) (got + 2)
> -             = (CPU_FEATURE_USABLE (XSAVEC)
> +             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
>                  ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
>                  : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
>           else
> diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> index 831a654713..f669805ac5 100644
> --- a/sysdeps/x86_64/dl-trampoline.S
> +++ b/sysdeps/x86_64/dl-trampoline.S
> @@ -20,6 +20,7 @@
>  #include <sysdep.h>
>  #include <cpu-features-offsets.h>
>  #include <link-defines.h>
> +#include <isa-level.h>
>
>  #ifndef DL_STACK_ALIGNMENT
>  /* Due to GCC bug:
> @@ -62,35 +63,39 @@
>  #undef VMOVA
>  #undef VEC_SIZE
>
> -#define VEC_SIZE               32
> -#define VMOVA                  vmovdqa
> -#define VEC(i)                 ymm##i
> -#define _dl_runtime_profile    _dl_runtime_profile_avx
> -#include "dl-trampoline.h"
> -#undef _dl_runtime_profile
> -#undef VEC
> -#undef VMOVA
> -#undef VEC_SIZE
> +#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL
> +# define VEC_SIZE              32
> +# define VMOVA                 vmovdqa
> +# define VEC(i)                        ymm##i
> +# define _dl_runtime_profile   _dl_runtime_profile_avx
> +# include "dl-trampoline.h"
> +# undef _dl_runtime_profile
> +# undef VEC
> +# undef VMOVA
> +# undef VEC_SIZE
> +#endif
>
> +#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL
>  /* movaps/movups is 1-byte shorter.  */
> -#define VEC_SIZE               16
> -#define VMOVA                  movaps
> -#define VEC(i)                 xmm##i
> -#define _dl_runtime_profile    _dl_runtime_profile_sse
> -#undef RESTORE_AVX
> -#include "dl-trampoline.h"
> -#undef _dl_runtime_profile
> -#undef VEC
> -#undef VMOVA
> -#undef VEC_SIZE
> -
> -#define USE_FXSAVE
> -#define STATE_SAVE_ALIGNMENT   16
> -#define _dl_runtime_resolve    _dl_runtime_resolve_fxsave
> -#include "dl-trampoline.h"
> -#undef _dl_runtime_resolve
> -#undef USE_FXSAVE
> -#undef STATE_SAVE_ALIGNMENT
> +# define VEC_SIZE              16
> +# define VMOVA                 movaps
> +# define VEC(i)                        xmm##i
> +# define _dl_runtime_profile   _dl_runtime_profile_sse
> +# undef RESTORE_AVX
> +# include "dl-trampoline.h"
> +# undef _dl_runtime_profile
> +# undef VEC
> +# undef VMOVA
> +# undef VEC_SIZE
> +
> +# define USE_FXSAVE
> +# define STATE_SAVE_ALIGNMENT  16
> +# define _dl_runtime_resolve   _dl_runtime_resolve_fxsave
> +# include "dl-trampoline.h"
> +# undef _dl_runtime_resolve
> +# undef USE_FXSAVE
> +# undef STATE_SAVE_ALIGNMENT
> +#endif
>
>  #define USE_XSAVE
>  #define STATE_SAVE_ALIGNMENT   64
> --
> 2.36.1
>

Sorry missed this earlier. But can you update the text section in
dl-trampoline for the corresponding
implementations?

I also have a patch to do some misc cleanup in that file so I can do
it if you don't mind
splitting the patch.
  
H.J. Lu June 27, 2022, 11:27 p.m. UTC | #2
On Mon, Jun 27, 2022 at 2:47 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Jun 27, 2022 at 2:19 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > When glibc is built with x86-64 ISA level v3, SSE run-time resolvers
> > aren't used.  For x86-64 ISA level v4 build, both SSE and AVX resolvers
> > are unused.  Check the minimum x86-64 ISA level to exclude the unused
> > run-time resolvers.
> > ---
> >  sysdeps/x86/isa-level.h        |  2 ++
> >  sysdeps/x86_64/dl-machine.h    | 12 ++++---
> >  sysdeps/x86_64/dl-trampoline.S | 59 ++++++++++++++++++----------------
> >  3 files changed, 42 insertions(+), 31 deletions(-)
> >
> > diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
> > index c6156e7f7a..f293aea906 100644
> > --- a/sysdeps/x86/isa-level.h
> > +++ b/sysdeps/x86/isa-level.h
> > @@ -68,10 +68,12 @@
> >     compile-time constant.. */
> >
> >  /* ISA level >= 4 guaranteed includes.  */
> > +#define AVX512F_X86_ISA_LEVEL 4
> >  #define AVX512VL_X86_ISA_LEVEL 4
> >  #define AVX512BW_X86_ISA_LEVEL 4
> >
> >  /* ISA level >= 3 guaranteed includes.  */
> > +#define AVX_X86_ISA_LEVEL 3
> >  #define AVX2_X86_ISA_LEVEL 3
> >  #define BMI2_X86_ISA_LEVEL 3
> >
> > diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> > index 34766325ae..005d089501 100644
> > --- a/sysdeps/x86_64/dl-machine.h
> > +++ b/sysdeps/x86_64/dl-machine.h
> > @@ -28,6 +28,7 @@
> >  #include <dl-tlsdesc.h>
> >  #include <dl-static-tls.h>
> >  #include <dl-machine-rel.h>
> > +#include <isa-level.h>
> >
> >  /* Return nonzero iff ELF header is compatible with the running host.  */
> >  static inline int __attribute__ ((unused))
> > @@ -86,6 +87,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >        /* Identify this shared object.  */
> >        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
> >
> > +      const struct cpu_features* cpu_features = __get_cpu_features ();
> > +
> >        /* The got[2] entry contains the address of a function which gets
> >          called to get the address of a so far unresolved function and
> >          jump to it.  The profiling extension of the dynamic linker allows
> > @@ -94,9 +97,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >          end in this function.  */
> >        if (__glibc_unlikely (profile))
> >         {
> > -         if (CPU_FEATURE_USABLE (AVX512F))
> > +         if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
> >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
> > -         else if (CPU_FEATURE_USABLE (AVX))
> > +         else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
> >           else
> >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
> > @@ -112,9 +115,10 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >           /* This function will get called to fix up the GOT entry
> >              indicated by the offset on the stack, and then jump to
> >              the resolved address.  */
> > -         if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > +         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> > +             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> >             *(ElfW(Addr) *) (got + 2)
> > -             = (CPU_FEATURE_USABLE (XSAVEC)
> > +             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> >                  ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> >                  : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> >           else
> > diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> > index 831a654713..f669805ac5 100644
> > --- a/sysdeps/x86_64/dl-trampoline.S
> > +++ b/sysdeps/x86_64/dl-trampoline.S
> > @@ -20,6 +20,7 @@
> >  #include <sysdep.h>
> >  #include <cpu-features-offsets.h>
> >  #include <link-defines.h>
> > +#include <isa-level.h>
> >
> >  #ifndef DL_STACK_ALIGNMENT
> >  /* Due to GCC bug:
> > @@ -62,35 +63,39 @@
> >  #undef VMOVA
> >  #undef VEC_SIZE
> >
> > -#define VEC_SIZE               32
> > -#define VMOVA                  vmovdqa
> > -#define VEC(i)                 ymm##i
> > -#define _dl_runtime_profile    _dl_runtime_profile_avx
> > -#include "dl-trampoline.h"
> > -#undef _dl_runtime_profile
> > -#undef VEC
> > -#undef VMOVA
> > -#undef VEC_SIZE
> > +#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL
> > +# define VEC_SIZE              32
> > +# define VMOVA                 vmovdqa
> > +# define VEC(i)                        ymm##i
> > +# define _dl_runtime_profile   _dl_runtime_profile_avx
> > +# include "dl-trampoline.h"
> > +# undef _dl_runtime_profile
> > +# undef VEC
> > +# undef VMOVA
> > +# undef VEC_SIZE
> > +#endif
> >
> > +#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL
> >  /* movaps/movups is 1-byte shorter.  */
> > -#define VEC_SIZE               16
> > -#define VMOVA                  movaps
> > -#define VEC(i)                 xmm##i
> > -#define _dl_runtime_profile    _dl_runtime_profile_sse
> > -#undef RESTORE_AVX
> > -#include "dl-trampoline.h"
> > -#undef _dl_runtime_profile
> > -#undef VEC
> > -#undef VMOVA
> > -#undef VEC_SIZE
> > -
> > -#define USE_FXSAVE
> > -#define STATE_SAVE_ALIGNMENT   16
> > -#define _dl_runtime_resolve    _dl_runtime_resolve_fxsave
> > -#include "dl-trampoline.h"
> > -#undef _dl_runtime_resolve
> > -#undef USE_FXSAVE
> > -#undef STATE_SAVE_ALIGNMENT
> > +# define VEC_SIZE              16
> > +# define VMOVA                 movaps
> > +# define VEC(i)                        xmm##i
> > +# define _dl_runtime_profile   _dl_runtime_profile_sse
> > +# undef RESTORE_AVX
> > +# include "dl-trampoline.h"
> > +# undef _dl_runtime_profile
> > +# undef VEC
> > +# undef VMOVA
> > +# undef VEC_SIZE
> > +
> > +# define USE_FXSAVE
> > +# define STATE_SAVE_ALIGNMENT  16
> > +# define _dl_runtime_resolve   _dl_runtime_resolve_fxsave
> > +# include "dl-trampoline.h"
> > +# undef _dl_runtime_resolve
> > +# undef USE_FXSAVE
> > +# undef STATE_SAVE_ALIGNMENT
> > +#endif
> >
> >  #define USE_XSAVE
> >  #define STATE_SAVE_ALIGNMENT   64
> > --
> > 2.36.1
> >
>
> Sorry missed this earlier. But can you update the text section in
> dl-trampoline for the corresponding
> implementations?

Did you mean a separate patch to change the text section name?

> I also have a patch to do some misc cleanup in that file so I can do
> it if you don't mind
> splitting the patch.
  

Patch

diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
index c6156e7f7a..f293aea906 100644
--- a/sysdeps/x86/isa-level.h
+++ b/sysdeps/x86/isa-level.h
@@ -68,10 +68,12 @@ 
    compile-time constant.. */
 
 /* ISA level >= 4 guaranteed includes.  */
+#define AVX512F_X86_ISA_LEVEL 4
 #define AVX512VL_X86_ISA_LEVEL 4
 #define AVX512BW_X86_ISA_LEVEL 4
 
 /* ISA level >= 3 guaranteed includes.  */
+#define AVX_X86_ISA_LEVEL 3
 #define AVX2_X86_ISA_LEVEL 3
 #define BMI2_X86_ISA_LEVEL 3
 
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index 34766325ae..005d089501 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -28,6 +28,7 @@ 
 #include <dl-tlsdesc.h>
 #include <dl-static-tls.h>
 #include <dl-machine-rel.h>
+#include <isa-level.h>
 
 /* Return nonzero iff ELF header is compatible with the running host.  */
 static inline int __attribute__ ((unused))
@@ -86,6 +87,8 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
       /* Identify this shared object.  */
       *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
 
+      const struct cpu_features* cpu_features = __get_cpu_features ();
+
       /* The got[2] entry contains the address of a function which gets
 	 called to get the address of a so far unresolved function and
 	 jump to it.  The profiling extension of the dynamic linker allows
@@ -94,9 +97,9 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 	 end in this function.  */
       if (__glibc_unlikely (profile))
 	{
-	  if (CPU_FEATURE_USABLE (AVX512F))
+	  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
 	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
-	  else if (CPU_FEATURE_USABLE (AVX))
+	  else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
 	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
 	  else
 	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
@@ -112,9 +115,10 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 	  /* This function will get called to fix up the GOT entry
 	     indicated by the offset on the stack, and then jump to
 	     the resolved address.  */
-	  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
+	  if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
+	      || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
 	    *(ElfW(Addr) *) (got + 2)
-	      = (CPU_FEATURE_USABLE (XSAVEC)
+	      = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
 		 ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
 		 : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
 	  else
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 831a654713..f669805ac5 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -20,6 +20,7 @@ 
 #include <sysdep.h>
 #include <cpu-features-offsets.h>
 #include <link-defines.h>
+#include <isa-level.h>
 
 #ifndef DL_STACK_ALIGNMENT
 /* Due to GCC bug:
@@ -62,35 +63,39 @@ 
 #undef VMOVA
 #undef VEC_SIZE
 
-#define VEC_SIZE		32
-#define VMOVA			vmovdqa
-#define VEC(i)			ymm##i
-#define _dl_runtime_profile	_dl_runtime_profile_avx
-#include "dl-trampoline.h"
-#undef _dl_runtime_profile
-#undef VEC
-#undef VMOVA
-#undef VEC_SIZE
+#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL
+# define VEC_SIZE		32
+# define VMOVA			vmovdqa
+# define VEC(i)			ymm##i
+# define _dl_runtime_profile	_dl_runtime_profile_avx
+# include "dl-trampoline.h"
+# undef _dl_runtime_profile
+# undef VEC
+# undef VMOVA
+# undef VEC_SIZE
+#endif
 
+#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL
 /* movaps/movups is 1-byte shorter.  */
-#define VEC_SIZE		16
-#define VMOVA			movaps
-#define VEC(i)			xmm##i
-#define _dl_runtime_profile	_dl_runtime_profile_sse
-#undef RESTORE_AVX
-#include "dl-trampoline.h"
-#undef _dl_runtime_profile
-#undef VEC
-#undef VMOVA
-#undef VEC_SIZE
-
-#define USE_FXSAVE
-#define STATE_SAVE_ALIGNMENT	16
-#define _dl_runtime_resolve	_dl_runtime_resolve_fxsave
-#include "dl-trampoline.h"
-#undef _dl_runtime_resolve
-#undef USE_FXSAVE
-#undef STATE_SAVE_ALIGNMENT
+# define VEC_SIZE		16
+# define VMOVA			movaps
+# define VEC(i)			xmm##i
+# define _dl_runtime_profile	_dl_runtime_profile_sse
+# undef RESTORE_AVX
+# include "dl-trampoline.h"
+# undef _dl_runtime_profile
+# undef VEC
+# undef VMOVA
+# undef VEC_SIZE
+
+# define USE_FXSAVE
+# define STATE_SAVE_ALIGNMENT	16
+# define _dl_runtime_resolve	_dl_runtime_resolve_fxsave
+# include "dl-trampoline.h"
+# undef _dl_runtime_resolve
+# undef USE_FXSAVE
+# undef STATE_SAVE_ALIGNMENT
+#endif
 
 #define USE_XSAVE
 #define STATE_SAVE_ALIGNMENT	64