x86-64: Only define used SSE/AVX/AVX512 run-time resolvers

Message ID 20220627190659.831144-1-hjl.tools@gmail.com
State Superseded
Headers
Series x86-64: Only define used SSE/AVX/AVX512 run-time resolvers |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

H.J. Lu June 27, 2022, 7:06 p.m. UTC
  When glibc is built with x86-64 ISA level v3, SSE run-time resolvers
aren't used.  For x86-64 ISA level v4 build, both SSE and AVX resolvers
are unused.

1. Move X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P to
isa-level.h.
2. Check the minimum x86-64 ISA level to exclude the unused run-time
resolvers.
---
 sysdeps/x86/isa-ifunc-macros.h | 27 ----------------
 sysdeps/x86/isa-level.h        | 26 +++++++++++++++
 sysdeps/x86_64/dl-machine.h    | 12 ++++---
 sysdeps/x86_64/dl-trampoline.S | 59 ++++++++++++++++++----------------
 4 files changed, 66 insertions(+), 58 deletions(-)
  

Comments

Noah Goldstein June 27, 2022, 7:29 p.m. UTC | #1
Forgot to reply-all, readding libc-alpha

On Mon, Jun 27, 2022 at 12:27 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Jun 27, 2022 at 12:23 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Mon, Jun 27, 2022 at 12:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > When glibc is built with x86-64 ISA level v3, SSE run-time resolvers
> > > aren't used.  For x86-64 ISA level v4 build, both SSE and AVX resolvers
> > > are unused.
> > >
> > > 1. Move X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P to
> > > isa-level.h.
> > > 2. Check the minimum x86-64 ISA level to exclude the unused run-time
> > > resolvers.
> > > ---
> > >  sysdeps/x86/isa-ifunc-macros.h | 27 ----------------
> > >  sysdeps/x86/isa-level.h        | 26 +++++++++++++++
> > >  sysdeps/x86_64/dl-machine.h    | 12 ++++---
> > >  sysdeps/x86_64/dl-trampoline.S | 59 ++++++++++++++++++----------------
> > >  4 files changed, 66 insertions(+), 58 deletions(-)
> > >
> > > diff --git a/sysdeps/x86/isa-ifunc-macros.h b/sysdeps/x86/isa-ifunc-macros.h
> > > index d69905689b..f967a1bec6 100644
> > > --- a/sysdeps/x86/isa-ifunc-macros.h
> > > +++ b/sysdeps/x86/isa-ifunc-macros.h
> > > @@ -56,31 +56,4 @@
> > >  # define X86_IFUNC_IMPL_ADD_V1(...)
> > >  #endif
> > >
> > > -/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
>
> Should this be a separate commit?
> > > -   macros are wrappers for the the respective
> > > -   CPU_FEATURE{S}_{USABLE|ARCH}_P runtime checks.  They differ in two
> > > -   ways.
> > > -
> > > -    1.  The USABLE_P version is evaluated to true when the feature
> > > -        is enabled.
> > > -
> > > -    2.  The ARCH_P version has a third argument `not`.  The `not`
> > > -        argument can either be '!' or empty.  If the feature is
> > > -        enabled above an ISA level, the third argument should be empty
> > > -        and the expression is evaluated to true when the feature is
> > > -        enabled.  If the feature is disabled above an ISA level, the
> > > -        third argument should be `!` and the expression is evaluated
> > > -        to true when the feature is disabled.
> > > - */
> > > -
> > > -#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > -   || CPU_FEATURE_USABLE_P (ptr, name))
> > > -
> > > -
> > > -#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > -   || not CPU_FEATURES_ARCH_P (ptr, name))
> > > -
> > > -
> > >  #endif
> > > diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
> > > index 075e7c6ee1..f293aea906 100644
> > > --- a/sysdeps/x86/isa-level.h
> > > +++ b/sysdeps/x86/isa-level.h
> > > @@ -68,10 +68,12 @@
> > >     compile-time constant.. */
> > >
> > >  /* ISA level >= 4 guaranteed includes.  */
> > > +#define AVX512F_X86_ISA_LEVEL 4
> > >  #define AVX512VL_X86_ISA_LEVEL 4
> > >  #define AVX512BW_X86_ISA_LEVEL 4
> > >
> > >  /* ISA level >= 3 guaranteed includes.  */
> > > +#define AVX_X86_ISA_LEVEL 3
> > >  #define AVX2_X86_ISA_LEVEL 3
> > >  #define BMI2_X86_ISA_LEVEL 3
> > >
> > > @@ -87,6 +89,30 @@
> > >     when ISA level < 3.  */
> > >  #define Prefer_No_VZEROUPPER_X86_ISA_LEVEL 3
> > >
> > > +/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
> > > +   macros are wrappers for the respective CPU_FEATURE{S}_{USABLE|ARCH}_P
> > > +   runtime checks.  They differ in two ways.
> > > +
> > > +    1.  The USABLE_P version is evaluated to true when the feature
> > > +        is enabled.
> > > +
> > > +    2.  The ARCH_P version has a third argument `not`.  The `not`
> > > +        argument can either be `!` or empty.  If the feature is
> > > +        enabled above an ISA level, the third argument should be empty
> > > +        and the expression is evaluated to true when the feature is
> > > +        enabled.  If the feature is disabled above an ISA level, the
> > > +        third argument should be `!` and the expression is evaluated
> > > +        to true when the feature is disabled.
> > > + */
> > > +
> > > +#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > +   || CPU_FEATURE_USABLE_P (ptr, name))
> > > +
> > > +#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > +   || not CPU_FEATURES_ARCH_P (ptr, name))
> > > +
> > >  #define ISA_SHOULD_BUILD(isa_build_level)                              \
> > >    (MINIMUM_X86_ISA_LEVEL <= (isa_build_level) && IS_IN (libc))         \
> > >     || defined ISA_DEFAULT_IMPL
> > > diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> > > index 34766325ae..005d089501 100644
> > > --- a/sysdeps/x86_64/dl-machine.h
> > > +++ b/sysdeps/x86_64/dl-machine.h
> > > @@ -28,6 +28,7 @@
> > >  #include <dl-tlsdesc.h>
> > >  #include <dl-static-tls.h>
> > >  #include <dl-machine-rel.h>
> > > +#include <isa-level.h>
> > >
> > >  /* Return nonzero iff ELF header is compatible with the running host.  */
> > >  static inline int __attribute__ ((unused))
> > > @@ -86,6 +87,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > >        /* Identify this shared object.  */
> > >        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
> > >
> > > +      const struct cpu_features* cpu_features = __get_cpu_features ();
> > > +
> > >        /* The got[2] entry contains the address of a function which gets
> > >          called to get the address of a so far unresolved function and
> > >          jump to it.  The profiling extension of the dynamic linker allows
> > > @@ -94,9 +97,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > >          end in this function.  */
> > >        if (__glibc_unlikely (profile))
> > >         {
> > > -         if (CPU_FEATURE_USABLE (AVX512F))
> > > +         if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
> > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
> > > -         else if (CPU_FEATURE_USABLE (AVX))
> > > +         else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
> > >           else
> > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
> > > @@ -112,9 +115,10 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > >           /* This function will get called to fix up the GOT entry
> > >              indicated by the offset on the stack, and then jump to
> > >              the resolved address.  */
> > > -         if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > > +         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> > > +             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > >             *(ElfW(Addr) *) (got + 2)
> > > -             = (CPU_FEATURE_USABLE (XSAVEC)
> > > +             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> > >                  ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> > >                  : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> > >           else
> > > diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> > > index 831a654713..f669805ac5 100644
> > > --- a/sysdeps/x86_64/dl-trampoline.S
> > > +++ b/sysdeps/x86_64/dl-trampoline.S
> > > @@ -20,6 +20,7 @@
> > >  #include <sysdep.h>
> > >  #include <cpu-features-offsets.h>
> > >  #include <link-defines.h>
> > > +#include <isa-level.h>
> > >
> > >  #ifndef DL_STACK_ALIGNMENT
> > >  /* Due to GCC bug:
> > > @@ -62,35 +63,39 @@
> >
> > Can you also add a guard for isa level v4 on the avx512 version?
> > >  #undef VMOVA
> > >  #undef VEC_SIZE
> > >
> > > -#define VEC_SIZE               32
> > > -#define VMOVA                  vmovdqa
> > > -#define VEC(i)                 ymm##i
> > > -#define _dl_runtime_profile    _dl_runtime_profile_avx
> > > -#include "dl-trampoline.h"
> > > -#undef _dl_runtime_profile
> > > -#undef VEC
> > > -#undef VMOVA
> > > -#undef VEC_SIZE
> > > +#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL
> > Can this be <= __X86_ISA_LEVEL_V3
> >
> > > +# define VEC_SIZE              32
> > > +# define VMOVA                 vmovdqa
> > > +# define VEC(i)                        ymm##i
> > > +# define _dl_runtime_profile   _dl_runtime_profile_avx
> > > +# include "dl-trampoline.h"
> > > +# undef _dl_runtime_profile
> > > +# undef VEC
> > > +# undef VMOVA
> > > +# undef VEC_SIZE
> > > +#endif
> > >
> > > +#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL
> >
> > Can this be <= __X86_ISA_LEVEL_V2?
> > >  /* movaps/movups is 1-byte shorter.  */
> > > -#define VEC_SIZE               16
> > > -#define VMOVA                  movaps
> > > -#define VEC(i)                 xmm##i
> > > -#define _dl_runtime_profile    _dl_runtime_profile_sse
> > > -#undef RESTORE_AVX
> > > -#include "dl-trampoline.h"
> > > -#undef _dl_runtime_profile
> > > -#undef VEC
> > > -#undef VMOVA
> > > -#undef VEC_SIZE
> > > -
> > > -#define USE_FXSAVE
> > > -#define STATE_SAVE_ALIGNMENT   16
> > > -#define _dl_runtime_resolve    _dl_runtime_resolve_fxsave
> > > -#include "dl-trampoline.h"
> > > -#undef _dl_runtime_resolve
> > > -#undef USE_FXSAVE
> > > -#undef STATE_SAVE_ALIGNMENT
> > > +# define VEC_SIZE              16
> > > +# define VMOVA                 movaps
> > > +# define VEC(i)                        xmm##i
> > > +# define _dl_runtime_profile   _dl_runtime_profile_sse
> > > +# undef RESTORE_AVX
> > > +# include "dl-trampoline.h"
> > > +# undef _dl_runtime_profile
> > > +# undef VEC
> > > +# undef VMOVA
> > > +# undef VEC_SIZE
> > > +
> > > +# define USE_FXSAVE
> > > +# define STATE_SAVE_ALIGNMENT  16
> > > +# define _dl_runtime_resolve   _dl_runtime_resolve_fxsave
> > > +# include "dl-trampoline.h"
> > > +# undef _dl_runtime_resolve
> > > +# undef USE_FXSAVE
> > > +# undef STATE_SAVE_ALIGNMENT
> > > +#endif
> > >
> > >  #define USE_XSAVE
> > >  #define STATE_SAVE_ALIGNMENT   64
> > > --
> > > 2.36.1
> > >
> >
> > There are cases of unconditional `movaps` usage in dl-trampoline.h (L222)
> >
> > This appears to be on ISA level v3 / v4 path.
> >
> > For ISA level >= 3 builds can those become `vmovdqa`?
  
H.J. Lu June 27, 2022, 7:43 p.m. UTC | #2
On Mon, Jun 27, 2022 at 12:23 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Jun 27, 2022 at 12:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > When glibc is built with x86-64 ISA level v3, SSE run-time resolvers
> > aren't used.  For x86-64 ISA level v4 build, both SSE and AVX resolvers
> > are unused.
> >
> > 1. Move X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P to
> > isa-level.h.
> > 2. Check the minimum x86-64 ISA level to exclude the unused run-time
> > resolvers.
> > ---
> >  sysdeps/x86/isa-ifunc-macros.h | 27 ----------------
> >  sysdeps/x86/isa-level.h        | 26 +++++++++++++++
> >  sysdeps/x86_64/dl-machine.h    | 12 ++++---
> >  sysdeps/x86_64/dl-trampoline.S | 59 ++++++++++++++++++----------------
> >  4 files changed, 66 insertions(+), 58 deletions(-)
> >
> > diff --git a/sysdeps/x86/isa-ifunc-macros.h b/sysdeps/x86/isa-ifunc-macros.h
> > index d69905689b..f967a1bec6 100644
> > --- a/sysdeps/x86/isa-ifunc-macros.h
> > +++ b/sysdeps/x86/isa-ifunc-macros.h
> > @@ -56,31 +56,4 @@
> >  # define X86_IFUNC_IMPL_ADD_V1(...)
> >  #endif
> >
> > -/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
> > -   macros are wrappers for the the respective
> > -   CPU_FEATURE{S}_{USABLE|ARCH}_P runtime checks.  They differ in two
> > -   ways.
> > -
> > -    1.  The USABLE_P version is evaluated to true when the feature
> > -        is enabled.
> > -
> > -    2.  The ARCH_P version has a third argument `not`.  The `not`
> > -        argument can either be '!' or empty.  If the feature is
> > -        enabled above an ISA level, the third argument should be empty
> > -        and the expression is evaluated to true when the feature is
> > -        enabled.  If the feature is disabled above an ISA level, the
> > -        third argument should be `!` and the expression is evaluated
> > -        to true when the feature is disabled.
> > - */
> > -
> > -#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > -   || CPU_FEATURE_USABLE_P (ptr, name))
> > -
> > -
> > -#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > -   || not CPU_FEATURES_ARCH_P (ptr, name))
> > -
> > -
> >  #endif
> > diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
> > index 075e7c6ee1..f293aea906 100644
> > --- a/sysdeps/x86/isa-level.h
> > +++ b/sysdeps/x86/isa-level.h
> > @@ -68,10 +68,12 @@
> >     compile-time constant.. */
> >
> >  /* ISA level >= 4 guaranteed includes.  */
> > +#define AVX512F_X86_ISA_LEVEL 4
> >  #define AVX512VL_X86_ISA_LEVEL 4
> >  #define AVX512BW_X86_ISA_LEVEL 4
> >
> >  /* ISA level >= 3 guaranteed includes.  */
> > +#define AVX_X86_ISA_LEVEL 3
> >  #define AVX2_X86_ISA_LEVEL 3
> >  #define BMI2_X86_ISA_LEVEL 3
> >
> > @@ -87,6 +89,30 @@
> >     when ISA level < 3.  */
> >  #define Prefer_No_VZEROUPPER_X86_ISA_LEVEL 3
> >
> > +/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
> > +   macros are wrappers for the respective CPU_FEATURE{S}_{USABLE|ARCH}_P
> > +   runtime checks.  They differ in two ways.
> > +
> > +    1.  The USABLE_P version is evaluated to true when the feature
> > +        is enabled.
> > +
> > +    2.  The ARCH_P version has a third argument `not`.  The `not`
> > +        argument can either be `!` or empty.  If the feature is
> > +        enabled above an ISA level, the third argument should be empty
> > +        and the expression is evaluated to true when the feature is
> > +        enabled.  If the feature is disabled above an ISA level, the
> > +        third argument should be `!` and the expression is evaluated
> > +        to true when the feature is disabled.
> > + */
> > +
> > +#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > +   || CPU_FEATURE_USABLE_P (ptr, name))
> > +
> > +#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > +   || not CPU_FEATURES_ARCH_P (ptr, name))
> > +
> >  #define ISA_SHOULD_BUILD(isa_build_level)                              \
> >    (MINIMUM_X86_ISA_LEVEL <= (isa_build_level) && IS_IN (libc))         \
> >     || defined ISA_DEFAULT_IMPL
> > diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> > index 34766325ae..005d089501 100644
> > --- a/sysdeps/x86_64/dl-machine.h
> > +++ b/sysdeps/x86_64/dl-machine.h
> > @@ -28,6 +28,7 @@
> >  #include <dl-tlsdesc.h>
> >  #include <dl-static-tls.h>
> >  #include <dl-machine-rel.h>
> > +#include <isa-level.h>
> >
> >  /* Return nonzero iff ELF header is compatible with the running host.  */
> >  static inline int __attribute__ ((unused))
> > @@ -86,6 +87,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >        /* Identify this shared object.  */
> >        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
> >
> > +      const struct cpu_features* cpu_features = __get_cpu_features ();
> > +
> >        /* The got[2] entry contains the address of a function which gets
> >          called to get the address of a so far unresolved function and
> >          jump to it.  The profiling extension of the dynamic linker allows
> > @@ -94,9 +97,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >          end in this function.  */
> >        if (__glibc_unlikely (profile))
> >         {
> > -         if (CPU_FEATURE_USABLE (AVX512F))
> > +         if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
> >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
> > -         else if (CPU_FEATURE_USABLE (AVX))
> > +         else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
> >           else
> >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
> > @@ -112,9 +115,10 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> >           /* This function will get called to fix up the GOT entry
> >              indicated by the offset on the stack, and then jump to
> >              the resolved address.  */
> > -         if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > +         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> > +             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> >             *(ElfW(Addr) *) (got + 2)
> > -             = (CPU_FEATURE_USABLE (XSAVEC)
> > +             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> >                  ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> >                  : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> >           else
> > diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> > index 831a654713..f669805ac5 100644
> > --- a/sysdeps/x86_64/dl-trampoline.S
> > +++ b/sysdeps/x86_64/dl-trampoline.S
> > @@ -20,6 +20,7 @@
> >  #include <sysdep.h>
> >  #include <cpu-features-offsets.h>
> >  #include <link-defines.h>
> > +#include <isa-level.h>
> >
> >  #ifndef DL_STACK_ALIGNMENT
> >  /* Due to GCC bug:
> > @@ -62,35 +63,39 @@
>
> Can you also add a guard for isa level v4 on the avx512 version?

It is needed only when we need to save and restore new registers
in _dl_runtime_profile, which requires a new _dl_runtime_profile.
We can add the check then.

> >  #undef VMOVA
> >  #undef VEC_SIZE
> >
> > -#define VEC_SIZE               32
> > -#define VMOVA                  vmovdqa
> > -#define VEC(i)                 ymm##i
> > -#define _dl_runtime_profile    _dl_runtime_profile_avx
> > -#include "dl-trampoline.h"
> > -#undef _dl_runtime_profile
> > -#undef VEC
> > -#undef VMOVA
> > -#undef VEC_SIZE
> > +#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL
> Can this be <= __X86_ISA_LEVEL_V3

No, since __X86_ISA_LEVEL_VN is defined either 0 or 1.

> > +# define VEC_SIZE              32
> > +# define VMOVA                 vmovdqa
> > +# define VEC(i)                        ymm##i
> > +# define _dl_runtime_profile   _dl_runtime_profile_avx
> > +# include "dl-trampoline.h"
> > +# undef _dl_runtime_profile
> > +# undef VEC
> > +# undef VMOVA
> > +# undef VEC_SIZE
> > +#endif
> >
> > +#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL
>
> Can this be <= __X86_ISA_LEVEL_V2?
> >  /* movaps/movups is 1-byte shorter.  */
> > -#define VEC_SIZE               16
> > -#define VMOVA                  movaps
> > -#define VEC(i)                 xmm##i
> > -#define _dl_runtime_profile    _dl_runtime_profile_sse
> > -#undef RESTORE_AVX
> > -#include "dl-trampoline.h"
> > -#undef _dl_runtime_profile
> > -#undef VEC
> > -#undef VMOVA
> > -#undef VEC_SIZE
> > -
> > -#define USE_FXSAVE
> > -#define STATE_SAVE_ALIGNMENT   16
> > -#define _dl_runtime_resolve    _dl_runtime_resolve_fxsave
> > -#include "dl-trampoline.h"
> > -#undef _dl_runtime_resolve
> > -#undef USE_FXSAVE
> > -#undef STATE_SAVE_ALIGNMENT
> > +# define VEC_SIZE              16
> > +# define VMOVA                 movaps
> > +# define VEC(i)                        xmm##i
> > +# define _dl_runtime_profile   _dl_runtime_profile_sse
> > +# undef RESTORE_AVX
> > +# include "dl-trampoline.h"
> > +# undef _dl_runtime_profile
> > +# undef VEC
> > +# undef VMOVA
> > +# undef VEC_SIZE
> > +
> > +# define USE_FXSAVE
> > +# define STATE_SAVE_ALIGNMENT  16
> > +# define _dl_runtime_resolve   _dl_runtime_resolve_fxsave
> > +# include "dl-trampoline.h"
> > +# undef _dl_runtime_resolve
> > +# undef USE_FXSAVE
> > +# undef STATE_SAVE_ALIGNMENT
> > +#endif
> >
> >  #define USE_XSAVE
> >  #define STATE_SAVE_ALIGNMENT   64
> > --
> > 2.36.1
> >
>
> There are cases of unconditional `movaps` usage in dl-trampoline.h (L222)
>
> This appears to be on ISA level v3 / v4 path.
>
> For ISA level >= 3 builds can those become `vmovdqa`?

It may be replaced with VMOVA in a separate patch.
  
H.J. Lu June 27, 2022, 7:44 p.m. UTC | #3
On Mon, Jun 27, 2022 at 12:27 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Jun 27, 2022 at 12:23 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Mon, Jun 27, 2022 at 12:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > When glibc is built with x86-64 ISA level v3, SSE run-time resolvers
> > > aren't used.  For x86-64 ISA level v4 build, both SSE and AVX resolvers
> > > are unused.
> > >
> > > 1. Move X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P to
> > > isa-level.h.
> > > 2. Check the minimum x86-64 ISA level to exclude the unused run-time
> > > resolvers.
> > > ---
> > >  sysdeps/x86/isa-ifunc-macros.h | 27 ----------------
> > >  sysdeps/x86/isa-level.h        | 26 +++++++++++++++
> > >  sysdeps/x86_64/dl-machine.h    | 12 ++++---
> > >  sysdeps/x86_64/dl-trampoline.S | 59 ++++++++++++++++++----------------
> > >  4 files changed, 66 insertions(+), 58 deletions(-)
> > >
> > > diff --git a/sysdeps/x86/isa-ifunc-macros.h b/sysdeps/x86/isa-ifunc-macros.h
> > > index d69905689b..f967a1bec6 100644
> > > --- a/sysdeps/x86/isa-ifunc-macros.h
> > > +++ b/sysdeps/x86/isa-ifunc-macros.h
> > > @@ -56,31 +56,4 @@
> > >  # define X86_IFUNC_IMPL_ADD_V1(...)
> > >  #endif
> > >
> > > -/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
>
> Should this be a separate commit?

I can do it.


> > > -   macros are wrappers for the the respective
> > > -   CPU_FEATURE{S}_{USABLE|ARCH}_P runtime checks.  They differ in two
> > > -   ways.
> > > -
> > > -    1.  The USABLE_P version is evaluated to true when the feature
> > > -        is enabled.
> > > -
> > > -    2.  The ARCH_P version has a third argument `not`.  The `not`
> > > -        argument can either be '!' or empty.  If the feature is
> > > -        enabled above an ISA level, the third argument should be empty
> > > -        and the expression is evaluated to true when the feature is
> > > -        enabled.  If the feature is disabled above an ISA level, the
> > > -        third argument should be `!` and the expression is evaluated
> > > -        to true when the feature is disabled.
> > > - */
> > > -
> > > -#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > -   || CPU_FEATURE_USABLE_P (ptr, name))
> > > -
> > > -
> > > -#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > -   || not CPU_FEATURES_ARCH_P (ptr, name))
> > > -
> > > -
> > >  #endif
> > > diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
> > > index 075e7c6ee1..f293aea906 100644
> > > --- a/sysdeps/x86/isa-level.h
> > > +++ b/sysdeps/x86/isa-level.h
> > > @@ -68,10 +68,12 @@
> > >     compile-time constant.. */
> > >
> > >  /* ISA level >= 4 guaranteed includes.  */
> > > +#define AVX512F_X86_ISA_LEVEL 4
> > >  #define AVX512VL_X86_ISA_LEVEL 4
> > >  #define AVX512BW_X86_ISA_LEVEL 4
> > >
> > >  /* ISA level >= 3 guaranteed includes.  */
> > > +#define AVX_X86_ISA_LEVEL 3
> > >  #define AVX2_X86_ISA_LEVEL 3
> > >  #define BMI2_X86_ISA_LEVEL 3
> > >
> > > @@ -87,6 +89,30 @@
> > >     when ISA level < 3.  */
> > >  #define Prefer_No_VZEROUPPER_X86_ISA_LEVEL 3
> > >
> > > +/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
> > > +   macros are wrappers for the respective CPU_FEATURE{S}_{USABLE|ARCH}_P
> > > +   runtime checks.  They differ in two ways.
> > > +
> > > +    1.  The USABLE_P version is evaluated to true when the feature
> > > +        is enabled.
> > > +
> > > +    2.  The ARCH_P version has a third argument `not`.  The `not`
> > > +        argument can either be `!` or empty.  If the feature is
> > > +        enabled above an ISA level, the third argument should be empty
> > > +        and the expression is evaluated to true when the feature is
> > > +        enabled.  If the feature is disabled above an ISA level, the
> > > +        third argument should be `!` and the expression is evaluated
> > > +        to true when the feature is disabled.
> > > + */
> > > +
> > > +#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > +   || CPU_FEATURE_USABLE_P (ptr, name))
> > > +
> > > +#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > +   || not CPU_FEATURES_ARCH_P (ptr, name))
> > > +
> > >  #define ISA_SHOULD_BUILD(isa_build_level)                              \
> > >    (MINIMUM_X86_ISA_LEVEL <= (isa_build_level) && IS_IN (libc))         \
> > >     || defined ISA_DEFAULT_IMPL
> > > diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> > > index 34766325ae..005d089501 100644
> > > --- a/sysdeps/x86_64/dl-machine.h
> > > +++ b/sysdeps/x86_64/dl-machine.h
> > > @@ -28,6 +28,7 @@
> > >  #include <dl-tlsdesc.h>
> > >  #include <dl-static-tls.h>
> > >  #include <dl-machine-rel.h>
> > > +#include <isa-level.h>
> > >
> > >  /* Return nonzero iff ELF header is compatible with the running host.  */
> > >  static inline int __attribute__ ((unused))
> > > @@ -86,6 +87,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > >        /* Identify this shared object.  */
> > >        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
> > >
> > > +      const struct cpu_features* cpu_features = __get_cpu_features ();
> > > +
> > >        /* The got[2] entry contains the address of a function which gets
> > >          called to get the address of a so far unresolved function and
> > >          jump to it.  The profiling extension of the dynamic linker allows
> > > @@ -94,9 +97,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > >          end in this function.  */
> > >        if (__glibc_unlikely (profile))
> > >         {
> > > -         if (CPU_FEATURE_USABLE (AVX512F))
> > > +         if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
> > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
> > > -         else if (CPU_FEATURE_USABLE (AVX))
> > > +         else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
> > >           else
> > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
> > > @@ -112,9 +115,10 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > >           /* This function will get called to fix up the GOT entry
> > >              indicated by the offset on the stack, and then jump to
> > >              the resolved address.  */
> > > -         if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > > +         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> > > +             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > >             *(ElfW(Addr) *) (got + 2)
> > > -             = (CPU_FEATURE_USABLE (XSAVEC)
> > > +             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> > >                  ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> > >                  : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> > >           else
> > > diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> > > index 831a654713..f669805ac5 100644
> > > --- a/sysdeps/x86_64/dl-trampoline.S
> > > +++ b/sysdeps/x86_64/dl-trampoline.S
> > > @@ -20,6 +20,7 @@
> > >  #include <sysdep.h>
> > >  #include <cpu-features-offsets.h>
> > >  #include <link-defines.h>
> > > +#include <isa-level.h>
> > >
> > >  #ifndef DL_STACK_ALIGNMENT
> > >  /* Due to GCC bug:
> > > @@ -62,35 +63,39 @@
> >
> > Can you also add a guard for isa level v4 on the avx512 version?
> > >  #undef VMOVA
> > >  #undef VEC_SIZE
> > >
> > > -#define VEC_SIZE               32
> > > -#define VMOVA                  vmovdqa
> > > -#define VEC(i)                 ymm##i
> > > -#define _dl_runtime_profile    _dl_runtime_profile_avx
> > > -#include "dl-trampoline.h"
> > > -#undef _dl_runtime_profile
> > > -#undef VEC
> > > -#undef VMOVA
> > > -#undef VEC_SIZE
> > > +#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL
> > Can this be <= __X86_ISA_LEVEL_V3
> >
> > > +# define VEC_SIZE              32
> > > +# define VMOVA                 vmovdqa
> > > +# define VEC(i)                        ymm##i
> > > +# define _dl_runtime_profile   _dl_runtime_profile_avx
> > > +# include "dl-trampoline.h"
> > > +# undef _dl_runtime_profile
> > > +# undef VEC
> > > +# undef VMOVA
> > > +# undef VEC_SIZE
> > > +#endif
> > >
> > > +#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL
> >
> > Can this be <= __X86_ISA_LEVEL_V2?
> > >  /* movaps/movups is 1-byte shorter.  */
> > > -#define VEC_SIZE               16
> > > -#define VMOVA                  movaps
> > > -#define VEC(i)                 xmm##i
> > > -#define _dl_runtime_profile    _dl_runtime_profile_sse
> > > -#undef RESTORE_AVX
> > > -#include "dl-trampoline.h"
> > > -#undef _dl_runtime_profile
> > > -#undef VEC
> > > -#undef VMOVA
> > > -#undef VEC_SIZE
> > > -
> > > -#define USE_FXSAVE
> > > -#define STATE_SAVE_ALIGNMENT   16
> > > -#define _dl_runtime_resolve    _dl_runtime_resolve_fxsave
> > > -#include "dl-trampoline.h"
> > > -#undef _dl_runtime_resolve
> > > -#undef USE_FXSAVE
> > > -#undef STATE_SAVE_ALIGNMENT
> > > +# define VEC_SIZE              16
> > > +# define VMOVA                 movaps
> > > +# define VEC(i)                        xmm##i
> > > +# define _dl_runtime_profile   _dl_runtime_profile_sse
> > > +# undef RESTORE_AVX
> > > +# include "dl-trampoline.h"
> > > +# undef _dl_runtime_profile
> > > +# undef VEC
> > > +# undef VMOVA
> > > +# undef VEC_SIZE
> > > +
> > > +# define USE_FXSAVE
> > > +# define STATE_SAVE_ALIGNMENT  16
> > > +# define _dl_runtime_resolve   _dl_runtime_resolve_fxsave
> > > +# include "dl-trampoline.h"
> > > +# undef _dl_runtime_resolve
> > > +# undef USE_FXSAVE
> > > +# undef STATE_SAVE_ALIGNMENT
> > > +#endif
> > >
> > >  #define USE_XSAVE
> > >  #define STATE_SAVE_ALIGNMENT   64
> > > --
> > > 2.36.1
> > >
> >
> > There are cases of unconditional `movaps` usage in dl-trampoline.h (L222)
> >
> > This appears to be on ISA level v3 / v4 path.
> >
> > For ISA level >= 3 builds can those become `vmovdqa`?



--
H.J.
  
Noah Goldstein June 27, 2022, 7:51 p.m. UTC | #4
On Mon, Jun 27, 2022 at 12:44 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jun 27, 2022 at 12:23 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Mon, Jun 27, 2022 at 12:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > When glibc is built with x86-64 ISA level v3, SSE run-time resolvers
> > > aren't used.  For x86-64 ISA level v4 build, both SSE and AVX resolvers
> > > are unused.
> > >
> > > 1. Move X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P to
> > > isa-level.h.
> > > 2. Check the minimum x86-64 ISA level to exclude the unused run-time
> > > resolvers.
> > > ---
> > >  sysdeps/x86/isa-ifunc-macros.h | 27 ----------------
> > >  sysdeps/x86/isa-level.h        | 26 +++++++++++++++
> > >  sysdeps/x86_64/dl-machine.h    | 12 ++++---
> > >  sysdeps/x86_64/dl-trampoline.S | 59 ++++++++++++++++++----------------
> > >  4 files changed, 66 insertions(+), 58 deletions(-)
> > >
> > > diff --git a/sysdeps/x86/isa-ifunc-macros.h b/sysdeps/x86/isa-ifunc-macros.h
> > > index d69905689b..f967a1bec6 100644
> > > --- a/sysdeps/x86/isa-ifunc-macros.h
> > > +++ b/sysdeps/x86/isa-ifunc-macros.h
> > > @@ -56,31 +56,4 @@
> > >  # define X86_IFUNC_IMPL_ADD_V1(...)
> > >  #endif
> > >
> > > -/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
> > > -   macros are wrappers for the the respective
> > > -   CPU_FEATURE{S}_{USABLE|ARCH}_P runtime checks.  They differ in two
> > > -   ways.
> > > -
> > > -    1.  The USABLE_P version is evaluated to true when the feature
> > > -        is enabled.
> > > -
> > > -    2.  The ARCH_P version has a third argument `not`.  The `not`
> > > -        argument can either be '!' or empty.  If the feature is
> > > -        enabled above an ISA level, the third argument should be empty
> > > -        and the expression is evaluated to true when the feature is
> > > -        enabled.  If the feature is disabled above an ISA level, the
> > > -        third argument should be `!` and the expression is evaluated
> > > -        to true when the feature is disabled.
> > > - */
> > > -
> > > -#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > -   || CPU_FEATURE_USABLE_P (ptr, name))
> > > -
> > > -
> > > -#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > -   || not CPU_FEATURES_ARCH_P (ptr, name))
> > > -
> > > -
> > >  #endif
> > > diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
> > > index 075e7c6ee1..f293aea906 100644
> > > --- a/sysdeps/x86/isa-level.h
> > > +++ b/sysdeps/x86/isa-level.h
> > > @@ -68,10 +68,12 @@
> > >     compile-time constant.. */
> > >
> > >  /* ISA level >= 4 guaranteed includes.  */
> > > +#define AVX512F_X86_ISA_LEVEL 4
> > >  #define AVX512VL_X86_ISA_LEVEL 4
> > >  #define AVX512BW_X86_ISA_LEVEL 4
> > >
> > >  /* ISA level >= 3 guaranteed includes.  */
> > > +#define AVX_X86_ISA_LEVEL 3
> > >  #define AVX2_X86_ISA_LEVEL 3
> > >  #define BMI2_X86_ISA_LEVEL 3
> > >
> > > @@ -87,6 +89,30 @@
> > >     when ISA level < 3.  */
> > >  #define Prefer_No_VZEROUPPER_X86_ISA_LEVEL 3
> > >
> > > +/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
> > > +   macros are wrappers for the respective CPU_FEATURE{S}_{USABLE|ARCH}_P
> > > +   runtime checks.  They differ in two ways.
> > > +
> > > +    1.  The USABLE_P version is evaluated to true when the feature
> > > +        is enabled.
> > > +
> > > +    2.  The ARCH_P version has a third argument `not`.  The `not`
> > > +        argument can either be `!` or empty.  If the feature is
> > > +        enabled above an ISA level, the third argument should be empty
> > > +        and the expression is evaluated to true when the feature is
> > > +        enabled.  If the feature is disabled above an ISA level, the
> > > +        third argument should be `!` and the expression is evaluated
> > > +        to true when the feature is disabled.
> > > + */
> > > +
> > > +#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > +   || CPU_FEATURE_USABLE_P (ptr, name))
> > > +
> > > +#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > +   || not CPU_FEATURES_ARCH_P (ptr, name))
> > > +
> > >  #define ISA_SHOULD_BUILD(isa_build_level)                              \
> > >    (MINIMUM_X86_ISA_LEVEL <= (isa_build_level) && IS_IN (libc))         \
> > >     || defined ISA_DEFAULT_IMPL
> > > diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> > > index 34766325ae..005d089501 100644
> > > --- a/sysdeps/x86_64/dl-machine.h
> > > +++ b/sysdeps/x86_64/dl-machine.h
> > > @@ -28,6 +28,7 @@
> > >  #include <dl-tlsdesc.h>
> > >  #include <dl-static-tls.h>
> > >  #include <dl-machine-rel.h>
> > > +#include <isa-level.h>
> > >
> > >  /* Return nonzero iff ELF header is compatible with the running host.  */
> > >  static inline int __attribute__ ((unused))
> > > @@ -86,6 +87,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > >        /* Identify this shared object.  */
> > >        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
> > >
> > > +      const struct cpu_features* cpu_features = __get_cpu_features ();
> > > +
> > >        /* The got[2] entry contains the address of a function which gets
> > >          called to get the address of a so far unresolved function and
> > >          jump to it.  The profiling extension of the dynamic linker allows
> > > @@ -94,9 +97,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > >          end in this function.  */
> > >        if (__glibc_unlikely (profile))
> > >         {
> > > -         if (CPU_FEATURE_USABLE (AVX512F))
> > > +         if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
> > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
> > > -         else if (CPU_FEATURE_USABLE (AVX))
> > > +         else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
> > >           else
> > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
> > > @@ -112,9 +115,10 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > >           /* This function will get called to fix up the GOT entry
> > >              indicated by the offset on the stack, and then jump to
> > >              the resolved address.  */
> > > -         if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > > +         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> > > +             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > >             *(ElfW(Addr) *) (got + 2)
> > > -             = (CPU_FEATURE_USABLE (XSAVEC)
> > > +             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> > >                  ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> > >                  : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> > >           else
> > > diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> > > index 831a654713..f669805ac5 100644
> > > --- a/sysdeps/x86_64/dl-trampoline.S
> > > +++ b/sysdeps/x86_64/dl-trampoline.S
> > > @@ -20,6 +20,7 @@
> > >  #include <sysdep.h>
> > >  #include <cpu-features-offsets.h>
> > >  #include <link-defines.h>
> > > +#include <isa-level.h>
> > >
> > >  #ifndef DL_STACK_ALIGNMENT
> > >  /* Due to GCC bug:
> > > @@ -62,35 +63,39 @@
> >
> > Can you also add a guard for isa level v4 on the avx512 version?
>
> It is needed only when we need to save and restore new registers
> in _dl_runtime_profile, which requires a new _dl_runtime_profile.
> We can add the check then.

?
Why does the prohibit the check?
>
> > >  #undef VMOVA
> > >  #undef VEC_SIZE
> > >
> > > -#define VEC_SIZE               32
> > > -#define VMOVA                  vmovdqa
> > > -#define VEC(i)                 ymm##i
> > > -#define _dl_runtime_profile    _dl_runtime_profile_avx
> > > -#include "dl-trampoline.h"
> > > -#undef _dl_runtime_profile
> > > -#undef VEC
> > > -#undef VMOVA
> > > -#undef VEC_SIZE
> > > +#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL
> > Can this be <= __X86_ISA_LEVEL_V3
>
> No, since __X86_ISA_LEVEL_VN is defined either 0 or 1.

Then <= 3 / <= 2?
>
> > > +# define VEC_SIZE              32
> > > +# define VMOVA                 vmovdqa
> > > +# define VEC(i)                        ymm##i
> > > +# define _dl_runtime_profile   _dl_runtime_profile_avx
> > > +# include "dl-trampoline.h"
> > > +# undef _dl_runtime_profile
> > > +# undef VEC
> > > +# undef VMOVA
> > > +# undef VEC_SIZE
> > > +#endif
> > >
> > > +#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL
> >
> > Can this be <= __X86_ISA_LEVEL_V2?
> > >  /* movaps/movups is 1-byte shorter.  */
> > > -#define VEC_SIZE               16
> > > -#define VMOVA                  movaps
> > > -#define VEC(i)                 xmm##i
> > > -#define _dl_runtime_profile    _dl_runtime_profile_sse
> > > -#undef RESTORE_AVX
> > > -#include "dl-trampoline.h"
> > > -#undef _dl_runtime_profile
> > > -#undef VEC
> > > -#undef VMOVA
> > > -#undef VEC_SIZE
> > > -
> > > -#define USE_FXSAVE
> > > -#define STATE_SAVE_ALIGNMENT   16
> > > -#define _dl_runtime_resolve    _dl_runtime_resolve_fxsave
> > > -#include "dl-trampoline.h"
> > > -#undef _dl_runtime_resolve
> > > -#undef USE_FXSAVE
> > > -#undef STATE_SAVE_ALIGNMENT
> > > +# define VEC_SIZE              16
> > > +# define VMOVA                 movaps
> > > +# define VEC(i)                        xmm##i
> > > +# define _dl_runtime_profile   _dl_runtime_profile_sse
> > > +# undef RESTORE_AVX
> > > +# include "dl-trampoline.h"
> > > +# undef _dl_runtime_profile
> > > +# undef VEC
> > > +# undef VMOVA
> > > +# undef VEC_SIZE
> > > +
> > > +# define USE_FXSAVE
> > > +# define STATE_SAVE_ALIGNMENT  16
> > > +# define _dl_runtime_resolve   _dl_runtime_resolve_fxsave
> > > +# include "dl-trampoline.h"
> > > +# undef _dl_runtime_resolve
> > > +# undef USE_FXSAVE
> > > +# undef STATE_SAVE_ALIGNMENT
> > > +#endif
> > >
> > >  #define USE_XSAVE
> > >  #define STATE_SAVE_ALIGNMENT   64
> > > --
> > > 2.36.1
> > >
> >
> > There are cases of unconditional `movaps` usage in dl-trampoline.h (L222)
> >
> > This appears to be on ISA level v3 / v4 path.
> >
> > For ISA level >= 3 builds can those become `vmovdqa`?
>
> It may be replaced with VMOVA in a separate patch.

Kk.
>
> --
> H.J.
  
H.J. Lu June 27, 2022, 8 p.m. UTC | #5
On Mon, Jun 27, 2022 at 12:51 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Jun 27, 2022 at 12:44 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Jun 27, 2022 at 12:23 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Mon, Jun 27, 2022 at 12:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > When glibc is built with x86-64 ISA level v3, SSE run-time resolvers
> > > > aren't used.  For x86-64 ISA level v4 build, both SSE and AVX resolvers
> > > > are unused.
> > > >
> > > > 1. Move X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P to
> > > > isa-level.h.
> > > > 2. Check the minimum x86-64 ISA level to exclude the unused run-time
> > > > resolvers.
> > > > ---
> > > >  sysdeps/x86/isa-ifunc-macros.h | 27 ----------------
> > > >  sysdeps/x86/isa-level.h        | 26 +++++++++++++++
> > > >  sysdeps/x86_64/dl-machine.h    | 12 ++++---
> > > >  sysdeps/x86_64/dl-trampoline.S | 59 ++++++++++++++++++----------------
> > > >  4 files changed, 66 insertions(+), 58 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86/isa-ifunc-macros.h b/sysdeps/x86/isa-ifunc-macros.h
> > > > index d69905689b..f967a1bec6 100644
> > > > --- a/sysdeps/x86/isa-ifunc-macros.h
> > > > +++ b/sysdeps/x86/isa-ifunc-macros.h
> > > > @@ -56,31 +56,4 @@
> > > >  # define X86_IFUNC_IMPL_ADD_V1(...)
> > > >  #endif
> > > >
> > > > -/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
> > > > -   macros are wrappers for the the respective
> > > > -   CPU_FEATURE{S}_{USABLE|ARCH}_P runtime checks.  They differ in two
> > > > -   ways.
> > > > -
> > > > -    1.  The USABLE_P version is evaluated to true when the feature
> > > > -        is enabled.
> > > > -
> > > > -    2.  The ARCH_P version has a third argument `not`.  The `not`
> > > > -        argument can either be '!' or empty.  If the feature is
> > > > -        enabled above an ISA level, the third argument should be empty
> > > > -        and the expression is evaluated to true when the feature is
> > > > -        enabled.  If the feature is disabled above an ISA level, the
> > > > -        third argument should be `!` and the expression is evaluated
> > > > -        to true when the feature is disabled.
> > > > - */
> > > > -
> > > > -#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > > > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > -   || CPU_FEATURE_USABLE_P (ptr, name))
> > > > -
> > > > -
> > > > -#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > > > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > -   || not CPU_FEATURES_ARCH_P (ptr, name))
> > > > -
> > > > -
> > > >  #endif
> > > > diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
> > > > index 075e7c6ee1..f293aea906 100644
> > > > --- a/sysdeps/x86/isa-level.h
> > > > +++ b/sysdeps/x86/isa-level.h
> > > > @@ -68,10 +68,12 @@
> > > >     compile-time constant.. */
> > > >
> > > >  /* ISA level >= 4 guaranteed includes.  */
> > > > +#define AVX512F_X86_ISA_LEVEL 4
> > > >  #define AVX512VL_X86_ISA_LEVEL 4
> > > >  #define AVX512BW_X86_ISA_LEVEL 4
> > > >
> > > >  /* ISA level >= 3 guaranteed includes.  */
> > > > +#define AVX_X86_ISA_LEVEL 3
> > > >  #define AVX2_X86_ISA_LEVEL 3
> > > >  #define BMI2_X86_ISA_LEVEL 3
> > > >
> > > > @@ -87,6 +89,30 @@
> > > >     when ISA level < 3.  */
> > > >  #define Prefer_No_VZEROUPPER_X86_ISA_LEVEL 3
> > > >
> > > > +/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
> > > > +   macros are wrappers for the respective CPU_FEATURE{S}_{USABLE|ARCH}_P
> > > > +   runtime checks.  They differ in two ways.
> > > > +
> > > > +    1.  The USABLE_P version is evaluated to true when the feature
> > > > +        is enabled.
> > > > +
> > > > +    2.  The ARCH_P version has a third argument `not`.  The `not`
> > > > +        argument can either be `!` or empty.  If the feature is
> > > > +        enabled above an ISA level, the third argument should be empty
> > > > +        and the expression is evaluated to true when the feature is
> > > > +        enabled.  If the feature is disabled above an ISA level, the
> > > > +        third argument should be `!` and the expression is evaluated
> > > > +        to true when the feature is disabled.
> > > > + */
> > > > +
> > > > +#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > > > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > +   || CPU_FEATURE_USABLE_P (ptr, name))
> > > > +
> > > > +#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > > > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > +   || not CPU_FEATURES_ARCH_P (ptr, name))
> > > > +
> > > >  #define ISA_SHOULD_BUILD(isa_build_level)                              \
> > > >    (MINIMUM_X86_ISA_LEVEL <= (isa_build_level) && IS_IN (libc))         \
> > > >     || defined ISA_DEFAULT_IMPL
> > > > diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> > > > index 34766325ae..005d089501 100644
> > > > --- a/sysdeps/x86_64/dl-machine.h
> > > > +++ b/sysdeps/x86_64/dl-machine.h
> > > > @@ -28,6 +28,7 @@
> > > >  #include <dl-tlsdesc.h>
> > > >  #include <dl-static-tls.h>
> > > >  #include <dl-machine-rel.h>
> > > > +#include <isa-level.h>
> > > >
> > > >  /* Return nonzero iff ELF header is compatible with the running host.  */
> > > >  static inline int __attribute__ ((unused))
> > > > @@ -86,6 +87,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > > >        /* Identify this shared object.  */
> > > >        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
> > > >
> > > > +      const struct cpu_features* cpu_features = __get_cpu_features ();
> > > > +
> > > >        /* The got[2] entry contains the address of a function which gets
> > > >          called to get the address of a so far unresolved function and
> > > >          jump to it.  The profiling extension of the dynamic linker allows
> > > > @@ -94,9 +97,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > > >          end in this function.  */
> > > >        if (__glibc_unlikely (profile))
> > > >         {
> > > > -         if (CPU_FEATURE_USABLE (AVX512F))
> > > > +         if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
> > > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
> > > > -         else if (CPU_FEATURE_USABLE (AVX))
> > > > +         else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> > > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
> > > >           else
> > > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
> > > > @@ -112,9 +115,10 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > > >           /* This function will get called to fix up the GOT entry
> > > >              indicated by the offset on the stack, and then jump to
> > > >              the resolved address.  */
> > > > -         if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > > > +         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> > > > +             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > > >             *(ElfW(Addr) *) (got + 2)
> > > > -             = (CPU_FEATURE_USABLE (XSAVEC)
> > > > +             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> > > >                  ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> > > >                  : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> > > >           else
> > > > diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> > > > index 831a654713..f669805ac5 100644
> > > > --- a/sysdeps/x86_64/dl-trampoline.S
> > > > +++ b/sysdeps/x86_64/dl-trampoline.S
> > > > @@ -20,6 +20,7 @@
> > > >  #include <sysdep.h>
> > > >  #include <cpu-features-offsets.h>
> > > >  #include <link-defines.h>
> > > > +#include <isa-level.h>
> > > >
> > > >  #ifndef DL_STACK_ALIGNMENT
> > > >  /* Due to GCC bug:
> > > > @@ -62,35 +63,39 @@
> > >
> > > Can you also add a guard for isa level v4 on the avx512 version?
> >
> > It is needed only when we need to save and restore new registers
> > in _dl_runtime_profile, which requires a new _dl_runtime_profile.
> > We can add the check then.
>
> ?
> Why does the prohibit the check?

I don't think we need another _dl_runtime_XXX any time soon.
Adding a check may indicate otherwise.

> >
> > > >  #undef VMOVA
> > > >  #undef VEC_SIZE
> > > >
> > > > -#define VEC_SIZE               32
> > > > -#define VMOVA                  vmovdqa
> > > > -#define VEC(i)                 ymm##i
> > > > -#define _dl_runtime_profile    _dl_runtime_profile_avx
> > > > -#include "dl-trampoline.h"
> > > > -#undef _dl_runtime_profile
> > > > -#undef VEC
> > > > -#undef VMOVA
> > > > -#undef VEC_SIZE
> > > > +#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL
> > > Can this be <= __X86_ISA_LEVEL_V3
> >
> > No, since __X86_ISA_LEVEL_VN is defined either 0 or 1.
>
> Then <= 3 / <= 2?

Since AVX and AVX512F are the features which _dl_runtime_XXX
supports, AVX_X86_ISA_LEVEL is more appropriate than a number.

> >
> > > > +# define VEC_SIZE              32
> > > > +# define VMOVA                 vmovdqa
> > > > +# define VEC(i)                        ymm##i
> > > > +# define _dl_runtime_profile   _dl_runtime_profile_avx
> > > > +# include "dl-trampoline.h"
> > > > +# undef _dl_runtime_profile
> > > > +# undef VEC
> > > > +# undef VMOVA
> > > > +# undef VEC_SIZE
> > > > +#endif
> > > >
> > > > +#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL
> > >
> > > Can this be <= __X86_ISA_LEVEL_V2?
> > > >  /* movaps/movups is 1-byte shorter.  */
> > > > -#define VEC_SIZE               16
> > > > -#define VMOVA                  movaps
> > > > -#define VEC(i)                 xmm##i
> > > > -#define _dl_runtime_profile    _dl_runtime_profile_sse
> > > > -#undef RESTORE_AVX
> > > > -#include "dl-trampoline.h"
> > > > -#undef _dl_runtime_profile
> > > > -#undef VEC
> > > > -#undef VMOVA
> > > > -#undef VEC_SIZE
> > > > -
> > > > -#define USE_FXSAVE
> > > > -#define STATE_SAVE_ALIGNMENT   16
> > > > -#define _dl_runtime_resolve    _dl_runtime_resolve_fxsave
> > > > -#include "dl-trampoline.h"
> > > > -#undef _dl_runtime_resolve
> > > > -#undef USE_FXSAVE
> > > > -#undef STATE_SAVE_ALIGNMENT
> > > > +# define VEC_SIZE              16
> > > > +# define VMOVA                 movaps
> > > > +# define VEC(i)                        xmm##i
> > > > +# define _dl_runtime_profile   _dl_runtime_profile_sse
> > > > +# undef RESTORE_AVX
> > > > +# include "dl-trampoline.h"
> > > > +# undef _dl_runtime_profile
> > > > +# undef VEC
> > > > +# undef VMOVA
> > > > +# undef VEC_SIZE
> > > > +
> > > > +# define USE_FXSAVE
> > > > +# define STATE_SAVE_ALIGNMENT  16
> > > > +# define _dl_runtime_resolve   _dl_runtime_resolve_fxsave
> > > > +# include "dl-trampoline.h"
> > > > +# undef _dl_runtime_resolve
> > > > +# undef USE_FXSAVE
> > > > +# undef STATE_SAVE_ALIGNMENT
> > > > +#endif
> > > >
> > > >  #define USE_XSAVE
> > > >  #define STATE_SAVE_ALIGNMENT   64
> > > > --
> > > > 2.36.1
> > > >
> > >
> > > There are cases of unconditional `movaps` usage in dl-trampoline.h (L222)
> > >
> > > This appears to be on ISA level v3 / v4 path.
> > >
> > > For ISA level >= 3 builds can those become `vmovdqa`?
> >
> > It may be replaced with VMOVA in a separate patch.
>
> Kk.
> >
> > --
> > H.J.
  
Noah Goldstein June 27, 2022, 8:09 p.m. UTC | #6
On Mon, Jun 27, 2022 at 1:00 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jun 27, 2022 at 12:51 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Mon, Jun 27, 2022 at 12:44 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Mon, Jun 27, 2022 at 12:23 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Mon, Jun 27, 2022 at 12:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > When glibc is built with x86-64 ISA level v3, SSE run-time resolvers
> > > > > aren't used.  For x86-64 ISA level v4 build, both SSE and AVX resolvers
> > > > > are unused.
> > > > >
> > > > > 1. Move X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P to
> > > > > isa-level.h.
> > > > > 2. Check the minimum x86-64 ISA level to exclude the unused run-time
> > > > > resolvers.
> > > > > ---
> > > > >  sysdeps/x86/isa-ifunc-macros.h | 27 ----------------
> > > > >  sysdeps/x86/isa-level.h        | 26 +++++++++++++++
> > > > >  sysdeps/x86_64/dl-machine.h    | 12 ++++---
> > > > >  sysdeps/x86_64/dl-trampoline.S | 59 ++++++++++++++++++----------------
> > > > >  4 files changed, 66 insertions(+), 58 deletions(-)
> > > > >
> > > > > diff --git a/sysdeps/x86/isa-ifunc-macros.h b/sysdeps/x86/isa-ifunc-macros.h
> > > > > index d69905689b..f967a1bec6 100644
> > > > > --- a/sysdeps/x86/isa-ifunc-macros.h
> > > > > +++ b/sysdeps/x86/isa-ifunc-macros.h
> > > > > @@ -56,31 +56,4 @@
> > > > >  # define X86_IFUNC_IMPL_ADD_V1(...)
> > > > >  #endif
> > > > >
> > > > > -/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
> > > > > -   macros are wrappers for the the respective
> > > > > -   CPU_FEATURE{S}_{USABLE|ARCH}_P runtime checks.  They differ in two
> > > > > -   ways.
> > > > > -
> > > > > -    1.  The USABLE_P version is evaluated to true when the feature
> > > > > -        is enabled.
> > > > > -
> > > > > -    2.  The ARCH_P version has a third argument `not`.  The `not`
> > > > > -        argument can either be '!' or empty.  If the feature is
> > > > > -        enabled above an ISA level, the third argument should be empty
> > > > > -        and the expression is evaluated to true when the feature is
> > > > > -        enabled.  If the feature is disabled above an ISA level, the
> > > > > -        third argument should be `!` and the expression is evaluated
> > > > > -        to true when the feature is disabled.
> > > > > - */
> > > > > -
> > > > > -#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > > > > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > > -   || CPU_FEATURE_USABLE_P (ptr, name))
> > > > > -
> > > > > -
> > > > > -#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > > > > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > > -   || not CPU_FEATURES_ARCH_P (ptr, name))
> > > > > -
> > > > > -
> > > > >  #endif
> > > > > diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
> > > > > index 075e7c6ee1..f293aea906 100644
> > > > > --- a/sysdeps/x86/isa-level.h
> > > > > +++ b/sysdeps/x86/isa-level.h
> > > > > @@ -68,10 +68,12 @@
> > > > >     compile-time constant.. */
> > > > >
> > > > >  /* ISA level >= 4 guaranteed includes.  */
> > > > > +#define AVX512F_X86_ISA_LEVEL 4
> > > > >  #define AVX512VL_X86_ISA_LEVEL 4
> > > > >  #define AVX512BW_X86_ISA_LEVEL 4
> > > > >
> > > > >  /* ISA level >= 3 guaranteed includes.  */
> > > > > +#define AVX_X86_ISA_LEVEL 3
> > > > >  #define AVX2_X86_ISA_LEVEL 3
> > > > >  #define BMI2_X86_ISA_LEVEL 3
> > > > >
> > > > > @@ -87,6 +89,30 @@
> > > > >     when ISA level < 3.  */
> > > > >  #define Prefer_No_VZEROUPPER_X86_ISA_LEVEL 3
> > > > >
> > > > > +/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
> > > > > +   macros are wrappers for the respective CPU_FEATURE{S}_{USABLE|ARCH}_P
> > > > > +   runtime checks.  They differ in two ways.
> > > > > +
> > > > > +    1.  The USABLE_P version is evaluated to true when the feature
> > > > > +        is enabled.
> > > > > +
> > > > > +    2.  The ARCH_P version has a third argument `not`.  The `not`
> > > > > +        argument can either be `!` or empty.  If the feature is
> > > > > +        enabled above an ISA level, the third argument should be empty
> > > > > +        and the expression is evaluated to true when the feature is
> > > > > +        enabled.  If the feature is disabled above an ISA level, the
> > > > > +        third argument should be `!` and the expression is evaluated
> > > > > +        to true when the feature is disabled.
> > > > > + */
> > > > > +
> > > > > +#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > > > > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > > +   || CPU_FEATURE_USABLE_P (ptr, name))
> > > > > +
> > > > > +#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > > > > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > > +   || not CPU_FEATURES_ARCH_P (ptr, name))
> > > > > +
> > > > >  #define ISA_SHOULD_BUILD(isa_build_level)                              \
> > > > >    (MINIMUM_X86_ISA_LEVEL <= (isa_build_level) && IS_IN (libc))         \
> > > > >     || defined ISA_DEFAULT_IMPL
> > > > > diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> > > > > index 34766325ae..005d089501 100644
> > > > > --- a/sysdeps/x86_64/dl-machine.h
> > > > > +++ b/sysdeps/x86_64/dl-machine.h
> > > > > @@ -28,6 +28,7 @@
> > > > >  #include <dl-tlsdesc.h>
> > > > >  #include <dl-static-tls.h>
> > > > >  #include <dl-machine-rel.h>
> > > > > +#include <isa-level.h>
> > > > >
> > > > >  /* Return nonzero iff ELF header is compatible with the running host.  */
> > > > >  static inline int __attribute__ ((unused))
> > > > > @@ -86,6 +87,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > > > >        /* Identify this shared object.  */
> > > > >        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
> > > > >
> > > > > +      const struct cpu_features* cpu_features = __get_cpu_features ();
> > > > > +
> > > > >        /* The got[2] entry contains the address of a function which gets
> > > > >          called to get the address of a so far unresolved function and
> > > > >          jump to it.  The profiling extension of the dynamic linker allows
> > > > > @@ -94,9 +97,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > > > >          end in this function.  */
> > > > >        if (__glibc_unlikely (profile))
> > > > >         {
> > > > > -         if (CPU_FEATURE_USABLE (AVX512F))
> > > > > +         if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
> > > > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
> > > > > -         else if (CPU_FEATURE_USABLE (AVX))
> > > > > +         else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> > > > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
> > > > >           else
> > > > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
> > > > > @@ -112,9 +115,10 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > > > >           /* This function will get called to fix up the GOT entry
> > > > >              indicated by the offset on the stack, and then jump to
> > > > >              the resolved address.  */
> > > > > -         if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > > > > +         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> > > > > +             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > > > >             *(ElfW(Addr) *) (got + 2)
> > > > > -             = (CPU_FEATURE_USABLE (XSAVEC)
> > > > > +             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> > > > >                  ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> > > > >                  : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> > > > >           else
> > > > > diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> > > > > index 831a654713..f669805ac5 100644
> > > > > --- a/sysdeps/x86_64/dl-trampoline.S
> > > > > +++ b/sysdeps/x86_64/dl-trampoline.S
> > > > > @@ -20,6 +20,7 @@
> > > > >  #include <sysdep.h>
> > > > >  #include <cpu-features-offsets.h>
> > > > >  #include <link-defines.h>
> > > > > +#include <isa-level.h>
> > > > >
> > > > >  #ifndef DL_STACK_ALIGNMENT
> > > > >  /* Due to GCC bug:
> > > > > @@ -62,35 +63,39 @@
> > > >
> > > > Can you also add a guard for isa level v4 on the avx512 version?
> > >
> > > It is needed only when we need to save and restore new registers
> > > in _dl_runtime_profile, which requires a new _dl_runtime_profile.
> > > We can add the check then.
> >
> > ?
> > Why does the prohibit the check?
>
> I don't think we need another _dl_runtime_XXX any time soon.
> Adding a check may indicate otherwise.

Guess my thought here is everywhere else we are adding for all
ISA levels. Just seems like an inconsistency that we will potentially
miss sometime in the future.

>
> > >
> > > > >  #undef VMOVA
> > > > >  #undef VEC_SIZE
> > > > >
> > > > > -#define VEC_SIZE               32
> > > > > -#define VMOVA                  vmovdqa
> > > > > -#define VEC(i)                 ymm##i
> > > > > -#define _dl_runtime_profile    _dl_runtime_profile_avx
> > > > > -#include "dl-trampoline.h"
> > > > > -#undef _dl_runtime_profile
> > > > > -#undef VEC
> > > > > -#undef VMOVA
> > > > > -#undef VEC_SIZE
> > > > > +#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL
> > > > Can this be <= __X86_ISA_LEVEL_V3
> > >
> > > No, since __X86_ISA_LEVEL_VN is defined either 0 or 1.
> >
> > Then <= 3 / <= 2?
>
> Since AVX and AVX512F are the features which _dl_runtime_XXX
> supports, AVX_X86_ISA_LEVEL is more appropriate than a number.
kk
>
> > >
> > > > > +# define VEC_SIZE              32
> > > > > +# define VMOVA                 vmovdqa
> > > > > +# define VEC(i)                        ymm##i
> > > > > +# define _dl_runtime_profile   _dl_runtime_profile_avx
> > > > > +# include "dl-trampoline.h"
> > > > > +# undef _dl_runtime_profile
> > > > > +# undef VEC
> > > > > +# undef VMOVA
> > > > > +# undef VEC_SIZE
> > > > > +#endif
> > > > >
> > > > > +#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL
> > > >
> > > > Can this be <= __X86_ISA_LEVEL_V2?
> > > > >  /* movaps/movups is 1-byte shorter.  */
> > > > > -#define VEC_SIZE               16
> > > > > -#define VMOVA                  movaps
> > > > > -#define VEC(i)                 xmm##i
> > > > > -#define _dl_runtime_profile    _dl_runtime_profile_sse
> > > > > -#undef RESTORE_AVX
> > > > > -#include "dl-trampoline.h"
> > > > > -#undef _dl_runtime_profile
> > > > > -#undef VEC
> > > > > -#undef VMOVA
> > > > > -#undef VEC_SIZE
> > > > > -
> > > > > -#define USE_FXSAVE
> > > > > -#define STATE_SAVE_ALIGNMENT   16
> > > > > -#define _dl_runtime_resolve    _dl_runtime_resolve_fxsave
> > > > > -#include "dl-trampoline.h"
> > > > > -#undef _dl_runtime_resolve
> > > > > -#undef USE_FXSAVE
> > > > > -#undef STATE_SAVE_ALIGNMENT
> > > > > +# define VEC_SIZE              16
> > > > > +# define VMOVA                 movaps
> > > > > +# define VEC(i)                        xmm##i
> > > > > +# define _dl_runtime_profile   _dl_runtime_profile_sse
> > > > > +# undef RESTORE_AVX
> > > > > +# include "dl-trampoline.h"
> > > > > +# undef _dl_runtime_profile
> > > > > +# undef VEC
> > > > > +# undef VMOVA
> > > > > +# undef VEC_SIZE
> > > > > +
> > > > > +# define USE_FXSAVE
> > > > > +# define STATE_SAVE_ALIGNMENT  16
> > > > > +# define _dl_runtime_resolve   _dl_runtime_resolve_fxsave
> > > > > +# include "dl-trampoline.h"
> > > > > +# undef _dl_runtime_resolve
> > > > > +# undef USE_FXSAVE
> > > > > +# undef STATE_SAVE_ALIGNMENT
> > > > > +#endif
> > > > >
> > > > >  #define USE_XSAVE
> > > > >  #define STATE_SAVE_ALIGNMENT   64
> > > > > --
> > > > > 2.36.1
> > > > >
> > > >
> > > > There are cases of unconditional `movaps` usage in dl-trampoline.h (L222)
> > > >
> > > > This appears to be on ISA level v3 / v4 path.
> > > >
> > > > For ISA level >= 3 builds can those become `vmovdqa`?
> > >
> > > It may be replaced with VMOVA in a separate patch.
> >
> > Kk.
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.
  
H.J. Lu June 27, 2022, 8:19 p.m. UTC | #7
On Mon, Jun 27, 2022 at 1:09 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Jun 27, 2022 at 1:00 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Jun 27, 2022 at 12:51 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Mon, Jun 27, 2022 at 12:44 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Mon, Jun 27, 2022 at 12:23 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > On Mon, Jun 27, 2022 at 12:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > When glibc is built with x86-64 ISA level v3, SSE run-time resolvers
> > > > > > aren't used.  For x86-64 ISA level v4 build, both SSE and AVX resolvers
> > > > > > are unused.
> > > > > >
> > > > > > 1. Move X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P to
> > > > > > isa-level.h.
> > > > > > 2. Check the minimum x86-64 ISA level to exclude the unused run-time
> > > > > > resolvers.
> > > > > > ---
> > > > > >  sysdeps/x86/isa-ifunc-macros.h | 27 ----------------
> > > > > >  sysdeps/x86/isa-level.h        | 26 +++++++++++++++
> > > > > >  sysdeps/x86_64/dl-machine.h    | 12 ++++---
> > > > > >  sysdeps/x86_64/dl-trampoline.S | 59 ++++++++++++++++++----------------
> > > > > >  4 files changed, 66 insertions(+), 58 deletions(-)
> > > > > >
> > > > > > diff --git a/sysdeps/x86/isa-ifunc-macros.h b/sysdeps/x86/isa-ifunc-macros.h
> > > > > > index d69905689b..f967a1bec6 100644
> > > > > > --- a/sysdeps/x86/isa-ifunc-macros.h
> > > > > > +++ b/sysdeps/x86/isa-ifunc-macros.h
> > > > > > @@ -56,31 +56,4 @@
> > > > > >  # define X86_IFUNC_IMPL_ADD_V1(...)
> > > > > >  #endif
> > > > > >
> > > > > > -/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
> > > > > > -   macros are wrappers for the the respective
> > > > > > -   CPU_FEATURE{S}_{USABLE|ARCH}_P runtime checks.  They differ in two
> > > > > > -   ways.
> > > > > > -
> > > > > > -    1.  The USABLE_P version is evaluated to true when the feature
> > > > > > -        is enabled.
> > > > > > -
> > > > > > -    2.  The ARCH_P version has a third argument `not`.  The `not`
> > > > > > -        argument can either be '!' or empty.  If the feature is
> > > > > > -        enabled above an ISA level, the third argument should be empty
> > > > > > -        and the expression is evaluated to true when the feature is
> > > > > > -        enabled.  If the feature is disabled above an ISA level, the
> > > > > > -        third argument should be `!` and the expression is evaluated
> > > > > > -        to true when the feature is disabled.
> > > > > > - */
> > > > > > -
> > > > > > -#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > > > > > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > > > -   || CPU_FEATURE_USABLE_P (ptr, name))
> > > > > > -
> > > > > > -
> > > > > > -#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > > > > > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > > > -   || not CPU_FEATURES_ARCH_P (ptr, name))
> > > > > > -
> > > > > > -
> > > > > >  #endif
> > > > > > diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
> > > > > > index 075e7c6ee1..f293aea906 100644
> > > > > > --- a/sysdeps/x86/isa-level.h
> > > > > > +++ b/sysdeps/x86/isa-level.h
> > > > > > @@ -68,10 +68,12 @@
> > > > > >     compile-time constant.. */
> > > > > >
> > > > > >  /* ISA level >= 4 guaranteed includes.  */
> > > > > > +#define AVX512F_X86_ISA_LEVEL 4
> > > > > >  #define AVX512VL_X86_ISA_LEVEL 4
> > > > > >  #define AVX512BW_X86_ISA_LEVEL 4
> > > > > >
> > > > > >  /* ISA level >= 3 guaranteed includes.  */
> > > > > > +#define AVX_X86_ISA_LEVEL 3
> > > > > >  #define AVX2_X86_ISA_LEVEL 3
> > > > > >  #define BMI2_X86_ISA_LEVEL 3
> > > > > >
> > > > > > @@ -87,6 +89,30 @@
> > > > > >     when ISA level < 3.  */
> > > > > >  #define Prefer_No_VZEROUPPER_X86_ISA_LEVEL 3
> > > > > >
> > > > > > +/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
> > > > > > +   macros are wrappers for the respective CPU_FEATURE{S}_{USABLE|ARCH}_P
> > > > > > +   runtime checks.  They differ in two ways.
> > > > > > +
> > > > > > +    1.  The USABLE_P version is evaluated to true when the feature
> > > > > > +        is enabled.
> > > > > > +
> > > > > > +    2.  The ARCH_P version has a third argument `not`.  The `not`
> > > > > > +        argument can either be `!` or empty.  If the feature is
> > > > > > +        enabled above an ISA level, the third argument should be empty
> > > > > > +        and the expression is evaluated to true when the feature is
> > > > > > +        enabled.  If the feature is disabled above an ISA level, the
> > > > > > +        third argument should be `!` and the expression is evaluated
> > > > > > +        to true when the feature is disabled.
> > > > > > + */
> > > > > > +
> > > > > > +#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > > > > > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > > > +   || CPU_FEATURE_USABLE_P (ptr, name))
> > > > > > +
> > > > > > +#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > > > > > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > > > +   || not CPU_FEATURES_ARCH_P (ptr, name))
> > > > > > +
> > > > > >  #define ISA_SHOULD_BUILD(isa_build_level)                              \
> > > > > >    (MINIMUM_X86_ISA_LEVEL <= (isa_build_level) && IS_IN (libc))         \
> > > > > >     || defined ISA_DEFAULT_IMPL
> > > > > > diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> > > > > > index 34766325ae..005d089501 100644
> > > > > > --- a/sysdeps/x86_64/dl-machine.h
> > > > > > +++ b/sysdeps/x86_64/dl-machine.h
> > > > > > @@ -28,6 +28,7 @@
> > > > > >  #include <dl-tlsdesc.h>
> > > > > >  #include <dl-static-tls.h>
> > > > > >  #include <dl-machine-rel.h>
> > > > > > +#include <isa-level.h>
> > > > > >
> > > > > >  /* Return nonzero iff ELF header is compatible with the running host.  */
> > > > > >  static inline int __attribute__ ((unused))
> > > > > > @@ -86,6 +87,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > > > > >        /* Identify this shared object.  */
> > > > > >        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
> > > > > >
> > > > > > +      const struct cpu_features* cpu_features = __get_cpu_features ();
> > > > > > +
> > > > > >        /* The got[2] entry contains the address of a function which gets
> > > > > >          called to get the address of a so far unresolved function and
> > > > > >          jump to it.  The profiling extension of the dynamic linker allows
> > > > > > @@ -94,9 +97,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > > > > >          end in this function.  */
> > > > > >        if (__glibc_unlikely (profile))
> > > > > >         {
> > > > > > -         if (CPU_FEATURE_USABLE (AVX512F))
> > > > > > +         if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
> > > > > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
> > > > > > -         else if (CPU_FEATURE_USABLE (AVX))
> > > > > > +         else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> > > > > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
> > > > > >           else
> > > > > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
> > > > > > @@ -112,9 +115,10 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > > > > >           /* This function will get called to fix up the GOT entry
> > > > > >              indicated by the offset on the stack, and then jump to
> > > > > >              the resolved address.  */
> > > > > > -         if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > > > > > +         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> > > > > > +             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > > > > >             *(ElfW(Addr) *) (got + 2)
> > > > > > -             = (CPU_FEATURE_USABLE (XSAVEC)
> > > > > > +             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> > > > > >                  ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> > > > > >                  : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> > > > > >           else
> > > > > > diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> > > > > > index 831a654713..f669805ac5 100644
> > > > > > --- a/sysdeps/x86_64/dl-trampoline.S
> > > > > > +++ b/sysdeps/x86_64/dl-trampoline.S
> > > > > > @@ -20,6 +20,7 @@
> > > > > >  #include <sysdep.h>
> > > > > >  #include <cpu-features-offsets.h>
> > > > > >  #include <link-defines.h>
> > > > > > +#include <isa-level.h>
> > > > > >
> > > > > >  #ifndef DL_STACK_ALIGNMENT
> > > > > >  /* Due to GCC bug:
> > > > > > @@ -62,35 +63,39 @@
> > > > >
> > > > > Can you also add a guard for isa level v4 on the avx512 version?
> > > >
> > > > It is needed only when we need to save and restore new registers
> > > > in _dl_runtime_profile, which requires a new _dl_runtime_profile.
> > > > We can add the check then.
> > >
> > > ?
> > > Why does the prohibit the check?
> >
> > I don't think we need another _dl_runtime_XXX any time soon.
> > Adding a check may indicate otherwise.
>
> Guess my thought here is everywhere else we are adding for all
> ISA levels. Just seems like an inconsistency that we will potentially
> miss sometime in the future.

For this one, a check may lead to the question when the check will
be false.

>
> >
> > > >
> > > > > >  #undef VMOVA
> > > > > >  #undef VEC_SIZE
> > > > > >
> > > > > > -#define VEC_SIZE               32
> > > > > > -#define VMOVA                  vmovdqa
> > > > > > -#define VEC(i)                 ymm##i
> > > > > > -#define _dl_runtime_profile    _dl_runtime_profile_avx
> > > > > > -#include "dl-trampoline.h"
> > > > > > -#undef _dl_runtime_profile
> > > > > > -#undef VEC
> > > > > > -#undef VMOVA
> > > > > > -#undef VEC_SIZE
> > > > > > +#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL
> > > > > Can this be <= __X86_ISA_LEVEL_V3
> > > >
> > > > No, since __X86_ISA_LEVEL_VN is defined either 0 or 1.
> > >
> > > Then <= 3 / <= 2?
> >
> > Since AVX and AVX512F are the features which _dl_runtime_XXX
> > supports, AVX_X86_ISA_LEVEL is more appropriate than a number.
> kk
> >
> > > >
> > > > > > +# define VEC_SIZE              32
> > > > > > +# define VMOVA                 vmovdqa
> > > > > > +# define VEC(i)                        ymm##i
> > > > > > +# define _dl_runtime_profile   _dl_runtime_profile_avx
> > > > > > +# include "dl-trampoline.h"
> > > > > > +# undef _dl_runtime_profile
> > > > > > +# undef VEC
> > > > > > +# undef VMOVA
> > > > > > +# undef VEC_SIZE
> > > > > > +#endif
> > > > > >
> > > > > > +#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL
> > > > >
> > > > > Can this be <= __X86_ISA_LEVEL_V2?
> > > > > >  /* movaps/movups is 1-byte shorter.  */
> > > > > > -#define VEC_SIZE               16
> > > > > > -#define VMOVA                  movaps
> > > > > > -#define VEC(i)                 xmm##i
> > > > > > -#define _dl_runtime_profile    _dl_runtime_profile_sse
> > > > > > -#undef RESTORE_AVX
> > > > > > -#include "dl-trampoline.h"
> > > > > > -#undef _dl_runtime_profile
> > > > > > -#undef VEC
> > > > > > -#undef VMOVA
> > > > > > -#undef VEC_SIZE
> > > > > > -
> > > > > > -#define USE_FXSAVE
> > > > > > -#define STATE_SAVE_ALIGNMENT   16
> > > > > > -#define _dl_runtime_resolve    _dl_runtime_resolve_fxsave
> > > > > > -#include "dl-trampoline.h"
> > > > > > -#undef _dl_runtime_resolve
> > > > > > -#undef USE_FXSAVE
> > > > > > -#undef STATE_SAVE_ALIGNMENT
> > > > > > +# define VEC_SIZE              16
> > > > > > +# define VMOVA                 movaps
> > > > > > +# define VEC(i)                        xmm##i
> > > > > > +# define _dl_runtime_profile   _dl_runtime_profile_sse
> > > > > > +# undef RESTORE_AVX
> > > > > > +# include "dl-trampoline.h"
> > > > > > +# undef _dl_runtime_profile
> > > > > > +# undef VEC
> > > > > > +# undef VMOVA
> > > > > > +# undef VEC_SIZE
> > > > > > +
> > > > > > +# define USE_FXSAVE
> > > > > > +# define STATE_SAVE_ALIGNMENT  16
> > > > > > +# define _dl_runtime_resolve   _dl_runtime_resolve_fxsave
> > > > > > +# include "dl-trampoline.h"
> > > > > > +# undef _dl_runtime_resolve
> > > > > > +# undef USE_FXSAVE
> > > > > > +# undef STATE_SAVE_ALIGNMENT
> > > > > > +#endif
> > > > > >
> > > > > >  #define USE_XSAVE
> > > > > >  #define STATE_SAVE_ALIGNMENT   64
> > > > > > --
> > > > > > 2.36.1
> > > > > >
> > > > >
> > > > > There are cases of unconditional `movaps` usage in dl-trampoline.h (L222)
> > > > >
> > > > > This appears to be on ISA level v3 / v4 path.
> > > > >
> > > > > For ISA level >= 3 builds can those become `vmovdqa`?
> > > >
> > > > It may be replaced with VMOVA in a separate patch.
> > >
> > > Kk.
> > > >
> > > > --
> > > > H.J.
> >
> >
> >
> > --
> > H.J.
  
Noah Goldstein June 27, 2022, 8:25 p.m. UTC | #8
On Mon, Jun 27, 2022 at 1:20 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jun 27, 2022 at 1:09 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Mon, Jun 27, 2022 at 1:00 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Mon, Jun 27, 2022 at 12:51 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Mon, Jun 27, 2022 at 12:44 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Mon, Jun 27, 2022 at 12:23 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > >
> > > > > > On Mon, Jun 27, 2022 at 12:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > >
> > > > > > > When glibc is built with x86-64 ISA level v3, SSE run-time resolvers
> > > > > > > aren't used.  For x86-64 ISA level v4 build, both SSE and AVX resolvers
> > > > > > > are unused.
> > > > > > >
> > > > > > > 1. Move X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P to
> > > > > > > isa-level.h.
> > > > > > > 2. Check the minimum x86-64 ISA level to exclude the unused run-time
> > > > > > > resolvers.
> > > > > > > ---
> > > > > > >  sysdeps/x86/isa-ifunc-macros.h | 27 ----------------
> > > > > > >  sysdeps/x86/isa-level.h        | 26 +++++++++++++++
> > > > > > >  sysdeps/x86_64/dl-machine.h    | 12 ++++---
> > > > > > >  sysdeps/x86_64/dl-trampoline.S | 59 ++++++++++++++++++----------------
> > > > > > >  4 files changed, 66 insertions(+), 58 deletions(-)
> > > > > > >
> > > > > > > diff --git a/sysdeps/x86/isa-ifunc-macros.h b/sysdeps/x86/isa-ifunc-macros.h
> > > > > > > index d69905689b..f967a1bec6 100644
> > > > > > > --- a/sysdeps/x86/isa-ifunc-macros.h
> > > > > > > +++ b/sysdeps/x86/isa-ifunc-macros.h
> > > > > > > @@ -56,31 +56,4 @@
> > > > > > >  # define X86_IFUNC_IMPL_ADD_V1(...)
> > > > > > >  #endif
> > > > > > >
> > > > > > > -/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
> > > > > > > -   macros are wrappers for the the respective
> > > > > > > -   CPU_FEATURE{S}_{USABLE|ARCH}_P runtime checks.  They differ in two
> > > > > > > -   ways.
> > > > > > > -
> > > > > > > -    1.  The USABLE_P version is evaluated to true when the feature
> > > > > > > -        is enabled.
> > > > > > > -
> > > > > > > -    2.  The ARCH_P version has a third argument `not`.  The `not`
> > > > > > > -        argument can either be '!' or empty.  If the feature is
> > > > > > > -        enabled above an ISA level, the third argument should be empty
> > > > > > > -        and the expression is evaluated to true when the feature is
> > > > > > > -        enabled.  If the feature is disabled above an ISA level, the
> > > > > > > -        third argument should be `!` and the expression is evaluated
> > > > > > > -        to true when the feature is disabled.
> > > > > > > - */
> > > > > > > -
> > > > > > > -#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > > > > > > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > > > > -   || CPU_FEATURE_USABLE_P (ptr, name))
> > > > > > > -
> > > > > > > -
> > > > > > > -#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > > > > > > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > > > > -   || not CPU_FEATURES_ARCH_P (ptr, name))
> > > > > > > -
> > > > > > > -
> > > > > > >  #endif
> > > > > > > diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
> > > > > > > index 075e7c6ee1..f293aea906 100644
> > > > > > > --- a/sysdeps/x86/isa-level.h
> > > > > > > +++ b/sysdeps/x86/isa-level.h
> > > > > > > @@ -68,10 +68,12 @@
> > > > > > >     compile-time constant.. */
> > > > > > >
> > > > > > >  /* ISA level >= 4 guaranteed includes.  */
> > > > > > > +#define AVX512F_X86_ISA_LEVEL 4
> > > > > > >  #define AVX512VL_X86_ISA_LEVEL 4
> > > > > > >  #define AVX512BW_X86_ISA_LEVEL 4
> > > > > > >
> > > > > > >  /* ISA level >= 3 guaranteed includes.  */
> > > > > > > +#define AVX_X86_ISA_LEVEL 3
> > > > > > >  #define AVX2_X86_ISA_LEVEL 3
> > > > > > >  #define BMI2_X86_ISA_LEVEL 3
> > > > > > >
> > > > > > > @@ -87,6 +89,30 @@
> > > > > > >     when ISA level < 3.  */
> > > > > > >  #define Prefer_No_VZEROUPPER_X86_ISA_LEVEL 3
> > > > > > >
> > > > > > > +/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
> > > > > > > +   macros are wrappers for the respective CPU_FEATURE{S}_{USABLE|ARCH}_P
> > > > > > > +   runtime checks.  They differ in two ways.
> > > > > > > +
> > > > > > > +    1.  The USABLE_P version is evaluated to true when the feature
> > > > > > > +        is enabled.
> > > > > > > +
> > > > > > > +    2.  The ARCH_P version has a third argument `not`.  The `not`
> > > > > > > +        argument can either be `!` or empty.  If the feature is
> > > > > > > +        enabled above an ISA level, the third argument should be empty
> > > > > > > +        and the expression is evaluated to true when the feature is
> > > > > > > +        enabled.  If the feature is disabled above an ISA level, the
> > > > > > > +        third argument should be `!` and the expression is evaluated
> > > > > > > +        to true when the feature is disabled.
> > > > > > > + */
> > > > > > > +
> > > > > > > +#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > > > > > > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > > > > +   || CPU_FEATURE_USABLE_P (ptr, name))
> > > > > > > +
> > > > > > > +#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > > > > > > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > > > > +   || not CPU_FEATURES_ARCH_P (ptr, name))
> > > > > > > +
> > > > > > >  #define ISA_SHOULD_BUILD(isa_build_level)                              \
> > > > > > >    (MINIMUM_X86_ISA_LEVEL <= (isa_build_level) && IS_IN (libc))         \
> > > > > > >     || defined ISA_DEFAULT_IMPL
> > > > > > > diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> > > > > > > index 34766325ae..005d089501 100644
> > > > > > > --- a/sysdeps/x86_64/dl-machine.h
> > > > > > > +++ b/sysdeps/x86_64/dl-machine.h
> > > > > > > @@ -28,6 +28,7 @@
> > > > > > >  #include <dl-tlsdesc.h>
> > > > > > >  #include <dl-static-tls.h>
> > > > > > >  #include <dl-machine-rel.h>
> > > > > > > +#include <isa-level.h>
> > > > > > >
> > > > > > >  /* Return nonzero iff ELF header is compatible with the running host.  */
> > > > > > >  static inline int __attribute__ ((unused))
> > > > > > > @@ -86,6 +87,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > > > > > >        /* Identify this shared object.  */
> > > > > > >        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
> > > > > > >
> > > > > > > +      const struct cpu_features* cpu_features = __get_cpu_features ();
> > > > > > > +
> > > > > > >        /* The got[2] entry contains the address of a function which gets
> > > > > > >          called to get the address of a so far unresolved function and
> > > > > > >          jump to it.  The profiling extension of the dynamic linker allows
> > > > > > > @@ -94,9 +97,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > > > > > >          end in this function.  */
> > > > > > >        if (__glibc_unlikely (profile))
> > > > > > >         {
> > > > > > > -         if (CPU_FEATURE_USABLE (AVX512F))
> > > > > > > +         if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
> > > > > > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
> > > > > > > -         else if (CPU_FEATURE_USABLE (AVX))
> > > > > > > +         else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> > > > > > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
> > > > > > >           else
> > > > > > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
> > > > > > > @@ -112,9 +115,10 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > > > > > >           /* This function will get called to fix up the GOT entry
> > > > > > >              indicated by the offset on the stack, and then jump to
> > > > > > >              the resolved address.  */
> > > > > > > -         if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > > > > > > +         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> > > > > > > +             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > > > > > >             *(ElfW(Addr) *) (got + 2)
> > > > > > > -             = (CPU_FEATURE_USABLE (XSAVEC)
> > > > > > > +             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> > > > > > >                  ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> > > > > > >                  : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> > > > > > >           else
> > > > > > > diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> > > > > > > index 831a654713..f669805ac5 100644
> > > > > > > --- a/sysdeps/x86_64/dl-trampoline.S
> > > > > > > +++ b/sysdeps/x86_64/dl-trampoline.S
> > > > > > > @@ -20,6 +20,7 @@
> > > > > > >  #include <sysdep.h>
> > > > > > >  #include <cpu-features-offsets.h>
> > > > > > >  #include <link-defines.h>
> > > > > > > +#include <isa-level.h>
> > > > > > >
> > > > > > >  #ifndef DL_STACK_ALIGNMENT
> > > > > > >  /* Due to GCC bug:
> > > > > > > @@ -62,35 +63,39 @@
> > > > > >
> > > > > > Can you also add a guard for isa level v4 on the avx512 version?
> > > > >
> > > > > It is needed only when we need to save and restore new registers
> > > > > in _dl_runtime_profile, which requires a new _dl_runtime_profile.
> > > > > We can add the check then.
> > > >
> > > > ?
> > > > Why does the prohibit the check?
> > >
> > > I don't think we need another _dl_runtime_XXX any time soon.
> > > Adding a check may indicate otherwise.
> >
> > Guess my thought here is everywhere else we are adding for all
> > ISA levels. Just seems like an inconsistency that we will potentially
> > miss sometime in the future.
>
> For this one, a check may lead to the question when the check will
> be false.

Think a comment would do then. Seems like the kind of thing that
otherwise would be easily forgotten.

Think if we do full coverage for the ISA raising stuff, it will be much
easier to manage in the future than if we have random holes that
we need to keep track of.

>
> >
> > >
> > > > >
> > > > > > >  #undef VMOVA
> > > > > > >  #undef VEC_SIZE
> > > > > > >
> > > > > > > -#define VEC_SIZE               32
> > > > > > > -#define VMOVA                  vmovdqa
> > > > > > > -#define VEC(i)                 ymm##i
> > > > > > > -#define _dl_runtime_profile    _dl_runtime_profile_avx
> > > > > > > -#include "dl-trampoline.h"
> > > > > > > -#undef _dl_runtime_profile
> > > > > > > -#undef VEC
> > > > > > > -#undef VMOVA
> > > > > > > -#undef VEC_SIZE
> > > > > > > +#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL
> > > > > > Can this be <= __X86_ISA_LEVEL_V3
> > > > >
> > > > > No, since __X86_ISA_LEVEL_VN is defined either 0 or 1.
> > > >
> > > > Then <= 3 / <= 2?
> > >
> > > Since AVX and AVX512F are the features which _dl_runtime_XXX
> > > supports, AVX_X86_ISA_LEVEL is more appropriate than a number.
> > kk
> > >
> > > > >
> > > > > > > +# define VEC_SIZE              32
> > > > > > > +# define VMOVA                 vmovdqa
> > > > > > > +# define VEC(i)                        ymm##i
> > > > > > > +# define _dl_runtime_profile   _dl_runtime_profile_avx
> > > > > > > +# include "dl-trampoline.h"
> > > > > > > +# undef _dl_runtime_profile
> > > > > > > +# undef VEC
> > > > > > > +# undef VMOVA
> > > > > > > +# undef VEC_SIZE
> > > > > > > +#endif
> > > > > > >
> > > > > > > +#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL
> > > > > >
> > > > > > Can this be <= __X86_ISA_LEVEL_V2?
> > > > > > >  /* movaps/movups is 1-byte shorter.  */
> > > > > > > -#define VEC_SIZE               16
> > > > > > > -#define VMOVA                  movaps
> > > > > > > -#define VEC(i)                 xmm##i
> > > > > > > -#define _dl_runtime_profile    _dl_runtime_profile_sse
> > > > > > > -#undef RESTORE_AVX
> > > > > > > -#include "dl-trampoline.h"
> > > > > > > -#undef _dl_runtime_profile
> > > > > > > -#undef VEC
> > > > > > > -#undef VMOVA
> > > > > > > -#undef VEC_SIZE
> > > > > > > -
> > > > > > > -#define USE_FXSAVE
> > > > > > > -#define STATE_SAVE_ALIGNMENT   16
> > > > > > > -#define _dl_runtime_resolve    _dl_runtime_resolve_fxsave
> > > > > > > -#include "dl-trampoline.h"
> > > > > > > -#undef _dl_runtime_resolve
> > > > > > > -#undef USE_FXSAVE
> > > > > > > -#undef STATE_SAVE_ALIGNMENT
> > > > > > > +# define VEC_SIZE              16
> > > > > > > +# define VMOVA                 movaps
> > > > > > > +# define VEC(i)                        xmm##i
> > > > > > > +# define _dl_runtime_profile   _dl_runtime_profile_sse
> > > > > > > +# undef RESTORE_AVX
> > > > > > > +# include "dl-trampoline.h"
> > > > > > > +# undef _dl_runtime_profile
> > > > > > > +# undef VEC
> > > > > > > +# undef VMOVA
> > > > > > > +# undef VEC_SIZE
> > > > > > > +
> > > > > > > +# define USE_FXSAVE
> > > > > > > +# define STATE_SAVE_ALIGNMENT  16
> > > > > > > +# define _dl_runtime_resolve   _dl_runtime_resolve_fxsave
> > > > > > > +# include "dl-trampoline.h"
> > > > > > > +# undef _dl_runtime_resolve
> > > > > > > +# undef USE_FXSAVE
> > > > > > > +# undef STATE_SAVE_ALIGNMENT
> > > > > > > +#endif
> > > > > > >
> > > > > > >  #define USE_XSAVE
> > > > > > >  #define STATE_SAVE_ALIGNMENT   64
> > > > > > > --
> > > > > > > 2.36.1
> > > > > > >
> > > > > >
> > > > > > There are cases of unconditional `movaps` usage in dl-trampoline.h (L222)
> > > > > >
> > > > > > This appears to be on ISA level v3 / v4 path.
> > > > > >
> > > > > > For ISA level >= 3 builds can those become `vmovdqa`?
> > > > >
> > > > > It may be replaced with VMOVA in a separate patch.
> > > >
> > > > Kk.
> > > > >
> > > > > --
> > > > > H.J.
> > >
> > >
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.
  
H.J. Lu June 27, 2022, 8:45 p.m. UTC | #9
On Mon, Jun 27, 2022 at 1:26 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, Jun 27, 2022 at 1:20 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Jun 27, 2022 at 1:09 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Mon, Jun 27, 2022 at 1:00 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Mon, Jun 27, 2022 at 12:51 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > On Mon, Jun 27, 2022 at 12:44 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > >
> > > > > > On Mon, Jun 27, 2022 at 12:23 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > > >
> > > > > > > On Mon, Jun 27, 2022 at 12:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > > > > >
> > > > > > > > When glibc is built with x86-64 ISA level v3, SSE run-time resolvers
> > > > > > > > aren't used.  For x86-64 ISA level v4 build, both SSE and AVX resolvers
> > > > > > > > are unused.
> > > > > > > >
> > > > > > > > 1. Move X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P to
> > > > > > > > isa-level.h.
> > > > > > > > 2. Check the minimum x86-64 ISA level to exclude the unused run-time
> > > > > > > > resolvers.
> > > > > > > > ---
> > > > > > > >  sysdeps/x86/isa-ifunc-macros.h | 27 ----------------
> > > > > > > >  sysdeps/x86/isa-level.h        | 26 +++++++++++++++
> > > > > > > >  sysdeps/x86_64/dl-machine.h    | 12 ++++---
> > > > > > > >  sysdeps/x86_64/dl-trampoline.S | 59 ++++++++++++++++++----------------
> > > > > > > >  4 files changed, 66 insertions(+), 58 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/sysdeps/x86/isa-ifunc-macros.h b/sysdeps/x86/isa-ifunc-macros.h
> > > > > > > > index d69905689b..f967a1bec6 100644
> > > > > > > > --- a/sysdeps/x86/isa-ifunc-macros.h
> > > > > > > > +++ b/sysdeps/x86/isa-ifunc-macros.h
> > > > > > > > @@ -56,31 +56,4 @@
> > > > > > > >  # define X86_IFUNC_IMPL_ADD_V1(...)
> > > > > > > >  #endif
> > > > > > > >
> > > > > > > > -/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
> > > > > > > > -   macros are wrappers for the the respective
> > > > > > > > -   CPU_FEATURE{S}_{USABLE|ARCH}_P runtime checks.  They differ in two
> > > > > > > > -   ways.
> > > > > > > > -
> > > > > > > > -    1.  The USABLE_P version is evaluated to true when the feature
> > > > > > > > -        is enabled.
> > > > > > > > -
> > > > > > > > -    2.  The ARCH_P version has a third argument `not`.  The `not`
> > > > > > > > -        argument can either be '!' or empty.  If the feature is
> > > > > > > > -        enabled above an ISA level, the third argument should be empty
> > > > > > > > -        and the expression is evaluated to true when the feature is
> > > > > > > > -        enabled.  If the feature is disabled above an ISA level, the
> > > > > > > > -        third argument should be `!` and the expression is evaluated
> > > > > > > > -        to true when the feature is disabled.
> > > > > > > > - */
> > > > > > > > -
> > > > > > > > -#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > > > > > > > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > > > > > -   || CPU_FEATURE_USABLE_P (ptr, name))
> > > > > > > > -
> > > > > > > > -
> > > > > > > > -#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > > > > > > > -  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > > > > > -   || not CPU_FEATURES_ARCH_P (ptr, name))
> > > > > > > > -
> > > > > > > > -
> > > > > > > >  #endif
> > > > > > > > diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
> > > > > > > > index 075e7c6ee1..f293aea906 100644
> > > > > > > > --- a/sysdeps/x86/isa-level.h
> > > > > > > > +++ b/sysdeps/x86/isa-level.h
> > > > > > > > @@ -68,10 +68,12 @@
> > > > > > > >     compile-time constant.. */
> > > > > > > >
> > > > > > > >  /* ISA level >= 4 guaranteed includes.  */
> > > > > > > > +#define AVX512F_X86_ISA_LEVEL 4
> > > > > > > >  #define AVX512VL_X86_ISA_LEVEL 4
> > > > > > > >  #define AVX512BW_X86_ISA_LEVEL 4
> > > > > > > >
> > > > > > > >  /* ISA level >= 3 guaranteed includes.  */
> > > > > > > > +#define AVX_X86_ISA_LEVEL 3
> > > > > > > >  #define AVX2_X86_ISA_LEVEL 3
> > > > > > > >  #define BMI2_X86_ISA_LEVEL 3
> > > > > > > >
> > > > > > > > @@ -87,6 +89,30 @@
> > > > > > > >     when ISA level < 3.  */
> > > > > > > >  #define Prefer_No_VZEROUPPER_X86_ISA_LEVEL 3
> > > > > > > >
> > > > > > > > +/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
> > > > > > > > +   macros are wrappers for the respective CPU_FEATURE{S}_{USABLE|ARCH}_P
> > > > > > > > +   runtime checks.  They differ in two ways.
> > > > > > > > +
> > > > > > > > +    1.  The USABLE_P version is evaluated to true when the feature
> > > > > > > > +        is enabled.
> > > > > > > > +
> > > > > > > > +    2.  The ARCH_P version has a third argument `not`.  The `not`
> > > > > > > > +        argument can either be `!` or empty.  If the feature is
> > > > > > > > +        enabled above an ISA level, the third argument should be empty
> > > > > > > > +        and the expression is evaluated to true when the feature is
> > > > > > > > +        enabled.  If the feature is disabled above an ISA level, the
> > > > > > > > +        third argument should be `!` and the expression is evaluated
> > > > > > > > +        to true when the feature is disabled.
> > > > > > > > + */
> > > > > > > > +
> > > > > > > > +#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
> > > > > > > > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > > > > > +   || CPU_FEATURE_USABLE_P (ptr, name))
> > > > > > > > +
> > > > > > > > +#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
> > > > > > > > +  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
> > > > > > > > +   || not CPU_FEATURES_ARCH_P (ptr, name))
> > > > > > > > +
> > > > > > > >  #define ISA_SHOULD_BUILD(isa_build_level)                              \
> > > > > > > >    (MINIMUM_X86_ISA_LEVEL <= (isa_build_level) && IS_IN (libc))         \
> > > > > > > >     || defined ISA_DEFAULT_IMPL
> > > > > > > > diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> > > > > > > > index 34766325ae..005d089501 100644
> > > > > > > > --- a/sysdeps/x86_64/dl-machine.h
> > > > > > > > +++ b/sysdeps/x86_64/dl-machine.h
> > > > > > > > @@ -28,6 +28,7 @@
> > > > > > > >  #include <dl-tlsdesc.h>
> > > > > > > >  #include <dl-static-tls.h>
> > > > > > > >  #include <dl-machine-rel.h>
> > > > > > > > +#include <isa-level.h>
> > > > > > > >
> > > > > > > >  /* Return nonzero iff ELF header is compatible with the running host.  */
> > > > > > > >  static inline int __attribute__ ((unused))
> > > > > > > > @@ -86,6 +87,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > > > > > > >        /* Identify this shared object.  */
> > > > > > > >        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
> > > > > > > >
> > > > > > > > +      const struct cpu_features* cpu_features = __get_cpu_features ();
> > > > > > > > +
> > > > > > > >        /* The got[2] entry contains the address of a function which gets
> > > > > > > >          called to get the address of a so far unresolved function and
> > > > > > > >          jump to it.  The profiling extension of the dynamic linker allows
> > > > > > > > @@ -94,9 +97,9 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > > > > > > >          end in this function.  */
> > > > > > > >        if (__glibc_unlikely (profile))
> > > > > > > >         {
> > > > > > > > -         if (CPU_FEATURE_USABLE (AVX512F))
> > > > > > > > +         if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
> > > > > > > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
> > > > > > > > -         else if (CPU_FEATURE_USABLE (AVX))
> > > > > > > > +         else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> > > > > > > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
> > > > > > > >           else
> > > > > > > >             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
> > > > > > > > @@ -112,9 +115,10 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> > > > > > > >           /* This function will get called to fix up the GOT entry
> > > > > > > >              indicated by the offset on the stack, and then jump to
> > > > > > > >              the resolved address.  */
> > > > > > > > -         if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > > > > > > > +         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> > > > > > > > +             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> > > > > > > >             *(ElfW(Addr) *) (got + 2)
> > > > > > > > -             = (CPU_FEATURE_USABLE (XSAVEC)
> > > > > > > > +             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> > > > > > > >                  ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> > > > > > > >                  : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> > > > > > > >           else
> > > > > > > > diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> > > > > > > > index 831a654713..f669805ac5 100644
> > > > > > > > --- a/sysdeps/x86_64/dl-trampoline.S
> > > > > > > > +++ b/sysdeps/x86_64/dl-trampoline.S
> > > > > > > > @@ -20,6 +20,7 @@
> > > > > > > >  #include <sysdep.h>
> > > > > > > >  #include <cpu-features-offsets.h>
> > > > > > > >  #include <link-defines.h>
> > > > > > > > +#include <isa-level.h>
> > > > > > > >
> > > > > > > >  #ifndef DL_STACK_ALIGNMENT
> > > > > > > >  /* Due to GCC bug:
> > > > > > > > @@ -62,35 +63,39 @@
> > > > > > >
> > > > > > > Can you also add a guard for isa level v4 on the avx512 version?
> > > > > >
> > > > > > It is needed only when we need to save and restore new registers
> > > > > > in _dl_runtime_profile, which requires a new _dl_runtime_profile.
> > > > > > We can add the check then.
> > > > >
> > > > > ?
> > > > > Why does the prohibit the check?
> > > >
> > > > I don't think we need another _dl_runtime_XXX any time soon.
> > > > Adding a check may indicate otherwise.
> > >
> > > Guess my thought here is everywhere else we are adding for all
> > > ISA levels. Just seems like an inconsistency that we will potentially
> > > miss sometime in the future.
> >
> > For this one, a check may lead to the question when the check will
> > be false.
>
> Think a comment would do then. Seems like the kind of thing that
> otherwise would be easily forgotten.
>
> Think if we do full coverage for the ISA raising stuff, it will be much
> easier to manage in the future than if we have random holes that
> we need to keep track of.

It is the opposite.   What check should it be? With

#if MINIMUM_X86_ISA_LEVEL <= AVX512F_X86_ISA_LEVEL

if we add a new ISA level, v5, with some additional ISAs,  when
the glibc is built with ISA level v5, the check becomes false.

> >
> > >
> > > >
> > > > > >
> > > > > > > >  #undef VMOVA
> > > > > > > >  #undef VEC_SIZE
> > > > > > > >
> > > > > > > > -#define VEC_SIZE               32
> > > > > > > > -#define VMOVA                  vmovdqa
> > > > > > > > -#define VEC(i)                 ymm##i
> > > > > > > > -#define _dl_runtime_profile    _dl_runtime_profile_avx
> > > > > > > > -#include "dl-trampoline.h"
> > > > > > > > -#undef _dl_runtime_profile
> > > > > > > > -#undef VEC
> > > > > > > > -#undef VMOVA
> > > > > > > > -#undef VEC_SIZE
> > > > > > > > +#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL
> > > > > > > Can this be <= __X86_ISA_LEVEL_V3
> > > > > >
> > > > > > No, since __X86_ISA_LEVEL_VN is defined either 0 or 1.
> > > > >
> > > > > Then <= 3 / <= 2?
> > > >
> > > > Since AVX and AVX512F are the features which _dl_runtime_XXX
> > > > supports, AVX_X86_ISA_LEVEL is more appropriate than a number.
> > > kk
> > > >
> > > > > >
> > > > > > > > +# define VEC_SIZE              32
> > > > > > > > +# define VMOVA                 vmovdqa
> > > > > > > > +# define VEC(i)                        ymm##i
> > > > > > > > +# define _dl_runtime_profile   _dl_runtime_profile_avx
> > > > > > > > +# include "dl-trampoline.h"
> > > > > > > > +# undef _dl_runtime_profile
> > > > > > > > +# undef VEC
> > > > > > > > +# undef VMOVA
> > > > > > > > +# undef VEC_SIZE
> > > > > > > > +#endif
> > > > > > > >
> > > > > > > > +#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL
> > > > > > >
> > > > > > > Can this be <= __X86_ISA_LEVEL_V2?
> > > > > > > >  /* movaps/movups is 1-byte shorter.  */
> > > > > > > > -#define VEC_SIZE               16
> > > > > > > > -#define VMOVA                  movaps
> > > > > > > > -#define VEC(i)                 xmm##i
> > > > > > > > -#define _dl_runtime_profile    _dl_runtime_profile_sse
> > > > > > > > -#undef RESTORE_AVX
> > > > > > > > -#include "dl-trampoline.h"
> > > > > > > > -#undef _dl_runtime_profile
> > > > > > > > -#undef VEC
> > > > > > > > -#undef VMOVA
> > > > > > > > -#undef VEC_SIZE
> > > > > > > > -
> > > > > > > > -#define USE_FXSAVE
> > > > > > > > -#define STATE_SAVE_ALIGNMENT   16
> > > > > > > > -#define _dl_runtime_resolve    _dl_runtime_resolve_fxsave
> > > > > > > > -#include "dl-trampoline.h"
> > > > > > > > -#undef _dl_runtime_resolve
> > > > > > > > -#undef USE_FXSAVE
> > > > > > > > -#undef STATE_SAVE_ALIGNMENT
> > > > > > > > +# define VEC_SIZE              16
> > > > > > > > +# define VMOVA                 movaps
> > > > > > > > +# define VEC(i)                        xmm##i
> > > > > > > > +# define _dl_runtime_profile   _dl_runtime_profile_sse
> > > > > > > > +# undef RESTORE_AVX
> > > > > > > > +# include "dl-trampoline.h"
> > > > > > > > +# undef _dl_runtime_profile
> > > > > > > > +# undef VEC
> > > > > > > > +# undef VMOVA
> > > > > > > > +# undef VEC_SIZE
> > > > > > > > +
> > > > > > > > +# define USE_FXSAVE
> > > > > > > > +# define STATE_SAVE_ALIGNMENT  16
> > > > > > > > +# define _dl_runtime_resolve   _dl_runtime_resolve_fxsave
> > > > > > > > +# include "dl-trampoline.h"
> > > > > > > > +# undef _dl_runtime_resolve
> > > > > > > > +# undef USE_FXSAVE
> > > > > > > > +# undef STATE_SAVE_ALIGNMENT
> > > > > > > > +#endif
> > > > > > > >
> > > > > > > >  #define USE_XSAVE
> > > > > > > >  #define STATE_SAVE_ALIGNMENT   64
> > > > > > > > --
> > > > > > > > 2.36.1
> > > > > > > >
> > > > > > >
> > > > > > > There are cases of unconditional `movaps` usage in dl-trampoline.h (L222)
> > > > > > >
> > > > > > > This appears to be on ISA level v3 / v4 path.
> > > > > > >
> > > > > > > For ISA level >= 3 builds can those become `vmovdqa`?
> > > > > >
> > > > > > It may be replaced with VMOVA in a separate patch.
> > > > >
> > > > > Kk.
> > > > > >
> > > > > > --
> > > > > > H.J.
> > > >
> > > >
> > > >
> > > > --
> > > > H.J.
> >
> >
> >
> > --
> > H.J.
  

Patch

diff --git a/sysdeps/x86/isa-ifunc-macros.h b/sysdeps/x86/isa-ifunc-macros.h
index d69905689b..f967a1bec6 100644
--- a/sysdeps/x86/isa-ifunc-macros.h
+++ b/sysdeps/x86/isa-ifunc-macros.h
@@ -56,31 +56,4 @@ 
 # define X86_IFUNC_IMPL_ADD_V1(...)
 #endif
 
-/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
-   macros are wrappers for the the respective
-   CPU_FEATURE{S}_{USABLE|ARCH}_P runtime checks.  They differ in two
-   ways.
-
-    1.  The USABLE_P version is evaluated to true when the feature
-        is enabled.
-
-    2.  The ARCH_P version has a third argument `not`.  The `not`
-        argument can either be '!' or empty.  If the feature is
-        enabled above an ISA level, the third argument should be empty
-        and the expression is evaluated to true when the feature is
-        enabled.  If the feature is disabled above an ISA level, the
-        third argument should be `!` and the expression is evaluated
-        to true when the feature is disabled.
- */
-
-#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
-  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
-   || CPU_FEATURE_USABLE_P (ptr, name))
-
-
-#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
-  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
-   || not CPU_FEATURES_ARCH_P (ptr, name))
-
-
 #endif
diff --git a/sysdeps/x86/isa-level.h b/sysdeps/x86/isa-level.h
index 075e7c6ee1..f293aea906 100644
--- a/sysdeps/x86/isa-level.h
+++ b/sysdeps/x86/isa-level.h
@@ -68,10 +68,12 @@ 
    compile-time constant.. */
 
 /* ISA level >= 4 guaranteed includes.  */
+#define AVX512F_X86_ISA_LEVEL 4
 #define AVX512VL_X86_ISA_LEVEL 4
 #define AVX512BW_X86_ISA_LEVEL 4
 
 /* ISA level >= 3 guaranteed includes.  */
+#define AVX_X86_ISA_LEVEL 3
 #define AVX2_X86_ISA_LEVEL 3
 #define BMI2_X86_ISA_LEVEL 3
 
@@ -87,6 +89,30 @@ 
    when ISA level < 3.  */
 #define Prefer_No_VZEROUPPER_X86_ISA_LEVEL 3
 
+/* Both X86_ISA_CPU_FEATURE_USABLE_P and X86_ISA_CPU_FEATURES_ARCH_P
+   macros are wrappers for the respective CPU_FEATURE{S}_{USABLE|ARCH}_P
+   runtime checks.  They differ in two ways.
+
+    1.  The USABLE_P version is evaluated to true when the feature
+        is enabled.
+
+    2.  The ARCH_P version has a third argument `not`.  The `not`
+        argument can either be `!` or empty.  If the feature is
+        enabled above an ISA level, the third argument should be empty
+        and the expression is evaluated to true when the feature is
+        enabled.  If the feature is disabled above an ISA level, the
+        third argument should be `!` and the expression is evaluated
+        to true when the feature is disabled.
+ */
+
+#define X86_ISA_CPU_FEATURE_USABLE_P(ptr, name)                        \
+  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
+   || CPU_FEATURE_USABLE_P (ptr, name))
+
+#define X86_ISA_CPU_FEATURES_ARCH_P(ptr, name, not)                    \
+  (((name##_X86_ISA_LEVEL) <= MINIMUM_X86_ISA_LEVEL)                   \
+   || not CPU_FEATURES_ARCH_P (ptr, name))
+
 #define ISA_SHOULD_BUILD(isa_build_level)                              \
   (MINIMUM_X86_ISA_LEVEL <= (isa_build_level) && IS_IN (libc))         \
    || defined ISA_DEFAULT_IMPL
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index 34766325ae..005d089501 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -28,6 +28,7 @@ 
 #include <dl-tlsdesc.h>
 #include <dl-static-tls.h>
 #include <dl-machine-rel.h>
+#include <isa-level.h>
 
 /* Return nonzero iff ELF header is compatible with the running host.  */
 static inline int __attribute__ ((unused))
@@ -86,6 +87,8 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
       /* Identify this shared object.  */
       *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
 
+      const struct cpu_features* cpu_features = __get_cpu_features ();
+
       /* The got[2] entry contains the address of a function which gets
 	 called to get the address of a so far unresolved function and
 	 jump to it.  The profiling extension of the dynamic linker allows
@@ -94,9 +97,9 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 	 end in this function.  */
       if (__glibc_unlikely (profile))
 	{
-	  if (CPU_FEATURE_USABLE (AVX512F))
+	  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
 	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
-	  else if (CPU_FEATURE_USABLE (AVX))
+	  else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
 	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
 	  else
 	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
@@ -112,9 +115,10 @@  elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
 	  /* This function will get called to fix up the GOT entry
 	     indicated by the offset on the stack, and then jump to
 	     the resolved address.  */
-	  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
+	  if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
+	      || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
 	    *(ElfW(Addr) *) (got + 2)
-	      = (CPU_FEATURE_USABLE (XSAVEC)
+	      = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
 		 ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
 		 : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
 	  else
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 831a654713..f669805ac5 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -20,6 +20,7 @@ 
 #include <sysdep.h>
 #include <cpu-features-offsets.h>
 #include <link-defines.h>
+#include <isa-level.h>
 
 #ifndef DL_STACK_ALIGNMENT
 /* Due to GCC bug:
@@ -62,35 +63,39 @@ 
 #undef VMOVA
 #undef VEC_SIZE
 
-#define VEC_SIZE		32
-#define VMOVA			vmovdqa
-#define VEC(i)			ymm##i
-#define _dl_runtime_profile	_dl_runtime_profile_avx
-#include "dl-trampoline.h"
-#undef _dl_runtime_profile
-#undef VEC
-#undef VMOVA
-#undef VEC_SIZE
+#if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL
+# define VEC_SIZE		32
+# define VMOVA			vmovdqa
+# define VEC(i)			ymm##i
+# define _dl_runtime_profile	_dl_runtime_profile_avx
+# include "dl-trampoline.h"
+# undef _dl_runtime_profile
+# undef VEC
+# undef VMOVA
+# undef VEC_SIZE
+#endif
 
+#if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL
 /* movaps/movups is 1-byte shorter.  */
-#define VEC_SIZE		16
-#define VMOVA			movaps
-#define VEC(i)			xmm##i
-#define _dl_runtime_profile	_dl_runtime_profile_sse
-#undef RESTORE_AVX
-#include "dl-trampoline.h"
-#undef _dl_runtime_profile
-#undef VEC
-#undef VMOVA
-#undef VEC_SIZE
-
-#define USE_FXSAVE
-#define STATE_SAVE_ALIGNMENT	16
-#define _dl_runtime_resolve	_dl_runtime_resolve_fxsave
-#include "dl-trampoline.h"
-#undef _dl_runtime_resolve
-#undef USE_FXSAVE
-#undef STATE_SAVE_ALIGNMENT
+# define VEC_SIZE		16
+# define VMOVA			movaps
+# define VEC(i)			xmm##i
+# define _dl_runtime_profile	_dl_runtime_profile_sse
+# undef RESTORE_AVX
+# include "dl-trampoline.h"
+# undef _dl_runtime_profile
+# undef VEC
+# undef VMOVA
+# undef VEC_SIZE
+
+# define USE_FXSAVE
+# define STATE_SAVE_ALIGNMENT	16
+# define _dl_runtime_resolve	_dl_runtime_resolve_fxsave
+# include "dl-trampoline.h"
+# undef _dl_runtime_resolve
+# undef USE_FXSAVE
+# undef STATE_SAVE_ALIGNMENT
+#endif
 
 #define USE_XSAVE
 #define STATE_SAVE_ALIGNMENT	64