[v1,2/3] x86: Move and slightly improve memset_erms

Message ID 20220628152735.17863-2-goldstein.w.n@gmail.com
State Superseded
Headers
Series [v1,1/3] x86: Add definition for __wmemset_chk AVX2 RTM in ifunc impl list |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Noah Goldstein June 28, 2022, 3:27 p.m. UTC
  Implementation wise:
    1. Remove the VZEROUPPER as memset_{impl}_unaligned_erms does not
       use the L(stosb) label that was previously defined.

    2. Don't give the hotpath (fallthrough) to zero size.

Code positioning wise:

Move L(memset_{chk}_erms) to its own file.  Leaving it in between the
memset_{impl}_unaligned both adds unnecessary complexity to the
file and wastes space in a relatively hot cache section.
---
 .../multiarch/memset-vec-unaligned-erms.S     | 54 ++++++++-----------
 1 file changed, 23 insertions(+), 31 deletions(-)
  

Comments

H.J. Lu June 29, 2022, 7:25 p.m. UTC | #1
On Tue, Jun 28, 2022 at 8:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Implementation wise:
>     1. Remove the VZEROUPPER as memset_{impl}_unaligned_erms does not
>        use the L(stosb) label that was previously defined.
>
>     2. Don't give the hotpath (fallthrough) to zero size.
>
> Code positioning wise:
>
> Move L(memset_{chk}_erms) to its own file.  Leaving it in between the

 It is ENTRY, not L.   Did you mean to move them to the end of file?

> memset_{impl}_unaligned both adds unnecessary complexity to the
> file and wastes space in a relatively hot cache section.
> ---
>  .../multiarch/memset-vec-unaligned-erms.S     | 54 ++++++++-----------
>  1 file changed, 23 insertions(+), 31 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index abc12d9cda..d98c613651 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -156,37 +156,6 @@ L(entry_from_wmemset):
>  #if defined USE_MULTIARCH && IS_IN (libc)
>  END (MEMSET_SYMBOL (__memset, unaligned))
>
> -# if VEC_SIZE == 16
> -ENTRY (__memset_chk_erms)
> -       cmp     %RDX_LP, %RCX_LP
> -       jb      HIDDEN_JUMPTARGET (__chk_fail)
> -END (__memset_chk_erms)
> -
> -/* Only used to measure performance of REP STOSB.  */
> -ENTRY (__memset_erms)
> -       /* Skip zero length.  */
> -       test    %RDX_LP, %RDX_LP
> -       jnz      L(stosb)
> -       movq    %rdi, %rax
> -       ret
> -# else
> -/* Provide a hidden symbol to debugger.  */
> -       .hidden MEMSET_SYMBOL (__memset, erms)
> -ENTRY (MEMSET_SYMBOL (__memset, erms))
> -# endif
> -L(stosb):
> -       mov     %RDX_LP, %RCX_LP
> -       movzbl  %sil, %eax
> -       mov     %RDI_LP, %RDX_LP
> -       rep stosb
> -       mov     %RDX_LP, %RAX_LP
> -       VZEROUPPER_RETURN
> -# if VEC_SIZE == 16
> -END (__memset_erms)
> -# else
> -END (MEMSET_SYMBOL (__memset, erms))
> -# endif
> -
>  # if defined SHARED && IS_IN (libc)
>  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
>         cmp     %RDX_LP, %RCX_LP
> @@ -461,3 +430,26 @@ L(between_2_3):
>  #endif
>         ret
>  END (MEMSET_SYMBOL (__memset, unaligned_erms))
> +
> +#if defined USE_MULTIARCH && IS_IN (libc) && VEC_SIZE == 16
> +ENTRY (__memset_chk_erms)
> +       cmp     %RDX_LP, %RCX_LP
> +       jb      HIDDEN_JUMPTARGET (__chk_fail)
> +END (__memset_chk_erms)
> +
> +/* Only used to measure performance of REP STOSB.  */
> +ENTRY (__memset_erms)
> +       /* Skip zero length.  */
> +       test    %RDX_LP, %RDX_LP
> +       jz       L(stosb_return_zero)
> +       mov     %RDX_LP, %RCX_LP
> +       movzbl  %sil, %eax
> +       mov     %RDI_LP, %RDX_LP
> +       rep stosb
> +       mov     %RDX_LP, %RAX_LP
> +       ret
> +L(stosb_return_zero):
> +       movq    %rdi, %rax
> +       ret
> +END (__memset_erms)
> +#endif
> --
> 2.34.1
>
  
Noah Goldstein June 29, 2022, 7:32 p.m. UTC | #2
On Wed, Jun 29, 2022 at 12:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Jun 28, 2022 at 8:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Implementation wise:
> >     1. Remove the VZEROUPPER as memset_{impl}_unaligned_erms does not
> >        use the L(stosb) label that was previously defined.
> >
> >     2. Don't give the hotpath (fallthrough) to zero size.
> >
> > Code positioning wise:
> >
> > Move L(memset_{chk}_erms) to its own file.  Leaving it in between the
>
>  It is ENTRY, not L.   Did you mean to move them to the end of file?

Will fix L -> ENTRY for V2.

Yes it should be moved to new file in this patch. Was rebase mistake. The file
change is in the isa raising patch. Will fix both for v2.
>
> > memset_{impl}_unaligned both adds unnecessary complexity to the
> > file and wastes space in a relatively hot cache section.
> > ---
> >  .../multiarch/memset-vec-unaligned-erms.S     | 54 ++++++++-----------
> >  1 file changed, 23 insertions(+), 31 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > index abc12d9cda..d98c613651 100644
> > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > @@ -156,37 +156,6 @@ L(entry_from_wmemset):
> >  #if defined USE_MULTIARCH && IS_IN (libc)
> >  END (MEMSET_SYMBOL (__memset, unaligned))
> >
> > -# if VEC_SIZE == 16
> > -ENTRY (__memset_chk_erms)
> > -       cmp     %RDX_LP, %RCX_LP
> > -       jb      HIDDEN_JUMPTARGET (__chk_fail)
> > -END (__memset_chk_erms)
> > -
> > -/* Only used to measure performance of REP STOSB.  */
> > -ENTRY (__memset_erms)
> > -       /* Skip zero length.  */
> > -       test    %RDX_LP, %RDX_LP
> > -       jnz      L(stosb)
> > -       movq    %rdi, %rax
> > -       ret
> > -# else
> > -/* Provide a hidden symbol to debugger.  */
> > -       .hidden MEMSET_SYMBOL (__memset, erms)
> > -ENTRY (MEMSET_SYMBOL (__memset, erms))
> > -# endif
> > -L(stosb):
> > -       mov     %RDX_LP, %RCX_LP
> > -       movzbl  %sil, %eax
> > -       mov     %RDI_LP, %RDX_LP
> > -       rep stosb
> > -       mov     %RDX_LP, %RAX_LP
> > -       VZEROUPPER_RETURN
> > -# if VEC_SIZE == 16
> > -END (__memset_erms)
> > -# else
> > -END (MEMSET_SYMBOL (__memset, erms))
> > -# endif
> > -
> >  # if defined SHARED && IS_IN (libc)
> >  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> >         cmp     %RDX_LP, %RCX_LP
> > @@ -461,3 +430,26 @@ L(between_2_3):
> >  #endif
> >         ret
> >  END (MEMSET_SYMBOL (__memset, unaligned_erms))
> > +
> > +#if defined USE_MULTIARCH && IS_IN (libc) && VEC_SIZE == 16
> > +ENTRY (__memset_chk_erms)
> > +       cmp     %RDX_LP, %RCX_LP
> > +       jb      HIDDEN_JUMPTARGET (__chk_fail)
> > +END (__memset_chk_erms)
> > +
> > +/* Only used to measure performance of REP STOSB.  */
> > +ENTRY (__memset_erms)
> > +       /* Skip zero length.  */
> > +       test    %RDX_LP, %RDX_LP
> > +       jz       L(stosb_return_zero)
> > +       mov     %RDX_LP, %RCX_LP
> > +       movzbl  %sil, %eax
> > +       mov     %RDI_LP, %RDX_LP
> > +       rep stosb
> > +       mov     %RDX_LP, %RAX_LP
> > +       ret
> > +L(stosb_return_zero):
> > +       movq    %rdi, %rax
> > +       ret
> > +END (__memset_erms)
> > +#endif
> > --
> > 2.34.1
> >
>
>
> --
> H.J.
  
Noah Goldstein June 29, 2022, 10:11 p.m. UTC | #3
On Wed, Jun 29, 2022 at 12:32 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Jun 29, 2022 at 12:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Jun 28, 2022 at 8:27 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > Implementation wise:
> > >     1. Remove the VZEROUPPER as memset_{impl}_unaligned_erms does not
> > >        use the L(stosb) label that was previously defined.
> > >
> > >     2. Don't give the hotpath (fallthrough) to zero size.
> > >
> > > Code positioning wise:
> > >
> > > Move L(memset_{chk}_erms) to its own file.  Leaving it in between the
> >
> >  It is ENTRY, not L.   Did you mean to move them to the end of file?
>
> Will fix L -> ENTRY for V2.

Fixed in V2.
>
> Yes it should be moved to new file in this patch. Was rebase mistake. The file
> change is in the isa raising patch. Will fix both for v2.
> >
> > > memset_{impl}_unaligned both adds unnecessary complexity to the
> > > file and wastes space in a relatively hot cache section.
> > > ---
> > >  .../multiarch/memset-vec-unaligned-erms.S     | 54 ++++++++-----------
> > >  1 file changed, 23 insertions(+), 31 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > index abc12d9cda..d98c613651 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > @@ -156,37 +156,6 @@ L(entry_from_wmemset):
> > >  #if defined USE_MULTIARCH && IS_IN (libc)
> > >  END (MEMSET_SYMBOL (__memset, unaligned))
> > >
> > > -# if VEC_SIZE == 16
> > > -ENTRY (__memset_chk_erms)
> > > -       cmp     %RDX_LP, %RCX_LP
> > > -       jb      HIDDEN_JUMPTARGET (__chk_fail)
> > > -END (__memset_chk_erms)
> > > -
> > > -/* Only used to measure performance of REP STOSB.  */
> > > -ENTRY (__memset_erms)
> > > -       /* Skip zero length.  */
> > > -       test    %RDX_LP, %RDX_LP
> > > -       jnz      L(stosb)
> > > -       movq    %rdi, %rax
> > > -       ret
> > > -# else
> > > -/* Provide a hidden symbol to debugger.  */
> > > -       .hidden MEMSET_SYMBOL (__memset, erms)
> > > -ENTRY (MEMSET_SYMBOL (__memset, erms))
> > > -# endif
> > > -L(stosb):
> > > -       mov     %RDX_LP, %RCX_LP
> > > -       movzbl  %sil, %eax
> > > -       mov     %RDI_LP, %RDX_LP
> > > -       rep stosb
> > > -       mov     %RDX_LP, %RAX_LP
> > > -       VZEROUPPER_RETURN
> > > -# if VEC_SIZE == 16
> > > -END (__memset_erms)
> > > -# else
> > > -END (MEMSET_SYMBOL (__memset, erms))
> > > -# endif
> > > -
> > >  # if defined SHARED && IS_IN (libc)
> > >  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> > >         cmp     %RDX_LP, %RCX_LP
> > > @@ -461,3 +430,26 @@ L(between_2_3):
> > >  #endif
> > >         ret
> > >  END (MEMSET_SYMBOL (__memset, unaligned_erms))
> > > +
> > > +#if defined USE_MULTIARCH && IS_IN (libc) && VEC_SIZE == 16
> > > +ENTRY (__memset_chk_erms)
> > > +       cmp     %RDX_LP, %RCX_LP
> > > +       jb      HIDDEN_JUMPTARGET (__chk_fail)
> > > +END (__memset_chk_erms)
> > > +
> > > +/* Only used to measure performance of REP STOSB.  */
> > > +ENTRY (__memset_erms)
> > > +       /* Skip zero length.  */
> > > +       test    %RDX_LP, %RDX_LP
> > > +       jz       L(stosb_return_zero)
> > > +       mov     %RDX_LP, %RCX_LP
> > > +       movzbl  %sil, %eax
> > > +       mov     %RDI_LP, %RDX_LP
> > > +       rep stosb
> > > +       mov     %RDX_LP, %RAX_LP
> > > +       ret
> > > +L(stosb_return_zero):
> > > +       movq    %rdi, %rax
> > > +       ret
> > > +END (__memset_erms)
> > > +#endif
> > > --
> > > 2.34.1
> > >
> >
> >
> > --
> > H.J.
  

Patch

diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index abc12d9cda..d98c613651 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -156,37 +156,6 @@  L(entry_from_wmemset):
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
 
-# if VEC_SIZE == 16
-ENTRY (__memset_chk_erms)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (__memset_chk_erms)
-
-/* Only used to measure performance of REP STOSB.  */
-ENTRY (__memset_erms)
-	/* Skip zero length.  */
-	test	%RDX_LP, %RDX_LP
-	jnz	 L(stosb)
-	movq	%rdi, %rax
-	ret
-# else
-/* Provide a hidden symbol to debugger.  */
-	.hidden	MEMSET_SYMBOL (__memset, erms)
-ENTRY (MEMSET_SYMBOL (__memset, erms))
-# endif
-L(stosb):
-	mov	%RDX_LP, %RCX_LP
-	movzbl	%sil, %eax
-	mov	%RDI_LP, %RDX_LP
-	rep stosb
-	mov	%RDX_LP, %RAX_LP
-	VZEROUPPER_RETURN
-# if VEC_SIZE == 16
-END (__memset_erms)
-# else
-END (MEMSET_SYMBOL (__memset, erms))
-# endif
-
 # if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 	cmp	%RDX_LP, %RCX_LP
@@ -461,3 +430,26 @@  L(between_2_3):
 #endif
 	ret
 END (MEMSET_SYMBOL (__memset, unaligned_erms))
+
+#if defined USE_MULTIARCH && IS_IN (libc) && VEC_SIZE == 16
+ENTRY (__memset_chk_erms)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_erms)
+
+/* Only used to measure performance of REP STOSB.  */
+ENTRY (__memset_erms)
+	/* Skip zero length.  */
+	test	%RDX_LP, %RDX_LP
+	jz	 L(stosb_return_zero)
+	mov	%RDX_LP, %RCX_LP
+	movzbl	%sil, %eax
+	mov	%RDI_LP, %RDX_LP
+	rep stosb
+	mov	%RDX_LP, %RAX_LP
+	ret
+L(stosb_return_zero):
+	movq	%rdi, %rax
+	ret
+END (__memset_erms)
+#endif