[v1] x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S

Message ID 20211023052647.535991-1-goldstein.w.n@gmail.com
State Committed
Commit bad852b61b79503fcb3c5fc379c70f768df3e1fb
Headers
Series [v1] x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Noah Goldstein Oct. 23, 2021, 5:26 a.m. UTC
  This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.

it could potentially be dangerous to use SSE2 if this function is ever
called without using 'vzeroupper' beforehand. While compilers appear
to use 'vzeroupper' before function calls if AVX2 has been used, using
SSE2 here is more brittle. Since it is not absolutely necessary it
should be avoided.

It costs 2-extra bytes but the extra bytes should only eat into
alignment padding.
---
 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
  

Comments

H.J. Lu Oct. 23, 2021, 1:22 p.m. UTC | #1
On Fri, Oct 22, 2021 at 10:26 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
>
> it could potentially be dangerous to use SSE2 if this function is ever
> called without using 'vzeroupper' beforehand. While compilers appear
> to use 'vzeroupper' before function calls if AVX2 has been used, using
> SSE2 here is more brittle. Since it is not absolutely necessary it
> should be avoided.
>
> It costs 2-extra bytes but the extra bytes should only eat into
> alignment padding.
> ---
>  sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> index 2761b54f2e..640f6757fa 100644
> --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> @@ -561,13 +561,13 @@ L(between_16_31):
>         /* From 16 to 31 bytes.  No branch when size == 16.  */
>
>         /* Use movups to save code size.  */
> -       movups  (%rsi), %xmm2
> +       vmovdqu (%rsi), %xmm2
>         VPCMP   $4, (%rdi), %xmm2, %k1
>         kmovd   %k1, %eax
>         testl   %eax, %eax
>         jnz     L(return_vec_0_lv)
>         /* Use overlapping loads to avoid branches.  */
> -       movups  -16(%rsi, %rdx, CHAR_SIZE), %xmm2
> +       vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2
>         VPCMP   $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
>         addl    $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
>         kmovd   %k1, %eax
> --
> 2.29.2
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
  
Sunil Pandey April 23, 2022, 1:22 a.m. UTC | #2
On Sat, Oct 23, 2021 at 6:22 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Fri, Oct 22, 2021 at 10:26 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
> >
> > it could potentially be dangerous to use SSE2 if this function is ever
> > called without using 'vzeroupper' beforehand. While compilers appear
> > to use 'vzeroupper' before function calls if AVX2 has been used, using
> > SSE2 here is more brittle. Since it is not absolutely necessary it
> > should be avoided.
> >
> > It costs 2-extra bytes but the extra bytes should only eat into
> > alignment padding.
> > ---
> >  sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> > index 2761b54f2e..640f6757fa 100644
> > --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> > +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> > @@ -561,13 +561,13 @@ L(between_16_31):
> >         /* From 16 to 31 bytes.  No branch when size == 16.  */
> >
> >         /* Use movups to save code size.  */
> > -       movups  (%rsi), %xmm2
> > +       vmovdqu (%rsi), %xmm2
> >         VPCMP   $4, (%rdi), %xmm2, %k1
> >         kmovd   %k1, %eax
> >         testl   %eax, %eax
> >         jnz     L(return_vec_0_lv)
> >         /* Use overlapping loads to avoid branches.  */
> > -       movups  -16(%rsi, %rdx, CHAR_SIZE), %xmm2
> > +       vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2
> >         VPCMP   $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
> >         addl    $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
> >         kmovd   %k1, %eax
> > --
> > 2.29.2
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil
  

Patch

diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
index 2761b54f2e..640f6757fa 100644
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -561,13 +561,13 @@  L(between_16_31):
 	/* From 16 to 31 bytes.  No branch when size == 16.  */
 
 	/* Use movups to save code size.  */
-	movups	(%rsi), %xmm2
+	vmovdqu	(%rsi), %xmm2
 	VPCMP	$4, (%rdi), %xmm2, %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_vec_0_lv)
 	/* Use overlapping loads to avoid branches.  */
-	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
 	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
 	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
 	kmovd	%k1, %eax