x86_64: Improved V1TImode rotations by non-constant amounts.

Message ID 005501d7e460$9acba670$d062f350$@nextmovesoftware.com
State Committed
Commit a5d269f0c1cda545a86da960e8989bea862dd75e
Headers
Series x86_64: Improved V1TImode rotations by non-constant amounts. |

Commit Message

Roger Sayle Nov. 28, 2021, 2:02 p.m. UTC
  This patch builds on the recent improvements to TImode rotations (and
Jakub's fixes to shldq/shrdq patterns).  Now that expanding a TImode
rotation can never fail, it is safe to allow general_operand constraints
on the QImode shift amounts in rotlv1ti3 and rotrv1ti3 patterns.
I've also made an additional tweak to ix86_expand_v1ti_to_ti to use
vec_extract via V2DImode, which avoid using memory and takes advantage
vpextrq on recent hardware.

For the following test case:

typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
uv1ti rotr(uv1ti x, unsigned int i) { return (x >> i) | (x << (128-i)); }

GCC with -O2 -mavx2 would previously generate:

rotr:   vmovdqa %xmm0, -24(%rsp)
        movq    -16(%rsp), %rdx
        movl    %edi, %ecx
        xorl    %esi, %esi
        movq    -24(%rsp), %rax
        shrdq   %rdx, %rax
        shrq    %cl, %rdx
        testb   $64, %dil
        cmovne  %rdx, %rax
        cmovne  %rsi, %rdx
        negl    %ecx
        xorl    %edi, %edi
        andl    $127, %ecx
        vmovq   %rax, %xmm2
        movq    -24(%rsp), %rax
        vpinsrq $1, %rdx, %xmm2, %xmm1
        movq    -16(%rsp), %rdx
        shldq   %rax, %rdx
        salq    %cl, %rax
        testb   $64, %cl
        cmovne  %rax, %rdx
        cmovne  %rdi, %rax
        vmovq   %rax, %xmm3
        vpinsrq $1, %rdx, %xmm3, %xmm0
        vpor    %xmm1, %xmm0, %xmm0
        ret

with this patch, we now generate:

rotr:   movl    %edi, %ecx
        vpextrq $1, %xmm0, %rax
        vmovq   %xmm0, %rdx
        shrdq   %rax, %rdx
        vmovq   %xmm0, %rsi
        shrdq   %rsi, %rax
        andl    $64, %ecx
        movq    %rdx, %rsi
        cmovne  %rax, %rsi
        cmove   %rax, %rdx
        vmovq   %rsi, %xmm0
        vpinsrq $1, %rdx, %xmm0, %xmm0
        ret


This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check with no new failures.  Ok for mainline?


2021-11-28  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	* config/i386/i386-expand.c (ix86_expand_v1ti_to_ti): Perform the
	conversion via V2DImode using vec_extractv2didi on TARGET_SSE2.
	* config/i386/sse.md (rotlv1ti3, rotrv1ti3): Change constraint
	on QImode shift amounts from const_int_operand to general_operand.

gcc/testsuite/ChangeLog
	* gcc.target/i386/sse2-v1ti-rotate.c: New test case.


Thanks in advance,
Roger
--
  

Comments

Uros Bizjak Nov. 28, 2021, 7:15 p.m. UTC | #1
On Sun, Nov 28, 2021 at 3:02 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch builds on the recent improvements to TImode rotations (and
> Jakub's fixes to shldq/shrdq patterns).  Now that expanding a TImode
> rotation can never fail, it is safe to allow general_operand constraints
> on the QImode shift amounts in rotlv1ti3 and rotrv1ti3 patterns.
> I've also made an additional tweak to ix86_expand_v1ti_to_ti to use
> vec_extract via V2DImode, which avoid using memory and takes advantage
> vpextrq on recent hardware.
>
> For the following test case:
>
> typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
> uv1ti rotr(uv1ti x, unsigned int i) { return (x >> i) | (x << (128-i)); }
>
> GCC with -O2 -mavx2 would previously generate:
>
> rotr:   vmovdqa %xmm0, -24(%rsp)
>         movq    -16(%rsp), %rdx
>         movl    %edi, %ecx
>         xorl    %esi, %esi
>         movq    -24(%rsp), %rax
>         shrdq   %rdx, %rax
>         shrq    %cl, %rdx
>         testb   $64, %dil
>         cmovne  %rdx, %rax
>         cmovne  %rsi, %rdx
>         negl    %ecx
>         xorl    %edi, %edi
>         andl    $127, %ecx
>         vmovq   %rax, %xmm2
>         movq    -24(%rsp), %rax
>         vpinsrq $1, %rdx, %xmm2, %xmm1
>         movq    -16(%rsp), %rdx
>         shldq   %rax, %rdx
>         salq    %cl, %rax
>         testb   $64, %cl
>         cmovne  %rax, %rdx
>         cmovne  %rdi, %rax
>         vmovq   %rax, %xmm3
>         vpinsrq $1, %rdx, %xmm3, %xmm0
>         vpor    %xmm1, %xmm0, %xmm0
>         ret
>
> with this patch, we now generate:
>
> rotr:   movl    %edi, %ecx
>         vpextrq $1, %xmm0, %rax
>         vmovq   %xmm0, %rdx
>         shrdq   %rax, %rdx
>         vmovq   %xmm0, %rsi
>         shrdq   %rsi, %rax
>         andl    $64, %ecx
>         movq    %rdx, %rsi
>         cmovne  %rax, %rsi
>         cmove   %rax, %rdx
>         vmovq   %rsi, %xmm0
>         vpinsrq $1, %rdx, %xmm0, %xmm0
>         ret
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check with no new failures.  Ok for mainline?
>
>
> 2021-11-28  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         * config/i386/i386-expand.c (ix86_expand_v1ti_to_ti): Perform the
>         conversion via V2DImode using vec_extractv2didi on TARGET_SSE2.
>         * config/i386/sse.md (rotlv1ti3, rotrv1ti3): Change constraint
>         on QImode shift amounts from const_int_operand to general_operand.
>
> gcc/testsuite/ChangeLog
>         * gcc.target/i386/sse2-v1ti-rotate.c: New test case.

OK.

Thanks,
Uros.

>
>
> Thanks in advance,
> Roger
> --
>
  

Patch

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 088e6af..1e9734b 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -6162,7 +6162,17 @@  static rtx
 ix86_expand_v1ti_to_ti (rtx x)
 {
   rtx result = gen_reg_rtx (TImode);
-  emit_move_insn (result, gen_lowpart (TImode, x));
+  if (TARGET_SSE2)
+    {
+      rtx temp = gen_reg_rtx (V2DImode);
+      emit_move_insn (temp, gen_lowpart (V2DImode, x));
+      rtx lo = gen_lowpart (DImode, result);
+      emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
+      rtx hi = gen_highpart (DImode, result);
+      emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
+    }
+  else
+    emit_move_insn (result, gen_lowpart (TImode, x));
   return result;
 }
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 2764a25..459eec9 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -15169,7 +15169,7 @@ 
   [(set (match_operand:V1TI 0 "register_operand")
 	(rotate:V1TI
 	 (match_operand:V1TI 1 "register_operand")
-	 (match_operand:QI 2 "const_int_operand")))]
+	 (match_operand:QI 2 "general_operand")))]
   "TARGET_SSE2 && TARGET_64BIT"
 {
   ix86_expand_v1ti_rotate (ROTATE, operands);
@@ -15180,7 +15180,7 @@ 
   [(set (match_operand:V1TI 0 "register_operand")
 	(rotatert:V1TI
 	 (match_operand:V1TI 1 "register_operand")
-	 (match_operand:QI 2 "const_int_operand")))]
+	 (match_operand:QI 2 "general_operand")))]
   "TARGET_SSE2 && TARGET_64BIT"
 {
   ix86_expand_v1ti_rotate (ROTATERT, operands);
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-rotate.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-rotate.c
new file mode 100644
index 0000000..b4b2814
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-rotate.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
+
+uv1ti rotr(uv1ti x, unsigned int i) { return (x >> i) | (x << (128-i)); }
+uv1ti rotl(uv1ti x, unsigned int i) { return (x << i) | (x >> (128-i)); }
+
+/* { dg-final { scan-assembler-not "shrq" } } */
+/* { dg-final { scan-assembler-not "salq" } } */