x86_64: Improved V1TImode rotations by non-constant amounts.
Commit Message
This patch builds on the recent improvements to TImode rotations (and
Jakub's fixes to shldq/shrdq patterns). Now that expanding a TImode
rotation can never fail, it is safe to allow general_operand constraints
on the QImode shift amounts in rotlv1ti3 and rotrv1ti3 patterns.
I've also made an additional tweak to ix86_expand_v1ti_to_ti to use
vec_extract via V2DImode, which avoid using memory and takes advantage
vpextrq on recent hardware.
For the following test case:
typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
uv1ti rotr(uv1ti x, unsigned int i) { return (x >> i) | (x << (128-i)); }
GCC with -O2 -mavx2 would previously generate:
rotr: vmovdqa %xmm0, -24(%rsp)
movq -16(%rsp), %rdx
movl %edi, %ecx
xorl %esi, %esi
movq -24(%rsp), %rax
shrdq %rdx, %rax
shrq %cl, %rdx
testb $64, %dil
cmovne %rdx, %rax
cmovne %rsi, %rdx
negl %ecx
xorl %edi, %edi
andl $127, %ecx
vmovq %rax, %xmm2
movq -24(%rsp), %rax
vpinsrq $1, %rdx, %xmm2, %xmm1
movq -16(%rsp), %rdx
shldq %rax, %rdx
salq %cl, %rax
testb $64, %cl
cmovne %rax, %rdx
cmovne %rdi, %rax
vmovq %rax, %xmm3
vpinsrq $1, %rdx, %xmm3, %xmm0
vpor %xmm1, %xmm0, %xmm0
ret
with this patch, we now generate:
rotr: movl %edi, %ecx
vpextrq $1, %xmm0, %rax
vmovq %xmm0, %rdx
shrdq %rax, %rdx
vmovq %xmm0, %rsi
shrdq %rsi, %rax
andl $64, %ecx
movq %rdx, %rsi
cmovne %rax, %rsi
cmove %rax, %rdx
vmovq %rsi, %xmm0
vpinsrq $1, %rdx, %xmm0, %xmm0
ret
This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check with no new failures. Ok for mainline?
2021-11-28 Roger Sayle <roger@nextmovesoftware.com>
gcc/ChangeLog
* config/i386/i386-expand.c (ix86_expand_v1ti_to_ti): Perform the
conversion via V2DImode using vec_extractv2didi on TARGET_SSE2.
* config/i386/sse.md (rotlv1ti3, rotrv1ti3): Change constraint
on QImode shift amounts from const_int_operand to general_operand.
gcc/testsuite/ChangeLog
* gcc.target/i386/sse2-v1ti-rotate.c: New test case.
Thanks in advance,
Roger
--
Comments
On Sun, Nov 28, 2021 at 3:02 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch builds on the recent improvements to TImode rotations (and
> Jakub's fixes to shldq/shrdq patterns). Now that expanding a TImode
> rotation can never fail, it is safe to allow general_operand constraints
> on the QImode shift amounts in rotlv1ti3 and rotrv1ti3 patterns.
> I've also made an additional tweak to ix86_expand_v1ti_to_ti to use
> vec_extract via V2DImode, which avoid using memory and takes advantage
> vpextrq on recent hardware.
>
> For the following test case:
>
> typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
> uv1ti rotr(uv1ti x, unsigned int i) { return (x >> i) | (x << (128-i)); }
>
> GCC with -O2 -mavx2 would previously generate:
>
> rotr: vmovdqa %xmm0, -24(%rsp)
> movq -16(%rsp), %rdx
> movl %edi, %ecx
> xorl %esi, %esi
> movq -24(%rsp), %rax
> shrdq %rdx, %rax
> shrq %cl, %rdx
> testb $64, %dil
> cmovne %rdx, %rax
> cmovne %rsi, %rdx
> negl %ecx
> xorl %edi, %edi
> andl $127, %ecx
> vmovq %rax, %xmm2
> movq -24(%rsp), %rax
> vpinsrq $1, %rdx, %xmm2, %xmm1
> movq -16(%rsp), %rdx
> shldq %rax, %rdx
> salq %cl, %rax
> testb $64, %cl
> cmovne %rax, %rdx
> cmovne %rdi, %rax
> vmovq %rax, %xmm3
> vpinsrq $1, %rdx, %xmm3, %xmm0
> vpor %xmm1, %xmm0, %xmm0
> ret
>
> with this patch, we now generate:
>
> rotr: movl %edi, %ecx
> vpextrq $1, %xmm0, %rax
> vmovq %xmm0, %rdx
> shrdq %rax, %rdx
> vmovq %xmm0, %rsi
> shrdq %rsi, %rax
> andl $64, %ecx
> movq %rdx, %rsi
> cmovne %rax, %rsi
> cmove %rax, %rdx
> vmovq %rsi, %xmm0
> vpinsrq $1, %rdx, %xmm0, %xmm0
> ret
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check with no new failures. Ok for mainline?
>
>
> 2021-11-28 Roger Sayle <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
> * config/i386/i386-expand.c (ix86_expand_v1ti_to_ti): Perform the
> conversion via V2DImode using vec_extractv2didi on TARGET_SSE2.
> * config/i386/sse.md (rotlv1ti3, rotrv1ti3): Change constraint
> on QImode shift amounts from const_int_operand to general_operand.
>
> gcc/testsuite/ChangeLog
> * gcc.target/i386/sse2-v1ti-rotate.c: New test case.
OK.
Thanks,
Uros.
>
>
> Thanks in advance,
> Roger
> --
>
@@ -6162,7 +6162,17 @@ static rtx
ix86_expand_v1ti_to_ti (rtx x)
{
rtx result = gen_reg_rtx (TImode);
- emit_move_insn (result, gen_lowpart (TImode, x));
+ if (TARGET_SSE2)
+ {
+ rtx temp = gen_reg_rtx (V2DImode);
+ emit_move_insn (temp, gen_lowpart (V2DImode, x));
+ rtx lo = gen_lowpart (DImode, result);
+ emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
+ rtx hi = gen_highpart (DImode, result);
+ emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
+ }
+ else
+ emit_move_insn (result, gen_lowpart (TImode, x));
return result;
}
@@ -15169,7 +15169,7 @@
[(set (match_operand:V1TI 0 "register_operand")
(rotate:V1TI
(match_operand:V1TI 1 "register_operand")
- (match_operand:QI 2 "const_int_operand")))]
+ (match_operand:QI 2 "general_operand")))]
"TARGET_SSE2 && TARGET_64BIT"
{
ix86_expand_v1ti_rotate (ROTATE, operands);
@@ -15180,7 +15180,7 @@
[(set (match_operand:V1TI 0 "register_operand")
(rotatert:V1TI
(match_operand:V1TI 1 "register_operand")
- (match_operand:QI 2 "const_int_operand")))]
+ (match_operand:QI 2 "general_operand")))]
"TARGET_SSE2 && TARGET_64BIT"
{
ix86_expand_v1ti_rotate (ROTATERT, operands);
new file mode 100644
@@ -0,0 +1,11 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
+
+uv1ti rotr(uv1ti x, unsigned int i) { return (x >> i) | (x << (128-i)); }
+uv1ti rotl(uv1ti x, unsigned int i) { return (x << i) | (x >> (128-i)); }
+
+/* { dg-final { scan-assembler-not "shrq" } } */
+/* { dg-final { scan-assembler-not "salq" } } */