[x86_64] Implement __imag__ of float _Complex using shufps.

Message ID 022001d8894d$aa3a6370$feaf2a50$@nextmovesoftware.com
State Committed
Commit 64d4f27a0ce47e97867512bda7fa5683acf8a134
Headers
Series [x86_64] Implement __imag__ of float _Complex using shufps. |

Commit Message

Roger Sayle June 26, 2022, 11:12 a.m. UTC
  This patch is a follow-up improvement to my recent patch for
PR rtl-optimization/7061.  That patch added the test case
gcc.target/i386/pr7061-2.c:

float im(float _Complex a) { return __imag__ a; }

For which GCC on x86_64 currently generates:

        movq    %xmm0, %rax
        shrq    $32, %rax
        movd    %eax, %xmm0
        ret

but with this patch we now generate (the same as LLVM):

        shufps  $85, %xmm0, %xmm0
        ret

This is achieved by providing a define_insn_and_split that allows
truncated lshiftrt:DI by 32 to be performed on either SSE or general
regs, where if the register allocator prefers to use SSE, we split
to a shufps_v4si, or if not, we use a regular shrq.

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, with no new failures.  Ok for mainline?


2022-06-26  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	PR rtl-optimization/7061
	* config/i386/i386.md (*highpartdisi2): New define_insn_and_split.

gcc/testsuite/ChangeLog
	PR rtl-optimization/7061
 	* gcc.target/i386/pr7061-2.c: Update to look for shufps.


Roger
--
  

Comments

Uros Bizjak June 26, 2022, 4:58 p.m. UTC | #1
On Sun, Jun 26, 2022 at 1:12 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch is a follow-up improvement to my recent patch for
> PR rtl-optimization/7061.  That patch added the test case
> gcc.target/i386/pr7061-2.c:
>
> float im(float _Complex a) { return __imag__ a; }
>
> For which GCC on x86_64 currently generates:
>
>         movq    %xmm0, %rax
>         shrq    $32, %rax
>         movd    %eax, %xmm0
>         ret
>
> but with this patch we now generate (the same as LLVM):
>
>         shufps  $85, %xmm0, %xmm0
>         ret
>
> This is achieved by providing a define_insn_and_split that allows
> truncated lshiftrt:DI by 32 to be performed on either SSE or general
> regs, where if the register allocator prefers to use SSE, we split
> to a shufps_v4si, or if not, we use a regular shrq.
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check, with no new failures.  Ok for mainline?
>
>
> 2022-06-26  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         PR rtl-optimization/7061
>         * config/i386/i386.md (*highpartdisi2): New define_insn_and_split.
>
> gcc/testsuite/ChangeLog
>         PR rtl-optimization/7061
>         * gcc.target/i386/pr7061-2.c: Update to look for shufps.

OK.

Thanks,
Uros.

>
>
> Roger
> --
>
  

Patch

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 5b53841..709598c 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -13234,6 +13234,31 @@ 
        (const_string "*")))
    (set_attr "mode" "<MODE>")])
 
+;; Specialization of *lshr<mode>3_1 below, extracting the SImode
+;; highpart of a DI to be extracted, but allowing it to be clobbered.
+(define_insn_and_split "*highpartdisi2"
+  [(set (subreg:DI (match_operand:SI 0 "register_operand" "=r,x,?k") 0)
+        (lshiftrt:DI (match_operand:DI 1 "register_operand" "0,0,k")
+		     (const_int 32)))
+   (clobber (reg:CC FLAGS_REG))]
+  "TARGET_64BIT"
+  "#"
+  "&& reload_completed"
+  [(parallel
+    [(set (match_dup 0) (lshiftrt:DI (match_dup 1) (const_int 32)))
+     (clobber (reg:CC FLAGS_REG))])]
+{
+  if (SSE_REG_P (operands[0]))
+    {
+      rtx tmp = gen_rtx_REG (V4SImode, REGNO (operands[0]));
+      emit_insn (gen_sse_shufps_v4si (tmp, tmp, tmp,
+				      const1_rtx, const1_rtx,
+				      GEN_INT (5), GEN_INT (5)));
+      DONE;
+    }
+  operands[0] = gen_rtx_REG (DImode, REGNO (operands[0]));
+})
+
 (define_insn "*lshr<mode>3_1"
   [(set (match_operand:SWI48 0 "nonimmediate_operand" "=rm,r,?k")
 	(lshiftrt:SWI48
diff --git a/gcc/testsuite/gcc.target/i386/pr7061-2.c b/gcc/testsuite/gcc.target/i386/pr7061-2.c
index ac33340..837cd83 100644
--- a/gcc/testsuite/gcc.target/i386/pr7061-2.c
+++ b/gcc/testsuite/gcc.target/i386/pr7061-2.c
@@ -1,5 +1,9 @@ 
 /* { dg-do compile { target { ! ia32 } } } */
 /* { dg-options "-O2" } */
 float im(float _Complex a) { return __imag__ a; }
+/* { dg-final { scan-assembler "shufps" } } */
+/* { dg-final { scan-assembler-not "movd" } } */
+/* { dg-final { scan-assembler-not "movq" } } */
 /* { dg-final { scan-assembler-not "movss" } } */
 /* { dg-final { scan-assembler-not "rsp" } } */
+/* { dg-final { scan-assembler-not "shr" } } */