AArch64 Optimize right shift rounding narrowing
Commit Message
Hi All,
This optimizes right shift rounding narrow instructions to
rounding add narrow high where one vector is 0 when the shift amount is half
that of the original input type.
i.e.
uint32x4_t foo (uint64x2_t a, uint64x2_t b)
{
return vrshrn_high_n_u64 (vrshrn_n_u64 (a, 32), b, 32);
}
now generates:
foo:
movi v3.4s, 0
raddhn v0.2s, v2.2d, v3.2d
raddhn2 v0.4s, v2.2d, v3.2d
instead of:
foo:
rshrn v0.2s, v0.2d, 32
rshrn2 v0.4s, v1.2d, 32
ret
On Arm cores this is an improvement in both latency and throughput.
Because a vector zero is needed I created a new method
aarch64_gen_shareable_zero that creates zeros using V4SI and then takes a subreg
of the zero to the desired type. This allows CSE to share all the zero
constants.
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Ok for master?
Thanks,
Tamar
gcc/ChangeLog:
* config/aarch64/aarch64-protos.h (aarch64_gen_shareable_zero): New.
* config/aarch64/aarch64-simd.md (aarch64_rshrn<mode>,
aarch64_rshrn2<mode>):
* config/aarch64/aarch64.c (aarch64_gen_shareable_zero): New.
gcc/testsuite/ChangeLog:
* gcc.target/aarch64/advsimd-intrinsics/shrn-1.c: New test.
* gcc.target/aarch64/advsimd-intrinsics/shrn-2.c: New test.
* gcc.target/aarch64/advsimd-intrinsics/shrn-3.c: New test.
* gcc.target/aarch64/advsimd-intrinsics/shrn-4.c: New test.
--- inline copy of patch --
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index f7887d06139f01c1591c4e755538d94e5e608a52..f7f5cae82bc9198e54d0298f25f7c0f5902d5fb1 100644
--
Comments
Adding ML back in. ☹
> -----Original Message-----
> From: Tamar Christina <Tamar.Christina@arm.com>
> Sent: Tuesday, November 23, 2021 3:17 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Richard Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>;
> Richard Sandiford <Richard.Sandiford@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>; Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
> Subject: RE: [PATCH]AArch64 Optimize right shift rounding narrowing
>
> Ping.
>
> > -----Original Message-----
> > From: Gcc-patches <gcc-patches-
> > bounces+tamar.christina=arm.com@gcc.gnu.org> On Behalf Of Tamar
> > Christina via Gcc-patches
> > Sent: Friday, November 12, 2021 12:08 PM
> > To: gcc-patches@gcc.gnu.org
> > Cc: Richard Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>;
> > Richard Sandiford <Richard.Sandiford@arm.com>; Marcus Shawcroft
> > <Marcus.Shawcroft@arm.com>
> > Subject: [PATCH]AArch64 Optimize right shift rounding narrowing
> >
> > Hi All,
> >
> > This optimizes right shift rounding narrow instructions to rounding
> > add narrow high where one vector is 0 when the shift amount is half
> > that of the original input type.
> >
> > i.e.
> >
> > uint32x4_t foo (uint64x2_t a, uint64x2_t b) {
> > return vrshrn_high_n_u64 (vrshrn_n_u64 (a, 32), b, 32); }
> >
> > now generates:
> >
> > foo:
> > movi v3.4s, 0
> > raddhn v0.2s, v2.2d, v3.2d
> > raddhn2 v0.4s, v2.2d, v3.2d
> >
> > instead of:
> >
> > foo:
> > rshrn v0.2s, v0.2d, 32
> > rshrn2 v0.4s, v1.2d, 32
> > ret
> >
> > On Arm cores this is an improvement in both latency and throughput.
> > Because a vector zero is needed I created a new method
> > aarch64_gen_shareable_zero that creates zeros using V4SI and then
> > takes a subreg of the zero to the desired type. This allows CSE to
> > share all the zero constants.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > * config/aarch64/aarch64-protos.h (aarch64_gen_shareable_zero):
> > New.
> > * config/aarch64/aarch64-simd.md (aarch64_rshrn<mode>,
> > aarch64_rshrn2<mode>):
> > * config/aarch64/aarch64.c (aarch64_gen_shareable_zero): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/aarch64/advsimd-intrinsics/shrn-1.c: New test.
> > * gcc.target/aarch64/advsimd-intrinsics/shrn-2.c: New test.
> > * gcc.target/aarch64/advsimd-intrinsics/shrn-3.c: New test.
> > * gcc.target/aarch64/advsimd-intrinsics/shrn-4.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64-protos.h
> > b/gcc/config/aarch64/aarch64-protos.h
> > index
> >
> f7887d06139f01c1591c4e755538d94e5e608a52..f7f5cae82bc9198e54d0298f25f
> > 7c0f5902d5fb1 100644
> > --- a/gcc/config/aarch64/aarch64-protos.h
> > +++ b/gcc/config/aarch64/aarch64-protos.h
> > @@ -846,6 +846,7 @@ const char *aarch64_output_move_struct (rtx
> > *operands); rtx aarch64_return_addr_rtx (void); rtx
> > aarch64_return_addr (int, rtx); rtx aarch64_simd_gen_const_vector_dup
> > (machine_mode, HOST_WIDE_INT);
> > +rtx aarch64_gen_shareable_zero (machine_mode);
> > bool aarch64_simd_mem_operand_p (rtx); bool
> > aarch64_sve_ld1r_operand_p (rtx); bool aarch64_sve_ld1rq_operand_p
> > (rtx); diff --git a/gcc/config/aarch64/aarch64-simd.md
> > b/gcc/config/aarch64/aarch64- simd.md index
> >
> c71658e2bf52b26bf9fc9fa702dd5446447f4d43..d7f8694add540e32628893a7b7
> > 471c08de6f760f 100644
> > --- a/gcc/config/aarch64/aarch64-simd.md
> > +++ b/gcc/config/aarch64/aarch64-simd.md
> > @@ -1956,20 +1956,32 @@ (define_expand "aarch64_rshrn<mode>"
> > (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>")]
> > "TARGET_SIMD"
> > {
> > - operands[2] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
> > - INTVAL (operands[2]));
> > - rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
> > - if (BYTES_BIG_ENDIAN)
> > - emit_insn (gen_aarch64_rshrn<mode>_insn_be (tmp, operands[1],
> > - operands[2], CONST0_RTX
> > (<VNARROWQ>mode)));
> > + if (INTVAL (operands[2]) == GET_MODE_UNIT_BITSIZE
> > (<VNARROWQ>mode))
> > + {
> > + rtx tmp0 = aarch64_gen_shareable_zero (<MODE>mode);
> > + emit_insn (gen_aarch64_raddhn<mode> (operands[0], operands[1],
> > tmp0));
> > + }
> > else
> > - emit_insn (gen_aarch64_rshrn<mode>_insn_le (tmp, operands[1],
> > - operands[2], CONST0_RTX
> > (<VNARROWQ>mode)));
> > -
> > - /* The intrinsic expects a narrow result, so emit a subreg that will get
> > - optimized away as appropriate. */
> > - emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode,
> > tmp,
> > - <VNARROWQ2>mode));
> > + {
> > + rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
> > + operands[2] = aarch64_simd_gen_const_vector_dup
> > (<MODE>mode,
> > + INTVAL (operands[2]));
> > + if (BYTES_BIG_ENDIAN)
> > + emit_insn (
> > + gen_aarch64_rshrn<mode>_insn_be (tmp, operands[1],
> > + operands[2],
> > + CONST0_RTX
> > (<VNARROWQ>mode)));
> > + else
> > + emit_insn (
> > + gen_aarch64_rshrn<mode>_insn_le (tmp, operands[1],
> > + operands[2],
> > + CONST0_RTX
> > (<VNARROWQ>mode)));
> > +
> > + /* The intrinsic expects a narrow result, so emit a subreg that will
> > + get optimized away as appropriate. */
> > + emit_move_insn (operands[0], lowpart_subreg
> > (<VNARROWQ>mode, tmp,
> > + <VNARROWQ2>mode));
> > + }
> > DONE;
> > }
> > )
> > @@ -2049,14 +2061,27 @@ (define_expand "aarch64_rshrn2<mode>"
> > (match_operand:SI 3 "aarch64_simd_shift_imm_offset_<vn_mode>")]
> > "TARGET_SIMD"
> > {
> > - operands[3] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
> > - INTVAL (operands[3]));
> > - if (BYTES_BIG_ENDIAN)
> > - emit_insn (gen_aarch64_rshrn2<mode>_insn_be (operands[0],
> > operands[1],
> > - operands[2], operands[3]));
> > + if (INTVAL (operands[3]) == GET_MODE_UNIT_BITSIZE
> > (<VNARROWQ2>mode))
> > + {
> > + rtx tmp = aarch64_gen_shareable_zero (<MODE>mode);
> > + emit_insn (gen_aarch64_raddhn2<mode> (operands[0],
> > operands[1],
> > + operands[2], tmp));
> > + }
> > else
> > - emit_insn (gen_aarch64_rshrn2<mode>_insn_le (operands[0],
> > operands[1],
> > - operands[2], operands[3]));
> > + {
> > + operands[3] = aarch64_simd_gen_const_vector_dup
> > (<MODE>mode,
> > + INTVAL
> > (operands[3]));
> > + if (BYTES_BIG_ENDIAN)
> > + emit_insn (gen_aarch64_rshrn2<mode>_insn_be (operands[0],
> > + operands[1],
> > + operands[2],
> > + operands[3]));
> > + else
> > + emit_insn (gen_aarch64_rshrn2<mode>_insn_le (operands[0],
> > + operands[1],
> > + operands[2],
> > + operands[3]));
> > + }
> > DONE;
> > }
> > )
> > diff --git a/gcc/config/aarch64/aarch64.c
> > b/gcc/config/aarch64/aarch64.c index
> >
> fdf05505846721b02059df494d6395ae9423a8ef..11201ea3498beb270c0a7f8da5
> > f5009d710535ee 100644
> > --- a/gcc/config/aarch64/aarch64.c
> > +++ b/gcc/config/aarch64/aarch64.c
> > @@ -20397,6 +20397,18 @@ aarch64_mov_operand_p (rtx x,
> machine_mode
> > mode)
> > == SYMBOL_TINY_ABSOLUTE;
> > }
> >
> > +/* Create a 0 constant that is based of V4SI to allow CSE to optimally share
> > + the constant creation. */
> > +
> > +rtx
> > +aarch64_gen_shareable_zero (machine_mode mode) {
> > + machine_mode zmode = V4SImode;
> > + rtx tmp = gen_reg_rtx (zmode);
> > + emit_move_insn (tmp, CONST0_RTX (zmode));
> > + return lowpart_subreg (mode, tmp, zmode); }
> > +
> > /* Return a const_int vector of VAL. */ rtx
> > aarch64_simd_gen_const_vector_dup (machine_mode mode,
> HOST_WIDE_INT
> > val) diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-
> > intrinsics/shrn-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-
> > intrinsics/shrn-1.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..4bc3aa9563ee7d0dc46557d30
> > d9a29149706229d
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-1.c
> > @@ -0,0 +1,15 @@
> > +/* { dg-do compile { target { aarch64*-*-* } } } */
> > +/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
> > +
> > +#include <arm_neon.h>
> > +
> > +uint8x16_t foo (uint32x4_t a, uint32x4_t b) {
> > + uint16x4_t a1 = vrshrn_n_u32 (a, 16);
> > + uint16x8_t b1 = vrshrn_high_n_u32 (a1, b, 16);
> > + return vrshrn_high_n_u16 (vrshrn_n_u16 (b1, 8), b1, 8); }
> > +
> > +/* { dg-final { scan-assembler-times {\tmovi\t} 1 } } */
> > +/* { dg-final { scan-assembler-times {\traddhn\t} 2 } } */
> > +/* { dg-final { scan-assembler-times {\traddhn2\t} 2 } } */
> > diff --git
> > a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-2.c
> > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-2.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..09d913e85524f06367c1c2cf51
> > dda0f57578e9ae
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-2.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do compile { target { aarch64*-*-* } } } */
> > +
> > +#include <arm_neon.h>
> > +
> > +uint32x4_t foo (uint64x2_t a, uint64x2_t b) {
> > + return vrshrn_high_n_u64 (vrshrn_n_u64 (a, 32), b, 32); }
> > +
> > +/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */
> > +/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */
> > diff --git
> > a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-3.c
> > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-3.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..bdccbb3410f049d7e45aabdcc
> > 3d2964fbabca807
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-3.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do compile { target { aarch64*-*-* } } } */
> > +
> > +#include <arm_neon.h>
> > +
> > +uint16x8_t foo (uint32x4_t a, uint32x4_t b) {
> > + return vrshrn_high_n_u32 (vrshrn_n_u32 (a, 16), b, 16); }
> > +
> > +/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */
> > +/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */
> > diff --git
> > a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-4.c
> > b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-4.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..4b23eddb85891975b8e122060
> > e2a9ebfe56d842c
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-4.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do compile { target { aarch64*-*-* } } } */
> > +
> > +#include <arm_neon.h>
> > +
> > +uint8x16_t foo (uint16x8_t a, uint16x8_t b) {
> > + return vrshrn_high_n_u16 (vrshrn_n_u16 (a, 8), b, 8); }
> > +
> > +/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */
> > +/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */
> >
> >
> > --
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> This optimizes right shift rounding narrow instructions to
> rounding add narrow high where one vector is 0 when the shift amount is half
> that of the original input type.
>
> i.e.
>
> uint32x4_t foo (uint64x2_t a, uint64x2_t b)
> {
> return vrshrn_high_n_u64 (vrshrn_n_u64 (a, 32), b, 32);
> }
>
> now generates:
>
> foo:
> movi v3.4s, 0
> raddhn v0.2s, v2.2d, v3.2d
> raddhn2 v0.4s, v2.2d, v3.2d
>
> instead of:
>
> foo:
> rshrn v0.2s, v0.2d, 32
> rshrn2 v0.4s, v1.2d, 32
> ret
>
> On Arm cores this is an improvement in both latency and throughput.
> Because a vector zero is needed I created a new method
> aarch64_gen_shareable_zero that creates zeros using V4SI and then takes a subreg
> of the zero to the desired type. This allows CSE to share all the zero
> constants.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
LGTM. Just a couple of nits:
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> * config/aarch64/aarch64-protos.h (aarch64_gen_shareable_zero): New.
> * config/aarch64/aarch64-simd.md (aarch64_rshrn<mode>,
> aarch64_rshrn2<mode>):
Missing description.
> * config/aarch64/aarch64.c (aarch64_gen_shareable_zero): New.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/aarch64/advsimd-intrinsics/shrn-1.c: New test.
> * gcc.target/aarch64/advsimd-intrinsics/shrn-2.c: New test.
> * gcc.target/aarch64/advsimd-intrinsics/shrn-3.c: New test.
> * gcc.target/aarch64/advsimd-intrinsics/shrn-4.c: New test.
>
> --- inline copy of patch --
> diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
> index f7887d06139f01c1591c4e755538d94e5e608a52..f7f5cae82bc9198e54d0298f25f7c0f5902d5fb1 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -846,6 +846,7 @@ const char *aarch64_output_move_struct (rtx *operands);
> rtx aarch64_return_addr_rtx (void);
> rtx aarch64_return_addr (int, rtx);
> rtx aarch64_simd_gen_const_vector_dup (machine_mode, HOST_WIDE_INT);
> +rtx aarch64_gen_shareable_zero (machine_mode);
> bool aarch64_simd_mem_operand_p (rtx);
> bool aarch64_sve_ld1r_operand_p (rtx);
> bool aarch64_sve_ld1rq_operand_p (rtx);
> diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
> index c71658e2bf52b26bf9fc9fa702dd5446447f4d43..d7f8694add540e32628893a7b7471c08de6f760f 100644
> --- a/gcc/config/aarch64/aarch64-simd.md
> +++ b/gcc/config/aarch64/aarch64-simd.md
> @@ -1956,20 +1956,32 @@ (define_expand "aarch64_rshrn<mode>"
> (match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>")]
> "TARGET_SIMD"
> {
> - operands[2] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
> - INTVAL (operands[2]));
> - rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
> - if (BYTES_BIG_ENDIAN)
> - emit_insn (gen_aarch64_rshrn<mode>_insn_be (tmp, operands[1],
> - operands[2], CONST0_RTX (<VNARROWQ>mode)));
> + if (INTVAL (operands[2]) == GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode))
> + {
> + rtx tmp0 = aarch64_gen_shareable_zero (<MODE>mode);
> + emit_insn (gen_aarch64_raddhn<mode> (operands[0], operands[1], tmp0));
> + }
> else
> - emit_insn (gen_aarch64_rshrn<mode>_insn_le (tmp, operands[1],
> - operands[2], CONST0_RTX (<VNARROWQ>mode)));
> -
> - /* The intrinsic expects a narrow result, so emit a subreg that will get
> - optimized away as appropriate. */
> - emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
> - <VNARROWQ2>mode));
> + {
> + rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
> + operands[2] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
> + INTVAL (operands[2]));
> + if (BYTES_BIG_ENDIAN)
> + emit_insn (
> + gen_aarch64_rshrn<mode>_insn_be (tmp, operands[1],
> + operands[2],
> + CONST0_RTX (<VNARROWQ>mode)));
> + else
> + emit_insn (
> + gen_aarch64_rshrn<mode>_insn_le (tmp, operands[1],
> + operands[2],
> + CONST0_RTX (<VNARROWQ>mode)));
> +
> + /* The intrinsic expects a narrow result, so emit a subreg that will
> + get optimized away as appropriate. */
> + emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
> + <VNARROWQ2>mode));
> + }
> DONE;
> }
> )
> @@ -2049,14 +2061,27 @@ (define_expand "aarch64_rshrn2<mode>"
> (match_operand:SI 3 "aarch64_simd_shift_imm_offset_<vn_mode>")]
> "TARGET_SIMD"
> {
> - operands[3] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
> - INTVAL (operands[3]));
> - if (BYTES_BIG_ENDIAN)
> - emit_insn (gen_aarch64_rshrn2<mode>_insn_be (operands[0], operands[1],
> - operands[2], operands[3]));
> + if (INTVAL (operands[3]) == GET_MODE_UNIT_BITSIZE (<VNARROWQ2>mode))
> + {
> + rtx tmp = aarch64_gen_shareable_zero (<MODE>mode);
> + emit_insn (gen_aarch64_raddhn2<mode> (operands[0], operands[1],
> + operands[2], tmp));
> + }
> else
> - emit_insn (gen_aarch64_rshrn2<mode>_insn_le (operands[0], operands[1],
> - operands[2], operands[3]));
> + {
> + operands[3] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
> + INTVAL (operands[3]));
> + if (BYTES_BIG_ENDIAN)
> + emit_insn (gen_aarch64_rshrn2<mode>_insn_be (operands[0],
> + operands[1],
> + operands[2],
> + operands[3]));
> + else
> + emit_insn (gen_aarch64_rshrn2<mode>_insn_le (operands[0],
> + operands[1],
> + operands[2],
> + operands[3]));
> + }
> DONE;
> }
> )
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index fdf05505846721b02059df494d6395ae9423a8ef..11201ea3498beb270c0a7f8da5f5009d710535ee 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -20397,6 +20397,18 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
> == SYMBOL_TINY_ABSOLUTE;
> }
>
> +/* Create a 0 constant that is based of V4SI to allow CSE to optimally share
based on
OK otherwise, thanks. I think long-term we should create shareable
zeros in all contexts, a bit like we do for PTRUEs, but I realise
that isn't late stage 1 material.
Richard
> + the constant creation. */
> +
> +rtx
> +aarch64_gen_shareable_zero (machine_mode mode)
> +{
> + machine_mode zmode = V4SImode;
> + rtx tmp = gen_reg_rtx (zmode);
> + emit_move_insn (tmp, CONST0_RTX (zmode));
> + return lowpart_subreg (mode, tmp, zmode);
> +}
> +
> /* Return a const_int vector of VAL. */
> rtx
> aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-1.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..4bc3aa9563ee7d0dc46557d30d9a29149706229d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-1.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile { target { aarch64*-*-* } } } */
> +/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
> +
> +#include <arm_neon.h>
> +
> +uint8x16_t foo (uint32x4_t a, uint32x4_t b)
> +{
> + uint16x4_t a1 = vrshrn_n_u32 (a, 16);
> + uint16x8_t b1 = vrshrn_high_n_u32 (a1, b, 16);
> + return vrshrn_high_n_u16 (vrshrn_n_u16 (b1, 8), b1, 8);
> +}
> +
> +/* { dg-final { scan-assembler-times {\tmovi\t} 1 } } */
> +/* { dg-final { scan-assembler-times {\traddhn\t} 2 } } */
> +/* { dg-final { scan-assembler-times {\traddhn2\t} 2 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-2.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..09d913e85524f06367c1c2cf51dda0f57578e9ae
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-2.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile { target { aarch64*-*-* } } } */
> +
> +#include <arm_neon.h>
> +
> +uint32x4_t foo (uint64x2_t a, uint64x2_t b)
> +{
> + return vrshrn_high_n_u64 (vrshrn_n_u64 (a, 32), b, 32);
> +}
> +
> +/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */
> +/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-3.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-3.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..bdccbb3410f049d7e45aabdcc3d2964fbabca807
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-3.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile { target { aarch64*-*-* } } } */
> +
> +#include <arm_neon.h>
> +
> +uint16x8_t foo (uint32x4_t a, uint32x4_t b)
> +{
> + return vrshrn_high_n_u32 (vrshrn_n_u32 (a, 16), b, 16);
> +}
> +
> +/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */
> +/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-4.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-4.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..4b23eddb85891975b8e122060e2a9ebfe56d842c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/shrn-4.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile { target { aarch64*-*-* } } } */
> +
> +#include <arm_neon.h>
> +
> +uint8x16_t foo (uint16x8_t a, uint16x8_t b)
> +{
> + return vrshrn_high_n_u16 (vrshrn_n_u16 (a, 8), b, 8);
> +}
> +
> +/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */
> +/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */
@@ -846,6 +846,7 @@ const char *aarch64_output_move_struct (rtx *operands);
rtx aarch64_return_addr_rtx (void);
rtx aarch64_return_addr (int, rtx);
rtx aarch64_simd_gen_const_vector_dup (machine_mode, HOST_WIDE_INT);
+rtx aarch64_gen_shareable_zero (machine_mode);
bool aarch64_simd_mem_operand_p (rtx);
bool aarch64_sve_ld1r_operand_p (rtx);
bool aarch64_sve_ld1rq_operand_p (rtx);
@@ -1956,20 +1956,32 @@ (define_expand "aarch64_rshrn<mode>"
(match_operand:SI 2 "aarch64_simd_shift_imm_offset_<vn_mode>")]
"TARGET_SIMD"
{
- operands[2] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
- INTVAL (operands[2]));
- rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
- if (BYTES_BIG_ENDIAN)
- emit_insn (gen_aarch64_rshrn<mode>_insn_be (tmp, operands[1],
- operands[2], CONST0_RTX (<VNARROWQ>mode)));
+ if (INTVAL (operands[2]) == GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode))
+ {
+ rtx tmp0 = aarch64_gen_shareable_zero (<MODE>mode);
+ emit_insn (gen_aarch64_raddhn<mode> (operands[0], operands[1], tmp0));
+ }
else
- emit_insn (gen_aarch64_rshrn<mode>_insn_le (tmp, operands[1],
- operands[2], CONST0_RTX (<VNARROWQ>mode)));
-
- /* The intrinsic expects a narrow result, so emit a subreg that will get
- optimized away as appropriate. */
- emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
- <VNARROWQ2>mode));
+ {
+ rtx tmp = gen_reg_rtx (<VNARROWQ2>mode);
+ operands[2] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
+ INTVAL (operands[2]));
+ if (BYTES_BIG_ENDIAN)
+ emit_insn (
+ gen_aarch64_rshrn<mode>_insn_be (tmp, operands[1],
+ operands[2],
+ CONST0_RTX (<VNARROWQ>mode)));
+ else
+ emit_insn (
+ gen_aarch64_rshrn<mode>_insn_le (tmp, operands[1],
+ operands[2],
+ CONST0_RTX (<VNARROWQ>mode)));
+
+ /* The intrinsic expects a narrow result, so emit a subreg that will
+ get optimized away as appropriate. */
+ emit_move_insn (operands[0], lowpart_subreg (<VNARROWQ>mode, tmp,
+ <VNARROWQ2>mode));
+ }
DONE;
}
)
@@ -2049,14 +2061,27 @@ (define_expand "aarch64_rshrn2<mode>"
(match_operand:SI 3 "aarch64_simd_shift_imm_offset_<vn_mode>")]
"TARGET_SIMD"
{
- operands[3] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
- INTVAL (operands[3]));
- if (BYTES_BIG_ENDIAN)
- emit_insn (gen_aarch64_rshrn2<mode>_insn_be (operands[0], operands[1],
- operands[2], operands[3]));
+ if (INTVAL (operands[3]) == GET_MODE_UNIT_BITSIZE (<VNARROWQ2>mode))
+ {
+ rtx tmp = aarch64_gen_shareable_zero (<MODE>mode);
+ emit_insn (gen_aarch64_raddhn2<mode> (operands[0], operands[1],
+ operands[2], tmp));
+ }
else
- emit_insn (gen_aarch64_rshrn2<mode>_insn_le (operands[0], operands[1],
- operands[2], operands[3]));
+ {
+ operands[3] = aarch64_simd_gen_const_vector_dup (<MODE>mode,
+ INTVAL (operands[3]));
+ if (BYTES_BIG_ENDIAN)
+ emit_insn (gen_aarch64_rshrn2<mode>_insn_be (operands[0],
+ operands[1],
+ operands[2],
+ operands[3]));
+ else
+ emit_insn (gen_aarch64_rshrn2<mode>_insn_le (operands[0],
+ operands[1],
+ operands[2],
+ operands[3]));
+ }
DONE;
}
)
@@ -20397,6 +20397,18 @@ aarch64_mov_operand_p (rtx x, machine_mode mode)
== SYMBOL_TINY_ABSOLUTE;
}
+/* Create a 0 constant that is based of V4SI to allow CSE to optimally share
+ the constant creation. */
+
+rtx
+aarch64_gen_shareable_zero (machine_mode mode)
+{
+ machine_mode zmode = V4SImode;
+ rtx tmp = gen_reg_rtx (zmode);
+ emit_move_insn (tmp, CONST0_RTX (zmode));
+ return lowpart_subreg (mode, tmp, zmode);
+}
+
/* Return a const_int vector of VAL. */
rtx
aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
new file mode 100644
@@ -0,0 +1,15 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
+
+#include <arm_neon.h>
+
+uint8x16_t foo (uint32x4_t a, uint32x4_t b)
+{
+ uint16x4_t a1 = vrshrn_n_u32 (a, 16);
+ uint16x8_t b1 = vrshrn_high_n_u32 (a1, b, 16);
+ return vrshrn_high_n_u16 (vrshrn_n_u16 (b1, 8), b1, 8);
+}
+
+/* { dg-final { scan-assembler-times {\tmovi\t} 1 } } */
+/* { dg-final { scan-assembler-times {\traddhn\t} 2 } } */
+/* { dg-final { scan-assembler-times {\traddhn2\t} 2 } } */
new file mode 100644
@@ -0,0 +1,11 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+
+#include <arm_neon.h>
+
+uint32x4_t foo (uint64x2_t a, uint64x2_t b)
+{
+ return vrshrn_high_n_u64 (vrshrn_n_u64 (a, 32), b, 32);
+}
+
+/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */
+/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */
new file mode 100644
@@ -0,0 +1,11 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+
+#include <arm_neon.h>
+
+uint16x8_t foo (uint32x4_t a, uint32x4_t b)
+{
+ return vrshrn_high_n_u32 (vrshrn_n_u32 (a, 16), b, 16);
+}
+
+/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */
+/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */
new file mode 100644
@@ -0,0 +1,11 @@
+/* { dg-do compile { target { aarch64*-*-* } } } */
+
+#include <arm_neon.h>
+
+uint8x16_t foo (uint16x8_t a, uint16x8_t b)
+{
+ return vrshrn_high_n_u16 (vrshrn_n_u16 (a, 8), b, 8);
+}
+
+/* { dg-final { scan-assembler-times {\traddhn\t} 1 } } */
+/* { dg-final { scan-assembler-times {\traddhn2\t} 1 } } */