[v2] s390x: Optimize vector permute with constant indexes
Checks
Commit Message
Loop vectorizer can generate vector permutes with constant indexes
where all indexes are equal. Optimize this case to use vector
replicate instead of vector permute.
gcc/ChangeLog:
* config/s390/s390.cc (expand_perm_as_replicate): Implement.
(vectorize_vec_perm_const_1): Call new function.
* config/s390/vx-builtins.md (vec_splat<mode>): Change to...
(@vec_splat<mode>): ...this.
gcc/testsuite/ChangeLog:
* gcc.target/s390/vector/vec-expand-replicate.c: New test.
Bootstrapped and regtested on s390x. Ok for trunk?
Signed-off-by: Juergen Christ <jchrist@linux.ibm.com>
---
gcc/config/s390/s390.cc | 33 ++++++++++
gcc/config/s390/vx-builtins.md | 2 +-
.../s390/vector/vec-expand-replicate.c | 60 +++++++++++++++++++
3 files changed, 94 insertions(+), 1 deletion(-)
create mode 100644 gcc/testsuite/gcc.target/s390/vector/vec-expand-replicate.c
Comments
On 4/9/24 16:31, Juergen Christ wrote:
> Loop vectorizer can generate vector permutes with constant indexes
> where all indexes are equal. Optimize this case to use vector
> replicate instead of vector permute.
>
> gcc/ChangeLog:
>
> * config/s390/s390.cc (expand_perm_as_replicate): Implement.
> (vectorize_vec_perm_const_1): Call new function.
> * config/s390/vx-builtins.md (vec_splat<mode>): Change to...
> (@vec_splat<mode>): ...this.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/s390/vector/vec-expand-replicate.c: New test.
>
> Bootstrapped and regtested on s390x. Ok for trunk?
Does this also work when using the vec_perm intrinsic or would we need to define a matching RTX for
that?
Ok. Thanks!
Andreas
Am Tue, Apr 09, 2024 at 05:01:18PM +0200 schrieb Andreas Krebbel:
> On 4/9/24 16:31, Juergen Christ wrote:
> > Loop vectorizer can generate vector permutes with constant indexes
> > where all indexes are equal. Optimize this case to use vector
> > replicate instead of vector permute.
> >
> > gcc/ChangeLog:
> >
> > * config/s390/s390.cc (expand_perm_as_replicate): Implement.
> > (vectorize_vec_perm_const_1): Call new function.
> > * config/s390/vx-builtins.md (vec_splat<mode>): Change to...
> > (@vec_splat<mode>): ...this.
> >
> > gcc/testsuite/ChangeLog:
> >
> > * gcc.target/s390/vector/vec-expand-replicate.c: New test.
> >
> > Bootstrapped and regtested on s390x. Ok for trunk?
>
> Does this also work when using the vec_perm intrinsic or would we need to define a matching RTX for
> that?
Unfortunately, it does not work with vec_perm.
> Ok. Thanks!
Pushed.
Juergen
@@ -17923,6 +17923,36 @@ expand_perm_as_a_vlbr_vstbr_candidate (const struct expand_vec_perm_d &d)
return false;
}
+static bool
+expand_perm_as_replicate (const struct expand_vec_perm_d &d)
+{
+ unsigned char i;
+ unsigned char elem;
+ rtx base = d.op0;
+ rtx insn;
+ /* Needed to silence maybe-uninitialized warning. */
+ gcc_assert (d.nelt > 0);
+ elem = d.perm[0];
+ for (i = 1; i < d.nelt; ++i)
+ if (d.perm[i] != elem)
+ return false;
+ if (!d.testing_p)
+ {
+ if (elem >= d.nelt)
+ {
+ base = d.op1;
+ elem -= d.nelt;
+ }
+ insn = maybe_gen_vec_splat (d.vmode, d.target, base, GEN_INT (elem));
+ if (insn == NULL_RTX)
+ return false;
+ emit_insn (insn);
+ return true;
+ }
+ else
+ return maybe_code_for_vec_splat (d.vmode) != CODE_FOR_nothing;
+}
+
/* Try to find the best sequence for the vector permute operation
described by D. Return true if the operation could be
expanded. */
@@ -17941,6 +17971,9 @@ vectorize_vec_perm_const_1 (const struct expand_vec_perm_d &d)
if (expand_perm_as_a_vlbr_vstbr_candidate (d))
return true;
+ if (expand_perm_as_replicate (d))
+ return true;
+
return false;
}
@@ -424,7 +424,7 @@
; Replicate from vector element
-(define_expand "vec_splat<mode>"
+(define_expand "@vec_splat<mode>"
[(set (match_operand:V_HW 0 "register_operand" "")
(vec_duplicate:V_HW (vec_select:<non_vec>
(match_operand:V_HW 1 "register_operand" "")
new file mode 100644
@@ -0,0 +1,60 @@
+/* Check that the vectorize_vec_perm_const expander correctly deals with
+ replication. Extracted from spec "nab". */
+
+/* { dg-do compile } */
+/* { dg-options "-O3 -mzarch -march=z13 -fvect-cost-model=unlimited" } */
+
+typedef double POINT_T[3];
+typedef double MATRIX_T[][4];
+typedef struct {
+ POINT_T a_pos;
+} ATOM_T;
+typedef struct {
+ ATOM_T *r_atoms;
+} RESIDUE_T;
+typedef struct strand_t {
+ RESIDUE_T *s_residues;
+} STRAND_T;
+typedef struct strand_t MOLECULE_T;
+double xfm_xyz_oxyz4[4];
+MOLECULE_T add_he2o3transformmol_mol, add_he2o3transformmol_sp;
+RESIDUE_T add_he2o3transformmol_res;
+int add_he2o3transformmol_r, add_he2o3transformmol_a, add_he2o3transformmol_i;
+ATOM_T *add_he2o3transformmol_ap;
+POINT_T add_he2o3transformmol_xyz, add_he2o3transformmol_nxyz;
+static void xfm_xyz(POINT_T oxyz, MATRIX_T mat, POINT_T nxyz) {
+ int i, j;
+ double nxyz4[4];
+ for (i = 0; i < 3; i++)
+ xfm_xyz_oxyz4[i] = oxyz[i];
+ xfm_xyz_oxyz4[3] = 1.0;
+ for (i = 0; i < 4; i++) {
+ nxyz4[i] = 0.0;
+ for (j = 0; j < 4; j++)
+ nxyz4[i] += xfm_xyz_oxyz4[j] * mat[j][i];
+ }
+ for (i = 0; i < 3; i++)
+ nxyz[i] = nxyz4[i];
+}
+void add_he2o3transformmol(MATRIX_T mat, int n) {
+ for (add_he2o3transformmol_sp = add_he2o3transformmol_mol;;)
+ for (add_he2o3transformmol_r = 0;;) {
+ add_he2o3transformmol_res =
+ add_he2o3transformmol_sp.s_residues[add_he2o3transformmol_r];
+ for (add_he2o3transformmol_a = 0; add_he2o3transformmol_a < n; add_he2o3transformmol_a++) {
+ add_he2o3transformmol_ap =
+ &add_he2o3transformmol_res.r_atoms[add_he2o3transformmol_a];
+ for (add_he2o3transformmol_i = 0; add_he2o3transformmol_i < 3;
+ add_he2o3transformmol_i++)
+ add_he2o3transformmol_xyz[add_he2o3transformmol_i] =
+ add_he2o3transformmol_ap->a_pos[add_he2o3transformmol_i];
+ xfm_xyz(add_he2o3transformmol_xyz, mat, add_he2o3transformmol_nxyz);
+ for (add_he2o3transformmol_i = 0; add_he2o3transformmol_i < 3;
+ add_he2o3transformmol_i++)
+ add_he2o3transformmol_ap->a_pos[add_he2o3transformmol_i] =
+ add_he2o3transformmol_nxyz[add_he2o3transformmol_i];
+ }
+ }
+}
+
+/* { dg-final { scan-assembler-not "vperm" } } */