Strip of a vector load which is only used partially.
Commit Message
Optimize
_1 = *srcp_3(D);
_4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
_5 = BIT_FIELD_REF <_4, 128, 0>;
to
_1 = *srcp_3(D);
_5 = BIT_FIELD_REF <_1, 128, 128>;
the upper will finally be optimized to
_5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>;
Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}.
Ok for trunk?
gcc/ChangeLog:
PR tree-optimization/102583
* gimple.h (gate_optimize_vector_load): Declare.
* match.pd: Simplify (BIT_FIELD_REF (vec_perm *p *p { 4, 5, 6,
7, 4, 5, 6, 7 }) 128 0) to (BIT_FIELD_REF *p 128 128).
* tree-ssa-forwprop.cc (gate_optimize_vector_load): New
function.
(pass_forwprop::execute): Put condition codes in the upper new
function.
gcc/testsuite/ChangeLog:
* gcc.target/i386/pr102583.c: New test.
---
gcc/gimple.h | 1 +
gcc/match.pd | 56 ++++++++++++++++++++++++
gcc/testsuite/gcc.target/i386/pr102583.c | 30 +++++++++++++
gcc/tree-ssa-forwprop.cc | 32 +++++++++-----
4 files changed, 109 insertions(+), 10 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr102583.c
Comments
On Thu, May 5, 2022 at 7:04 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> Optimize
>
> _1 = *srcp_3(D);
> _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
> _5 = BIT_FIELD_REF <_4, 128, 0>;
>
> to
>
> _1 = *srcp_3(D);
> _5 = BIT_FIELD_REF <_1, 128, 128>;
>
> the upper will finally be optimized to
>
> _5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>;
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}.
> Ok for trunk?
Hmm, tree-ssa-forwprop.cc:simplify_bitfield_ref should already
handle this in the
if (code == VEC_PERM_EXPR
&& constant_multiple_p (bit_field_offset (op), size, &idx))
{
part of the code - maybe that needs to be enhanced to cover
a contiguous stride in the VEC_PERM_EXPR. I see
we have
size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
if (maybe_ne (bit_field_size (op), size))
return false;
where it will currently bail, so adjust that to check for a
constant multiple. I also think we should only handle the
case where the new bit_field_offset alignment is not
worse than the original one.
That said, I'd prefer if you integrate this transform with
simplify_bitfield_ref.
Richard.
>
> gcc/ChangeLog:
>
> PR tree-optimization/102583
> * gimple.h (gate_optimize_vector_load): Declare.
> * match.pd: Simplify (BIT_FIELD_REF (vec_perm *p *p { 4, 5, 6,
> 7, 4, 5, 6, 7 }) 128 0) to (BIT_FIELD_REF *p 128 128).
> * tree-ssa-forwprop.cc (gate_optimize_vector_load): New
> function.
> (pass_forwprop::execute): Put condition codes in the upper new
> function.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/pr102583.c: New test.
> ---
> gcc/gimple.h | 1 +
> gcc/match.pd | 56 ++++++++++++++++++++++++
> gcc/testsuite/gcc.target/i386/pr102583.c | 30 +++++++++++++
> gcc/tree-ssa-forwprop.cc | 32 +++++++++-----
> 4 files changed, 109 insertions(+), 10 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr102583.c
>
> diff --git a/gcc/gimple.h b/gcc/gimple.h
> index 6b1e89ad74e..1747dae1193 100644
> --- a/gcc/gimple.h
> +++ b/gcc/gimple.h
> @@ -1638,6 +1638,7 @@ extern void maybe_remove_unused_call_args (struct function *, gimple *);
> extern bool gimple_inexpensive_call_p (gcall *);
> extern bool stmt_can_terminate_bb_p (gimple *);
> extern location_t gimple_or_expr_nonartificial_location (gimple *, tree);
> +extern bool gate_optimize_vector_load (gimple *);
>
> /* Return the disposition for a warning (or all warnings by default)
> for a statement. */
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 6d691d302b3..ac214310251 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -6832,6 +6832,62 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> }
> (cmp @0 { res; })))))))))
>
> +#if GIMPLE
> +/* Simplify partail vector access, transform
> +
> + V8SI A;
> + V4SI B;
> + A = *PA;
> + B = VEC_PERM_EXPR (A, A, { 4, 5, 6, 7, 4, 5, 6, 7 });
> + C = BIT_FIELD_REF (B, 128, 0)
> +
> +to
> +
> + A = *PA;
> + C = BIT_FIELD_REF (B, 128, 128);
> +
> +optimize_vector_load will eventually optimize the upper to
> +
> + C = BIT_FIELD_REF (*PA, 128, 128); */
> +
> +(simplify
> + (BIT_FIELD_REF (vec_perm@2 SSA_NAME@0 @0 VECTOR_CST@1) @rsize @rpos)
> + (if (VECTOR_TYPE_P (type)
> + && TYPE_MODE (type) != BLKmode
> + && single_use (@2)
> + && gate_optimize_vector_load (SSA_NAME_DEF_STMT (@0))
> + && types_match (TREE_TYPE (type), TREE_TYPE (TREE_TYPE (@0))))
> + (with
> + {
> + unsigned HOST_WIDE_INT nelts = -1;
> + if (!VECTOR_CST_NELTS (@1).is_constant (&nelts))
> + return NULL_TREE;
> + tree inner_type = TREE_TYPE (type);
> + unsigned HOST_WIDE_INT elt_w = tree_to_uhwi (TYPE_SIZE (inner_type));
> + unsigned HOST_WIDE_INT pos = tree_to_uhwi (@rpos);
> + unsigned HOST_WIDE_INT size = tree_to_uhwi (@rsize);
> + unsigned HOST_WIDE_INT start
> + = tree_to_uhwi (vector_cst_elt (@1, pos / elt_w));
> +
> + for (unsigned HOST_WIDE_INT i = pos / elt_w + 1; i != size / elt_w; i++)
> + {
> + /* Continuous area. */
> + if (tree_to_uhwi (vector_cst_elt (@1, i)) - 1
> + != tree_to_uhwi (vector_cst_elt (@1, i - 1)))
> + return NULL_TREE;
> + }
> +
> + /* Aligned or support movmisalign_optab. */
> + unsigned HOST_WIDE_INT dest_align = tree_to_uhwi (TYPE_SIZE (type));
> + if ((TYPE_ALIGN (TREE_TYPE (@0)) % dest_align
> + || start * elt_w % dest_align)
> + && (optab_handler (movmisalign_optab, TYPE_MODE (type))
> + == CODE_FOR_nothing))
> + return NULL_TREE;
> + }
> + (BIT_FIELD_REF @0 @rsize { bitsize_int (start * elt_w); }))))
> +#endif
> +
> /* Canonicalizations of BIT_FIELD_REFs. */
>
> (simplify
> diff --git a/gcc/testsuite/gcc.target/i386/pr102583.c b/gcc/testsuite/gcc.target/i386/pr102583.c
> new file mode 100644
> index 00000000000..ff2ffb5e671
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr102583.c
> @@ -0,0 +1,30 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512f -O2" } */
> +/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+20\(%.*%ymm} 1 } } */
> +/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+8\(%.*%xmm} 1 } } */
> +/* { dg-final { scan-assembler-times {(?n)vmovq[ \t]+16\(%.*%xmm} 1 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-not {(?n)vpermd[ \t]+.*%zmm} } } */
> +
> +typedef int v16si __attribute__((vector_size(64)));
> +typedef float v8sf __attribute__((vector_size(32)));
> +typedef float v4sf __attribute__((vector_size(16)));
> +typedef float v2sf __attribute__((vector_size(8)));
> +
> +v8sf part (v16si *srcp)
> +{
> + v16si src = *srcp;
> + return (v8sf) { (float)src[5], (float)src[6], (float)src[7], (float)src[8],
> + (float)src[9], (float)src[10], (float)src[11], (float)src[12] };
> +}
> +
> +v4sf part1 (v16si *srcp)
> +{
> + v16si src = *srcp;
> + return (v4sf) { (float)src[2], (float)src[3], (float)src[4], (float)src[5] };
> +}
> +
> +v2sf part2 (v16si *srcp)
> +{
> + v16si src = *srcp;
> + return (v2sf) { (float)src[4], (float)src[5] };
> +}
> diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
> index 484491fa1c5..2c8d8bc6dce 100644
> --- a/gcc/tree-ssa-forwprop.cc
> +++ b/gcc/tree-ssa-forwprop.cc
> @@ -3074,6 +3074,27 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
> return true;
> }
>
> +/* Gate for optimize_vector_load. */
> +bool
> +gate_optimize_vector_load (gimple* stmt)
> +{
> + if (!is_gimple_assign (stmt))
> + return false;
> +
> + tree lhs = gimple_assign_lhs (stmt);
> + tree rhs = gimple_assign_rhs1 (stmt);
> + return (cfun
> + && TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
> + && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
> + /* After vector lowering rewrite all loads, but
> + initially do not since this conflicts with
> + vector CONSTRUCTOR to shuffle optimization. */
> + || (cfun->curr_properties & PROP_gimple_lvec))
> + && gimple_assign_load_p (stmt)
> + && !gimple_has_volatile_ops (stmt)
> + && !stmt_can_throw_internal (cfun, stmt)
> + && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs)));
> +}
>
> /* Rewrite the vector load at *GSI to component-wise loads if the load
> is only used in BIT_FIELD_REF extractions with eventual intermediate
> @@ -3500,16 +3521,7 @@ pass_forwprop::execute (function *fun)
> else
> gsi_next (&gsi);
> }
> - else if (TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
> - && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
> - /* After vector lowering rewrite all loads, but
> - initially do not since this conflicts with
> - vector CONSTRUCTOR to shuffle optimization. */
> - || (fun->curr_properties & PROP_gimple_lvec))
> - && gimple_assign_load_p (stmt)
> - && !gimple_has_volatile_ops (stmt)
> - && !stmt_can_throw_internal (cfun, stmt)
> - && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs)))
> + else if (gate_optimize_vector_load (stmt))
> optimize_vector_load (&gsi);
>
> else if (code == COMPLEX_EXPR)
> --
> 2.18.1
>
On 5/5/2022 2:26 AM, Richard Biener via Gcc-patches wrote:
> On Thu, May 5, 2022 at 7:04 AM liuhongt <hongtao.liu@intel.com> wrote:
>> Optimize
>>
>> _1 = *srcp_3(D);
>> _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
>> _5 = BIT_FIELD_REF <_4, 128, 0>;
>>
>> to
>>
>> _1 = *srcp_3(D);
>> _5 = BIT_FIELD_REF <_1, 128, 128>;
>>
>> the upper will finally be optimized to
>>
>> _5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>;
>>
>> Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}.
>> Ok for trunk?
> Hmm, tree-ssa-forwprop.cc:simplify_bitfield_ref should already
> handle this in the
>
> if (code == VEC_PERM_EXPR
> && constant_multiple_p (bit_field_offset (op), size, &idx))
> {
>
> part of the code - maybe that needs to be enhanced to cover
> a contiguous stride in the VEC_PERM_EXPR. I see
> we have
>
> size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
> if (maybe_ne (bit_field_size (op), size))
> return false;
>
> where it will currently bail, so adjust that to check for a
> constant multiple. I also think we should only handle the
> case where the new bit_field_offset alignment is not
> worse than the original one.
>
> That said, I'd prefer if you integrate this transform with
> simplify_bitfield_ref.
I've got a hack here that tries to do something similar, but it's trying
to catch the case where we CONSTRUCTOR feeds the BIT_FIELD_REF. It
walks the CONSTRUCTOR elements to see if an element has the right
offset/size to satisify the BIT_FIELD_REF. For x264 we're often able to
eliminate the VEC_PERMUTE entirely and just forward operands into the
BIT_FIELD_REF.
I was leaning towards moving those bits into match.pd before submitting,
but if you'd prefer them in tree-ssa-forwprop, that's even easier.
Jeff
On Tue, May 10, 2022 at 12:58 AM Jeff Law via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
>
>
> On 5/5/2022 2:26 AM, Richard Biener via Gcc-patches wrote:
> > On Thu, May 5, 2022 at 7:04 AM liuhongt <hongtao.liu@intel.com> wrote:
> >> Optimize
> >>
> >> _1 = *srcp_3(D);
> >> _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
> >> _5 = BIT_FIELD_REF <_4, 128, 0>;
> >>
> >> to
> >>
> >> _1 = *srcp_3(D);
> >> _5 = BIT_FIELD_REF <_1, 128, 128>;
> >>
> >> the upper will finally be optimized to
> >>
> >> _5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>;
> >>
> >> Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}.
> >> Ok for trunk?
> > Hmm, tree-ssa-forwprop.cc:simplify_bitfield_ref should already
> > handle this in the
> >
> > if (code == VEC_PERM_EXPR
> > && constant_multiple_p (bit_field_offset (op), size, &idx))
> > {
> >
> > part of the code - maybe that needs to be enhanced to cover
> > a contiguous stride in the VEC_PERM_EXPR. I see
> > we have
> >
> > size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
> > if (maybe_ne (bit_field_size (op), size))
> > return false;
> >
> > where it will currently bail, so adjust that to check for a
> > constant multiple. I also think we should only handle the
> > case where the new bit_field_offset alignment is not
> > worse than the original one.
> >
> > That said, I'd prefer if you integrate this transform with
> > simplify_bitfield_ref.
> I've got a hack here that tries to do something similar, but it's trying
> to catch the case where we CONSTRUCTOR feeds the BIT_FIELD_REF. It
> walks the CONSTRUCTOR elements to see if an element has the right
> offset/size to satisify the BIT_FIELD_REF. For x264 we're often able to
> eliminate the VEC_PERMUTE entirely and just forward operands into the
> BIT_FIELD_REF.
>
> I was leaning towards moving those bits into match.pd before submitting,
> but if you'd prefer them in tree-ssa-forwprop, that's even easier.
I think when deciding where to put things it's important to look where related
transforms reside. We already do have a (simplify (BIT_FIELD_REF
CONSTRUCTOR@ ...))
pattern which should also handle your case already. So instead of
adding something
new it would be nice to figure why it doesn't handle the case you are
interested in and
eventually just adjust the existing pattern.
In the case of the above patch there isn't a match.pd pattern for this yet but
forwprop already has code to match bit-field-refs with vec-perms, so that's the
reason I preferred extending that. But of course the whole thing could live in
match.pd as well.
Richard.
> Jeff
>
>
On 5/10/2022 12:30 AM, Richard Biener wrote:
> On Tue, May 10, 2022 at 12:58 AM Jeff Law via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>>
>>
>> On 5/5/2022 2:26 AM, Richard Biener via Gcc-patches wrote:
>>> On Thu, May 5, 2022 at 7:04 AM liuhongt <hongtao.liu@intel.com> wrote:
>>>> Optimize
>>>>
>>>> _1 = *srcp_3(D);
>>>> _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
>>>> _5 = BIT_FIELD_REF <_4, 128, 0>;
>>>>
>>>> to
>>>>
>>>> _1 = *srcp_3(D);
>>>> _5 = BIT_FIELD_REF <_1, 128, 128>;
>>>>
>>>> the upper will finally be optimized to
>>>>
>>>> _5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>;
>>>>
>>>> Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}.
>>>> Ok for trunk?
>>> Hmm, tree-ssa-forwprop.cc:simplify_bitfield_ref should already
>>> handle this in the
>>>
>>> if (code == VEC_PERM_EXPR
>>> && constant_multiple_p (bit_field_offset (op), size, &idx))
>>> {
>>>
>>> part of the code - maybe that needs to be enhanced to cover
>>> a contiguous stride in the VEC_PERM_EXPR. I see
>>> we have
>>>
>>> size = TREE_INT_CST_LOW (TYPE_SIZE (elem_type));
>>> if (maybe_ne (bit_field_size (op), size))
>>> return false;
>>>
>>> where it will currently bail, so adjust that to check for a
>>> constant multiple. I also think we should only handle the
>>> case where the new bit_field_offset alignment is not
>>> worse than the original one.
>>>
>>> That said, I'd prefer if you integrate this transform with
>>> simplify_bitfield_ref.
>> I've got a hack here that tries to do something similar, but it's trying
>> to catch the case where we CONSTRUCTOR feeds the BIT_FIELD_REF. It
>> walks the CONSTRUCTOR elements to see if an element has the right
>> offset/size to satisify the BIT_FIELD_REF. For x264 we're often able to
>> eliminate the VEC_PERMUTE entirely and just forward operands into the
>> BIT_FIELD_REF.
>>
>> I was leaning towards moving those bits into match.pd before submitting,
>> but if you'd prefer them in tree-ssa-forwprop, that's even easier.
> I think when deciding where to put things it's important to look where related
> transforms reside. We already do have a (simplify (BIT_FIELD_REF
> CONSTRUCTOR@ ...))
> pattern which should also handle your case already. So instead of
> adding something
> new it would be nice to figure why it doesn't handle the case you are
> interested in and
> eventually just adjust the existing pattern.
I'm aware of that pattern. I've found it painfully inadequate in every
case I've looked at. In general I've found tree-ssa-forwprop is a
reasonable place to prototype a lot of stuff to see how it works in
practice, but I think match.pd is better for most of the transformations
in the long term.
It sounds like you'd prefer this particular case to move into match.pd.
Fine. That's what I'd originally planned to do. It's pretty simple
support code, so doing it in match.pd shouldn't be too hard.
Jeff
@@ -1638,6 +1638,7 @@ extern void maybe_remove_unused_call_args (struct function *, gimple *);
extern bool gimple_inexpensive_call_p (gcall *);
extern bool stmt_can_terminate_bb_p (gimple *);
extern location_t gimple_or_expr_nonartificial_location (gimple *, tree);
+extern bool gate_optimize_vector_load (gimple *);
/* Return the disposition for a warning (or all warnings by default)
for a statement. */
@@ -6832,6 +6832,62 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
}
(cmp @0 { res; })))))))))
+#if GIMPLE
+/* Simplify partail vector access, transform
+
+ V8SI A;
+ V4SI B;
+ A = *PA;
+ B = VEC_PERM_EXPR (A, A, { 4, 5, 6, 7, 4, 5, 6, 7 });
+ C = BIT_FIELD_REF (B, 128, 0)
+
+to
+
+ A = *PA;
+ C = BIT_FIELD_REF (B, 128, 128);
+
+optimize_vector_load will eventually optimize the upper to
+
+ C = BIT_FIELD_REF (*PA, 128, 128); */
+
+(simplify
+ (BIT_FIELD_REF (vec_perm@2 SSA_NAME@0 @0 VECTOR_CST@1) @rsize @rpos)
+ (if (VECTOR_TYPE_P (type)
+ && TYPE_MODE (type) != BLKmode
+ && single_use (@2)
+ && gate_optimize_vector_load (SSA_NAME_DEF_STMT (@0))
+ && types_match (TREE_TYPE (type), TREE_TYPE (TREE_TYPE (@0))))
+ (with
+ {
+ unsigned HOST_WIDE_INT nelts = -1;
+ if (!VECTOR_CST_NELTS (@1).is_constant (&nelts))
+ return NULL_TREE;
+ tree inner_type = TREE_TYPE (type);
+ unsigned HOST_WIDE_INT elt_w = tree_to_uhwi (TYPE_SIZE (inner_type));
+ unsigned HOST_WIDE_INT pos = tree_to_uhwi (@rpos);
+ unsigned HOST_WIDE_INT size = tree_to_uhwi (@rsize);
+ unsigned HOST_WIDE_INT start
+ = tree_to_uhwi (vector_cst_elt (@1, pos / elt_w));
+
+ for (unsigned HOST_WIDE_INT i = pos / elt_w + 1; i != size / elt_w; i++)
+ {
+ /* Continuous area. */
+ if (tree_to_uhwi (vector_cst_elt (@1, i)) - 1
+ != tree_to_uhwi (vector_cst_elt (@1, i - 1)))
+ return NULL_TREE;
+ }
+
+ /* Aligned or support movmisalign_optab. */
+ unsigned HOST_WIDE_INT dest_align = tree_to_uhwi (TYPE_SIZE (type));
+ if ((TYPE_ALIGN (TREE_TYPE (@0)) % dest_align
+ || start * elt_w % dest_align)
+ && (optab_handler (movmisalign_optab, TYPE_MODE (type))
+ == CODE_FOR_nothing))
+ return NULL_TREE;
+ }
+ (BIT_FIELD_REF @0 @rsize { bitsize_int (start * elt_w); }))))
+#endif
+
/* Canonicalizations of BIT_FIELD_REFs. */
(simplify
new file mode 100644
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+20\(%.*%ymm} 1 } } */
+/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+8\(%.*%xmm} 1 } } */
+/* { dg-final { scan-assembler-times {(?n)vmovq[ \t]+16\(%.*%xmm} 1 { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not {(?n)vpermd[ \t]+.*%zmm} } } */
+
+typedef int v16si __attribute__((vector_size(64)));
+typedef float v8sf __attribute__((vector_size(32)));
+typedef float v4sf __attribute__((vector_size(16)));
+typedef float v2sf __attribute__((vector_size(8)));
+
+v8sf part (v16si *srcp)
+{
+ v16si src = *srcp;
+ return (v8sf) { (float)src[5], (float)src[6], (float)src[7], (float)src[8],
+ (float)src[9], (float)src[10], (float)src[11], (float)src[12] };
+}
+
+v4sf part1 (v16si *srcp)
+{
+ v16si src = *srcp;
+ return (v4sf) { (float)src[2], (float)src[3], (float)src[4], (float)src[5] };
+}
+
+v2sf part2 (v16si *srcp)
+{
+ v16si src = *srcp;
+ return (v2sf) { (float)src[4], (float)src[5] };
+}
@@ -3074,6 +3074,27 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
return true;
}
+/* Gate for optimize_vector_load. */
+bool
+gate_optimize_vector_load (gimple* stmt)
+{
+ if (!is_gimple_assign (stmt))
+ return false;
+
+ tree lhs = gimple_assign_lhs (stmt);
+ tree rhs = gimple_assign_rhs1 (stmt);
+ return (cfun
+ && TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
+ && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
+ /* After vector lowering rewrite all loads, but
+ initially do not since this conflicts with
+ vector CONSTRUCTOR to shuffle optimization. */
+ || (cfun->curr_properties & PROP_gimple_lvec))
+ && gimple_assign_load_p (stmt)
+ && !gimple_has_volatile_ops (stmt)
+ && !stmt_can_throw_internal (cfun, stmt)
+ && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs)));
+}
/* Rewrite the vector load at *GSI to component-wise loads if the load
is only used in BIT_FIELD_REF extractions with eventual intermediate
@@ -3500,16 +3521,7 @@ pass_forwprop::execute (function *fun)
else
gsi_next (&gsi);
}
- else if (TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
- && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
- /* After vector lowering rewrite all loads, but
- initially do not since this conflicts with
- vector CONSTRUCTOR to shuffle optimization. */
- || (fun->curr_properties & PROP_gimple_lvec))
- && gimple_assign_load_p (stmt)
- && !gimple_has_volatile_ops (stmt)
- && !stmt_can_throw_internal (cfun, stmt)
- && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs)))
+ else if (gate_optimize_vector_load (stmt))
optimize_vector_load (&gsi);
else if (code == COMPLEX_EXPR)