[pass_if_conversion] Extend is_cond_scalar_reduction to handle bit_and/bit_xor/bit_ior.

Message ID 20211109020940.108983-1-hongtao.liu@intel.com
State New
Headers
Series [pass_if_conversion] Extend is_cond_scalar_reduction to handle bit_and/bit_xor/bit_ior. |

Commit Message

Liu, Hongtao Nov. 9, 2021, 2:09 a.m. UTC
  This will enable transformation like

-  # sum1_50 = PHI <prephitmp_64(13), 0(4)>
-  # sum2_52 = PHI <sum2_21(13), 0(4)>
+  # sum1_50 = PHI <_87(13), 0(4)>
+  # sum2_52 = PHI <_89(13), 0(4)>
   # ivtmp_62 = PHI <ivtmp_61(13), 64(4)>
   i.2_7 = (long unsigned int) i_49;
   _8 = i.2_7 * 8;
...
   vec1_i_38 = vec1_29 >> _10;
   vec2_i_39 = vec2_31 >> _10;
   _11 = vec1_i_38 & 1;
-  _63 = tmp_37 ^ sum1_50;
-  prephitmp_64 = _11 == 0 ? sum1_50 : _63;
+  _ifc__86 = _11 != 0 ? tmp_37 : 0;
+  _87 = sum1_50 ^ _ifc__86;
   _12 = vec2_i_39 & 1;
:

so that vectorizer won't failed due to

  /* If this isn't a nested cycle or if the nested cycle reduction value
     is used ouside of the inner loop we cannot handle uses of the reduction
     value.  */
  if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
    {
      if (dump_enabled_p ())
	dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
			 "reduction used in loop.\n");
      return NULL;
    }

Bootstrap and regtest on x86_64-pc-linux-gnu{-m32,}
Ok for trunk?

gcc/ChangeLog:

	PR tree-optimization/103126
	* tree-if-conv.c (is_cond_scalar_reduction): Handle
	BIT_XOR_EXPR/BIT_IOR_EXPR/BIT_AND_EXPR.
	(convert_scalar_cond_reduction): Ditto.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/ifcvt-reduction-logic-op.c: New test.
---
 .../i386/ifcvt-reduction-logic-op.c           | 80 +++++++++++++++++++
 gcc/tree-if-conv.c                            | 19 +++--
 2 files changed, 92 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
  

Comments

Richard Biener Nov. 9, 2021, 10:19 a.m. UTC | #1
On Tue, Nov 9, 2021 at 3:09 AM liuhongt <hongtao.liu@intel.com> wrote:
>
> This will enable transformation like
>
> -  # sum1_50 = PHI <prephitmp_64(13), 0(4)>
> -  # sum2_52 = PHI <sum2_21(13), 0(4)>
> +  # sum1_50 = PHI <_87(13), 0(4)>
> +  # sum2_52 = PHI <_89(13), 0(4)>
>    # ivtmp_62 = PHI <ivtmp_61(13), 64(4)>
>    i.2_7 = (long unsigned int) i_49;
>    _8 = i.2_7 * 8;
> ...
>    vec1_i_38 = vec1_29 >> _10;
>    vec2_i_39 = vec2_31 >> _10;
>    _11 = vec1_i_38 & 1;
> -  _63 = tmp_37 ^ sum1_50;
> -  prephitmp_64 = _11 == 0 ? sum1_50 : _63;
> +  _ifc__86 = _11 != 0 ? tmp_37 : 0;
> +  _87 = sum1_50 ^ _ifc__86;
>    _12 = vec2_i_39 & 1;
> :
>
> so that vectorizer won't failed due to
>
>   /* If this isn't a nested cycle or if the nested cycle reduction value
>      is used ouside of the inner loop we cannot handle uses of the reduction
>      value.  */
>   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
>     {
>       if (dump_enabled_p ())
>         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>                          "reduction used in loop.\n");
>       return NULL;
>     }
>
> Bootstrap and regtest on x86_64-pc-linux-gnu{-m32,}
> Ok for trunk?
>
> gcc/ChangeLog:
>
>         PR tree-optimization/103126
>         * tree-if-conv.c (is_cond_scalar_reduction): Handle
>         BIT_XOR_EXPR/BIT_IOR_EXPR/BIT_AND_EXPR.
>         (convert_scalar_cond_reduction): Ditto.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/ifcvt-reduction-logic-op.c: New test.
> ---
>  .../i386/ifcvt-reduction-logic-op.c           | 80 +++++++++++++++++++
>  gcc/tree-if-conv.c                            | 19 +++--
>  2 files changed, 92 insertions(+), 7 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
>
> diff --git a/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
> new file mode 100644
> index 00000000000..eeb822d5d43
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
> @@ -0,0 +1,80 @@
> +/* PR tree-optimization/103126.  */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -mavx2 -ftree-vectorize -fdump-tree-vect-details" } */
> +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 3 "vect" } } */
> +#include<stdint.h>
> +
> +void xor_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
> +    int64_t n)
> +{
> +  int64_t i;
> +  uint64_t vec1, sum1;
> +  uint64_t vec2, sum2;
> +
> +  while (n > 0) {
> +    sum1 = 0;
> +    vec1 = a[n];
> +    sum2 = 0;
> +    vec2 = b[n];
> +
> +    for (i = 0; i < 64; i++) {
> +      uint64_t tmp = mat[i];
> +      uint64_t vec1_i = (vec1 >> i);
> +      uint64_t vec2_i = (vec2 >> i);
> +      sum1 ^= (vec1_i & 1) ? tmp : 0;
> +      if (vec2_i&1) sum2 ^= tmp;
> +    }
> +    *ans++ ^= sum1;  n--;
> +    *ans++ ^= sum2;  n--;
> +  }
> +}
> +
> +void ior_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
> +    int64_t n)
> +{
> +  int64_t i;
> +  uint64_t vec1, sum1;
> +  uint64_t vec2, sum2;
> +
> +  while (n > 0) {
> +    sum1 = 0;
> +    vec1 = a[n];
> +    sum2 = 0;
> +    vec2 = b[n];
> +
> +    for (i = 0; i < 64; i++) {
> +      uint64_t tmp = mat[i];
> +      uint64_t vec1_i = (vec1 >> i);
> +      uint64_t vec2_i = (vec2 >> i);
> +      sum1 |= (vec1_i & 1) ? tmp : 0;
> +      if (vec2_i&1) sum2 |= tmp;
> +    }
> +    *ans++ |= sum1;  n--;
> +    *ans++ |= sum2;  n--;
> +  }
> +}
> +
> +void and_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
> +    int64_t n)
> +{
> +  int64_t i;
> +  uint64_t vec1, sum1;
> +  uint64_t vec2, sum2;
> +
> +  while (n > 0) {
> +    sum1 = -1;
> +    vec1 = a[n];
> +    sum2 = 0;
> +    vec2 = b[n];
> +
> +    for (i = 0; i < 64; i++) {
> +      uint64_t tmp = mat[i];
> +      uint64_t vec1_i = (vec1 >> i);
> +      uint64_t vec2_i = (vec2 >> i);
> +      sum1 &= (vec1_i & 1) ? tmp : -1;
> +      if (vec2_i&1) sum2 &= tmp;
> +    }
> +    *ans++ &= sum1;  n--;
> +    *ans++ &= sum2;  n--;
> +  }
> +}
> diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
> index b165dc0c17f..7df1103ff89 100644
> --- a/gcc/tree-if-conv.c
> +++ b/gcc/tree-if-conv.c
> @@ -1732,7 +1732,9 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
>        reduction_op = gimple_assign_rhs_code (stmt);
>      }
>
> -  if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR)
> +  if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR
> +     && reduction_op != BIT_IOR_EXPR && reduction_op != BIT_XOR_EXPR
> +     && reduction_op != BIT_AND_EXPR)

Please put each && on a separate line

>      return false;
>    r_op1 = gimple_assign_rhs1 (stmt);
>    r_op2 = gimple_assign_rhs2 (stmt);
> @@ -1742,7 +1744,7 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
>
>    /* Make R_OP1 to hold reduction variable.  */
>    if (r_nop2 == PHI_RESULT (header_phi)
> -      && reduction_op == PLUS_EXPR)
> +      && commutative_tree_code (reduction_op))
>      {
>        std::swap (r_op1, r_op2);
>        std::swap (r_nop1, r_nop2);
> @@ -1811,7 +1813,10 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>    tree rhs1 = gimple_assign_rhs1 (reduc);
>    tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
>    tree c;
> -  tree zero = build_zero_cst (TREE_TYPE (rhs1));
> +  enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
> +  tree op_nochange = reduction_op != BIT_AND_EXPR
> +    ? build_zero_cst (TREE_TYPE (rhs1))
> +    : build_minus_one_cst (TREE_TYPE (rhs1));

maybe export neutral_op_for_reduction and use it here (supply NULL
initial_value)?

Otherwise looks OK.

Thanks,
Richard.

>    gimple_seq stmts = NULL;
>
>    if (dump_file && (dump_flags & TDF_DETAILS))
> @@ -1824,14 +1829,14 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
>       of reduction rhs.  */
>    c = fold_build_cond_expr (TREE_TYPE (rhs1),
>                             unshare_expr (cond),
> -                           swap ? zero : op1,
> -                           swap ? op1 : zero);
> +                           swap ? op_nochange : op1,
> +                           swap ? op1 : op_nochange);
>
>    /* Create assignment stmt and insert it at GSI.  */
>    new_assign = gimple_build_assign (tmp, c);
>    gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> -  /* Build rhs for unconditional increment/decrement.  */
> -  rhs = gimple_build (&stmts, gimple_assign_rhs_code (reduc),
> +  /* Build rhs for unconditional increment/decrement/logic_operation.  */
> +  rhs = gimple_build (&stmts, reduction_op,
>                       TREE_TYPE (rhs1), op0, tmp);
>
>    if (has_nop)
> --
> 2.18.1
>
  
Hongtao Liu Nov. 10, 2021, 7:26 a.m. UTC | #2
On Tue, Nov 9, 2021 at 6:22 PM Richard Biener via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> On Tue, Nov 9, 2021 at 3:09 AM liuhongt <hongtao.liu@intel.com> wrote:
> >
> > This will enable transformation like
> >
> > -  # sum1_50 = PHI <prephitmp_64(13), 0(4)>
> > -  # sum2_52 = PHI <sum2_21(13), 0(4)>
> > +  # sum1_50 = PHI <_87(13), 0(4)>
> > +  # sum2_52 = PHI <_89(13), 0(4)>
> >    # ivtmp_62 = PHI <ivtmp_61(13), 64(4)>
> >    i.2_7 = (long unsigned int) i_49;
> >    _8 = i.2_7 * 8;
> > ...
> >    vec1_i_38 = vec1_29 >> _10;
> >    vec2_i_39 = vec2_31 >> _10;
> >    _11 = vec1_i_38 & 1;
> > -  _63 = tmp_37 ^ sum1_50;
> > -  prephitmp_64 = _11 == 0 ? sum1_50 : _63;
> > +  _ifc__86 = _11 != 0 ? tmp_37 : 0;
> > +  _87 = sum1_50 ^ _ifc__86;
> >    _12 = vec2_i_39 & 1;
> > :
> >
> > so that vectorizer won't failed due to
> >
> >   /* If this isn't a nested cycle or if the nested cycle reduction value
> >      is used ouside of the inner loop we cannot handle uses of the reduction
> >      value.  */
> >   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
> >     {
> >       if (dump_enabled_p ())
> >         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> >                          "reduction used in loop.\n");
> >       return NULL;
> >     }
> >
> > Bootstrap and regtest on x86_64-pc-linux-gnu{-m32,}
> > Ok for trunk?
> >
> > gcc/ChangeLog:
> >
> >         PR tree-optimization/103126
> >         * tree-if-conv.c (is_cond_scalar_reduction): Handle
> >         BIT_XOR_EXPR/BIT_IOR_EXPR/BIT_AND_EXPR.
> >         (convert_scalar_cond_reduction): Ditto.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/ifcvt-reduction-logic-op.c: New test.
> > ---
> >  .../i386/ifcvt-reduction-logic-op.c           | 80 +++++++++++++++++++
> >  gcc/tree-if-conv.c                            | 19 +++--
> >  2 files changed, 92 insertions(+), 7 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
> >
> > diff --git a/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
> > new file mode 100644
> > index 00000000000..eeb822d5d43
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
> > @@ -0,0 +1,80 @@
> > +/* PR tree-optimization/103126.  */
> > +/* { dg-do compile } */
> > +/* { dg-options "-Ofast -mavx2 -ftree-vectorize -fdump-tree-vect-details" } */
> > +/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 3 "vect" } } */
> > +#include<stdint.h>
> > +
> > +void xor_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
> > +    int64_t n)
> > +{
> > +  int64_t i;
> > +  uint64_t vec1, sum1;
> > +  uint64_t vec2, sum2;
> > +
> > +  while (n > 0) {
> > +    sum1 = 0;
> > +    vec1 = a[n];
> > +    sum2 = 0;
> > +    vec2 = b[n];
> > +
> > +    for (i = 0; i < 64; i++) {
> > +      uint64_t tmp = mat[i];
> > +      uint64_t vec1_i = (vec1 >> i);
> > +      uint64_t vec2_i = (vec2 >> i);
> > +      sum1 ^= (vec1_i & 1) ? tmp : 0;
> > +      if (vec2_i&1) sum2 ^= tmp;
> > +    }
> > +    *ans++ ^= sum1;  n--;
> > +    *ans++ ^= sum2;  n--;
> > +  }
> > +}
> > +
> > +void ior_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
> > +    int64_t n)
> > +{
> > +  int64_t i;
> > +  uint64_t vec1, sum1;
> > +  uint64_t vec2, sum2;
> > +
> > +  while (n > 0) {
> > +    sum1 = 0;
> > +    vec1 = a[n];
> > +    sum2 = 0;
> > +    vec2 = b[n];
> > +
> > +    for (i = 0; i < 64; i++) {
> > +      uint64_t tmp = mat[i];
> > +      uint64_t vec1_i = (vec1 >> i);
> > +      uint64_t vec2_i = (vec2 >> i);
> > +      sum1 |= (vec1_i & 1) ? tmp : 0;
> > +      if (vec2_i&1) sum2 |= tmp;
> > +    }
> > +    *ans++ |= sum1;  n--;
> > +    *ans++ |= sum2;  n--;
> > +  }
> > +}
> > +
> > +void and_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
> > +    int64_t n)
> > +{
> > +  int64_t i;
> > +  uint64_t vec1, sum1;
> > +  uint64_t vec2, sum2;
> > +
> > +  while (n > 0) {
> > +    sum1 = -1;
> > +    vec1 = a[n];
> > +    sum2 = 0;
> > +    vec2 = b[n];
> > +
> > +    for (i = 0; i < 64; i++) {
> > +      uint64_t tmp = mat[i];
> > +      uint64_t vec1_i = (vec1 >> i);
> > +      uint64_t vec2_i = (vec2 >> i);
> > +      sum1 &= (vec1_i & 1) ? tmp : -1;
> > +      if (vec2_i&1) sum2 &= tmp;
> > +    }
> > +    *ans++ &= sum1;  n--;
> > +    *ans++ &= sum2;  n--;
> > +  }
> > +}
> > diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
> > index b165dc0c17f..7df1103ff89 100644
> > --- a/gcc/tree-if-conv.c
> > +++ b/gcc/tree-if-conv.c
> > @@ -1732,7 +1732,9 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
> >        reduction_op = gimple_assign_rhs_code (stmt);
> >      }
> >
> > -  if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR)
> > +  if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR
> > +     && reduction_op != BIT_IOR_EXPR && reduction_op != BIT_XOR_EXPR
> > +     && reduction_op != BIT_AND_EXPR)
>
> Please put each && on a separate line
Changed.
>
> >      return false;
> >    r_op1 = gimple_assign_rhs1 (stmt);
> >    r_op2 = gimple_assign_rhs2 (stmt);
> > @@ -1742,7 +1744,7 @@ is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
> >
> >    /* Make R_OP1 to hold reduction variable.  */
> >    if (r_nop2 == PHI_RESULT (header_phi)
> > -      && reduction_op == PLUS_EXPR)
> > +      && commutative_tree_code (reduction_op))
> >      {
> >        std::swap (r_op1, r_op2);
> >        std::swap (r_nop1, r_nop2);
> > @@ -1811,7 +1813,10 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
> >    tree rhs1 = gimple_assign_rhs1 (reduc);
> >    tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
> >    tree c;
> > -  tree zero = build_zero_cst (TREE_TYPE (rhs1));
> > +  enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
> > +  tree op_nochange = reduction_op != BIT_AND_EXPR
> > +    ? build_zero_cst (TREE_TYPE (rhs1))
> > +    : build_minus_one_cst (TREE_TYPE (rhs1));
>
> maybe export neutral_op_for_reduction and use it here (supply NULL
> initial_value)?
Changed(didn't know there's such function.)
>
> Otherwise looks OK.
>
> Thanks,
> Richard.
>
> >    gimple_seq stmts = NULL;
> >
> >    if (dump_file && (dump_flags & TDF_DETAILS))
> > @@ -1824,14 +1829,14 @@ convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
> >       of reduction rhs.  */
> >    c = fold_build_cond_expr (TREE_TYPE (rhs1),
> >                             unshare_expr (cond),
> > -                           swap ? zero : op1,
> > -                           swap ? op1 : zero);
> > +                           swap ? op_nochange : op1,
> > +                           swap ? op1 : op_nochange);
> >
> >    /* Create assignment stmt and insert it at GSI.  */
> >    new_assign = gimple_build_assign (tmp, c);
> >    gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
> > -  /* Build rhs for unconditional increment/decrement.  */
> > -  rhs = gimple_build (&stmts, gimple_assign_rhs_code (reduc),
> > +  /* Build rhs for unconditional increment/decrement/logic_operation.  */
> > +  rhs = gimple_build (&stmts, reduction_op,
> >                       TREE_TYPE (rhs1), op0, tmp);
> >
> >    if (has_nop)
> > --
> > 2.18.1
> >


Here's the patch i'm going to check in.
  

Patch

diff --git a/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
new file mode 100644
index 00000000000..eeb822d5d43
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/ifcvt-reduction-logic-op.c
@@ -0,0 +1,80 @@ 
+/* PR tree-optimization/103126.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -mavx2 -ftree-vectorize -fdump-tree-vect-details" } */
+/* { dg-final { scan-tree-dump-times "vectorized \[1-3] loops" 3 "vect" } } */
+#include<stdint.h>
+
+void xor_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
+    int64_t n)
+{
+  int64_t i;
+  uint64_t vec1, sum1;
+  uint64_t vec2, sum2;
+
+  while (n > 0) {
+    sum1 = 0;
+    vec1 = a[n];
+    sum2 = 0;
+    vec2 = b[n];
+
+    for (i = 0; i < 64; i++) {
+      uint64_t tmp = mat[i];
+      uint64_t vec1_i = (vec1 >> i);
+      uint64_t vec2_i = (vec2 >> i);
+      sum1 ^= (vec1_i & 1) ? tmp : 0;
+      if (vec2_i&1) sum2 ^= tmp;
+    }
+    *ans++ ^= sum1;  n--;
+    *ans++ ^= sum2;  n--;
+  }
+}
+
+void ior_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
+    int64_t n)
+{
+  int64_t i;
+  uint64_t vec1, sum1;
+  uint64_t vec2, sum2;
+
+  while (n > 0) {
+    sum1 = 0;
+    vec1 = a[n];
+    sum2 = 0;
+    vec2 = b[n];
+
+    for (i = 0; i < 64; i++) {
+      uint64_t tmp = mat[i];
+      uint64_t vec1_i = (vec1 >> i);
+      uint64_t vec2_i = (vec2 >> i);
+      sum1 |= (vec1_i & 1) ? tmp : 0;
+      if (vec2_i&1) sum2 |= tmp;
+    }
+    *ans++ |= sum1;  n--;
+    *ans++ |= sum2;  n--;
+  }
+}
+
+void and_bit_arr_nolcd (uint64_t *__restrict mat, uint64_t* a,uint64_t* b, uint64_t *__restrict ans,
+    int64_t n)
+{
+  int64_t i;
+  uint64_t vec1, sum1;
+  uint64_t vec2, sum2;
+
+  while (n > 0) {
+    sum1 = -1;
+    vec1 = a[n];
+    sum2 = 0;
+    vec2 = b[n];
+
+    for (i = 0; i < 64; i++) {
+      uint64_t tmp = mat[i];
+      uint64_t vec1_i = (vec1 >> i);
+      uint64_t vec2_i = (vec2 >> i);
+      sum1 &= (vec1_i & 1) ? tmp : -1;
+      if (vec2_i&1) sum2 &= tmp;
+    }
+    *ans++ &= sum1;  n--;
+    *ans++ &= sum2;  n--;
+  }
+}
diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
index b165dc0c17f..7df1103ff89 100644
--- a/gcc/tree-if-conv.c
+++ b/gcc/tree-if-conv.c
@@ -1732,7 +1732,9 @@  is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
       reduction_op = gimple_assign_rhs_code (stmt);
     }
 
-  if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR)
+  if (reduction_op != PLUS_EXPR && reduction_op != MINUS_EXPR
+     && reduction_op != BIT_IOR_EXPR && reduction_op != BIT_XOR_EXPR
+     && reduction_op != BIT_AND_EXPR)
     return false;
   r_op1 = gimple_assign_rhs1 (stmt);
   r_op2 = gimple_assign_rhs2 (stmt);
@@ -1742,7 +1744,7 @@  is_cond_scalar_reduction (gimple *phi, gimple **reduc, tree arg_0, tree arg_1,
 
   /* Make R_OP1 to hold reduction variable.  */
   if (r_nop2 == PHI_RESULT (header_phi)
-      && reduction_op == PLUS_EXPR)
+      && commutative_tree_code (reduction_op))
     {
       std::swap (r_op1, r_op2);
       std::swap (r_nop1, r_nop2);
@@ -1811,7 +1813,10 @@  convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
   tree rhs1 = gimple_assign_rhs1 (reduc);
   tree tmp = make_temp_ssa_name (TREE_TYPE (rhs1), NULL, "_ifc_");
   tree c;
-  tree zero = build_zero_cst (TREE_TYPE (rhs1));
+  enum tree_code reduction_op  = gimple_assign_rhs_code (reduc);
+  tree op_nochange = reduction_op != BIT_AND_EXPR
+    ? build_zero_cst (TREE_TYPE (rhs1))
+    : build_minus_one_cst (TREE_TYPE (rhs1));
   gimple_seq stmts = NULL;
 
   if (dump_file && (dump_flags & TDF_DETAILS))
@@ -1824,14 +1829,14 @@  convert_scalar_cond_reduction (gimple *reduc, gimple_stmt_iterator *gsi,
      of reduction rhs.  */
   c = fold_build_cond_expr (TREE_TYPE (rhs1),
 			    unshare_expr (cond),
-			    swap ? zero : op1,
-			    swap ? op1 : zero);
+			    swap ? op_nochange : op1,
+			    swap ? op1 : op_nochange);
 
   /* Create assignment stmt and insert it at GSI.  */
   new_assign = gimple_build_assign (tmp, c);
   gsi_insert_before (gsi, new_assign, GSI_SAME_STMT);
-  /* Build rhs for unconditional increment/decrement.  */
-  rhs = gimple_build (&stmts, gimple_assign_rhs_code (reduc),
+  /* Build rhs for unconditional increment/decrement/logic_operation.  */
+  rhs = gimple_build (&stmts, reduction_op,
 		      TREE_TYPE (rhs1), op0, tmp);
 
   if (has_nop)