middle-end simplify complex if expressions where comparisons are inverse of one another.

Message ID patch-15679-tamar@arm.com
State Deferred
Headers
Series middle-end simplify complex if expressions where comparisons are inverse of one another. |

Commit Message

Tamar Christina June 16, 2022, 1:31 p.m. UTC
  Hi All,

This optimizes the following sequence

  ((a < b) & c) | ((a >= b) & d)

into

  (a < b ? c : d) & 1

for scalar. On vector we can omit the & 1.

This changes the code generation from

zoo2:
	cmp     w0, w1
	cset    w0, lt
	cset    w1, ge
	and     w0, w0, w2
	and     w1, w1, w3
	orr     w0, w0, w1
	ret

into

	cmp	w0, w1
	csel	w0, w2, w3, lt
	and	w0, w0, 1
	ret

and significantly reduces the number of selects we have to do in the vector
code.

Bootstrapped Regtested on aarch64-none-linux-gnu,
x86_64-pc-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* fold-const.cc (inverse_conditions_p): Traverse if SSA_NAME.
	* match.pd: Add new rule.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/if-compare_1.c: New test.
	* gcc.target/aarch64/if-compare_2.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 39a5a52958d87497f301826e706886b290771a2d..f180599b90150acd3ed895a64280aa3255061256 100644




--
diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 39a5a52958d87497f301826e706886b290771a2d..f180599b90150acd3ed895a64280aa3255061256 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -2833,15 +2833,38 @@ compcode_to_comparison (enum comparison_code code)
 bool
 inverse_conditions_p (const_tree cond1, const_tree cond2)
 {
-  return (COMPARISON_CLASS_P (cond1)
-	  && COMPARISON_CLASS_P (cond2)
-	  && (invert_tree_comparison
-	      (TREE_CODE (cond1),
-	       HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE (cond2))
-	  && operand_equal_p (TREE_OPERAND (cond1, 0),
-			      TREE_OPERAND (cond2, 0), 0)
-	  && operand_equal_p (TREE_OPERAND (cond1, 1),
-			      TREE_OPERAND (cond2, 1), 0));
+  if (COMPARISON_CLASS_P (cond1)
+      && COMPARISON_CLASS_P (cond2)
+      && (invert_tree_comparison
+	   (TREE_CODE (cond1),
+	    HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE (cond2))
+      && operand_equal_p (TREE_OPERAND (cond1, 0),
+			  TREE_OPERAND (cond2, 0), 0)
+      && operand_equal_p (TREE_OPERAND (cond1, 1),
+			  TREE_OPERAND (cond2, 1), 0))
+    return true;
+
+  if (TREE_CODE (cond1) == SSA_NAME
+      && TREE_CODE (cond2) == SSA_NAME)
+    {
+      gimple *gcond1 = SSA_NAME_DEF_STMT (cond1);
+      gimple *gcond2 = SSA_NAME_DEF_STMT (cond2);
+      if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2))
+	return false;
+
+      tree_code code1 = gimple_assign_rhs_code (gcond1);
+      tree_code code2 = gimple_assign_rhs_code (gcond2);
+      return TREE_CODE_CLASS (code1) == tcc_comparison
+	     && TREE_CODE_CLASS (code2) == tcc_comparison
+	     && invert_tree_comparison (code1,
+		  HONOR_NANS (gimple_arg (gcond1, 0))) == code2
+	     && operand_equal_p (gimple_arg (gcond1, 0),
+				 gimple_arg (gcond2, 0), 0)
+	     && operand_equal_p (gimple_arg (gcond1, 1),
+				 gimple_arg (gcond2, 1), 0);
+    }
+
+  return false;
 }
 
 /* Return a tree for the comparison which is the combination of
diff --git a/gcc/match.pd b/gcc/match.pd
index 6d691d302b339c0e4556b40af158b5208c12d08f..bad49dd348add751d9ec1e3023e34d9ac123194f 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1160,6 +1160,32 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
      (convert (bit_and (negate (convert:utype { pmop[0]; }))
 		       (convert:utype @1)))))))
 
+/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.  */
+(simplify
+ (bit_ior
+  (bit_and:c (convert? @0) @2)
+  (bit_and:c (convert? @1) @3))
+   (if (inverse_conditions_p (@0, @1)
+	/* The scalar version has to be canonicalized after vectorization
+	   because it makes unconditional loads conditional ones, which
+	   means we lose vectorization because the loads may trap.  */
+	&& canonicalize_math_after_vectorization_p ())
+    (bit_and (cond @0 @2 @3) { build_each_one_cst (type); })))
+(simplify
+ (bit_ior
+  (bit_and:c (convert? (vec_cond:s @0 @4 integer_zerop)) @2)
+  (bit_and:c (convert? (vec_cond:s @1 @4 integer_zerop)) @3))
+   (if (inverse_conditions_p (@0, @1)
+	&& integer_onep (@4))
+    (bit_and (vec_cond @0 @2 @3) @4)))
+/* Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d.  */
+(simplify
+ (bit_ior
+  (bit_and:c (convert? (vec_cond:s @0 integer_minus_onep integer_zerop)) @2)
+  (bit_and:c (convert? (vec_cond:s @1 integer_minus_onep integer_zerop)) @3))
+   (if (inverse_conditions_p (@0, @1))
+    (vec_cond @0 @2 @3)))
+
 /* X % Y is smaller than Y.  */
 (for cmp (lt ge)
  (simplify
diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_1.c b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..05a1292fa90c70b14a7985122f43711f55d047ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+/*
+**zoo2:
+**	cmp	w0, w1
+**	csel	w0, w2, w3, lt
+**	and	w0, w0, 1
+**	ret
+*/
+int zoo2 (int a, int b, int c, int d)
+{
+   return ((a < b) & c) | ((a >= b) & d);
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_2.c b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..34bc65f5db10eae81b8dee3316dfb7d12bf471c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+**foo:
+**	cmgt	v0.4s, v1.4s, v0.4s
+**	bsl	v0.16b, v2.16b, v3.16b
+**	ret
+*/
+v4si foo (v4si a, v4si b, v4si c, v4si d) {
+    return ((a < b) & c) | ((a >= b) & d);
+}
+
+
+/**
+**bar:
+**...
+**	cmge	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	bsl	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**	and	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**...
+*/
+void bar (int * restrict a, int * restrict b, int * restrict c,
+	  int * restrict d, int * restrict res, int n)
+{
+  for (int i = 0; i < (n & -4); i++)
+    res[i] = ((a[i] < b[i]) & c[i]) | ((a[i] >= b[i]) & d[i]);
+}
+
  

Comments

Richard Biener June 20, 2022, 8:56 a.m. UTC | #1
On Thu, 16 Jun 2022, Tamar Christina wrote:

> Hi All,
> 
> This optimizes the following sequence
> 
>   ((a < b) & c) | ((a >= b) & d)
> 
> into
> 
>   (a < b ? c : d) & 1
> 
> for scalar. On vector we can omit the & 1.
>
> This changes the code generation from
> 
> zoo2:
> 	cmp     w0, w1
> 	cset    w0, lt
> 	cset    w1, ge
> 	and     w0, w0, w2
> 	and     w1, w1, w3
> 	orr     w0, w0, w1
> 	ret
> 
> into
> 
> 	cmp	w0, w1
> 	csel	w0, w2, w3, lt
> 	and	w0, w0, 1
> 	ret
> 
> and significantly reduces the number of selects we have to do in the vector
> code.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> x86_64-pc-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* fold-const.cc (inverse_conditions_p): Traverse if SSA_NAME.
> 	* match.pd: Add new rule.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/aarch64/if-compare_1.c: New test.
> 	* gcc.target/aarch64/if-compare_2.c: New test.
> 
> --- inline copy of patch -- 
> diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> index 39a5a52958d87497f301826e706886b290771a2d..f180599b90150acd3ed895a64280aa3255061256 100644
> --- a/gcc/fold-const.cc
> +++ b/gcc/fold-const.cc
> @@ -2833,15 +2833,38 @@ compcode_to_comparison (enum comparison_code code)
>  bool
>  inverse_conditions_p (const_tree cond1, const_tree cond2)
>  {
> -  return (COMPARISON_CLASS_P (cond1)
> -	  && COMPARISON_CLASS_P (cond2)
> -	  && (invert_tree_comparison
> -	      (TREE_CODE (cond1),
> -	       HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE (cond2))
> -	  && operand_equal_p (TREE_OPERAND (cond1, 0),
> -			      TREE_OPERAND (cond2, 0), 0)
> -	  && operand_equal_p (TREE_OPERAND (cond1, 1),
> -			      TREE_OPERAND (cond2, 1), 0));
> +  if (COMPARISON_CLASS_P (cond1)
> +      && COMPARISON_CLASS_P (cond2)
> +      && (invert_tree_comparison
> +	   (TREE_CODE (cond1),
> +	    HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE (cond2))
> +      && operand_equal_p (TREE_OPERAND (cond1, 0),
> +			  TREE_OPERAND (cond2, 0), 0)
> +      && operand_equal_p (TREE_OPERAND (cond1, 1),
> +			  TREE_OPERAND (cond2, 1), 0))
> +    return true;
> +
> +  if (TREE_CODE (cond1) == SSA_NAME
> +      && TREE_CODE (cond2) == SSA_NAME)
> +    {
> +      gimple *gcond1 = SSA_NAME_DEF_STMT (cond1);
> +      gimple *gcond2 = SSA_NAME_DEF_STMT (cond2);
> +      if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2))
> +	return false;
> +
> +      tree_code code1 = gimple_assign_rhs_code (gcond1);
> +      tree_code code2 = gimple_assign_rhs_code (gcond2);
> +      return TREE_CODE_CLASS (code1) == tcc_comparison
> +	     && TREE_CODE_CLASS (code2) == tcc_comparison
> +	     && invert_tree_comparison (code1,
> +		  HONOR_NANS (gimple_arg (gcond1, 0))) == code2
> +	     && operand_equal_p (gimple_arg (gcond1, 0),
> +				 gimple_arg (gcond2, 0), 0)
> +	     && operand_equal_p (gimple_arg (gcond1, 1),
> +				 gimple_arg (gcond2, 1), 0);
> +    }
> +
> +  return false;

if we do extend inverse_condition_p please add an overload like

bool
inverse_condition_p (enum tree_code, tree op00, tree op01,
                     enum tree_code, tree op10, tree op11)

so you can avoid some code duplication here.

>  }
>  
>  /* Return a tree for the comparison which is the combination of
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 6d691d302b339c0e4556b40af158b5208c12d08f..bad49dd348add751d9ec1e3023e34d9ac123194f 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -1160,6 +1160,32 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>       (convert (bit_and (negate (convert:utype { pmop[0]; }))
>  		       (convert:utype @1)))))))
>  
> +/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.  */
> +(simplify
> + (bit_ior
> +  (bit_and:c (convert? @0) @2)
> +  (bit_and:c (convert? @1) @3))

in case the comparison returns a signed bool this might turn out wrong.
Maybe simply use zero_one_valued_p@0 here instead of (convert? @0)?

> +   (if (inverse_conditions_p (@0, @1)
> +	/* The scalar version has to be canonicalized after vectorization
> +	   because it makes unconditional loads conditional ones, which
> +	   means we lose vectorization because the loads may trap.  */
> +	&& canonicalize_math_after_vectorization_p ())
> +    (bit_and (cond @0 @2 @3) { build_each_one_cst (type); })))

I think you should restrict this to INTEGRAL_TYPE_P and use
build_one_cst (type) (also see below).

you can do inverse_onditions_p with lock-step for over
tcc_comparison and inverted_tcc_comparison{,_with_nans} (see existing 
examples).

> +(simplify
> + (bit_ior
> +  (bit_and:c (convert? (vec_cond:s @0 @4 integer_zerop)) @2)
> +  (bit_and:c (convert? (vec_cond:s @1 @4 integer_zerop)) @3))
> +   (if (inverse_conditions_p (@0, @1)
> +	&& integer_onep (@4))
> +    (bit_and (vec_cond @0 @2 @3) @4)))
> +/* Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d.  */
> +(simplify
> + (bit_ior
> +  (bit_and:c (convert? (vec_cond:s @0 integer_minus_onep integer_zerop)) @2)
> +  (bit_and:c (convert? (vec_cond:s @1 integer_minus_onep integer_zerop)) @3))
> +   (if (inverse_conditions_p (@0, @1))
> +    (vec_cond @0 @2 @3)))

I think the duplication is pre-mature optimization - we should get the
(bit_and (..) integer_minus_onep) simplified.  Also didn't we have
(vec_cond @0 -1 0) -> (view_convert @0) when the modes match?
We might want to add (match zero_minus_one_valued_p) or use
truth_valued_p (with appropriate vector semantics, plus extend it).
Why do you need (convert? ...) for the vector case btw?

I guess the integral type and vector type cases are similar enough
that the patterns can be merged with conditionalizing only the
replacement.

> +
>  /* X % Y is smaller than Y.  */
>  (for cmp (lt ge)
>   (simplify
> diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_1.c b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..05a1292fa90c70b14a7985122f43711f55d047ea
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +/*
> +**zoo2:
> +**	cmp	w0, w1
> +**	csel	w0, w2, w3, lt
> +**	and	w0, w0, 1
> +**	ret
> +*/
> +int zoo2 (int a, int b, int c, int d)
> +{
> +   return ((a < b) & c) | ((a >= b) & d);
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_2.c b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..34bc65f5db10eae81b8dee3316dfb7d12bf471c8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> @@ -0,0 +1,32 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +typedef int v4si __attribute__ ((vector_size (16)));
> +
> +/*
> +**foo:
> +**	cmgt	v0.4s, v1.4s, v0.4s
> +**	bsl	v0.16b, v2.16b, v3.16b
> +**	ret
> +*/
> +v4si foo (v4si a, v4si b, v4si c, v4si d) {
> +    return ((a < b) & c) | ((a >= b) & d);
> +}
> +
> +
> +/**
> +**bar:
> +**...
> +**	cmge	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +**	bsl	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +**	and	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +**...
> +*/
> +void bar (int * restrict a, int * restrict b, int * restrict c,
> +	  int * restrict d, int * restrict res, int n)
> +{
> +  for (int i = 0; i < (n & -4); i++)
> +    res[i] = ((a[i] < b[i]) & c[i]) | ((a[i] >= b[i]) & d[i]);
> +}
> +
> 
> 
> 
> 
>
  
Tamar Christina July 5, 2022, 3:15 p.m. UTC | #2
> -----Original Message-----
> From: Richard Biener <rguenther@suse.de>
> Sent: Monday, June 20, 2022 9:57 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> Subject: Re: [PATCH]middle-end simplify complex if expressions where
> comparisons are inverse of one another.
> 
> On Thu, 16 Jun 2022, Tamar Christina wrote:
> 
> > Hi All,
> >
> > This optimizes the following sequence
> >
> >   ((a < b) & c) | ((a >= b) & d)
> >
> > into
> >
> >   (a < b ? c : d) & 1
> >
> > for scalar. On vector we can omit the & 1.
> >
> > This changes the code generation from
> >
> > zoo2:
> > 	cmp     w0, w1
> > 	cset    w0, lt
> > 	cset    w1, ge
> > 	and     w0, w0, w2
> > 	and     w1, w1, w3
> > 	orr     w0, w0, w1
> > 	ret
> >
> > into
> >
> > 	cmp	w0, w1
> > 	csel	w0, w2, w3, lt
> > 	and	w0, w0, 1
> > 	ret
> >
> > and significantly reduces the number of selects we have to do in the
> > vector code.
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* fold-const.cc (inverse_conditions_p): Traverse if SSA_NAME.
> > 	* match.pd: Add new rule.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.target/aarch64/if-compare_1.c: New test.
> > 	* gcc.target/aarch64/if-compare_2.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc index
> >
> 39a5a52958d87497f301826e706886b290771a2d..f180599b90150acd3ed895a64
> 280
> > aa3255061256 100644
> > --- a/gcc/fold-const.cc
> > +++ b/gcc/fold-const.cc
> > @@ -2833,15 +2833,38 @@ compcode_to_comparison (enum
> comparison_code
> > code)  bool  inverse_conditions_p (const_tree cond1, const_tree cond2)
> > {
> > -  return (COMPARISON_CLASS_P (cond1)
> > -	  && COMPARISON_CLASS_P (cond2)
> > -	  && (invert_tree_comparison
> > -	      (TREE_CODE (cond1),
> > -	       HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE
> (cond2))
> > -	  && operand_equal_p (TREE_OPERAND (cond1, 0),
> > -			      TREE_OPERAND (cond2, 0), 0)
> > -	  && operand_equal_p (TREE_OPERAND (cond1, 1),
> > -			      TREE_OPERAND (cond2, 1), 0));
> > +  if (COMPARISON_CLASS_P (cond1)
> > +      && COMPARISON_CLASS_P (cond2)
> > +      && (invert_tree_comparison
> > +	   (TREE_CODE (cond1),
> > +	    HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE
> (cond2))
> > +      && operand_equal_p (TREE_OPERAND (cond1, 0),
> > +			  TREE_OPERAND (cond2, 0), 0)
> > +      && operand_equal_p (TREE_OPERAND (cond1, 1),
> > +			  TREE_OPERAND (cond2, 1), 0))
> > +    return true;
> > +
> > +  if (TREE_CODE (cond1) == SSA_NAME
> > +      && TREE_CODE (cond2) == SSA_NAME)
> > +    {
> > +      gimple *gcond1 = SSA_NAME_DEF_STMT (cond1);
> > +      gimple *gcond2 = SSA_NAME_DEF_STMT (cond2);
> > +      if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2))
> > +	return false;
> > +
> > +      tree_code code1 = gimple_assign_rhs_code (gcond1);
> > +      tree_code code2 = gimple_assign_rhs_code (gcond2);
> > +      return TREE_CODE_CLASS (code1) == tcc_comparison
> > +	     && TREE_CODE_CLASS (code2) == tcc_comparison
> > +	     && invert_tree_comparison (code1,
> > +		  HONOR_NANS (gimple_arg (gcond1, 0))) == code2
> > +	     && operand_equal_p (gimple_arg (gcond1, 0),
> > +				 gimple_arg (gcond2, 0), 0)
> > +	     && operand_equal_p (gimple_arg (gcond1, 1),
> > +				 gimple_arg (gcond2, 1), 0);
> > +    }
> > +
> > +  return false;
> 
> if we do extend inverse_condition_p please add an overload like

Done. 

> 
> bool
> inverse_condition_p (enum tree_code, tree op00, tree op01,
>                      enum tree_code, tree op10, tree op11)
> 
> so you can avoid some code duplication here.
> 
> >  }
> >
> >  /* Return a tree for the comparison which is the combination of diff
> > --git a/gcc/match.pd b/gcc/match.pd index
> >
> 6d691d302b339c0e4556b40af158b5208c12d08f..bad49dd348add751d9ec1e30
> 23e3
> > 4d9ac123194f 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -1160,6 +1160,32 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >       (convert (bit_and (negate (convert:utype { pmop[0]; }))
> >  		       (convert:utype @1)))))))
> >
> > +/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.
> > +*/ (simplify  (bit_ior
> > +  (bit_and:c (convert? @0) @2)
> > +  (bit_and:c (convert? @1) @3))
> 
> in case the comparison returns a signed bool this might turn out wrong.
> Maybe simply use zero_one_valued_p@0 here instead of (convert? @0)?

I think I still need the convert as the comparison gets promoted to int and
the predicate doesn't handle extensions.

So I left the convert but added the zero_one_valued_p@0 such that it's
checking that the result of the comparison itself is at least 0 or 1.

> 
> > +   (if (inverse_conditions_p (@0, @1)
> > +	/* The scalar version has to be canonicalized after vectorization
> > +	   because it makes unconditional loads conditional ones, which
> > +	   means we lose vectorization because the loads may trap.  */
> > +	&& canonicalize_math_after_vectorization_p ())
> > +    (bit_and (cond @0 @2 @3) { build_each_one_cst (type); })))
> 
> I think you should restrict this to INTEGRAL_TYPE_P and use build_one_cst
> (type) (also see below).
> 
> you can do inverse_onditions_p with lock-step for over tcc_comparison and
> inverted_tcc_comparison{,_with_nans} (see existing examples).

I couldn't figure out how to combine this approach with the zero_one_valued_p
predicate. The zero_one_valued_p would need to be on (cmp @0 @1) but don't
think that is allowed.  

> 
> > +(simplify
> > + (bit_ior
> > +  (bit_and:c (convert? (vec_cond:s @0 @4 integer_zerop)) @2)
> > +  (bit_and:c (convert? (vec_cond:s @1 @4 integer_zerop)) @3))
> > +   (if (inverse_conditions_p (@0, @1)
> > +	&& integer_onep (@4))
> > +    (bit_and (vec_cond @0 @2 @3) @4)))
> > +/* Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d.  */
> > +(simplify  (bit_ior
> > +  (bit_and:c (convert? (vec_cond:s @0 integer_minus_onep
> > +integer_zerop)) @2)
> > +  (bit_and:c (convert? (vec_cond:s @1 integer_minus_onep
> integer_zerop)) @3))
> > +   (if (inverse_conditions_p (@0, @1))
> > +    (vec_cond @0 @2 @3)))
> 
> I think the duplication is pre-mature optimization - we should get the
> (bit_and (..) integer_minus_onep) simplified.  Also didn't we have (vec_cond
> @0 -1 0) -> (view_convert @0) when the modes match?

This wasn't in my tree at the time, I could use this representation instead but it
wouldn't shorten the match tree. Combining them as you suggested below seems
most optimal.

> We might want to add (match zero_minus_one_valued_p) or use
> truth_valued_p (with appropriate vector semantics, plus extend it).
> Why do you need (convert? ...) for the vector case btw?
> 

Since we have to defer the scalar version then the vectorizer won't 

> I guess the integral type and vector type cases are similar enough that the
> patterns can be merged with conditionalizing only the replacement.
> 

Merged. 


Bootstrapped Regtested on aarch64-none-linux-gnu,
x86_64-pc-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* fold-const.cc (inverse_conditions_p): Traverse if SSA_NAME.
	(inverse_conditions_p_1): New.
	* match.pd: Add new rule.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/if-compare_1.c: New test.
	* gcc.target/aarch64/if-compare_2.c: New test.

--- inline copy of patch ---

diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
index 99021a82df4977b179b45db04e3083012c63067a..0628ffd395416d74bc031e3e4ac8246894ca564d 100644
--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -2829,20 +2829,55 @@ compcode_to_comparison (enum comparison_code code)
     }
 }
 
+
+/* Helper of inverse_condition_p.  Returns TRUE if CODE1 is the inverse of
+   CODE2 and OP00 == OP10 and OP01 == OP11.  */
+
+static bool
+inverse_conditions_p_1 (enum tree_code code1, tree op00, tree op01,
+			enum tree_code code2, tree op10, tree op11)
+{
+  return (invert_tree_comparison (code1, HONOR_NANS (op00)) == code2)
+      && operand_equal_p (op00, op10, 0)
+      && operand_equal_p (op01, op11, 0);
+}
+
 /* Return true if COND1 tests the opposite condition of COND2.  */
 
 bool
 inverse_conditions_p (const_tree cond1, const_tree cond2)
 {
-  return (COMPARISON_CLASS_P (cond1)
-	  && COMPARISON_CLASS_P (cond2)
-	  && (invert_tree_comparison
-	      (TREE_CODE (cond1),
-	       HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE (cond2))
-	  && operand_equal_p (TREE_OPERAND (cond1, 0),
-			      TREE_OPERAND (cond2, 0), 0)
-	  && operand_equal_p (TREE_OPERAND (cond1, 1),
-			      TREE_OPERAND (cond2, 1), 0));
+  if (COMPARISON_CLASS_P (cond1)
+      && COMPARISON_CLASS_P (cond2)
+      && inverse_conditions_p_1 (TREE_CODE (cond1),
+				 TREE_OPERAND (cond1, 0),
+				 TREE_OPERAND (cond1, 1),
+				 TREE_CODE (cond2),
+				 TREE_OPERAND (cond2, 0),
+				 TREE_OPERAND (cond2, 1)))
+    return true;
+
+  if (TREE_CODE (cond1) == SSA_NAME
+      && TREE_CODE (cond2) == SSA_NAME)
+    {
+      gimple *gcond1 = SSA_NAME_DEF_STMT (cond1);
+      gimple *gcond2 = SSA_NAME_DEF_STMT (cond2);
+      if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2))
+	return false;
+
+      tree_code code1 = gimple_assign_rhs_code (gcond1);
+      tree_code code2 = gimple_assign_rhs_code (gcond2);
+      return TREE_CODE_CLASS (code1) == tcc_comparison
+	     && TREE_CODE_CLASS (code2) == tcc_comparison
+	     && inverse_conditions_p_1 (code1,
+					gimple_arg (gcond1, 0),
+					gimple_arg (gcond1, 1),
+					code2,
+					gimple_arg (gcond2, 0),
+					gimple_arg (gcond2, 1));
+    }
+
+  return false;
 }
 
 /* Return a tree for the comparison which is the combination of
diff --git a/gcc/match.pd b/gcc/match.pd
index 24cbbbb5bc1d718bd03af7712fc7255213f2a742..0a459f2804e296f4336aee70e351ddc45486c867 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1872,6 +1872,30 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (if (INTEGRAL_TYPE_P (type))
   (bit_and @0 @1)))
 
+/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.
+   and vector Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d. */
+(simplify
+ (bit_ior
+  (bit_and:c (convert? zero_one_valued_p@0) @2)
+  (bit_and:c (convert? zero_one_valued_p@1) @3))
+   (if (INTEGRAL_TYPE_P (type)
+	&& inverse_conditions_p (@0, @1)
+	/* The scalar version has to be canonicalized after vectorization
+	   because it makes unconditional loads conditional ones, which
+	   means we lose vectorization because the loads may trap.  */
+	&& canonicalize_math_after_vectorization_p ())
+    (bit_and (cond @0 @2 @3) { build_one_cst (type); })))
+(simplify
+ (bit_ior
+  (bit_and:c (vec_cond:s @0 @4 integer_zerop) @2)
+  (bit_and:c (vec_cond:s @1 @4 integer_zerop) @3))
+   (if (inverse_conditions_p (@0, @1))
+    (switch
+     (if (integer_onep (@4))
+      (bit_and (vec_cond @0 @2 @3) @4))
+     (if (integer_minus_onep (@4))
+      (vec_cond @0 @2 @3)))))
+
 /* Transform X & -Y into X * Y when Y is { 0 or 1 }.  */
 (simplify
  (bit_and:c (convert? (negate zero_one_valued_p@0)) @1)
diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_1.c b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..05a1292fa90c70b14a7985122f43711f55d047ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+/*
+**zoo2:
+**	cmp	w0, w1
+**	csel	w0, w2, w3, lt
+**	and	w0, w0, 1
+**	ret
+*/
+int zoo2 (int a, int b, int c, int d)
+{
+   return ((a < b) & c) | ((a >= b) & d);
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_2.c b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..34bc65f5db10eae81b8dee3316dfb7d12bf471c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
@@ -0,0 +1,32 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+**foo:
+**	cmgt	v0.4s, v1.4s, v0.4s
+**	bsl	v0.16b, v2.16b, v3.16b
+**	ret
+*/
+v4si foo (v4si a, v4si b, v4si c, v4si d) {
+    return ((a < b) & c) | ((a >= b) & d);
+}
+
+
+/**
+**bar:
+**...
+**	cmge	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	bsl	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**	and	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**...
+*/
+void bar (int * restrict a, int * restrict b, int * restrict c,
+	  int * restrict d, int * restrict res, int n)
+{
+  for (int i = 0; i < (n & -4); i++)
+    res[i] = ((a[i] < b[i]) & c[i]) | ((a[i] >= b[i]) & d[i]);
+}
+
  
Andrew Pinski July 6, 2022, 2:09 a.m. UTC | #3
On Tue, Jul 5, 2022 at 8:16 AM Tamar Christina via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
>
>
> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Monday, June 20, 2022 9:57 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> > Subject: Re: [PATCH]middle-end simplify complex if expressions where
> > comparisons are inverse of one another.
> >
> > On Thu, 16 Jun 2022, Tamar Christina wrote:
> >
> > > Hi All,
> > >
> > > This optimizes the following sequence
> > >
> > >   ((a < b) & c) | ((a >= b) & d)
> > >
> > > into
> > >
> > >   (a < b ? c : d) & 1
> > >
> > > for scalar. On vector we can omit the & 1.
> > >
> > > This changes the code generation from
> > >
> > > zoo2:
> > >     cmp     w0, w1
> > >     cset    w0, lt
> > >     cset    w1, ge
> > >     and     w0, w0, w2
> > >     and     w1, w1, w3
> > >     orr     w0, w0, w1
> > >     ret
> > >
> > > into
> > >
> > >     cmp     w0, w1
> > >     csel    w0, w2, w3, lt
> > >     and     w0, w0, 1
> > >     ret
> > >
> > > and significantly reduces the number of selects we have to do in the
> > > vector code.
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > > and no issues.
> > >
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > >     * fold-const.cc (inverse_conditions_p): Traverse if SSA_NAME.
> > >     * match.pd: Add new rule.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >     * gcc.target/aarch64/if-compare_1.c: New test.
> > >     * gcc.target/aarch64/if-compare_2.c: New test.
> > >
> > > --- inline copy of patch --
> > > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc index
> > >
> > 39a5a52958d87497f301826e706886b290771a2d..f180599b90150acd3ed895a64
> > 280
> > > aa3255061256 100644
> > > --- a/gcc/fold-const.cc
> > > +++ b/gcc/fold-const.cc
> > > @@ -2833,15 +2833,38 @@ compcode_to_comparison (enum
> > comparison_code
> > > code)  bool  inverse_conditions_p (const_tree cond1, const_tree cond2)
> > > {
> > > -  return (COMPARISON_CLASS_P (cond1)
> > > -     && COMPARISON_CLASS_P (cond2)
> > > -     && (invert_tree_comparison
> > > -         (TREE_CODE (cond1),
> > > -          HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE
> > (cond2))
> > > -     && operand_equal_p (TREE_OPERAND (cond1, 0),
> > > -                         TREE_OPERAND (cond2, 0), 0)
> > > -     && operand_equal_p (TREE_OPERAND (cond1, 1),
> > > -                         TREE_OPERAND (cond2, 1), 0));
> > > +  if (COMPARISON_CLASS_P (cond1)
> > > +      && COMPARISON_CLASS_P (cond2)
> > > +      && (invert_tree_comparison
> > > +      (TREE_CODE (cond1),
> > > +       HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE
> > (cond2))
> > > +      && operand_equal_p (TREE_OPERAND (cond1, 0),
> > > +                     TREE_OPERAND (cond2, 0), 0)
> > > +      && operand_equal_p (TREE_OPERAND (cond1, 1),
> > > +                     TREE_OPERAND (cond2, 1), 0))
> > > +    return true;
> > > +
> > > +  if (TREE_CODE (cond1) == SSA_NAME
> > > +      && TREE_CODE (cond2) == SSA_NAME)
> > > +    {
> > > +      gimple *gcond1 = SSA_NAME_DEF_STMT (cond1);
> > > +      gimple *gcond2 = SSA_NAME_DEF_STMT (cond2);
> > > +      if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2))
> > > +   return false;
> > > +
> > > +      tree_code code1 = gimple_assign_rhs_code (gcond1);
> > > +      tree_code code2 = gimple_assign_rhs_code (gcond2);
> > > +      return TREE_CODE_CLASS (code1) == tcc_comparison
> > > +        && TREE_CODE_CLASS (code2) == tcc_comparison
> > > +        && invert_tree_comparison (code1,
> > > +             HONOR_NANS (gimple_arg (gcond1, 0))) == code2
> > > +        && operand_equal_p (gimple_arg (gcond1, 0),
> > > +                            gimple_arg (gcond2, 0), 0)
> > > +        && operand_equal_p (gimple_arg (gcond1, 1),
> > > +                            gimple_arg (gcond2, 1), 0);
> > > +    }
> > > +
> > > +  return false;
> >
> > if we do extend inverse_condition_p please add an overload like
>
> Done.
>
> >
> > bool
> > inverse_condition_p (enum tree_code, tree op00, tree op01,
> >                      enum tree_code, tree op10, tree op11)
> >
> > so you can avoid some code duplication here.
> >
> > >  }
> > >
> > >  /* Return a tree for the comparison which is the combination of diff
> > > --git a/gcc/match.pd b/gcc/match.pd index
> > >
> > 6d691d302b339c0e4556b40af158b5208c12d08f..bad49dd348add751d9ec1e30
> > 23e3
> > > 4d9ac123194f 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -1160,6 +1160,32 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >       (convert (bit_and (negate (convert:utype { pmop[0]; }))
> > >                    (convert:utype @1)))))))
> > >
> > > +/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.
> > > +*/ (simplify  (bit_ior
> > > +  (bit_and:c (convert? @0) @2)
> > > +  (bit_and:c (convert? @1) @3))
> >
> > in case the comparison returns a signed bool this might turn out wrong.
> > Maybe simply use zero_one_valued_p@0 here instead of (convert? @0)?
>
> I think I still need the convert as the comparison gets promoted to int and
> the predicate doesn't handle extensions.
>
> So I left the convert but added the zero_one_valued_p@0 such that it's
> checking that the result of the comparison itself is at least 0 or 1.
>
> >
> > > +   (if (inverse_conditions_p (@0, @1)
> > > +   /* The scalar version has to be canonicalized after vectorization
> > > +      because it makes unconditional loads conditional ones, which
> > > +      means we lose vectorization because the loads may trap.  */
> > > +   && canonicalize_math_after_vectorization_p ())
> > > +    (bit_and (cond @0 @2 @3) { build_each_one_cst (type); })))
> >
> > I think you should restrict this to INTEGRAL_TYPE_P and use build_one_cst
> > (type) (also see below).
> >
> > you can do inverse_onditions_p with lock-step for over tcc_comparison and
> > inverted_tcc_comparison{,_with_nans} (see existing examples).
>
> I couldn't figure out how to combine this approach with the zero_one_valued_p
> predicate. The zero_one_valued_p would need to be on (cmp @0 @1) but don't
> think that is allowed.
>
> >
> > > +(simplify
> > > + (bit_ior
> > > +  (bit_and:c (convert? (vec_cond:s @0 @4 integer_zerop)) @2)
> > > +  (bit_and:c (convert? (vec_cond:s @1 @4 integer_zerop)) @3))
> > > +   (if (inverse_conditions_p (@0, @1)
> > > +   && integer_onep (@4))
> > > +    (bit_and (vec_cond @0 @2 @3) @4)))
> > > +/* Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d.  */
> > > +(simplify  (bit_ior
> > > +  (bit_and:c (convert? (vec_cond:s @0 integer_minus_onep
> > > +integer_zerop)) @2)
> > > +  (bit_and:c (convert? (vec_cond:s @1 integer_minus_onep
> > integer_zerop)) @3))
> > > +   (if (inverse_conditions_p (@0, @1))
> > > +    (vec_cond @0 @2 @3)))
> >
> > I think the duplication is pre-mature optimization - we should get the
> > (bit_and (..) integer_minus_onep) simplified.  Also didn't we have (vec_cond
> > @0 -1 0) -> (view_convert @0) when the modes match?
>
> This wasn't in my tree at the time, I could use this representation instead but it
> wouldn't shorten the match tree. Combining them as you suggested below seems
> most optimal.
>
> > We might want to add (match zero_minus_one_valued_p) or use
> > truth_valued_p (with appropriate vector semantics, plus extend it).
> > Why do you need (convert? ...) for the vector case btw?
> >
>
> Since we have to defer the scalar version then the vectorizer won't
>
> > I guess the integral type and vector type cases are similar enough that the
> > patterns can be merged with conditionalizing only the replacement.
> >
>
> Merged.
>
>
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> x86_64-pc-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>         * fold-const.cc (inverse_conditions_p): Traverse if SSA_NAME.
>         (inverse_conditions_p_1): New.
>         * match.pd: Add new rule.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/if-compare_1.c: New test.
>         * gcc.target/aarch64/if-compare_2.c: New test.
>
> --- inline copy of patch ---
>
> diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> index 99021a82df4977b179b45db04e3083012c63067a..0628ffd395416d74bc031e3e4ac8246894ca564d 100644
> --- a/gcc/fold-const.cc
> +++ b/gcc/fold-const.cc
> @@ -2829,20 +2829,55 @@ compcode_to_comparison (enum comparison_code code)
>      }
>  }
>
> +
> +/* Helper of inverse_condition_p.  Returns TRUE if CODE1 is the inverse of
> +   CODE2 and OP00 == OP10 and OP01 == OP11.  */
> +
> +static bool
> +inverse_conditions_p_1 (enum tree_code code1, tree op00, tree op01,
> +                       enum tree_code code2, tree op10, tree op11)
> +{
> +  return (invert_tree_comparison (code1, HONOR_NANS (op00)) == code2)
> +      && operand_equal_p (op00, op10, 0)
> +      && operand_equal_p (op01, op11, 0);
> +}
> +
>  /* Return true if COND1 tests the opposite condition of COND2.  */
>
>  bool
>  inverse_conditions_p (const_tree cond1, const_tree cond2)
>  {
> -  return (COMPARISON_CLASS_P (cond1)
> -         && COMPARISON_CLASS_P (cond2)
> -         && (invert_tree_comparison
> -             (TREE_CODE (cond1),
> -              HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE (cond2))
> -         && operand_equal_p (TREE_OPERAND (cond1, 0),
> -                             TREE_OPERAND (cond2, 0), 0)
> -         && operand_equal_p (TREE_OPERAND (cond1, 1),
> -                             TREE_OPERAND (cond2, 1), 0));
> +  if (COMPARISON_CLASS_P (cond1)
> +      && COMPARISON_CLASS_P (cond2)
> +      && inverse_conditions_p_1 (TREE_CODE (cond1),
> +                                TREE_OPERAND (cond1, 0),
> +                                TREE_OPERAND (cond1, 1),
> +                                TREE_CODE (cond2),
> +                                TREE_OPERAND (cond2, 0),
> +                                TREE_OPERAND (cond2, 1)))
> +    return true;
> +
> +  if (TREE_CODE (cond1) == SSA_NAME
> +      && TREE_CODE (cond2) == SSA_NAME)
> +    {
> +      gimple *gcond1 = SSA_NAME_DEF_STMT (cond1);
> +      gimple *gcond2 = SSA_NAME_DEF_STMT (cond2);
> +      if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2))
> +       return false;
> +
> +      tree_code code1 = gimple_assign_rhs_code (gcond1);
> +      tree_code code2 = gimple_assign_rhs_code (gcond2);
> +      return TREE_CODE_CLASS (code1) == tcc_comparison
> +            && TREE_CODE_CLASS (code2) == tcc_comparison
> +            && inverse_conditions_p_1 (code1,
> +                                       gimple_arg (gcond1, 0),
> +                                       gimple_arg (gcond1, 1),
> +                                       code2,
> +                                       gimple_arg (gcond2, 0),
> +                                       gimple_arg (gcond2, 1));
> +    }
> +
> +  return false;
>  }
>
>  /* Return a tree for the comparison which is the combination of
> diff --git a/gcc/match.pd b/gcc/match.pd
> index 24cbbbb5bc1d718bd03af7712fc7255213f2a742..0a459f2804e296f4336aee70e351ddc45486c867 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -1872,6 +1872,30 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (if (INTEGRAL_TYPE_P (type))
>    (bit_and @0 @1)))
>
> +/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.
> +   and vector Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d. */
> +(simplify
> + (bit_ior
> +  (bit_and:c (convert? zero_one_valued_p@0) @2)
> +  (bit_and:c (convert? zero_one_valued_p@1) @3))
> +   (if (INTEGRAL_TYPE_P (type)
> +       && inverse_conditions_p (@0, @1)
> +       /* The scalar version has to be canonicalized after vectorization
> +          because it makes unconditional loads conditional ones, which
> +          means we lose vectorization because the loads may trap.  */
> +       && canonicalize_math_after_vectorization_p ())
> +    (bit_and (cond @0 @2 @3) { build_one_cst (type); })))

I Know this is not part of the current pattern but could you add:
(((a < b) & c) | ((-(a >= b)) & d)) -> a < b ? c : d

Not your fault but there are now like two different predicates for a
boolean like operand.
zero_one_valued_p and truth_valued_p and a third way to describe it is
to use SSA_NAME and check ssa_name_has_boolean_range.

Also why can't you just do:
Something similar like:
+ /* Similar but for comparisons which have been inverted already,
+    Note it is hard to similulate inverted tcc_comparison due to NaNs
+    so a double for loop is needed and then compare the inverse code
+    with the result of invert_tree_comparison is needed.  */
+ (for cmp (tcc_comparison)
+  (for icmp (tcc_comparison)
+   (simplify
+    (bitop:c (rbitop:c (icmp @0 @1) @2) (cmp@3 @0 @1))
+     (with { enum tree_code ic = invert_tree_comparison
+             (cmp, HONOR_NANS (@0)); }
+      (if (ic == icmp)
+       (bitop @3 @2)))))))

Where you match everything in match and simplify instead of needing to
use inverse_conditions_p?

Thanks,
Andrew Pinski


> +(simplify
> + (bit_ior
> +  (bit_and:c (vec_cond:s @0 @4 integer_zerop) @2)
> +  (bit_and:c (vec_cond:s @1 @4 integer_zerop) @3))
> +   (if (inverse_conditions_p (@0, @1))
> +    (switch
> +     (if (integer_onep (@4))
> +      (bit_and (vec_cond @0 @2 @3) @4))
> +     (if (integer_minus_onep (@4))
> +      (vec_cond @0 @2 @3)))))
> +
>  /* Transform X & -Y into X * Y when Y is { 0 or 1 }.  */
>  (simplify
>   (bit_and:c (convert? (negate zero_one_valued_p@0)) @1)
> diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_1.c b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..05a1292fa90c70b14a7985122f43711f55d047ea
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +/*
> +**zoo2:
> +**     cmp     w0, w1
> +**     csel    w0, w2, w3, lt
> +**     and     w0, w0, 1
> +**     ret
> +*/
> +int zoo2 (int a, int b, int c, int d)
> +{
> +   return ((a < b) & c) | ((a >= b) & d);
> +}
> +
> diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_2.c b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..34bc65f5db10eae81b8dee3316dfb7d12bf471c8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> @@ -0,0 +1,32 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +typedef int v4si __attribute__ ((vector_size (16)));
> +
> +/*
> +**foo:
> +**     cmgt    v0.4s, v1.4s, v0.4s
> +**     bsl     v0.16b, v2.16b, v3.16b
> +**     ret
> +*/
> +v4si foo (v4si a, v4si b, v4si c, v4si d) {
> +    return ((a < b) & c) | ((a >= b) & d);
> +}
> +
> +
> +/**
> +**bar:
> +**...
> +**     cmge    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +**     bsl     v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +**     and     v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +**...
> +*/
> +void bar (int * restrict a, int * restrict b, int * restrict c,
> +         int * restrict d, int * restrict res, int n)
> +{
> +  for (int i = 0; i < (n & -4); i++)
> +    res[i] = ((a[i] < b[i]) & c[i]) | ((a[i] >= b[i]) & d[i]);
> +}
> +
  
Tamar Christina July 6, 2022, 4:06 p.m. UTC | #4
> -----Original Message-----
> From: Andrew Pinski <pinskia@gmail.com>
> Sent: Wednesday, July 6, 2022 3:10 AM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Richard Biener <rguenther@suse.de>; nd <nd@arm.com>; gcc-
> patches@gcc.gnu.org
> Subject: Re: [PATCH]middle-end simplify complex if expressions where
> comparisons are inverse of one another.
> 
> On Tue, Jul 5, 2022 at 8:16 AM Tamar Christina via Gcc-patches <gcc-
> patches@gcc.gnu.org> wrote:
> >
> >
> >
> > > -----Original Message-----
> > > From: Richard Biener <rguenther@suse.de>
> > > Sent: Monday, June 20, 2022 9:57 AM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> > > Subject: Re: [PATCH]middle-end simplify complex if expressions where
> > > comparisons are inverse of one another.
> > >
> > > On Thu, 16 Jun 2022, Tamar Christina wrote:
> > >
> > > > Hi All,
> > > >
> > > > This optimizes the following sequence
> > > >
> > > >   ((a < b) & c) | ((a >= b) & d)
> > > >
> > > > into
> > > >
> > > >   (a < b ? c : d) & 1
> > > >
> > > > for scalar. On vector we can omit the & 1.
> > > >
> > > > This changes the code generation from
> > > >
> > > > zoo2:
> > > >     cmp     w0, w1
> > > >     cset    w0, lt
> > > >     cset    w1, ge
> > > >     and     w0, w0, w2
> > > >     and     w1, w1, w3
> > > >     orr     w0, w0, w1
> > > >     ret
> > > >
> > > > into
> > > >
> > > >     cmp     w0, w1
> > > >     csel    w0, w2, w3, lt
> > > >     and     w0, w0, 1
> > > >     ret
> > > >
> > > > and significantly reduces the number of selects we have to do in
> > > > the vector code.
> > > >
> > > > Bootstrapped Regtested on aarch64-none-linux-gnu,
> > > > x86_64-pc-linux-gnu and no issues.
> > > >
> > > > Ok for master?
> > > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > >     * fold-const.cc (inverse_conditions_p): Traverse if SSA_NAME.
> > > >     * match.pd: Add new rule.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > >     * gcc.target/aarch64/if-compare_1.c: New test.
> > > >     * gcc.target/aarch64/if-compare_2.c: New test.
> > > >
> > > > --- inline copy of patch --
> > > > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc index
> > > >
> > >
> 39a5a52958d87497f301826e706886b290771a2d..f180599b90150acd3ed895a64
> > > 280
> > > > aa3255061256 100644
> > > > --- a/gcc/fold-const.cc
> > > > +++ b/gcc/fold-const.cc
> > > > @@ -2833,15 +2833,38 @@ compcode_to_comparison (enum
> > > comparison_code
> > > > code)  bool  inverse_conditions_p (const_tree cond1, const_tree
> > > > cond2) {
> > > > -  return (COMPARISON_CLASS_P (cond1)
> > > > -     && COMPARISON_CLASS_P (cond2)
> > > > -     && (invert_tree_comparison
> > > > -         (TREE_CODE (cond1),
> > > > -          HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE
> > > (cond2))
> > > > -     && operand_equal_p (TREE_OPERAND (cond1, 0),
> > > > -                         TREE_OPERAND (cond2, 0), 0)
> > > > -     && operand_equal_p (TREE_OPERAND (cond1, 1),
> > > > -                         TREE_OPERAND (cond2, 1), 0));
> > > > +  if (COMPARISON_CLASS_P (cond1)
> > > > +      && COMPARISON_CLASS_P (cond2)
> > > > +      && (invert_tree_comparison
> > > > +      (TREE_CODE (cond1),
> > > > +       HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE
> > > (cond2))
> > > > +      && operand_equal_p (TREE_OPERAND (cond1, 0),
> > > > +                     TREE_OPERAND (cond2, 0), 0)
> > > > +      && operand_equal_p (TREE_OPERAND (cond1, 1),
> > > > +                     TREE_OPERAND (cond2, 1), 0))
> > > > +    return true;
> > > > +
> > > > +  if (TREE_CODE (cond1) == SSA_NAME
> > > > +      && TREE_CODE (cond2) == SSA_NAME)
> > > > +    {
> > > > +      gimple *gcond1 = SSA_NAME_DEF_STMT (cond1);
> > > > +      gimple *gcond2 = SSA_NAME_DEF_STMT (cond2);
> > > > +      if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2))
> > > > +   return false;
> > > > +
> > > > +      tree_code code1 = gimple_assign_rhs_code (gcond1);
> > > > +      tree_code code2 = gimple_assign_rhs_code (gcond2);
> > > > +      return TREE_CODE_CLASS (code1) == tcc_comparison
> > > > +        && TREE_CODE_CLASS (code2) == tcc_comparison
> > > > +        && invert_tree_comparison (code1,
> > > > +             HONOR_NANS (gimple_arg (gcond1, 0))) == code2
> > > > +        && operand_equal_p (gimple_arg (gcond1, 0),
> > > > +                            gimple_arg (gcond2, 0), 0)
> > > > +        && operand_equal_p (gimple_arg (gcond1, 1),
> > > > +                            gimple_arg (gcond2, 1), 0);
> > > > +    }
> > > > +
> > > > +  return false;
> > >
> > > if we do extend inverse_condition_p please add an overload like
> >
> > Done.
> >
> > >
> > > bool
> > > inverse_condition_p (enum tree_code, tree op00, tree op01,
> > >                      enum tree_code, tree op10, tree op11)
> > >
> > > so you can avoid some code duplication here.
> > >
> > > >  }
> > > >
> > > >  /* Return a tree for the comparison which is the combination of
> > > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > > >
> > >
> 6d691d302b339c0e4556b40af158b5208c12d08f..bad49dd348add751d9ec1e30
> > > 23e3
> > > > 4d9ac123194f 100644
> > > > --- a/gcc/match.pd
> > > > +++ b/gcc/match.pd
> > > > @@ -1160,6 +1160,32 @@ DEFINE_INT_AND_FLOAT_ROUND_FN
> (RINT)
> > > >       (convert (bit_and (negate (convert:utype { pmop[0]; }))
> > > >                    (convert:utype @1)))))))
> > > >
> > > > +/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.
> > > > +*/ (simplify  (bit_ior
> > > > +  (bit_and:c (convert? @0) @2)
> > > > +  (bit_and:c (convert? @1) @3))
> > >
> > > in case the comparison returns a signed bool this might turn out wrong.
> > > Maybe simply use zero_one_valued_p@0 here instead of (convert?
> @0)?
> >
> > I think I still need the convert as the comparison gets promoted to
> > int and the predicate doesn't handle extensions.
> >
> > So I left the convert but added the zero_one_valued_p@0 such that it's
> > checking that the result of the comparison itself is at least 0 or 1.
> >
> > >
> > > > +   (if (inverse_conditions_p (@0, @1)
> > > > +   /* The scalar version has to be canonicalized after vectorization
> > > > +      because it makes unconditional loads conditional ones, which
> > > > +      means we lose vectorization because the loads may trap.  */
> > > > +   && canonicalize_math_after_vectorization_p ())
> > > > +    (bit_and (cond @0 @2 @3) { build_each_one_cst (type); })))
> > >
> > > I think you should restrict this to INTEGRAL_TYPE_P and use
> > > build_one_cst
> > > (type) (also see below).
> > >
> > > you can do inverse_onditions_p with lock-step for over
> > > tcc_comparison and inverted_tcc_comparison{,_with_nans} (see existing
> examples).
> >
> > I couldn't figure out how to combine this approach with the
> > zero_one_valued_p predicate. The zero_one_valued_p would need to be
> on
> > (cmp @0 @1) but don't think that is allowed.
> >
> > >
> > > > +(simplify
> > > > + (bit_ior
> > > > +  (bit_and:c (convert? (vec_cond:s @0 @4 integer_zerop)) @2)
> > > > +  (bit_and:c (convert? (vec_cond:s @1 @4 integer_zerop)) @3))
> > > > +   (if (inverse_conditions_p (@0, @1)
> > > > +   && integer_onep (@4))
> > > > +    (bit_and (vec_cond @0 @2 @3) @4)))
> > > > +/* Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d.  */
> > > > +(simplify  (bit_ior
> > > > +  (bit_and:c (convert? (vec_cond:s @0 integer_minus_onep
> > > > +integer_zerop)) @2)
> > > > +  (bit_and:c (convert? (vec_cond:s @1 integer_minus_onep
> > > integer_zerop)) @3))
> > > > +   (if (inverse_conditions_p (@0, @1))
> > > > +    (vec_cond @0 @2 @3)))
> > >
> > > I think the duplication is pre-mature optimization - we should get
> > > the (bit_and (..) integer_minus_onep) simplified.  Also didn't we
> > > have (vec_cond
> > > @0 -1 0) -> (view_convert @0) when the modes match?
> >
> > This wasn't in my tree at the time, I could use this representation
> > instead but it wouldn't shorten the match tree. Combining them as you
> > suggested below seems most optimal.
> >
> > > We might want to add (match zero_minus_one_valued_p) or use
> > > truth_valued_p (with appropriate vector semantics, plus extend it).
> > > Why do you need (convert? ...) for the vector case btw?
> > >
> >
> > Since we have to defer the scalar version then the vectorizer won't
> >
> > > I guess the integral type and vector type cases are similar enough
> > > that the patterns can be merged with conditionalizing only the
> replacement.
> > >
> >
> > Merged.
> >
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> >         * fold-const.cc (inverse_conditions_p): Traverse if SSA_NAME.
> >         (inverse_conditions_p_1): New.
> >         * match.pd: Add new rule.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/aarch64/if-compare_1.c: New test.
> >         * gcc.target/aarch64/if-compare_2.c: New test.
> >
> > --- inline copy of patch ---
> >
> > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc index
> >
> 99021a82df4977b179b45db04e3083012c63067a..0628ffd395416d74bc031e3e4
> ac8
> > 246894ca564d 100644
> > --- a/gcc/fold-const.cc
> > +++ b/gcc/fold-const.cc
> > @@ -2829,20 +2829,55 @@ compcode_to_comparison (enum
> comparison_code code)
> >      }
> >  }
> >
> > +
> > +/* Helper of inverse_condition_p.  Returns TRUE if CODE1 is the inverse of
> > +   CODE2 and OP00 == OP10 and OP01 == OP11.  */
> > +
> > +static bool
> > +inverse_conditions_p_1 (enum tree_code code1, tree op00, tree op01,
> > +                       enum tree_code code2, tree op10, tree op11) {
> > +  return (invert_tree_comparison (code1, HONOR_NANS (op00)) ==
> code2)
> > +      && operand_equal_p (op00, op10, 0)
> > +      && operand_equal_p (op01, op11, 0); }
> > +
> >  /* Return true if COND1 tests the opposite condition of COND2.  */
> >
> >  bool
> >  inverse_conditions_p (const_tree cond1, const_tree cond2)  {
> > -  return (COMPARISON_CLASS_P (cond1)
> > -         && COMPARISON_CLASS_P (cond2)
> > -         && (invert_tree_comparison
> > -             (TREE_CODE (cond1),
> > -              HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE
> (cond2))
> > -         && operand_equal_p (TREE_OPERAND (cond1, 0),
> > -                             TREE_OPERAND (cond2, 0), 0)
> > -         && operand_equal_p (TREE_OPERAND (cond1, 1),
> > -                             TREE_OPERAND (cond2, 1), 0));
> > +  if (COMPARISON_CLASS_P (cond1)
> > +      && COMPARISON_CLASS_P (cond2)
> > +      && inverse_conditions_p_1 (TREE_CODE (cond1),
> > +                                TREE_OPERAND (cond1, 0),
> > +                                TREE_OPERAND (cond1, 1),
> > +                                TREE_CODE (cond2),
> > +                                TREE_OPERAND (cond2, 0),
> > +                                TREE_OPERAND (cond2, 1)))
> > +    return true;
> > +
> > +  if (TREE_CODE (cond1) == SSA_NAME
> > +      && TREE_CODE (cond2) == SSA_NAME)
> > +    {
> > +      gimple *gcond1 = SSA_NAME_DEF_STMT (cond1);
> > +      gimple *gcond2 = SSA_NAME_DEF_STMT (cond2);
> > +      if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2))
> > +       return false;
> > +
> > +      tree_code code1 = gimple_assign_rhs_code (gcond1);
> > +      tree_code code2 = gimple_assign_rhs_code (gcond2);
> > +      return TREE_CODE_CLASS (code1) == tcc_comparison
> > +            && TREE_CODE_CLASS (code2) == tcc_comparison
> > +            && inverse_conditions_p_1 (code1,
> > +                                       gimple_arg (gcond1, 0),
> > +                                       gimple_arg (gcond1, 1),
> > +                                       code2,
> > +                                       gimple_arg (gcond2, 0),
> > +                                       gimple_arg (gcond2, 1));
> > +    }
> > +
> > +  return false;
> >  }
> >
> >  /* Return a tree for the comparison which is the combination of diff
> > --git a/gcc/match.pd b/gcc/match.pd index
> >
> 24cbbbb5bc1d718bd03af7712fc7255213f2a742..0a459f2804e296f4336aee70e3
> 51
> > ddc45486c867 100644
> > --- a/gcc/match.pd
> > +++ b/gcc/match.pd
> > @@ -1872,6 +1872,30 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> >   (if (INTEGRAL_TYPE_P (type))
> >    (bit_and @0 @1)))
> >
> > +/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.
> > +   and vector Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c :
> > +d. */ (simplify  (bit_ior
> > +  (bit_and:c (convert? zero_one_valued_p@0) @2)
> > +  (bit_and:c (convert? zero_one_valued_p@1) @3))
> > +   (if (INTEGRAL_TYPE_P (type)
> > +       && inverse_conditions_p (@0, @1)
> > +       /* The scalar version has to be canonicalized after vectorization
> > +          because it makes unconditional loads conditional ones, which
> > +          means we lose vectorization because the loads may trap.  */
> > +       && canonicalize_math_after_vectorization_p ())
> > +    (bit_and (cond @0 @2 @3) { build_one_cst (type); })))
> 
> I Know this is not part of the current pattern but could you add:
> (((a < b) & c) | ((-(a >= b)) & d)) -> a < b ? c : d
> 
> Not your fault but there are now like two different predicates for a boolean
> like operand.
> zero_one_valued_p and truth_valued_p and a third way to describe it is to
> use SSA_NAME and check ssa_name_has_boolean_range.
> 
> Also why can't you just do:
> Something similar like:
> + /* Similar but for comparisons which have been inverted already,
> +    Note it is hard to similulate inverted tcc_comparison due to NaNs
> +    so a double for loop is needed and then compare the inverse code
> +    with the result of invert_tree_comparison is needed.  */ (for cmp
> + (tcc_comparison)  (for icmp (tcc_comparison)
> +   (simplify
> +    (bitop:c (rbitop:c (icmp @0 @1) @2) (cmp@3 @0 @1))
> +     (with { enum tree_code ic = invert_tree_comparison
> +             (cmp, HONOR_NANS (@0)); }
> +      (if (ic == icmp)
> +       (bitop @3 @2)))))))
> 
> Where you match everything in match and simplify instead of needing to use
> inverse_conditions_p?

As I mentioned above in the reply to Richi, this is because I can't apply the predicate
zero_one_valued_p to the comparison result if I decompose the comparison.
zero_one_valued_p@(cmp @0 @1) and other variants I've tried do not compile.

Is there a way I can do so?

Thanks,
Tamar

> 
> Thanks,
> Andrew Pinski
> 
> 
> > +(simplify
> > + (bit_ior
> > +  (bit_and:c (vec_cond:s @0 @4 integer_zerop) @2)
> > +  (bit_and:c (vec_cond:s @1 @4 integer_zerop) @3))
> > +   (if (inverse_conditions_p (@0, @1))
> > +    (switch
> > +     (if (integer_onep (@4))
> > +      (bit_and (vec_cond @0 @2 @3) @4))
> > +     (if (integer_minus_onep (@4))
> > +      (vec_cond @0 @2 @3)))))
> > +
> >  /* Transform X & -Y into X * Y when Y is { 0 or 1 }.  */  (simplify
> >   (bit_and:c (convert? (negate zero_one_valued_p@0)) @1) diff --git
> > a/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> > b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..05a1292fa90c70b14a7985122f
> 43
> > 711f55d047ea
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> > @@ -0,0 +1,16 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +/*
> > +**zoo2:
> > +**     cmp     w0, w1
> > +**     csel    w0, w2, w3, lt
> > +**     and     w0, w0, 1
> > +**     ret
> > +*/
> > +int zoo2 (int a, int b, int c, int d) {
> > +   return ((a < b) & c) | ((a >= b) & d); }
> > +
> > diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> > b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..34bc65f5db10eae81b8dee331
> 6df
> > b7d12bf471c8
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> > @@ -0,0 +1,32 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O3 -std=c99" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +typedef int v4si __attribute__ ((vector_size (16)));
> > +
> > +/*
> > +**foo:
> > +**     cmgt    v0.4s, v1.4s, v0.4s
> > +**     bsl     v0.16b, v2.16b, v3.16b
> > +**     ret
> > +*/
> > +v4si foo (v4si a, v4si b, v4si c, v4si d) {
> > +    return ((a < b) & c) | ((a >= b) & d); }
> > +
> > +
> > +/**
> > +**bar:
> > +**...
> > +**     cmge    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> > +**     bsl     v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > +**     and     v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > +**...
> > +*/
> > +void bar (int * restrict a, int * restrict b, int * restrict c,
> > +         int * restrict d, int * restrict res, int n) {
> > +  for (int i = 0; i < (n & -4); i++)
> > +    res[i] = ((a[i] < b[i]) & c[i]) | ((a[i] >= b[i]) & d[i]); }
> > +
  
Andrew Pinski July 6, 2022, 7:37 p.m. UTC | #5
On Wed, Jul 6, 2022 at 9:06 AM Tamar Christina <Tamar.Christina@arm.com> wrote:
>
> > -----Original Message-----
> > From: Andrew Pinski <pinskia@gmail.com>
> > Sent: Wednesday, July 6, 2022 3:10 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: Richard Biener <rguenther@suse.de>; nd <nd@arm.com>; gcc-
> > patches@gcc.gnu.org
> > Subject: Re: [PATCH]middle-end simplify complex if expressions where
> > comparisons are inverse of one another.
> >
> > On Tue, Jul 5, 2022 at 8:16 AM Tamar Christina via Gcc-patches <gcc-
> > patches@gcc.gnu.org> wrote:
> > >
> > >
> > >
> > > > -----Original Message-----
> > > > From: Richard Biener <rguenther@suse.de>
> > > > Sent: Monday, June 20, 2022 9:57 AM
> > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> > > > Subject: Re: [PATCH]middle-end simplify complex if expressions where
> > > > comparisons are inverse of one another.
> > > >
> > > > On Thu, 16 Jun 2022, Tamar Christina wrote:
> > > >
> > > > > Hi All,
> > > > >
> > > > > This optimizes the following sequence
> > > > >
> > > > >   ((a < b) & c) | ((a >= b) & d)
> > > > >
> > > > > into
> > > > >
> > > > >   (a < b ? c : d) & 1
> > > > >
> > > > > for scalar. On vector we can omit the & 1.
> > > > >
> > > > > This changes the code generation from
> > > > >
> > > > > zoo2:
> > > > >     cmp     w0, w1
> > > > >     cset    w0, lt
> > > > >     cset    w1, ge
> > > > >     and     w0, w0, w2
> > > > >     and     w1, w1, w3
> > > > >     orr     w0, w0, w1
> > > > >     ret
> > > > >
> > > > > into
> > > > >
> > > > >     cmp     w0, w1
> > > > >     csel    w0, w2, w3, lt
> > > > >     and     w0, w0, 1
> > > > >     ret
> > > > >
> > > > > and significantly reduces the number of selects we have to do in
> > > > > the vector code.
> > > > >
> > > > > Bootstrapped Regtested on aarch64-none-linux-gnu,
> > > > > x86_64-pc-linux-gnu and no issues.
> > > > >
> > > > > Ok for master?
> > > > >
> > > > > Thanks,
> > > > > Tamar
> > > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > >     * fold-const.cc (inverse_conditions_p): Traverse if SSA_NAME.
> > > > >     * match.pd: Add new rule.
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > >     * gcc.target/aarch64/if-compare_1.c: New test.
> > > > >     * gcc.target/aarch64/if-compare_2.c: New test.
> > > > >
> > > > > --- inline copy of patch --
> > > > > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc index
> > > > >
> > > >
> > 39a5a52958d87497f301826e706886b290771a2d..f180599b90150acd3ed895a64
> > > > 280
> > > > > aa3255061256 100644
> > > > > --- a/gcc/fold-const.cc
> > > > > +++ b/gcc/fold-const.cc
> > > > > @@ -2833,15 +2833,38 @@ compcode_to_comparison (enum
> > > > comparison_code
> > > > > code)  bool  inverse_conditions_p (const_tree cond1, const_tree
> > > > > cond2) {
> > > > > -  return (COMPARISON_CLASS_P (cond1)
> > > > > -     && COMPARISON_CLASS_P (cond2)
> > > > > -     && (invert_tree_comparison
> > > > > -         (TREE_CODE (cond1),
> > > > > -          HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE
> > > > (cond2))
> > > > > -     && operand_equal_p (TREE_OPERAND (cond1, 0),
> > > > > -                         TREE_OPERAND (cond2, 0), 0)
> > > > > -     && operand_equal_p (TREE_OPERAND (cond1, 1),
> > > > > -                         TREE_OPERAND (cond2, 1), 0));
> > > > > +  if (COMPARISON_CLASS_P (cond1)
> > > > > +      && COMPARISON_CLASS_P (cond2)
> > > > > +      && (invert_tree_comparison
> > > > > +      (TREE_CODE (cond1),
> > > > > +       HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE
> > > > (cond2))
> > > > > +      && operand_equal_p (TREE_OPERAND (cond1, 0),
> > > > > +                     TREE_OPERAND (cond2, 0), 0)
> > > > > +      && operand_equal_p (TREE_OPERAND (cond1, 1),
> > > > > +                     TREE_OPERAND (cond2, 1), 0))
> > > > > +    return true;
> > > > > +
> > > > > +  if (TREE_CODE (cond1) == SSA_NAME
> > > > > +      && TREE_CODE (cond2) == SSA_NAME)
> > > > > +    {
> > > > > +      gimple *gcond1 = SSA_NAME_DEF_STMT (cond1);
> > > > > +      gimple *gcond2 = SSA_NAME_DEF_STMT (cond2);
> > > > > +      if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2))
> > > > > +   return false;
> > > > > +
> > > > > +      tree_code code1 = gimple_assign_rhs_code (gcond1);
> > > > > +      tree_code code2 = gimple_assign_rhs_code (gcond2);
> > > > > +      return TREE_CODE_CLASS (code1) == tcc_comparison
> > > > > +        && TREE_CODE_CLASS (code2) == tcc_comparison
> > > > > +        && invert_tree_comparison (code1,
> > > > > +             HONOR_NANS (gimple_arg (gcond1, 0))) == code2
> > > > > +        && operand_equal_p (gimple_arg (gcond1, 0),
> > > > > +                            gimple_arg (gcond2, 0), 0)
> > > > > +        && operand_equal_p (gimple_arg (gcond1, 1),
> > > > > +                            gimple_arg (gcond2, 1), 0);
> > > > > +    }
> > > > > +
> > > > > +  return false;
> > > >
> > > > if we do extend inverse_condition_p please add an overload like
> > >
> > > Done.
> > >
> > > >
> > > > bool
> > > > inverse_condition_p (enum tree_code, tree op00, tree op01,
> > > >                      enum tree_code, tree op10, tree op11)
> > > >
> > > > so you can avoid some code duplication here.
> > > >
> > > > >  }
> > > > >
> > > > >  /* Return a tree for the comparison which is the combination of
> > > > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > > > >
> > > >
> > 6d691d302b339c0e4556b40af158b5208c12d08f..bad49dd348add751d9ec1e30
> > > > 23e3
> > > > > 4d9ac123194f 100644
> > > > > --- a/gcc/match.pd
> > > > > +++ b/gcc/match.pd
> > > > > @@ -1160,6 +1160,32 @@ DEFINE_INT_AND_FLOAT_ROUND_FN
> > (RINT)
> > > > >       (convert (bit_and (negate (convert:utype { pmop[0]; }))
> > > > >                    (convert:utype @1)))))))
> > > > >
> > > > > +/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.
> > > > > +*/ (simplify  (bit_ior
> > > > > +  (bit_and:c (convert? @0) @2)
> > > > > +  (bit_and:c (convert? @1) @3))
> > > >
> > > > in case the comparison returns a signed bool this might turn out wrong.
> > > > Maybe simply use zero_one_valued_p@0 here instead of (convert?
> > @0)?
> > >
> > > I think I still need the convert as the comparison gets promoted to
> > > int and the predicate doesn't handle extensions.
> > >
> > > So I left the convert but added the zero_one_valued_p@0 such that it's
> > > checking that the result of the comparison itself is at least 0 or 1.
> > >
> > > >
> > > > > +   (if (inverse_conditions_p (@0, @1)
> > > > > +   /* The scalar version has to be canonicalized after vectorization
> > > > > +      because it makes unconditional loads conditional ones, which
> > > > > +      means we lose vectorization because the loads may trap.  */
> > > > > +   && canonicalize_math_after_vectorization_p ())
> > > > > +    (bit_and (cond @0 @2 @3) { build_each_one_cst (type); })))
> > > >
> > > > I think you should restrict this to INTEGRAL_TYPE_P and use
> > > > build_one_cst
> > > > (type) (also see below).
> > > >
> > > > you can do inverse_onditions_p with lock-step for over
> > > > tcc_comparison and inverted_tcc_comparison{,_with_nans} (see existing
> > examples).
> > >
> > > I couldn't figure out how to combine this approach with the
> > > zero_one_valued_p predicate. The zero_one_valued_p would need to be
> > on
> > > (cmp @0 @1) but don't think that is allowed.
> > >
> > > >
> > > > > +(simplify
> > > > > + (bit_ior
> > > > > +  (bit_and:c (convert? (vec_cond:s @0 @4 integer_zerop)) @2)
> > > > > +  (bit_and:c (convert? (vec_cond:s @1 @4 integer_zerop)) @3))
> > > > > +   (if (inverse_conditions_p (@0, @1)
> > > > > +   && integer_onep (@4))
> > > > > +    (bit_and (vec_cond @0 @2 @3) @4)))
> > > > > +/* Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d.  */
> > > > > +(simplify  (bit_ior
> > > > > +  (bit_and:c (convert? (vec_cond:s @0 integer_minus_onep
> > > > > +integer_zerop)) @2)
> > > > > +  (bit_and:c (convert? (vec_cond:s @1 integer_minus_onep
> > > > integer_zerop)) @3))
> > > > > +   (if (inverse_conditions_p (@0, @1))
> > > > > +    (vec_cond @0 @2 @3)))
> > > >
> > > > I think the duplication is pre-mature optimization - we should get
> > > > the (bit_and (..) integer_minus_onep) simplified.  Also didn't we
> > > > have (vec_cond
> > > > @0 -1 0) -> (view_convert @0) when the modes match?
> > >
> > > This wasn't in my tree at the time, I could use this representation
> > > instead but it wouldn't shorten the match tree. Combining them as you
> > > suggested below seems most optimal.
> > >
> > > > We might want to add (match zero_minus_one_valued_p) or use
> > > > truth_valued_p (with appropriate vector semantics, plus extend it).
> > > > Why do you need (convert? ...) for the vector case btw?
> > > >
> > >
> > > Since we have to defer the scalar version then the vectorizer won't
> > >
> > > > I guess the integral type and vector type cases are similar enough
> > > > that the patterns can be merged with conditionalizing only the
> > replacement.
> > > >
> > >
> > > Merged.
> > >
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > > and no issues.
> > >
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > >         * fold-const.cc (inverse_conditions_p): Traverse if SSA_NAME.
> > >         (inverse_conditions_p_1): New.
> > >         * match.pd: Add new rule.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.target/aarch64/if-compare_1.c: New test.
> > >         * gcc.target/aarch64/if-compare_2.c: New test.
> > >
> > > --- inline copy of patch ---
> > >
> > > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc index
> > >
> > 99021a82df4977b179b45db04e3083012c63067a..0628ffd395416d74bc031e3e4
> > ac8
> > > 246894ca564d 100644
> > > --- a/gcc/fold-const.cc
> > > +++ b/gcc/fold-const.cc
> > > @@ -2829,20 +2829,55 @@ compcode_to_comparison (enum
> > comparison_code code)
> > >      }
> > >  }
> > >
> > > +
> > > +/* Helper of inverse_condition_p.  Returns TRUE if CODE1 is the inverse of
> > > +   CODE2 and OP00 == OP10 and OP01 == OP11.  */
> > > +
> > > +static bool
> > > +inverse_conditions_p_1 (enum tree_code code1, tree op00, tree op01,
> > > +                       enum tree_code code2, tree op10, tree op11) {
> > > +  return (invert_tree_comparison (code1, HONOR_NANS (op00)) ==
> > code2)
> > > +      && operand_equal_p (op00, op10, 0)
> > > +      && operand_equal_p (op01, op11, 0); }
> > > +
> > >  /* Return true if COND1 tests the opposite condition of COND2.  */
> > >
> > >  bool
> > >  inverse_conditions_p (const_tree cond1, const_tree cond2)  {
> > > -  return (COMPARISON_CLASS_P (cond1)
> > > -         && COMPARISON_CLASS_P (cond2)
> > > -         && (invert_tree_comparison
> > > -             (TREE_CODE (cond1),
> > > -              HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE
> > (cond2))
> > > -         && operand_equal_p (TREE_OPERAND (cond1, 0),
> > > -                             TREE_OPERAND (cond2, 0), 0)
> > > -         && operand_equal_p (TREE_OPERAND (cond1, 1),
> > > -                             TREE_OPERAND (cond2, 1), 0));
> > > +  if (COMPARISON_CLASS_P (cond1)
> > > +      && COMPARISON_CLASS_P (cond2)
> > > +      && inverse_conditions_p_1 (TREE_CODE (cond1),
> > > +                                TREE_OPERAND (cond1, 0),
> > > +                                TREE_OPERAND (cond1, 1),
> > > +                                TREE_CODE (cond2),
> > > +                                TREE_OPERAND (cond2, 0),
> > > +                                TREE_OPERAND (cond2, 1)))
> > > +    return true;
> > > +
> > > +  if (TREE_CODE (cond1) == SSA_NAME
> > > +      && TREE_CODE (cond2) == SSA_NAME)
> > > +    {
> > > +      gimple *gcond1 = SSA_NAME_DEF_STMT (cond1);
> > > +      gimple *gcond2 = SSA_NAME_DEF_STMT (cond2);
> > > +      if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2))
> > > +       return false;
> > > +
> > > +      tree_code code1 = gimple_assign_rhs_code (gcond1);
> > > +      tree_code code2 = gimple_assign_rhs_code (gcond2);
> > > +      return TREE_CODE_CLASS (code1) == tcc_comparison
> > > +            && TREE_CODE_CLASS (code2) == tcc_comparison
> > > +            && inverse_conditions_p_1 (code1,
> > > +                                       gimple_arg (gcond1, 0),
> > > +                                       gimple_arg (gcond1, 1),
> > > +                                       code2,
> > > +                                       gimple_arg (gcond2, 0),
> > > +                                       gimple_arg (gcond2, 1));
> > > +    }
> > > +
> > > +  return false;
> > >  }
> > >
> > >  /* Return a tree for the comparison which is the combination of diff
> > > --git a/gcc/match.pd b/gcc/match.pd index
> > >
> > 24cbbbb5bc1d718bd03af7712fc7255213f2a742..0a459f2804e296f4336aee70e3
> > 51
> > > ddc45486c867 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -1872,6 +1872,30 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >   (if (INTEGRAL_TYPE_P (type))
> > >    (bit_and @0 @1)))
> > >
> > > +/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.
> > > +   and vector Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c :
> > > +d. */ (simplify  (bit_ior
> > > +  (bit_and:c (convert? zero_one_valued_p@0) @2)
> > > +  (bit_and:c (convert? zero_one_valued_p@1) @3))
> > > +   (if (INTEGRAL_TYPE_P (type)
> > > +       && inverse_conditions_p (@0, @1)
> > > +       /* The scalar version has to be canonicalized after vectorization
> > > +          because it makes unconditional loads conditional ones, which
> > > +          means we lose vectorization because the loads may trap.  */
> > > +       && canonicalize_math_after_vectorization_p ())
> > > +    (bit_and (cond @0 @2 @3) { build_one_cst (type); })))
> >
> > I Know this is not part of the current pattern but could you add:
> > (((a < b) & c) | ((-(a >= b)) & d)) -> a < b ? c : d
> >
> > Not your fault but there are now like two different predicates for a boolean
> > like operand.
> > zero_one_valued_p and truth_valued_p and a third way to describe it is to
> > use SSA_NAME and check ssa_name_has_boolean_range.
> >
> > Also why can't you just do:
> > Something similar like:
> > + /* Similar but for comparisons which have been inverted already,
> > +    Note it is hard to similulate inverted tcc_comparison due to NaNs
> > +    so a double for loop is needed and then compare the inverse code
> > +    with the result of invert_tree_comparison is needed.  */ (for cmp
> > + (tcc_comparison)  (for icmp (tcc_comparison)
> > +   (simplify
> > +    (bitop:c (rbitop:c (icmp @0 @1) @2) (cmp@3 @0 @1))
> > +     (with { enum tree_code ic = invert_tree_comparison
> > +             (cmp, HONOR_NANS (@0)); }
> > +      (if (ic == icmp)
> > +       (bitop @3 @2)))))))
> >
> > Where you match everything in match and simplify instead of needing to use
> > inverse_conditions_p?
>
> As I mentioned above in the reply to Richi, this is because I can't apply the predicate
> zero_one_valued_p to the comparison result if I decompose the comparison.
> zero_one_valued_p@(cmp @0 @1) and other variants I've tried do not compile.
>
> Is there a way I can do so?


Thinking about it maybe adding a check for non vector type or just a
boolean type, comparisons will always have either a bool or an vector
type.
Note you might need to check for signed vs unsigned bool's because of
the convert (and yes some languages have signed bools, Ada).
That is something like (formating needs to be fixed):
(for cmp (tcc_comparison)
  (for icmp (tcc_comparison)
   (simplify
 (bit_ior
  (bit_and:c (convert? (cmp:c@0 @4 @5)) @2)
  (bit_and:c (convert? (icmp@1 @4 @5)) @3))
   (if (TREE_CODE (TREE_TYPE (@0)) == BOOLEAN_TYPE
        && TYPE_SIGN (TREE_TYPE (@0)) == UNSIGNED
        && TREE_CODE (TREE_TYPE (@1)) == BOOLEAN_TYPE
        && TYPE_SIGN (TREE_TYPE (@1)) == UNSIGNED
       /* The scalar version has to be canonicalized after vectorization
          because it makes unconditional loads conditional ones, which
          means we lose vectorization because the loads may trap.  */
       && canonicalize_math_after_vectorization_p ())
     (with { enum tree_code ic = invert_tree_comparison
             (cmp, HONOR_NANS (@0)); }
     (if (ic == icmp)
    (bit_and (cond @0 @2 @3) { build_each_one_cst (type); })))


>
> Thanks,
> Tamar
>
> >
> > Thanks,
> > Andrew Pinski
> >
> >
> > > +(simplify
> > > + (bit_ior
> > > +  (bit_and:c (vec_cond:s @0 @4 integer_zerop) @2)
> > > +  (bit_and:c (vec_cond:s @1 @4 integer_zerop) @3))
> > > +   (if (inverse_conditions_p (@0, @1))
> > > +    (switch
> > > +     (if (integer_onep (@4))
> > > +      (bit_and (vec_cond @0 @2 @3) @4))
> > > +     (if (integer_minus_onep (@4))
> > > +      (vec_cond @0 @2 @3)))))
> > > +
> > >  /* Transform X & -Y into X * Y when Y is { 0 or 1 }.  */  (simplify
> > >   (bit_and:c (convert? (negate zero_one_valued_p@0)) @1) diff --git
> > > a/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> > > b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..05a1292fa90c70b14a7985122f
> > 43
> > > 711f55d047ea
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> > > @@ -0,0 +1,16 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-additional-options "-O" } */
> > > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > > +} */
> > > +
> > > +/*
> > > +**zoo2:
> > > +**     cmp     w0, w1
> > > +**     csel    w0, w2, w3, lt
> > > +**     and     w0, w0, 1
> > > +**     ret
> > > +*/
> > > +int zoo2 (int a, int b, int c, int d) {
> > > +   return ((a < b) & c) | ((a >= b) & d); }
> > > +
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> > > b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..34bc65f5db10eae81b8dee331
> > 6df
> > > b7d12bf471c8
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> > > @@ -0,0 +1,32 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-additional-options "-O3 -std=c99" } */
> > > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > > +} */
> > > +
> > > +typedef int v4si __attribute__ ((vector_size (16)));
> > > +
> > > +/*
> > > +**foo:
> > > +**     cmgt    v0.4s, v1.4s, v0.4s
> > > +**     bsl     v0.16b, v2.16b, v3.16b
> > > +**     ret
> > > +*/
> > > +v4si foo (v4si a, v4si b, v4si c, v4si d) {
> > > +    return ((a < b) & c) | ((a >= b) & d); }
> > > +
> > > +
> > > +/**
> > > +**bar:
> > > +**...
> > > +**     cmge    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> > > +**     bsl     v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > +**     and     v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > +**...
> > > +*/
> > > +void bar (int * restrict a, int * restrict b, int * restrict c,
> > > +         int * restrict d, int * restrict res, int n) {
> > > +  for (int i = 0; i < (n & -4); i++)
> > > +    res[i] = ((a[i] < b[i]) & c[i]) | ((a[i] >= b[i]) & d[i]); }
> > > +
  
Tamar Christina July 7, 2022, 7:48 a.m. UTC | #6
> -----Original Message-----
> From: Andrew Pinski <pinskia@gmail.com>
> Sent: Wednesday, July 6, 2022 8:37 PM
> To: Tamar Christina <Tamar.Christina@arm.com>
> Cc: Richard Biener <rguenther@suse.de>; nd <nd@arm.com>; gcc-
> patches@gcc.gnu.org
> Subject: Re: [PATCH]middle-end simplify complex if expressions where
> comparisons are inverse of one another.
> 
> On Wed, Jul 6, 2022 at 9:06 AM Tamar Christina <Tamar.Christina@arm.com>
> wrote:
> >
> > > -----Original Message-----
> > > From: Andrew Pinski <pinskia@gmail.com>
> > > Sent: Wednesday, July 6, 2022 3:10 AM
> > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > Cc: Richard Biener <rguenther@suse.de>; nd <nd@arm.com>; gcc-
> > > patches@gcc.gnu.org
> > > Subject: Re: [PATCH]middle-end simplify complex if expressions where
> > > comparisons are inverse of one another.
> > >
> > > On Tue, Jul 5, 2022 at 8:16 AM Tamar Christina via Gcc-patches <gcc-
> > > patches@gcc.gnu.org> wrote:
> > > >
> > > >
> > > >
> > > > > -----Original Message-----
> > > > > From: Richard Biener <rguenther@suse.de>
> > > > > Sent: Monday, June 20, 2022 9:57 AM
> > > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> > > > > Subject: Re: [PATCH]middle-end simplify complex if expressions
> > > > > where comparisons are inverse of one another.
> > > > >
> > > > > On Thu, 16 Jun 2022, Tamar Christina wrote:
> > > > >
> > > > > > Hi All,
> > > > > >
> > > > > > This optimizes the following sequence
> > > > > >
> > > > > >   ((a < b) & c) | ((a >= b) & d)
> > > > > >
> > > > > > into
> > > > > >
> > > > > >   (a < b ? c : d) & 1
> > > > > >
> > > > > > for scalar. On vector we can omit the & 1.
> > > > > >
> > > > > > This changes the code generation from
> > > > > >
> > > > > > zoo2:
> > > > > >     cmp     w0, w1
> > > > > >     cset    w0, lt
> > > > > >     cset    w1, ge
> > > > > >     and     w0, w0, w2
> > > > > >     and     w1, w1, w3
> > > > > >     orr     w0, w0, w1
> > > > > >     ret
> > > > > >
> > > > > > into
> > > > > >
> > > > > >     cmp     w0, w1
> > > > > >     csel    w0, w2, w3, lt
> > > > > >     and     w0, w0, 1
> > > > > >     ret
> > > > > >
> > > > > > and significantly reduces the number of selects we have to do
> > > > > > in the vector code.
> > > > > >
> > > > > > Bootstrapped Regtested on aarch64-none-linux-gnu,
> > > > > > x86_64-pc-linux-gnu and no issues.
> > > > > >
> > > > > > Ok for master?
> > > > > >
> > > > > > Thanks,
> > > > > > Tamar
> > > > > >
> > > > > > gcc/ChangeLog:
> > > > > >
> > > > > >     * fold-const.cc (inverse_conditions_p): Traverse if SSA_NAME.
> > > > > >     * match.pd: Add new rule.
> > > > > >
> > > > > > gcc/testsuite/ChangeLog:
> > > > > >
> > > > > >     * gcc.target/aarch64/if-compare_1.c: New test.
> > > > > >     * gcc.target/aarch64/if-compare_2.c: New test.
> > > > > >
> > > > > > --- inline copy of patch --
> > > > > > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc index
> > > > > >
> > > > >
> > >
> 39a5a52958d87497f301826e706886b290771a2d..f180599b90150acd3ed895a64
> > > > > 280
> > > > > > aa3255061256 100644
> > > > > > --- a/gcc/fold-const.cc
> > > > > > +++ b/gcc/fold-const.cc
> > > > > > @@ -2833,15 +2833,38 @@ compcode_to_comparison (enum
> > > > > comparison_code
> > > > > > code)  bool  inverse_conditions_p (const_tree cond1,
> > > > > > const_tree
> > > > > > cond2) {
> > > > > > -  return (COMPARISON_CLASS_P (cond1)
> > > > > > -     && COMPARISON_CLASS_P (cond2)
> > > > > > -     && (invert_tree_comparison
> > > > > > -         (TREE_CODE (cond1),
> > > > > > -          HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE
> > > > > (cond2))
> > > > > > -     && operand_equal_p (TREE_OPERAND (cond1, 0),
> > > > > > -                         TREE_OPERAND (cond2, 0), 0)
> > > > > > -     && operand_equal_p (TREE_OPERAND (cond1, 1),
> > > > > > -                         TREE_OPERAND (cond2, 1), 0));
> > > > > > +  if (COMPARISON_CLASS_P (cond1)
> > > > > > +      && COMPARISON_CLASS_P (cond2)
> > > > > > +      && (invert_tree_comparison
> > > > > > +      (TREE_CODE (cond1),
> > > > > > +       HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE
> > > > > (cond2))
> > > > > > +      && operand_equal_p (TREE_OPERAND (cond1, 0),
> > > > > > +                     TREE_OPERAND (cond2, 0), 0)
> > > > > > +      && operand_equal_p (TREE_OPERAND (cond1, 1),
> > > > > > +                     TREE_OPERAND (cond2, 1), 0))
> > > > > > +    return true;
> > > > > > +
> > > > > > +  if (TREE_CODE (cond1) == SSA_NAME
> > > > > > +      && TREE_CODE (cond2) == SSA_NAME)
> > > > > > +    {
> > > > > > +      gimple *gcond1 = SSA_NAME_DEF_STMT (cond1);
> > > > > > +      gimple *gcond2 = SSA_NAME_DEF_STMT (cond2);
> > > > > > +      if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2))
> > > > > > +   return false;
> > > > > > +
> > > > > > +      tree_code code1 = gimple_assign_rhs_code (gcond1);
> > > > > > +      tree_code code2 = gimple_assign_rhs_code (gcond2);
> > > > > > +      return TREE_CODE_CLASS (code1) == tcc_comparison
> > > > > > +        && TREE_CODE_CLASS (code2) == tcc_comparison
> > > > > > +        && invert_tree_comparison (code1,
> > > > > > +             HONOR_NANS (gimple_arg (gcond1, 0))) == code2
> > > > > > +        && operand_equal_p (gimple_arg (gcond1, 0),
> > > > > > +                            gimple_arg (gcond2, 0), 0)
> > > > > > +        && operand_equal_p (gimple_arg (gcond1, 1),
> > > > > > +                            gimple_arg (gcond2, 1), 0);
> > > > > > +    }
> > > > > > +
> > > > > > +  return false;
> > > > >
> > > > > if we do extend inverse_condition_p please add an overload like
> > > >
> > > > Done.
> > > >
> > > > >
> > > > > bool
> > > > > inverse_condition_p (enum tree_code, tree op00, tree op01,
> > > > >                      enum tree_code, tree op10, tree op11)
> > > > >
> > > > > so you can avoid some code duplication here.
> > > > >
> > > > > >  }
> > > > > >
> > > > > >  /* Return a tree for the comparison which is the combination
> > > > > > of diff --git a/gcc/match.pd b/gcc/match.pd index
> > > > > >
> > > > >
> > >
> 6d691d302b339c0e4556b40af158b5208c12d08f..bad49dd348add751d9ec1e30
> > > > > 23e3
> > > > > > 4d9ac123194f 100644
> > > > > > --- a/gcc/match.pd
> > > > > > +++ b/gcc/match.pd
> > > > > > @@ -1160,6 +1160,32 @@ DEFINE_INT_AND_FLOAT_ROUND_FN
> > > (RINT)
> > > > > >       (convert (bit_and (negate (convert:utype { pmop[0]; }))
> > > > > >                    (convert:utype @1)))))))
> > > > > >
> > > > > > +/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.
> > > > > > +*/ (simplify  (bit_ior
> > > > > > +  (bit_and:c (convert? @0) @2)
> > > > > > +  (bit_and:c (convert? @1) @3))
> > > > >
> > > > > in case the comparison returns a signed bool this might turn out
> wrong.
> > > > > Maybe simply use zero_one_valued_p@0 here instead of (convert?
> > > @0)?
> > > >
> > > > I think I still need the convert as the comparison gets promoted
> > > > to int and the predicate doesn't handle extensions.
> > > >
> > > > So I left the convert but added the zero_one_valued_p@0 such that
> > > > it's checking that the result of the comparison itself is at least 0 or 1.
> > > >
> > > > >
> > > > > > +   (if (inverse_conditions_p (@0, @1)
> > > > > > +   /* The scalar version has to be canonicalized after vectorization
> > > > > > +      because it makes unconditional loads conditional ones, which
> > > > > > +      means we lose vectorization because the loads may trap.  */
> > > > > > +   && canonicalize_math_after_vectorization_p ())
> > > > > > +    (bit_and (cond @0 @2 @3) { build_each_one_cst (type);
> > > > > > + })))
> > > > >
> > > > > I think you should restrict this to INTEGRAL_TYPE_P and use
> > > > > build_one_cst
> > > > > (type) (also see below).
> > > > >
> > > > > you can do inverse_onditions_p with lock-step for over
> > > > > tcc_comparison and inverted_tcc_comparison{,_with_nans} (see
> > > > > existing
> > > examples).
> > > >
> > > > I couldn't figure out how to combine this approach with the
> > > > zero_one_valued_p predicate. The zero_one_valued_p would need to
> > > > be
> > > on
> > > > (cmp @0 @1) but don't think that is allowed.
> > > >
> > > > >
> > > > > > +(simplify
> > > > > > + (bit_ior
> > > > > > +  (bit_and:c (convert? (vec_cond:s @0 @4 integer_zerop)) @2)
> > > > > > +  (bit_and:c (convert? (vec_cond:s @1 @4 integer_zerop)) @3))
> > > > > > +   (if (inverse_conditions_p (@0, @1)
> > > > > > +   && integer_onep (@4))
> > > > > > +    (bit_and (vec_cond @0 @2 @3) @4)))
> > > > > > +/* Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d.
> > > > > > +*/ (simplify  (bit_ior
> > > > > > +  (bit_and:c (convert? (vec_cond:s @0 integer_minus_onep
> > > > > > +integer_zerop)) @2)
> > > > > > +  (bit_and:c (convert? (vec_cond:s @1 integer_minus_onep
> > > > > integer_zerop)) @3))
> > > > > > +   (if (inverse_conditions_p (@0, @1))
> > > > > > +    (vec_cond @0 @2 @3)))
> > > > >
> > > > > I think the duplication is pre-mature optimization - we should
> > > > > get the (bit_and (..) integer_minus_onep) simplified.  Also
> > > > > didn't we have (vec_cond
> > > > > @0 -1 0) -> (view_convert @0) when the modes match?
> > > >
> > > > This wasn't in my tree at the time, I could use this
> > > > representation instead but it wouldn't shorten the match tree.
> > > > Combining them as you suggested below seems most optimal.
> > > >
> > > > > We might want to add (match zero_minus_one_valued_p) or use
> > > > > truth_valued_p (with appropriate vector semantics, plus extend it).
> > > > > Why do you need (convert? ...) for the vector case btw?
> > > > >
> > > >
> > > > Since we have to defer the scalar version then the vectorizer
> > > > won't
> > > >
> > > > > I guess the integral type and vector type cases are similar
> > > > > enough that the patterns can be merged with conditionalizing
> > > > > only the
> > > replacement.
> > > > >
> > > >
> > > > Merged.
> > > >
> > > >
> > > > Bootstrapped Regtested on aarch64-none-linux-gnu,
> > > > x86_64-pc-linux-gnu and no issues.
> > > >
> > > > Ok for master?
> > > >
> > > > Thanks,
> > > > Tamar
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > >         * fold-const.cc (inverse_conditions_p): Traverse if SSA_NAME.
> > > >         (inverse_conditions_p_1): New.
> > > >         * match.pd: Add new rule.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > >         * gcc.target/aarch64/if-compare_1.c: New test.
> > > >         * gcc.target/aarch64/if-compare_2.c: New test.
> > > >
> > > > --- inline copy of patch ---
> > > >
> > > > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc index
> > > >
> > >
> 99021a82df4977b179b45db04e3083012c63067a..0628ffd395416d74bc031e3e4
> > > ac8
> > > > 246894ca564d 100644
> > > > --- a/gcc/fold-const.cc
> > > > +++ b/gcc/fold-const.cc
> > > > @@ -2829,20 +2829,55 @@ compcode_to_comparison (enum
> > > comparison_code code)
> > > >      }
> > > >  }
> > > >
> > > > +
> > > > +/* Helper of inverse_condition_p.  Returns TRUE if CODE1 is the
> inverse of
> > > > +   CODE2 and OP00 == OP10 and OP01 == OP11.  */
> > > > +
> > > > +static bool
> > > > +inverse_conditions_p_1 (enum tree_code code1, tree op00, tree
> op01,
> > > > +                       enum tree_code code2, tree op10, tree
> > > > +op11) {
> > > > +  return (invert_tree_comparison (code1, HONOR_NANS (op00)) ==
> > > code2)
> > > > +      && operand_equal_p (op00, op10, 0)
> > > > +      && operand_equal_p (op01, op11, 0); }
> > > > +
> > > >  /* Return true if COND1 tests the opposite condition of COND2.
> > > > */
> > > >
> > > >  bool
> > > >  inverse_conditions_p (const_tree cond1, const_tree cond2)  {
> > > > -  return (COMPARISON_CLASS_P (cond1)
> > > > -         && COMPARISON_CLASS_P (cond2)
> > > > -         && (invert_tree_comparison
> > > > -             (TREE_CODE (cond1),
> > > > -              HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE
> > > (cond2))
> > > > -         && operand_equal_p (TREE_OPERAND (cond1, 0),
> > > > -                             TREE_OPERAND (cond2, 0), 0)
> > > > -         && operand_equal_p (TREE_OPERAND (cond1, 1),
> > > > -                             TREE_OPERAND (cond2, 1), 0));
> > > > +  if (COMPARISON_CLASS_P (cond1)
> > > > +      && COMPARISON_CLASS_P (cond2)
> > > > +      && inverse_conditions_p_1 (TREE_CODE (cond1),
> > > > +                                TREE_OPERAND (cond1, 0),
> > > > +                                TREE_OPERAND (cond1, 1),
> > > > +                                TREE_CODE (cond2),
> > > > +                                TREE_OPERAND (cond2, 0),
> > > > +                                TREE_OPERAND (cond2, 1)))
> > > > +    return true;
> > > > +
> > > > +  if (TREE_CODE (cond1) == SSA_NAME
> > > > +      && TREE_CODE (cond2) == SSA_NAME)
> > > > +    {
> > > > +      gimple *gcond1 = SSA_NAME_DEF_STMT (cond1);
> > > > +      gimple *gcond2 = SSA_NAME_DEF_STMT (cond2);
> > > > +      if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2))
> > > > +       return false;
> > > > +
> > > > +      tree_code code1 = gimple_assign_rhs_code (gcond1);
> > > > +      tree_code code2 = gimple_assign_rhs_code (gcond2);
> > > > +      return TREE_CODE_CLASS (code1) == tcc_comparison
> > > > +            && TREE_CODE_CLASS (code2) == tcc_comparison
> > > > +            && inverse_conditions_p_1 (code1,
> > > > +                                       gimple_arg (gcond1, 0),
> > > > +                                       gimple_arg (gcond1, 1),
> > > > +                                       code2,
> > > > +                                       gimple_arg (gcond2, 0),
> > > > +                                       gimple_arg (gcond2, 1));
> > > > +    }
> > > > +
> > > > +  return false;
> > > >  }
> > > >
> > > >  /* Return a tree for the comparison which is the combination of
> > > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > > >
> > >
> 24cbbbb5bc1d718bd03af7712fc7255213f2a742..0a459f2804e296f4336aee70e3
> > > 51
> > > > ddc45486c867 100644
> > > > --- a/gcc/match.pd
> > > > +++ b/gcc/match.pd
> > > > @@ -1872,6 +1872,30 @@ DEFINE_INT_AND_FLOAT_ROUND_FN
> (RINT)
> > > >   (if (INTEGRAL_TYPE_P (type))
> > > >    (bit_and @0 @1)))
> > > >
> > > > +/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.
> > > > +   and vector Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c :
> > > > +d. */ (simplify  (bit_ior
> > > > +  (bit_and:c (convert? zero_one_valued_p@0) @2)
> > > > +  (bit_and:c (convert? zero_one_valued_p@1) @3))
> > > > +   (if (INTEGRAL_TYPE_P (type)
> > > > +       && inverse_conditions_p (@0, @1)
> > > > +       /* The scalar version has to be canonicalized after vectorization
> > > > +          because it makes unconditional loads conditional ones, which
> > > > +          means we lose vectorization because the loads may trap.  */
> > > > +       && canonicalize_math_after_vectorization_p ())
> > > > +    (bit_and (cond @0 @2 @3) { build_one_cst (type); })))
> > >
> > > I Know this is not part of the current pattern but could you add:
> > > (((a < b) & c) | ((-(a >= b)) & d)) -> a < b ? c : d
> > >
> > > Not your fault but there are now like two different predicates for a
> > > boolean like operand.
> > > zero_one_valued_p and truth_valued_p and a third way to describe it
> > > is to use SSA_NAME and check ssa_name_has_boolean_range.
> > >
> > > Also why can't you just do:
> > > Something similar like:
> > > + /* Similar but for comparisons which have been inverted already,
> > > +    Note it is hard to similulate inverted tcc_comparison due to NaNs
> > > +    so a double for loop is needed and then compare the inverse code
> > > +    with the result of invert_tree_comparison is needed.  */ (for
> > > + cmp
> > > + (tcc_comparison)  (for icmp (tcc_comparison)
> > > +   (simplify
> > > +    (bitop:c (rbitop:c (icmp @0 @1) @2) (cmp@3 @0 @1))
> > > +     (with { enum tree_code ic = invert_tree_comparison
> > > +             (cmp, HONOR_NANS (@0)); }
> > > +      (if (ic == icmp)
> > > +       (bitop @3 @2)))))))
> > >
> > > Where you match everything in match and simplify instead of needing
> > > to use inverse_conditions_p?
> >
> > As I mentioned above in the reply to Richi, this is because I can't
> > apply the predicate zero_one_valued_p to the comparison result if I
> decompose the comparison.
> > zero_one_valued_p@(cmp @0 @1) and other variants I've tried do not
> compile.
> >
> > Is there a way I can do so?
> 
> 
> Thinking about it maybe adding a check for non vector type or just a boolean
> type, comparisons will always have either a bool or an vector type.
> Note you might need to check for signed vs unsigned bool's because of the
> convert (and yes some languages have signed bools, Ada).

Yes, that's why Richi requested the use of the zero_one_valued_p. Because all
we care about is that the type returns 0 or 1 for comparisons as it's going to
mask it with 1.

> That is something like (formating needs to be fixed):
> (for cmp (tcc_comparison)
>   (for icmp (tcc_comparison)
>    (simplify
>  (bit_ior
>   (bit_and:c (convert? (cmp:c@0 @4 @5)) @2)
>   (bit_and:c (convert? (icmp@1 @4 @5)) @3))
>    (if (TREE_CODE (TREE_TYPE (@0)) == BOOLEAN_TYPE
>         && TYPE_SIGN (TREE_TYPE (@0)) == UNSIGNED
>         && TREE_CODE (TREE_TYPE (@1)) == BOOLEAN_TYPE
>         && TYPE_SIGN (TREE_TYPE (@1)) == UNSIGNED
>        /* The scalar version has to be canonicalized after vectorization
>           because it makes unconditional loads conditional ones, which
>           means we lose vectorization because the loads may trap.  */
>        && canonicalize_math_after_vectorization_p ())
>      (with { enum tree_code ic = invert_tree_comparison
>              (cmp, HONOR_NANS (@0)); }
>      (if (ic == icmp)
>     (bit_and (cond @0 @2 @3) { build_each_one_cst (type); })))
> 

zero_one_valued_p is more general as it doesn't limit the operation to just
Booleans.  So I'd like to hear what Richi thinks before not using it.

Thanks,
Tamar

> 
> >
> > Thanks,
> > Tamar
> >
> > >
> > > Thanks,
> > > Andrew Pinski
> > >
> > >
> > > > +(simplify
> > > > + (bit_ior
> > > > +  (bit_and:c (vec_cond:s @0 @4 integer_zerop) @2)
> > > > +  (bit_and:c (vec_cond:s @1 @4 integer_zerop) @3))
> > > > +   (if (inverse_conditions_p (@0, @1))
> > > > +    (switch
> > > > +     (if (integer_onep (@4))
> > > > +      (bit_and (vec_cond @0 @2 @3) @4))
> > > > +     (if (integer_minus_onep (@4))
> > > > +      (vec_cond @0 @2 @3)))))
> > > > +
> > > >  /* Transform X & -Y into X * Y when Y is { 0 or 1 }.  */  (simplify
> > > >   (bit_and:c (convert? (negate zero_one_valued_p@0)) @1) diff
> > > > --git a/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> > > > b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> > > > new file mode 100644
> > > > index
> > > >
> > >
> 0000000000000000000000000000000000000000..05a1292fa90c70b14a7985122f
> > > 43
> > > > 711f55d047ea
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> > > > @@ -0,0 +1,16 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-additional-options "-O" } */
> > > > +/* { dg-final { check-function-bodies "**" "" "" { target { le }
> > > > +} } } */
> > > > +
> > > > +/*
> > > > +**zoo2:
> > > > +**     cmp     w0, w1
> > > > +**     csel    w0, w2, w3, lt
> > > > +**     and     w0, w0, 1
> > > > +**     ret
> > > > +*/
> > > > +int zoo2 (int a, int b, int c, int d) {
> > > > +   return ((a < b) & c) | ((a >= b) & d); }
> > > > +
> > > > diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> > > > b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> > > > new file mode 100644
> > > > index
> > > >
> > >
> 0000000000000000000000000000000000000000..34bc65f5db10eae81b8dee331
> > > 6df
> > > > b7d12bf471c8
> > > > --- /dev/null
> > > > +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> > > > @@ -0,0 +1,32 @@
> > > > +/* { dg-do compile } */
> > > > +/* { dg-additional-options "-O3 -std=c99" } */
> > > > +/* { dg-final { check-function-bodies "**" "" "" { target { le }
> > > > +} } } */
> > > > +
> > > > +typedef int v4si __attribute__ ((vector_size (16)));
> > > > +
> > > > +/*
> > > > +**foo:
> > > > +**     cmgt    v0.4s, v1.4s, v0.4s
> > > > +**     bsl     v0.16b, v2.16b, v3.16b
> > > > +**     ret
> > > > +*/
> > > > +v4si foo (v4si a, v4si b, v4si c, v4si d) {
> > > > +    return ((a < b) & c) | ((a >= b) & d); }
> > > > +
> > > > +
> > > > +/**
> > > > +**bar:
> > > > +**...
> > > > +**     cmge    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> > > > +**     bsl     v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > > +**     and     v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > > +**...
> > > > +*/
> > > > +void bar (int * restrict a, int * restrict b, int * restrict c,
> > > > +         int * restrict d, int * restrict res, int n) {
> > > > +  for (int i = 0; i < (n & -4); i++)
> > > > +    res[i] = ((a[i] < b[i]) & c[i]) | ((a[i] >= b[i]) & d[i]); }
> > > > +
  
Jeff Law July 7, 2022, 4:07 p.m. UTC | #7
On 7/5/2022 8:09 PM, Andrew Pinski via Gcc-patches wrote:
>
> Not your fault but there are now like two different predicates for a
> boolean like operand.
> zero_one_valued_p and truth_valued_p and a third way to describe it is
> to use SSA_NAME and check ssa_name_has_boolean_range.
The latter is meant to catch cases where analysis indicates that a given 
SSA_NAME only takes on the values 0 or 1, regardless of the actual size 
of the SSA_NAME.

It pre-dates having reasonable range information available in DOM and 
from reviewing the existing uses in DOM, I would expect Ranger to make 
most, if not all, of this code useless.

jeff
  
Richard Biener July 8, 2022, 11:03 a.m. UTC | #8
On Thu, 7 Jul 2022, Tamar Christina wrote:

> > -----Original Message-----
> > From: Andrew Pinski <pinskia@gmail.com>
> > Sent: Wednesday, July 6, 2022 8:37 PM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: Richard Biener <rguenther@suse.de>; nd <nd@arm.com>; gcc-
> > patches@gcc.gnu.org
> > Subject: Re: [PATCH]middle-end simplify complex if expressions where
> > comparisons are inverse of one another.
> > 
> > On Wed, Jul 6, 2022 at 9:06 AM Tamar Christina <Tamar.Christina@arm.com>
> > wrote:
> > >
> > > > -----Original Message-----
> > > > From: Andrew Pinski <pinskia@gmail.com>
> > > > Sent: Wednesday, July 6, 2022 3:10 AM
> > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > Cc: Richard Biener <rguenther@suse.de>; nd <nd@arm.com>; gcc-
> > > > patches@gcc.gnu.org
> > > > Subject: Re: [PATCH]middle-end simplify complex if expressions where
> > > > comparisons are inverse of one another.
> > > >
> > > > On Tue, Jul 5, 2022 at 8:16 AM Tamar Christina via Gcc-patches <gcc-
> > > > patches@gcc.gnu.org> wrote:
> > > > >
> > > > >
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: Richard Biener <rguenther@suse.de>
> > > > > > Sent: Monday, June 20, 2022 9:57 AM
> > > > > > To: Tamar Christina <Tamar.Christina@arm.com>
> > > > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>
> > > > > > Subject: Re: [PATCH]middle-end simplify complex if expressions
> > > > > > where comparisons are inverse of one another.
> > > > > >
> > > > > > On Thu, 16 Jun 2022, Tamar Christina wrote:
> > > > > >
> > > > > > > Hi All,
> > > > > > >
> > > > > > > This optimizes the following sequence
> > > > > > >
> > > > > > >   ((a < b) & c) | ((a >= b) & d)
> > > > > > >
> > > > > > > into
> > > > > > >
> > > > > > >   (a < b ? c : d) & 1
> > > > > > >
> > > > > > > for scalar. On vector we can omit the & 1.
> > > > > > >
> > > > > > > This changes the code generation from
> > > > > > >
> > > > > > > zoo2:
> > > > > > >     cmp     w0, w1
> > > > > > >     cset    w0, lt
> > > > > > >     cset    w1, ge
> > > > > > >     and     w0, w0, w2
> > > > > > >     and     w1, w1, w3
> > > > > > >     orr     w0, w0, w1
> > > > > > >     ret
> > > > > > >
> > > > > > > into
> > > > > > >
> > > > > > >     cmp     w0, w1
> > > > > > >     csel    w0, w2, w3, lt
> > > > > > >     and     w0, w0, 1
> > > > > > >     ret
> > > > > > >
> > > > > > > and significantly reduces the number of selects we have to do
> > > > > > > in the vector code.
> > > > > > >
> > > > > > > Bootstrapped Regtested on aarch64-none-linux-gnu,
> > > > > > > x86_64-pc-linux-gnu and no issues.
> > > > > > >
> > > > > > > Ok for master?
> > > > > > >
> > > > > > > Thanks,
> > > > > > > Tamar
> > > > > > >
> > > > > > > gcc/ChangeLog:
> > > > > > >
> > > > > > >     * fold-const.cc (inverse_conditions_p): Traverse if SSA_NAME.
> > > > > > >     * match.pd: Add new rule.
> > > > > > >
> > > > > > > gcc/testsuite/ChangeLog:
> > > > > > >
> > > > > > >     * gcc.target/aarch64/if-compare_1.c: New test.
> > > > > > >     * gcc.target/aarch64/if-compare_2.c: New test.
> > > > > > >
> > > > > > > --- inline copy of patch --
> > > > > > > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc index
> > > > > > >
> > > > > >
> > > >
> > 39a5a52958d87497f301826e706886b290771a2d..f180599b90150acd3ed895a64
> > > > > > 280
> > > > > > > aa3255061256 100644
> > > > > > > --- a/gcc/fold-const.cc
> > > > > > > +++ b/gcc/fold-const.cc
> > > > > > > @@ -2833,15 +2833,38 @@ compcode_to_comparison (enum
> > > > > > comparison_code
> > > > > > > code)  bool  inverse_conditions_p (const_tree cond1,
> > > > > > > const_tree
> > > > > > > cond2) {
> > > > > > > -  return (COMPARISON_CLASS_P (cond1)
> > > > > > > -     && COMPARISON_CLASS_P (cond2)
> > > > > > > -     && (invert_tree_comparison
> > > > > > > -         (TREE_CODE (cond1),
> > > > > > > -          HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE
> > > > > > (cond2))
> > > > > > > -     && operand_equal_p (TREE_OPERAND (cond1, 0),
> > > > > > > -                         TREE_OPERAND (cond2, 0), 0)
> > > > > > > -     && operand_equal_p (TREE_OPERAND (cond1, 1),
> > > > > > > -                         TREE_OPERAND (cond2, 1), 0));
> > > > > > > +  if (COMPARISON_CLASS_P (cond1)
> > > > > > > +      && COMPARISON_CLASS_P (cond2)
> > > > > > > +      && (invert_tree_comparison
> > > > > > > +      (TREE_CODE (cond1),
> > > > > > > +       HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE
> > > > > > (cond2))
> > > > > > > +      && operand_equal_p (TREE_OPERAND (cond1, 0),
> > > > > > > +                     TREE_OPERAND (cond2, 0), 0)
> > > > > > > +      && operand_equal_p (TREE_OPERAND (cond1, 1),
> > > > > > > +                     TREE_OPERAND (cond2, 1), 0))
> > > > > > > +    return true;
> > > > > > > +
> > > > > > > +  if (TREE_CODE (cond1) == SSA_NAME
> > > > > > > +      && TREE_CODE (cond2) == SSA_NAME)
> > > > > > > +    {
> > > > > > > +      gimple *gcond1 = SSA_NAME_DEF_STMT (cond1);
> > > > > > > +      gimple *gcond2 = SSA_NAME_DEF_STMT (cond2);
> > > > > > > +      if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2))
> > > > > > > +   return false;
> > > > > > > +
> > > > > > > +      tree_code code1 = gimple_assign_rhs_code (gcond1);
> > > > > > > +      tree_code code2 = gimple_assign_rhs_code (gcond2);
> > > > > > > +      return TREE_CODE_CLASS (code1) == tcc_comparison
> > > > > > > +        && TREE_CODE_CLASS (code2) == tcc_comparison
> > > > > > > +        && invert_tree_comparison (code1,
> > > > > > > +             HONOR_NANS (gimple_arg (gcond1, 0))) == code2
> > > > > > > +        && operand_equal_p (gimple_arg (gcond1, 0),
> > > > > > > +                            gimple_arg (gcond2, 0), 0)
> > > > > > > +        && operand_equal_p (gimple_arg (gcond1, 1),
> > > > > > > +                            gimple_arg (gcond2, 1), 0);
> > > > > > > +    }
> > > > > > > +
> > > > > > > +  return false;
> > > > > >
> > > > > > if we do extend inverse_condition_p please add an overload like
> > > > >
> > > > > Done.
> > > > >
> > > > > >
> > > > > > bool
> > > > > > inverse_condition_p (enum tree_code, tree op00, tree op01,
> > > > > >                      enum tree_code, tree op10, tree op11)
> > > > > >
> > > > > > so you can avoid some code duplication here.
> > > > > >
> > > > > > >  }
> > > > > > >
> > > > > > >  /* Return a tree for the comparison which is the combination
> > > > > > > of diff --git a/gcc/match.pd b/gcc/match.pd index
> > > > > > >
> > > > > >
> > > >
> > 6d691d302b339c0e4556b40af158b5208c12d08f..bad49dd348add751d9ec1e30
> > > > > > 23e3
> > > > > > > 4d9ac123194f 100644
> > > > > > > --- a/gcc/match.pd
> > > > > > > +++ b/gcc/match.pd
> > > > > > > @@ -1160,6 +1160,32 @@ DEFINE_INT_AND_FLOAT_ROUND_FN
> > > > (RINT)
> > > > > > >       (convert (bit_and (negate (convert:utype { pmop[0]; }))
> > > > > > >                    (convert:utype @1)))))))
> > > > > > >
> > > > > > > +/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.
> > > > > > > +*/ (simplify  (bit_ior
> > > > > > > +  (bit_and:c (convert? @0) @2)
> > > > > > > +  (bit_and:c (convert? @1) @3))
> > > > > >
> > > > > > in case the comparison returns a signed bool this might turn out
> > wrong.
> > > > > > Maybe simply use zero_one_valued_p@0 here instead of (convert?
> > > > @0)?
> > > > >
> > > > > I think I still need the convert as the comparison gets promoted
> > > > > to int and the predicate doesn't handle extensions.
> > > > >
> > > > > So I left the convert but added the zero_one_valued_p@0 such that
> > > > > it's checking that the result of the comparison itself is at least 0 or 1.
> > > > >
> > > > > >
> > > > > > > +   (if (inverse_conditions_p (@0, @1)
> > > > > > > +   /* The scalar version has to be canonicalized after vectorization
> > > > > > > +      because it makes unconditional loads conditional ones, which
> > > > > > > +      means we lose vectorization because the loads may trap.  */
> > > > > > > +   && canonicalize_math_after_vectorization_p ())
> > > > > > > +    (bit_and (cond @0 @2 @3) { build_each_one_cst (type);
> > > > > > > + })))
> > > > > >
> > > > > > I think you should restrict this to INTEGRAL_TYPE_P and use
> > > > > > build_one_cst
> > > > > > (type) (also see below).
> > > > > >
> > > > > > you can do inverse_onditions_p with lock-step for over
> > > > > > tcc_comparison and inverted_tcc_comparison{,_with_nans} (see
> > > > > > existing
> > > > examples).
> > > > >
> > > > > I couldn't figure out how to combine this approach with the
> > > > > zero_one_valued_p predicate. The zero_one_valued_p would need to
> > > > > be
> > > > on
> > > > > (cmp @0 @1) but don't think that is allowed.
> > > > >
> > > > > >
> > > > > > > +(simplify
> > > > > > > + (bit_ior
> > > > > > > +  (bit_and:c (convert? (vec_cond:s @0 @4 integer_zerop)) @2)
> > > > > > > +  (bit_and:c (convert? (vec_cond:s @1 @4 integer_zerop)) @3))
> > > > > > > +   (if (inverse_conditions_p (@0, @1)
> > > > > > > +   && integer_onep (@4))
> > > > > > > +    (bit_and (vec_cond @0 @2 @3) @4)))
> > > > > > > +/* Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d.
> > > > > > > +*/ (simplify  (bit_ior
> > > > > > > +  (bit_and:c (convert? (vec_cond:s @0 integer_minus_onep
> > > > > > > +integer_zerop)) @2)
> > > > > > > +  (bit_and:c (convert? (vec_cond:s @1 integer_minus_onep
> > > > > > integer_zerop)) @3))
> > > > > > > +   (if (inverse_conditions_p (@0, @1))
> > > > > > > +    (vec_cond @0 @2 @3)))
> > > > > >
> > > > > > I think the duplication is pre-mature optimization - we should
> > > > > > get the (bit_and (..) integer_minus_onep) simplified.  Also
> > > > > > didn't we have (vec_cond
> > > > > > @0 -1 0) -> (view_convert @0) when the modes match?
> > > > >
> > > > > This wasn't in my tree at the time, I could use this
> > > > > representation instead but it wouldn't shorten the match tree.
> > > > > Combining them as you suggested below seems most optimal.
> > > > >
> > > > > > We might want to add (match zero_minus_one_valued_p) or use
> > > > > > truth_valued_p (with appropriate vector semantics, plus extend it).
> > > > > > Why do you need (convert? ...) for the vector case btw?
> > > > > >
> > > > >
> > > > > Since we have to defer the scalar version then the vectorizer
> > > > > won't
> > > > >
> > > > > > I guess the integral type and vector type cases are similar
> > > > > > enough that the patterns can be merged with conditionalizing
> > > > > > only the
> > > > replacement.
> > > > > >
> > > > >
> > > > > Merged.
> > > > >
> > > > >
> > > > > Bootstrapped Regtested on aarch64-none-linux-gnu,
> > > > > x86_64-pc-linux-gnu and no issues.
> > > > >
> > > > > Ok for master?
> > > > >
> > > > > Thanks,
> > > > > Tamar
> > > > >
> > > > > gcc/ChangeLog:
> > > > >
> > > > >         * fold-const.cc (inverse_conditions_p): Traverse if SSA_NAME.
> > > > >         (inverse_conditions_p_1): New.
> > > > >         * match.pd: Add new rule.
> > > > >
> > > > > gcc/testsuite/ChangeLog:
> > > > >
> > > > >         * gcc.target/aarch64/if-compare_1.c: New test.
> > > > >         * gcc.target/aarch64/if-compare_2.c: New test.
> > > > >
> > > > > --- inline copy of patch ---
> > > > >
> > > > > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc index
> > > > >
> > > >
> > 99021a82df4977b179b45db04e3083012c63067a..0628ffd395416d74bc031e3e4
> > > > ac8
> > > > > 246894ca564d 100644
> > > > > --- a/gcc/fold-const.cc
> > > > > +++ b/gcc/fold-const.cc
> > > > > @@ -2829,20 +2829,55 @@ compcode_to_comparison (enum
> > > > comparison_code code)
> > > > >      }
> > > > >  }
> > > > >
> > > > > +
> > > > > +/* Helper of inverse_condition_p.  Returns TRUE if CODE1 is the
> > inverse of
> > > > > +   CODE2 and OP00 == OP10 and OP01 == OP11.  */
> > > > > +
> > > > > +static bool
> > > > > +inverse_conditions_p_1 (enum tree_code code1, tree op00, tree
> > op01,
> > > > > +                       enum tree_code code2, tree op10, tree
> > > > > +op11) {
> > > > > +  return (invert_tree_comparison (code1, HONOR_NANS (op00)) ==
> > > > code2)
> > > > > +      && operand_equal_p (op00, op10, 0)
> > > > > +      && operand_equal_p (op01, op11, 0); }
> > > > > +
> > > > >  /* Return true if COND1 tests the opposite condition of COND2.
> > > > > */
> > > > >
> > > > >  bool
> > > > >  inverse_conditions_p (const_tree cond1, const_tree cond2)  {
> > > > > -  return (COMPARISON_CLASS_P (cond1)
> > > > > -         && COMPARISON_CLASS_P (cond2)
> > > > > -         && (invert_tree_comparison
> > > > > -             (TREE_CODE (cond1),
> > > > > -              HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE
> > > > (cond2))
> > > > > -         && operand_equal_p (TREE_OPERAND (cond1, 0),
> > > > > -                             TREE_OPERAND (cond2, 0), 0)
> > > > > -         && operand_equal_p (TREE_OPERAND (cond1, 1),
> > > > > -                             TREE_OPERAND (cond2, 1), 0));
> > > > > +  if (COMPARISON_CLASS_P (cond1)
> > > > > +      && COMPARISON_CLASS_P (cond2)
> > > > > +      && inverse_conditions_p_1 (TREE_CODE (cond1),
> > > > > +                                TREE_OPERAND (cond1, 0),
> > > > > +                                TREE_OPERAND (cond1, 1),
> > > > > +                                TREE_CODE (cond2),
> > > > > +                                TREE_OPERAND (cond2, 0),
> > > > > +                                TREE_OPERAND (cond2, 1)))
> > > > > +    return true;
> > > > > +
> > > > > +  if (TREE_CODE (cond1) == SSA_NAME
> > > > > +      && TREE_CODE (cond2) == SSA_NAME)
> > > > > +    {
> > > > > +      gimple *gcond1 = SSA_NAME_DEF_STMT (cond1);
> > > > > +      gimple *gcond2 = SSA_NAME_DEF_STMT (cond2);
> > > > > +      if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2))
> > > > > +       return false;
> > > > > +
> > > > > +      tree_code code1 = gimple_assign_rhs_code (gcond1);
> > > > > +      tree_code code2 = gimple_assign_rhs_code (gcond2);
> > > > > +      return TREE_CODE_CLASS (code1) == tcc_comparison
> > > > > +            && TREE_CODE_CLASS (code2) == tcc_comparison
> > > > > +            && inverse_conditions_p_1 (code1,
> > > > > +                                       gimple_arg (gcond1, 0),
> > > > > +                                       gimple_arg (gcond1, 1),
> > > > > +                                       code2,
> > > > > +                                       gimple_arg (gcond2, 0),
> > > > > +                                       gimple_arg (gcond2, 1));
> > > > > +    }
> > > > > +
> > > > > +  return false;
> > > > >  }
> > > > >
> > > > >  /* Return a tree for the comparison which is the combination of
> > > > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > > > >
> > > >
> > 24cbbbb5bc1d718bd03af7712fc7255213f2a742..0a459f2804e296f4336aee70e3
> > > > 51
> > > > > ddc45486c867 100644
> > > > > --- a/gcc/match.pd
> > > > > +++ b/gcc/match.pd
> > > > > @@ -1872,6 +1872,30 @@ DEFINE_INT_AND_FLOAT_ROUND_FN
> > (RINT)
> > > > >   (if (INTEGRAL_TYPE_P (type))
> > > > >    (bit_and @0 @1)))
> > > > >
> > > > > +/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.
> > > > > +   and vector Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c :
> > > > > +d. */ (simplify  (bit_ior
> > > > > +  (bit_and:c (convert? zero_one_valued_p@0) @2)
> > > > > +  (bit_and:c (convert? zero_one_valued_p@1) @3))
> > > > > +   (if (INTEGRAL_TYPE_P (type)
> > > > > +       && inverse_conditions_p (@0, @1)
> > > > > +       /* The scalar version has to be canonicalized after vectorization
> > > > > +          because it makes unconditional loads conditional ones, which
> > > > > +          means we lose vectorization because the loads may trap.  */
> > > > > +       && canonicalize_math_after_vectorization_p ())
> > > > > +    (bit_and (cond @0 @2 @3) { build_one_cst (type); })))
> > > >
> > > > I Know this is not part of the current pattern but could you add:
> > > > (((a < b) & c) | ((-(a >= b)) & d)) -> a < b ? c : d
> > > >
> > > > Not your fault but there are now like two different predicates for a
> > > > boolean like operand.
> > > > zero_one_valued_p and truth_valued_p and a third way to describe it
> > > > is to use SSA_NAME and check ssa_name_has_boolean_range.
> > > >
> > > > Also why can't you just do:
> > > > Something similar like:
> > > > + /* Similar but for comparisons which have been inverted already,
> > > > +    Note it is hard to similulate inverted tcc_comparison due to NaNs
> > > > +    so a double for loop is needed and then compare the inverse code
> > > > +    with the result of invert_tree_comparison is needed.  */ (for
> > > > + cmp
> > > > + (tcc_comparison)  (for icmp (tcc_comparison)
> > > > +   (simplify
> > > > +    (bitop:c (rbitop:c (icmp @0 @1) @2) (cmp@3 @0 @1))
> > > > +     (with { enum tree_code ic = invert_tree_comparison
> > > > +             (cmp, HONOR_NANS (@0)); }
> > > > +      (if (ic == icmp)
> > > > +       (bitop @3 @2)))))))
> > > >
> > > > Where you match everything in match and simplify instead of needing
> > > > to use inverse_conditions_p?
> > >
> > > As I mentioned above in the reply to Richi, this is because I can't
> > > apply the predicate zero_one_valued_p to the comparison result if I
> > decompose the comparison.
> > > zero_one_valued_p@(cmp @0 @1) and other variants I've tried do not
> > compile.
> > >
> > > Is there a way I can do so?
> > 
> > 
> > Thinking about it maybe adding a check for non vector type or just a boolean
> > type, comparisons will always have either a bool or an vector type.
> > Note you might need to check for signed vs unsigned bool's because of the
> > convert (and yes some languages have signed bools, Ada).
> 
> Yes, that's why Richi requested the use of the zero_one_valued_p. Because all
> we care about is that the type returns 0 or 1 for comparisons as it's going to
> mask it with 1.
> 
> > That is something like (formating needs to be fixed):
> > (for cmp (tcc_comparison)
> >   (for icmp (tcc_comparison)
> >    (simplify
> >  (bit_ior
> >   (bit_and:c (convert? (cmp:c@0 @4 @5)) @2)
> >   (bit_and:c (convert? (icmp@1 @4 @5)) @3))
> >    (if (TREE_CODE (TREE_TYPE (@0)) == BOOLEAN_TYPE
> >         && TYPE_SIGN (TREE_TYPE (@0)) == UNSIGNED
> >         && TREE_CODE (TREE_TYPE (@1)) == BOOLEAN_TYPE
> >         && TYPE_SIGN (TREE_TYPE (@1)) == UNSIGNED
> >        /* The scalar version has to be canonicalized after vectorization
> >           because it makes unconditional loads conditional ones, which
> >           means we lose vectorization because the loads may trap.  */
> >        && canonicalize_math_after_vectorization_p ())
> >      (with { enum tree_code ic = invert_tree_comparison
> >              (cmp, HONOR_NANS (@0)); }
> >      (if (ic == icmp)
> >     (bit_and (cond @0 @2 @3) { build_each_one_cst (type); })))
> > 
> 
> zero_one_valued_p is more general as it doesn't limit the operation to just
> Booleans.  So I'd like to hear what Richi thinks before not using it.

True, but then we'd need the inverse as well or use

> >  (bit_ior
> >   (bit_and:c (convert? zero_one_valued_p@0) @2)
> >   (bit_and:c (convert? (logical_inverted_value @0)) @3))

where logical_inverted is somewhat contradicting using zero_one_valued
instead of truth_valued_p (I think the former might not work for
vector booleans?).

In the end I'd prefer zero_one_valued_p but avoiding
inverse_conditions_p would be nice.

Richard.


> Thanks,
> Tamar
> 
> > 
> > >
> > > Thanks,
> > > Tamar
> > >
> > > >
> > > > Thanks,
> > > > Andrew Pinski
> > > >
> > > >
> > > > > +(simplify
> > > > > + (bit_ior
> > > > > +  (bit_and:c (vec_cond:s @0 @4 integer_zerop) @2)
> > > > > +  (bit_and:c (vec_cond:s @1 @4 integer_zerop) @3))
> > > > > +   (if (inverse_conditions_p (@0, @1))
> > > > > +    (switch
> > > > > +     (if (integer_onep (@4))
> > > > > +      (bit_and (vec_cond @0 @2 @3) @4))
> > > > > +     (if (integer_minus_onep (@4))
> > > > > +      (vec_cond @0 @2 @3)))))
> > > > > +
> > > > >  /* Transform X & -Y into X * Y when Y is { 0 or 1 }.  */  (simplify
> > > > >   (bit_and:c (convert? (negate zero_one_valued_p@0)) @1) diff
> > > > > --git a/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> > > > > b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> > > > > new file mode 100644
> > > > > index
> > > > >
> > > >
> > 0000000000000000000000000000000000000000..05a1292fa90c70b14a7985122f
> > > > 43
> > > > > 711f55d047ea
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> > > > > @@ -0,0 +1,16 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-additional-options "-O" } */
> > > > > +/* { dg-final { check-function-bodies "**" "" "" { target { le }
> > > > > +} } } */
> > > > > +
> > > > > +/*
> > > > > +**zoo2:
> > > > > +**     cmp     w0, w1
> > > > > +**     csel    w0, w2, w3, lt
> > > > > +**     and     w0, w0, 1
> > > > > +**     ret
> > > > > +*/
> > > > > +int zoo2 (int a, int b, int c, int d) {
> > > > > +   return ((a < b) & c) | ((a >= b) & d); }
> > > > > +
> > > > > diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> > > > > b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> > > > > new file mode 100644
> > > > > index
> > > > >
> > > >
> > 0000000000000000000000000000000000000000..34bc65f5db10eae81b8dee331
> > > > 6df
> > > > > b7d12bf471c8
> > > > > --- /dev/null
> > > > > +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> > > > > @@ -0,0 +1,32 @@
> > > > > +/* { dg-do compile } */
> > > > > +/* { dg-additional-options "-O3 -std=c99" } */
> > > > > +/* { dg-final { check-function-bodies "**" "" "" { target { le }
> > > > > +} } } */
> > > > > +
> > > > > +typedef int v4si __attribute__ ((vector_size (16)));
> > > > > +
> > > > > +/*
> > > > > +**foo:
> > > > > +**     cmgt    v0.4s, v1.4s, v0.4s
> > > > > +**     bsl     v0.16b, v2.16b, v3.16b
> > > > > +**     ret
> > > > > +*/
> > > > > +v4si foo (v4si a, v4si b, v4si c, v4si d) {
> > > > > +    return ((a < b) & c) | ((a >= b) & d); }
> > > > > +
> > > > > +
> > > > > +/**
> > > > > +**bar:
> > > > > +**...
> > > > > +**     cmge    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> > > > > +**     bsl     v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > > > +**     and     v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > > > +**...
> > > > > +*/
> > > > > +void bar (int * restrict a, int * restrict b, int * restrict c,
> > > > > +         int * restrict d, int * restrict res, int n) {
> > > > > +  for (int i = 0; i < (n & -4); i++)
> > > > > +    res[i] = ((a[i] < b[i]) & c[i]) | ((a[i] >= b[i]) & d[i]); }
> > > > > +
>
  
Tamar Christina Sept. 23, 2022, 9:16 a.m. UTC | #9
Hello,

> where logical_inverted is somewhat contradicting using zero_one_valued
> instead of truth_valued_p (I think the former might not work for vector
> booleans?).
> 
> In the end I'd prefer zero_one_valued_p but avoiding inverse_conditions_p
> would be nice.
> 
> Richard.

It's not pretty but I've made it work and added more tests.

Bootstrapped Regtested on aarch64-none-linux-gnu,
x86_64-pc-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* match.pd: Add new rule.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/if-compare_1.c: New test.
	* gcc.target/aarch64/if-compare_2.c: New test.

--- inline copy of patch ---

diff --git a/gcc/match.pd b/gcc/match.pd
index b61ed70e69b881a49177f10f20c1f92712bb8665..39da61bf117a6eb2924fc8a6473fb37ddadd60e9 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1903,6 +1903,101 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (if (INTEGRAL_TYPE_P (type))
   (bit_and @0 @1)))
 
+(for cmp (tcc_comparison)
+     icmp (inverted_tcc_comparison)
+ /* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.  */
+ (simplify
+  (bit_ior
+   (bit_and:c (convert? zero_one_valued_p@0) @2)
+   (bit_and:c (convert? zero_one_valued_p@1) @3))
+    (with {
+      enum tree_code c1
+	= (TREE_CODE (@0) == SSA_NAME
+	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@0)) : TREE_CODE (@0));
+
+      enum tree_code c2
+	= (TREE_CODE (@1) == SSA_NAME
+	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@1)) : TREE_CODE (@1));
+     }
+    (if (INTEGRAL_TYPE_P (type)
+	 && c1 == cmp
+	 && c2 == icmp
+	 /* The scalar version has to be canonicalized after vectorization
+	    because it makes unconditional loads conditional ones, which
+	    means we lose vectorization because the loads may trap.  */
+	 && canonicalize_math_after_vectorization_p ())
+     (bit_and (cond @0 @2 @3) { build_one_cst (type); }))))
+
+ /* Fold ((-(a < b) & c) | (-(a >= b) & d)) into a < b ? c : d.  */
+ (simplify
+  (bit_ior
+   (cond zero_one_valued_p@0 @2 zerop)
+   (cond zero_one_valued_p@1 @3 zerop))
+    (with {
+      enum tree_code c1
+	= (TREE_CODE (@0) == SSA_NAME
+	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@0)) : TREE_CODE (@0));
+
+      enum tree_code c2
+	= (TREE_CODE (@1) == SSA_NAME
+	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@1)) : TREE_CODE (@1));
+     }
+    (if (INTEGRAL_TYPE_P (type)
+	 && c1 == cmp
+	 && c2 == icmp
+	 /* The scalar version has to be canonicalized after vectorization
+	    because it makes unconditional loads conditional ones, which
+	    means we lose vectorization because the loads may trap.  */
+	 && canonicalize_math_after_vectorization_p ())
+    (cond @0 @2 @3))))
+
+ /* Vector Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d. 
+    and ((~(a < b) & c) | (~(a >= b) & d)) into a < b ? c : d.  */
+ (simplify
+  (bit_ior
+   (bit_and:c (vec_cond:s @0 @4 @5) @2)
+   (bit_and:c (vec_cond:s @1 @4 @5) @3))
+    (with {
+      enum tree_code c1
+	= (TREE_CODE (@0) == SSA_NAME
+	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@0)) : TREE_CODE (@0));
+
+      enum tree_code c2
+	= (TREE_CODE (@1) == SSA_NAME
+	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@1)) : TREE_CODE (@1));
+     }
+     (if (c1 == cmp && c2 == icmp)
+      (if (integer_zerop (@5))
+       (switch
+	(if (integer_onep (@4))
+	 (bit_and (vec_cond @0 @2 @3) @4))
+	(if (integer_minus_onep (@4))
+	 (vec_cond @0 @2 @3)))
+      (if (integer_zerop (@4))
+       (switch
+	(if (integer_onep (@5))
+	 (bit_and (vec_cond @0 @3 @2) @5))
+	(if (integer_minus_onep (@5))
+	 (vec_cond @0 @3 @2))))))))
+
+ /* Scalar Vectorized Fold ((-(a < b) & c) | (-(a >= b) & d))
+    into a < b ? d : c.  */
+ (simplify
+  (bit_ior
+   (vec_cond:s @0 @2 integer_zerop)
+   (vec_cond:s @1 @3 integer_zerop))
+    (with {
+      enum tree_code c1
+	= (TREE_CODE (@0) == SSA_NAME
+	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@0)) : TREE_CODE (@0));
+
+      enum tree_code c2
+	= (TREE_CODE (@1) == SSA_NAME
+	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@1)) : TREE_CODE (@1));
+     }
+     (if (c1 == cmp && c2 == icmp)
+      (vec_cond @0 @2 @3)))))
+
 /* Transform X & -Y into X * Y when Y is { 0 or 1 }.  */
 (simplify
  (bit_and:c (convert? (negate zero_one_valued_p@0)) @1)
diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_1.c b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..53bbd779a30e1a30e0ce0e4e5eaf589bfaf570fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
@@ -0,0 +1,47 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O -save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+extern void abort ();
+
+/*
+**zoo1:
+**	cmp	w0, w1
+**	csel	w0, w2, w3, lt
+**	and	w0, w0, 1
+**	ret
+*/
+__attribute((noipa, noinline))
+int zoo1 (int a, int b, int c, int d)
+{
+   return ((a < b) & c) | ((a >= b) & d);
+}
+
+/*
+**zoo2:
+**	cmp	w0, w1
+**	csel	w0, w2, w3, lt
+**	ret
+*/
+__attribute((noipa, noinline))
+int zoo2 (int a, int b, int c, int d)
+{
+   return (-(a < b) & c) | (-(a >= b) & d);
+}
+
+int main ()
+{
+  if (zoo1 (-3, 3, 5, 8) != 1)
+    abort ();
+
+  if (zoo1 (3, -3, 5, 8) != 0)
+    abort ();
+
+  if (zoo2 (-3, 3, 5, 8) != 5)
+    abort ();
+
+  if (zoo2 (3, -3, 5, 8) != 8)
+    abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_2.c b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..14988abac45989578b198f28c7c0ea203959c08b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
@@ -0,0 +1,96 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O3 -std=c99 -save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#pragma GCC target "+nosve"
+
+#include <string.h>
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+**foo1:
+**	cmgt	v0.4s, v1.4s, v0.4s
+**	bsl	v0.16b, v2.16b, v3.16b
+**	ret
+*/
+v4si foo1 (v4si a, v4si b, v4si c, v4si d) {
+    return ((a < b) & c) | ((a >= b) & d);
+}
+
+/*
+**foo2:
+**	cmgt	v0.4s, v1.4s, v0.4s
+**	bsl	v0.16b, v3.16b, v2.16b
+**	ret
+*/
+v4si foo2 (v4si a, v4si b, v4si c, v4si d) {
+    return (~(a < b) & c) | (~(a >= b) & d);
+}
+
+
+/**
+**bar1:
+**...
+**	cmge	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	bsl	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**	and	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**...
+*/
+void bar1 (int * restrict a, int * restrict b, int * restrict c,
+	  int * restrict d, int * restrict res, int n)
+{
+  for (int i = 0; i < (n & -4); i++)
+    res[i] = ((a[i] < b[i]) & c[i]) | ((a[i] >= b[i]) & d[i]);
+}
+
+/**
+**bar2:
+**...
+**	cmge	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	bsl	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**...
+*/
+void bar2 (int * restrict a, int * restrict b, int * restrict c,
+	  int * restrict d, int * restrict res, int n)
+{
+  for (int i = 0; i < (n & -4); i++)
+    res[i] = (-(a[i] < b[i]) & c[i]) | (-(a[i] >= b[i]) & d[i]);
+}
+
+extern void abort ();
+
+int main ()
+{
+
+  v4si a = { -3, -3, -3, -3 };
+  v4si b = { 3, 3, 3, 3 };
+  v4si c = { 5, 5, 5, 5 };
+  v4si d = { 8, 8, 8, 8 };
+
+  v4si res1 = foo1 (a, b, c, d);
+  if (memcmp (&res1, &c, 16UL) != 0)
+    abort ();
+
+  v4si res2 = foo2 (a, b, c, d);
+  if (memcmp (&res2, &d, 16UL) != 0)
+   abort ();
+
+  int ar[4] = { -3, -3, -3, -3 };
+  int br[4] = { 3, 3, 3, 3 };
+  int cr[4] = { 5, 5, 5, 5 };
+  int dr[4] = { 8, 8, 8, 8 };
+
+  int exp1[4] = { 1, 1, 1, 1 };
+  int res3[4];
+  bar1 ((int*)&ar, (int*)&br, (int*)&cr, (int*)&dr, (int*)&res3, 4);
+  if (memcmp (&res3, &exp1, 16UL) != 0)
+    abort ();
+
+  int res4[4];
+  bar2 ((int*)&ar, (int*)&br, (int*)&cr, (int*)&dr, (int*)&res4, 4);
+  if (memcmp (&res4, &cr, 16UL) != 0)
+    abort ();
+
+  return 0;
+}
  
Richard Biener Sept. 23, 2022, 1:10 p.m. UTC | #10
On Fri, 23 Sep 2022, Tamar Christina wrote:

> Hello,
> 
> > where logical_inverted is somewhat contradicting using zero_one_valued
> > instead of truth_valued_p (I think the former might not work for vector
> > booleans?).
> > 
> > In the end I'd prefer zero_one_valued_p but avoiding inverse_conditions_p
> > would be nice.
> > 
> > Richard.
> 
> It's not pretty but I've made it work and added more tests.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> x86_64-pc-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* match.pd: Add new rule.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/aarch64/if-compare_1.c: New test.
> 	* gcc.target/aarch64/if-compare_2.c: New test.
> 
> --- inline copy of patch ---
> 
> diff --git a/gcc/match.pd b/gcc/match.pd
> index b61ed70e69b881a49177f10f20c1f92712bb8665..39da61bf117a6eb2924fc8a6473fb37ddadd60e9 100644
> --- a/gcc/match.pd
> +++ b/gcc/match.pd
> @@ -1903,6 +1903,101 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
>   (if (INTEGRAL_TYPE_P (type))
>    (bit_and @0 @1)))
>  
> +(for cmp (tcc_comparison)
> +     icmp (inverted_tcc_comparison)
> + /* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.  */
> + (simplify
> +  (bit_ior
> +   (bit_and:c (convert? zero_one_valued_p@0) @2)
> +   (bit_and:c (convert? zero_one_valued_p@1) @3))
> +    (with {
> +      enum tree_code c1
> +	= (TREE_CODE (@0) == SSA_NAME
> +	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@0)) : TREE_CODE (@0));
> +
> +      enum tree_code c2
> +	= (TREE_CODE (@1) == SSA_NAME
> +	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@1)) : TREE_CODE (@1));
> +     }
> +    (if (INTEGRAL_TYPE_P (type)
> +	 && c1 == cmp
> +	 && c2 == icmp

So that doesn't have any advantage over doing

 (simplify
  (bit_ior
   (bit_and:c (convert? (cmp@0 @01 @02)) @2)
   (bit_and:c (convert? (icmp@1 @11 @12)) @3))
...

I don't remember if that's what we had before.

> +	 /* The scalar version has to be canonicalized after vectorization
> +	    because it makes unconditional loads conditional ones, which
> +	    means we lose vectorization because the loads may trap.  */
> +	 && canonicalize_math_after_vectorization_p ())
> +     (bit_and (cond @0 @2 @3) { build_one_cst (type); }))))
> +
> + /* Fold ((-(a < b) & c) | (-(a >= b) & d)) into a < b ? c : d.  */

The comment doesn't match the pattern below?

> + (simplify
> +  (bit_ior
> +   (cond zero_one_valued_p@0 @2 zerop)
> +   (cond zero_one_valued_p@1 @3 zerop))
> +    (with {
> +      enum tree_code c1
> +	= (TREE_CODE (@0) == SSA_NAME
> +	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@0)) : TREE_CODE (@0));
> +
> +      enum tree_code c2
> +	= (TREE_CODE (@1) == SSA_NAME
> +	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@1)) : TREE_CODE (@1));
> +     }
> +    (if (INTEGRAL_TYPE_P (type)
> +	 && c1 == cmp
> +	 && c2 == icmp
> +	 /* The scalar version has to be canonicalized after vectorization
> +	    because it makes unconditional loads conditional ones, which
> +	    means we lose vectorization because the loads may trap.  */
> +	 && canonicalize_math_after_vectorization_p ())
> +    (cond @0 @2 @3))))
> +
> + /* Vector Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d. 
> +    and ((~(a < b) & c) | (~(a >= b) & d)) into a < b ? c : d.  */
> + (simplify
> +  (bit_ior
> +   (bit_and:c (vec_cond:s @0 @4 @5) @2)
> +   (bit_and:c (vec_cond:s @1 @4 @5) @3))
> +    (with {
> +      enum tree_code c1
> +	= (TREE_CODE (@0) == SSA_NAME
> +	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@0)) : TREE_CODE (@0));
> +
> +      enum tree_code c2
> +	= (TREE_CODE (@1) == SSA_NAME
> +	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@1)) : TREE_CODE (@1));
> +     }
> +     (if (c1 == cmp && c2 == icmp)
> +      (if (integer_zerop (@5))
> +       (switch
> +	(if (integer_onep (@4))
> +	 (bit_and (vec_cond @0 @2 @3) @4))
> +	(if (integer_minus_onep (@4))
> +	 (vec_cond @0 @2 @3)))
> +      (if (integer_zerop (@4))
> +       (switch
> +	(if (integer_onep (@5))
> +	 (bit_and (vec_cond @0 @3 @2) @5))
> +	(if (integer_minus_onep (@5))
> +	 (vec_cond @0 @3 @2))))))))
> +
> + /* Scalar Vectorized Fold ((-(a < b) & c) | (-(a >= b) & d))
> +    into a < b ? d : c.  */
> + (simplify
> +  (bit_ior
> +   (vec_cond:s @0 @2 integer_zerop)
> +   (vec_cond:s @1 @3 integer_zerop))
> +    (with {
> +      enum tree_code c1
> +	= (TREE_CODE (@0) == SSA_NAME
> +	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@0)) : TREE_CODE (@0));
> +
> +      enum tree_code c2
> +	= (TREE_CODE (@1) == SSA_NAME
> +	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@1)) : TREE_CODE (@1));
> +     }
> +     (if (c1 == cmp && c2 == icmp)
> +      (vec_cond @0 @2 @3)))))
> +

As you say, it's not pretty.  When looking at

int zoo1 (int a, int b, int c, int d)
{  
   return ((a < b) & c) | ((a >= b) & d);
}

I see that we fail to transform this to

  _Bool tem = a < b;
   return (tem & c) | (!tem & d);

but when feeding that in as source then forwprop undoes this transform.
Still when in this form the patterns would be a lot easier to
handle?

Richard.


>  /* Transform X & -Y into X * Y when Y is { 0 or 1 }.  */
>  (simplify
>   (bit_and:c (convert? (negate zero_one_valued_p@0)) @1)
> diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_1.c b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..53bbd779a30e1a30e0ce0e4e5eaf589bfaf570fe
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> @@ -0,0 +1,47 @@
> +/* { dg-do run } */
> +/* { dg-additional-options "-O -save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +extern void abort ();
> +
> +/*
> +**zoo1:
> +**	cmp	w0, w1
> +**	csel	w0, w2, w3, lt
> +**	and	w0, w0, 1
> +**	ret
> +*/
> +__attribute((noipa, noinline))
> +int zoo1 (int a, int b, int c, int d)
> +{
> +   return ((a < b) & c) | ((a >= b) & d);
> +}
> +
> +/*
> +**zoo2:
> +**	cmp	w0, w1
> +**	csel	w0, w2, w3, lt
> +**	ret
> +*/
> +__attribute((noipa, noinline))
> +int zoo2 (int a, int b, int c, int d)
> +{
> +   return (-(a < b) & c) | (-(a >= b) & d);
> +}
> +
> +int main ()
> +{
> +  if (zoo1 (-3, 3, 5, 8) != 1)
> +    abort ();
> +
> +  if (zoo1 (3, -3, 5, 8) != 0)
> +    abort ();
> +
> +  if (zoo2 (-3, 3, 5, 8) != 5)
> +    abort ();
> +
> +  if (zoo2 (3, -3, 5, 8) != 8)
> +    abort ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_2.c b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..14988abac45989578b198f28c7c0ea203959c08b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> @@ -0,0 +1,96 @@
> +/* { dg-do run } */
> +/* { dg-additional-options "-O3 -std=c99 -save-temps" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#pragma GCC target "+nosve"
> +
> +#include <string.h>
> +
> +typedef int v4si __attribute__ ((vector_size (16)));
> +
> +/*
> +**foo1:
> +**	cmgt	v0.4s, v1.4s, v0.4s
> +**	bsl	v0.16b, v2.16b, v3.16b
> +**	ret
> +*/
> +v4si foo1 (v4si a, v4si b, v4si c, v4si d) {
> +    return ((a < b) & c) | ((a >= b) & d);
> +}
> +
> +/*
> +**foo2:
> +**	cmgt	v0.4s, v1.4s, v0.4s
> +**	bsl	v0.16b, v3.16b, v2.16b
> +**	ret
> +*/
> +v4si foo2 (v4si a, v4si b, v4si c, v4si d) {
> +    return (~(a < b) & c) | (~(a >= b) & d);
> +}
> +
> +
> +/**
> +**bar1:
> +**...
> +**	cmge	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +**	bsl	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +**	and	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +**...
> +*/
> +void bar1 (int * restrict a, int * restrict b, int * restrict c,
> +	  int * restrict d, int * restrict res, int n)
> +{
> +  for (int i = 0; i < (n & -4); i++)
> +    res[i] = ((a[i] < b[i]) & c[i]) | ((a[i] >= b[i]) & d[i]);
> +}
> +
> +/**
> +**bar2:
> +**...
> +**	cmge	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +**	bsl	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> +**...
> +*/
> +void bar2 (int * restrict a, int * restrict b, int * restrict c,
> +	  int * restrict d, int * restrict res, int n)
> +{
> +  for (int i = 0; i < (n & -4); i++)
> +    res[i] = (-(a[i] < b[i]) & c[i]) | (-(a[i] >= b[i]) & d[i]);
> +}
> +
> +extern void abort ();
> +
> +int main ()
> +{
> +
> +  v4si a = { -3, -3, -3, -3 };
> +  v4si b = { 3, 3, 3, 3 };
> +  v4si c = { 5, 5, 5, 5 };
> +  v4si d = { 8, 8, 8, 8 };
> +
> +  v4si res1 = foo1 (a, b, c, d);
> +  if (memcmp (&res1, &c, 16UL) != 0)
> +    abort ();
> +
> +  v4si res2 = foo2 (a, b, c, d);
> +  if (memcmp (&res2, &d, 16UL) != 0)
> +   abort ();
> +
> +  int ar[4] = { -3, -3, -3, -3 };
> +  int br[4] = { 3, 3, 3, 3 };
> +  int cr[4] = { 5, 5, 5, 5 };
> +  int dr[4] = { 8, 8, 8, 8 };
> +
> +  int exp1[4] = { 1, 1, 1, 1 };
> +  int res3[4];
> +  bar1 ((int*)&ar, (int*)&br, (int*)&cr, (int*)&dr, (int*)&res3, 4);
> +  if (memcmp (&res3, &exp1, 16UL) != 0)
> +    abort ();
> +
> +  int res4[4];
> +  bar2 ((int*)&ar, (int*)&br, (int*)&cr, (int*)&dr, (int*)&res4, 4);
> +  if (memcmp (&res4, &cr, 16UL) != 0)
> +    abort ();
> +
> +  return 0;
> +}
> 
> 
>
  
Richard Biener Sept. 26, 2022, 10:42 a.m. UTC | #11
On Fri, 23 Sep 2022, Tamar Christina wrote:

> > -----Original Message-----
> > From: Richard Biener <rguenther@suse.de>
> > Sent: Friday, September 23, 2022 9:10 AM
> > To: Tamar Christina <Tamar.Christina@arm.com>
> > Cc: Andrew Pinski <pinskia@gmail.com>; nd <nd@arm.com>; gcc-
> > patches@gcc.gnu.org
> > Subject: RE: [PATCH]middle-end simplify complex if expressions where
> > comparisons are inverse of one another.
> > 
> > On Fri, 23 Sep 2022, Tamar Christina wrote:
> > 
> > > Hello,
> > >
> > > > where logical_inverted is somewhat contradicting using
> > > > zero_one_valued instead of truth_valued_p (I think the former might
> > > > not work for vector booleans?).
> > > >
> > > > In the end I'd prefer zero_one_valued_p but avoiding
> > > > inverse_conditions_p would be nice.
> > > >
> > > > Richard.
> > >
> > > It's not pretty but I've made it work and added more tests.
> > >
> > > Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> > > and no issues.
> > >
> > > Ok for master?
> > >
> > > Thanks,
> > > Tamar
> > >
> > > gcc/ChangeLog:
> > >
> > > 	* match.pd: Add new rule.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > > 	* gcc.target/aarch64/if-compare_1.c: New test.
> > > 	* gcc.target/aarch64/if-compare_2.c: New test.
> > >
> > > --- inline copy of patch ---
> > >
> > > diff --git a/gcc/match.pd b/gcc/match.pd index
> > >
> > b61ed70e69b881a49177f10f20c1f92712bb8665..39da61bf117a6eb2924fc8a647
> > 3f
> > > b37ddadd60e9 100644
> > > --- a/gcc/match.pd
> > > +++ b/gcc/match.pd
> > > @@ -1903,6 +1903,101 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
> > >   (if (INTEGRAL_TYPE_P (type))
> > >    (bit_and @0 @1)))
> > >
> > > +(for cmp (tcc_comparison)
> > > +     icmp (inverted_tcc_comparison)
> > > + /* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.
> > > +*/  (simplify
> > > +  (bit_ior
> > > +   (bit_and:c (convert? zero_one_valued_p@0) @2)
> > > +   (bit_and:c (convert? zero_one_valued_p@1) @3))
> > > +    (with {
> > > +      enum tree_code c1
> > > +	= (TREE_CODE (@0) == SSA_NAME
> > > +	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@0)) :
> > TREE_CODE
> > > +(@0));
> > > +
> > > +      enum tree_code c2
> > > +	= (TREE_CODE (@1) == SSA_NAME
> > > +	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@1)) :
> > TREE_CODE (@1));
> > > +     }
> > > +    (if (INTEGRAL_TYPE_P (type)
> > > +	 && c1 == cmp
> > > +	 && c2 == icmp
> > 
> > So that doesn't have any advantage over doing
> > 
> >  (simplify
> >   (bit_ior
> >    (bit_and:c (convert? (cmp@0 @01 @02)) @2)
> >    (bit_and:c (convert? (icmp@1 @11 @12)) @3)) ...
> > 
> > I don't remember if that's what we had before.
> 
> No, the specific problem has always been applying zero_one_valued_p to the right type.
> Before it was much shorter because I was using the tree  helper function to get the inverses.
> 
> But with your suggestion I think I can do zero_one_valued_p on @0 and @1 instead..

But with comparsions and INTEGRAL_TYPE_P the value is always zero or one
so I'm confused.

> > 
> > > +	 /* The scalar version has to be canonicalized after vectorization
> > > +	    because it makes unconditional loads conditional ones, which
> > > +	    means we lose vectorization because the loads may trap.  */
> > > +	 && canonicalize_math_after_vectorization_p ())
> > > +     (bit_and (cond @0 @2 @3) { build_one_cst (type); }))))
> > > +
> > > + /* Fold ((-(a < b) & c) | (-(a >= b) & d)) into a < b ? c : d.  */
> > 
> > The comment doesn't match the pattern below?
> 
> The pattern in the comment gets rewritten to this form eventually,
> so I match it instead.  I can update the comment but I thought the above
> made it more clear why these belong together ?

Please mention the canonicalized form in the comment as well.

> > 
> > > + (simplify
> > > +  (bit_ior
> > > +   (cond zero_one_valued_p@0 @2 zerop)
> > > +   (cond zero_one_valued_p@1 @3 zerop))
> > > +    (with {
> > > +      enum tree_code c1
> > > +	= (TREE_CODE (@0) == SSA_NAME
> > > +	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@0)) :
> > TREE_CODE
> > > +(@0));
> > > +
> > > +      enum tree_code c2
> > > +	= (TREE_CODE (@1) == SSA_NAME
> > > +	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@1)) :
> > TREE_CODE (@1));
> > > +     }
> > > +    (if (INTEGRAL_TYPE_P (type)
> > > +	 && c1 == cmp
> > > +	 && c2 == icmp
> > > +	 /* The scalar version has to be canonicalized after vectorization
> > > +	    because it makes unconditional loads conditional ones, which
> > > +	    means we lose vectorization because the loads may trap.  */
> > > +	 && canonicalize_math_after_vectorization_p ())
> > > +    (cond @0 @2 @3))))
> > > +
> > > + /* Vector Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d.
> > > +    and ((~(a < b) & c) | (~(a >= b) & d)) into a < b ? c : d.  */
> > > +(simplify
> > > +  (bit_ior
> > > +   (bit_and:c (vec_cond:s @0 @4 @5) @2)
> > > +   (bit_and:c (vec_cond:s @1 @4 @5) @3))
> > > +    (with {
> > > +      enum tree_code c1
> > > +	= (TREE_CODE (@0) == SSA_NAME
> > > +	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@0)) :
> > TREE_CODE
> > > +(@0));
> > > +
> > > +      enum tree_code c2
> > > +	= (TREE_CODE (@1) == SSA_NAME
> > > +	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@1)) :
> > TREE_CODE (@1));
> > > +     }
> > > +     (if (c1 == cmp && c2 == icmp)
> > > +      (if (integer_zerop (@5))
> > > +       (switch
> > > +	(if (integer_onep (@4))
> > > +	 (bit_and (vec_cond @0 @2 @3) @4))
> > > +	(if (integer_minus_onep (@4))
> > > +	 (vec_cond @0 @2 @3)))
> > > +      (if (integer_zerop (@4))
> > > +       (switch
> > > +	(if (integer_onep (@5))
> > > +	 (bit_and (vec_cond @0 @3 @2) @5))
> > > +	(if (integer_minus_onep (@5))
> > > +	 (vec_cond @0 @3 @2))))))))
> > > +
> > > + /* Scalar Vectorized Fold ((-(a < b) & c) | (-(a >= b) & d))
> > > +    into a < b ? d : c.  */
> > > + (simplify
> > > +  (bit_ior
> > > +   (vec_cond:s @0 @2 integer_zerop)
> > > +   (vec_cond:s @1 @3 integer_zerop))
> > > +    (with {
> > > +      enum tree_code c1
> > > +	= (TREE_CODE (@0) == SSA_NAME
> > > +	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@0)) :
> > TREE_CODE
> > > +(@0));
> > > +
> > > +      enum tree_code c2
> > > +	= (TREE_CODE (@1) == SSA_NAME
> > > +	   ? gimple_assign_rhs_code (SSA_NAME_DEF_STMT (@1)) :
> > TREE_CODE (@1));
> > > +     }
> > > +     (if (c1 == cmp && c2 == icmp)
> > > +      (vec_cond @0 @2 @3)))))
> > > +
> > 
> > As you say, it's not pretty.  When looking at
> > 
> > int zoo1 (int a, int b, int c, int d)
> > {
> >    return ((a < b) & c) | ((a >= b) & d); }
> > 
> > I see that we fail to transform this to
> > 
> >   _Bool tem = a < b;
> >    return (tem & c) | (!tem & d);
> > 
> > but when feeding that in as source then forwprop undoes this transform.
> > Still when in this form the patterns would be a lot easier to handle?
> 
> Hmm perhaps, that would remove the need for icmp, but with your suggestion above I think
> I can clean this up, let me give it a try.

Sure.

Thanks,
Richard.

> Tamar.
> 
> > 
> > Richard.
> > 
> > 
> > >  /* Transform X & -Y into X * Y when Y is { 0 or 1 }.  */  (simplify
> > >   (bit_and:c (convert? (negate zero_one_valued_p@0)) @1) diff --git
> > > a/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> > > b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> > > new file mode 100644
> > > index
> > >
> > 0000000000000000000000000000000000000000..53bbd779a30e1a30e0ce0e4e5
> > eaf
> > > 589bfaf570fe
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
> > > @@ -0,0 +1,47 @@
> > > +/* { dg-do run } */
> > > +/* { dg-additional-options "-O -save-temps" } */
> > > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > > +} */
> > > +
> > > +extern void abort ();
> > > +
> > > +/*
> > > +**zoo1:
> > > +**	cmp	w0, w1
> > > +**	csel	w0, w2, w3, lt
> > > +**	and	w0, w0, 1
> > > +**	ret
> > > +*/
> > > +__attribute((noipa, noinline))
> > > +int zoo1 (int a, int b, int c, int d) {
> > > +   return ((a < b) & c) | ((a >= b) & d); }
> > > +
> > > +/*
> > > +**zoo2:
> > > +**	cmp	w0, w1
> > > +**	csel	w0, w2, w3, lt
> > > +**	ret
> > > +*/
> > > +__attribute((noipa, noinline))
> > > +int zoo2 (int a, int b, int c, int d)
> > > +{
> > > +   return (-(a < b) & c) | (-(a >= b) & d);
> > > +}
> > > +
> > > +int main ()
> > > +{
> > > +  if (zoo1 (-3, 3, 5, 8) != 1)
> > > +    abort ();
> > > +
> > > +  if (zoo1 (3, -3, 5, 8) != 0)
> > > +    abort ();
> > > +
> > > +  if (zoo2 (-3, 3, 5, 8) != 5)
> > > +    abort ();
> > > +
> > > +  if (zoo2 (3, -3, 5, 8) != 8)
> > > +    abort ();
> > > +
> > > +  return 0;
> > > +}
> > > diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> > b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> > > new file mode 100644
> > > index
> > 0000000000000000000000000000000000000000..14988abac45989578b198f28c7
> > c0ea203959c08b
> > > --- /dev/null
> > > +++ b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
> > > @@ -0,0 +1,96 @@
> > > +/* { dg-do run } */
> > > +/* { dg-additional-options "-O3 -std=c99 -save-temps" } */
> > > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> > > +
> > > +#pragma GCC target "+nosve"
> > > +
> > > +#include <string.h>
> > > +
> > > +typedef int v4si __attribute__ ((vector_size (16)));
> > > +
> > > +/*
> > > +**foo1:
> > > +**	cmgt	v0.4s, v1.4s, v0.4s
> > > +**	bsl	v0.16b, v2.16b, v3.16b
> > > +**	ret
> > > +*/
> > > +v4si foo1 (v4si a, v4si b, v4si c, v4si d) {
> > > +    return ((a < b) & c) | ((a >= b) & d);
> > > +}
> > > +
> > > +/*
> > > +**foo2:
> > > +**	cmgt	v0.4s, v1.4s, v0.4s
> > > +**	bsl	v0.16b, v3.16b, v2.16b
> > > +**	ret
> > > +*/
> > > +v4si foo2 (v4si a, v4si b, v4si c, v4si d) {
> > > +    return (~(a < b) & c) | (~(a >= b) & d);
> > > +}
> > > +
> > > +
> > > +/**
> > > +**bar1:
> > > +**...
> > > +**	cmge	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> > > +**	bsl	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > +**	and	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > +**...
> > > +*/
> > > +void bar1 (int * restrict a, int * restrict b, int * restrict c,
> > > +	  int * restrict d, int * restrict res, int n)
> > > +{
> > > +  for (int i = 0; i < (n & -4); i++)
> > > +    res[i] = ((a[i] < b[i]) & c[i]) | ((a[i] >= b[i]) & d[i]);
> > > +}
> > > +
> > > +/**
> > > +**bar2:
> > > +**...
> > > +**	cmge	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> > > +**	bsl	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
> > > +**...
> > > +*/
> > > +void bar2 (int * restrict a, int * restrict b, int * restrict c,
> > > +	  int * restrict d, int * restrict res, int n)
> > > +{
> > > +  for (int i = 0; i < (n & -4); i++)
> > > +    res[i] = (-(a[i] < b[i]) & c[i]) | (-(a[i] >= b[i]) & d[i]);
> > > +}
> > > +
> > > +extern void abort ();
> > > +
> > > +int main ()
> > > +{
> > > +
> > > +  v4si a = { -3, -3, -3, -3 };
> > > +  v4si b = { 3, 3, 3, 3 };
> > > +  v4si c = { 5, 5, 5, 5 };
> > > +  v4si d = { 8, 8, 8, 8 };
> > > +
> > > +  v4si res1 = foo1 (a, b, c, d);
> > > +  if (memcmp (&res1, &c, 16UL) != 0)
> > > +    abort ();
> > > +
> > > +  v4si res2 = foo2 (a, b, c, d);
> > > +  if (memcmp (&res2, &d, 16UL) != 0)
> > > +   abort ();
> > > +
> > > +  int ar[4] = { -3, -3, -3, -3 };
> > > +  int br[4] = { 3, 3, 3, 3 };
> > > +  int cr[4] = { 5, 5, 5, 5 };
> > > +  int dr[4] = { 8, 8, 8, 8 };
> > > +
> > > +  int exp1[4] = { 1, 1, 1, 1 };
> > > +  int res3[4];
> > > +  bar1 ((int*)&ar, (int*)&br, (int*)&cr, (int*)&dr, (int*)&res3, 4);
> > > +  if (memcmp (&res3, &exp1, 16UL) != 0)
> > > +    abort ();
> > > +
> > > +  int res4[4];
> > > +  bar2 ((int*)&ar, (int*)&br, (int*)&cr, (int*)&dr, (int*)&res4, 4);
> > > +  if (memcmp (&res4, &cr, 16UL) != 0)
> > > +    abort ();
> > > +
> > > +  return 0;
> > > +}
> > >
> > >
> > >
> > 
> > --
> > Richard Biener <rguenther@suse.de>
> > SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461
> > Nuernberg,
> > Germany; GF: Ivo Totev, Andrew Myers, Andrew McDonald, Boudien
> > Moerman;
> > HRB 36809 (AG Nuernberg)
>
  
Tamar Christina Oct. 31, 2022, 11:42 a.m. UTC | #12
Hi,

This is a cleaned up version addressing all feedback.

Bootstrapped Regtested on aarch64-none-linux-gnu,
x86_64-pc-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* match.pd: Add new rule.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/if-compare_1.c: New test.
	* gcc.target/aarch64/if-compare_2.c: New test.

--- inline copy of patch ---

diff --git a/gcc/match.pd b/gcc/match.pd
index 297c4d7a2355c2faa2afb7cfa0b4e171a161ef39..463132bfd35cf61f50381cedd07d7617c638034b 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1903,6 +1903,61 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (if (INTEGRAL_TYPE_P (type))
   (bit_and @0 @1)))
 
+(for cmp (tcc_comparison)
+     icmp (inverted_tcc_comparison)
+ /* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.  */
+ (simplify
+  (bit_ior
+   (bit_and:c (convert? (cmp@0  @01 @02)) @3)
+   (bit_and:c (convert? (icmp@4 @01 @02)) @5))
+    (if (INTEGRAL_TYPE_P (type)
+	 /* The scalar version has to be canonicalized after vectorization
+	    because it makes unconditional loads conditional ones, which
+	    means we lose vectorization because the loads may trap.  */
+	 && canonicalize_math_after_vectorization_p ())
+     (bit_and (cond @0 @3 @5) { build_one_cst (type); })))
+
+ /* Fold ((-(a < b) & c) | (-(a >= b) & d)) into a < b ? c : d.  This is
+    canonicalized further and we recognize the conditional form:
+    (a < b ? c : 0) | (a >= b ? d : 0) into a < b ? c : d.  */
+ (simplify
+  (bit_ior
+   (cond (cmp@0  @01 @02) @3 zerop)
+   (cond (icmp@4 @01 @02) @5 zerop))
+    (if (INTEGRAL_TYPE_P (type)
+	 /* The scalar version has to be canonicalized after vectorization
+	    because it makes unconditional loads conditional ones, which
+	    means we lose vectorization because the loads may trap.  */
+	 && canonicalize_math_after_vectorization_p ())
+    (cond @0 @3 @5)))
+
+ /* Vector Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d. 
+    and ((~(a < b) & c) | (~(a >= b) & d)) into a < b ? c : d.  */
+ (simplify
+  (bit_ior
+   (bit_and:c (vec_cond:s (cmp@0 @6 @7) @4 @5) @2)
+   (bit_and:c (vec_cond:s (icmp@1 @6 @7) @4 @5) @3))
+    (if (integer_zerop (@5))
+     (switch
+      (if (integer_onep (@4))
+       (bit_and (vec_cond @0 @2 @3) @4))
+	(if (integer_minus_onep (@4))
+	 (vec_cond @0 @2 @3)))
+    (if (integer_zerop (@4))
+     (switch
+      (if (integer_onep (@5))
+       (bit_and (vec_cond @0 @3 @2) @5))
+      (if (integer_minus_onep (@5))
+       (vec_cond @0 @3 @2))))))
+
+ /* Scalar Vectorized Fold ((-(a < b) & c) | (-(a >= b) & d))
+    into a < b ? d : c.  */
+ (simplify
+  (bit_ior
+   (vec_cond:s (cmp@0 @4 @5) @2 integer_zerop)
+   (vec_cond:s (icmp@1 @4 @5) @3 integer_zerop))
+    (vec_cond @0 @2 @3)))
+
 /* Transform X & -Y into X * Y when Y is { 0 or 1 }.  */
 (simplify
  (bit_and:c (convert? (negate zero_one_valued_p@0)) @1)
diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_1.c b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..53bbd779a30e1a30e0ce0e4e5eaf589bfaf570fe
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
@@ -0,0 +1,47 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O -save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+extern void abort ();
+
+/*
+**zoo1:
+**	cmp	w0, w1
+**	csel	w0, w2, w3, lt
+**	and	w0, w0, 1
+**	ret
+*/
+__attribute((noipa, noinline))
+int zoo1 (int a, int b, int c, int d)
+{
+   return ((a < b) & c) | ((a >= b) & d);
+}
+
+/*
+**zoo2:
+**	cmp	w0, w1
+**	csel	w0, w2, w3, lt
+**	ret
+*/
+__attribute((noipa, noinline))
+int zoo2 (int a, int b, int c, int d)
+{
+   return (-(a < b) & c) | (-(a >= b) & d);
+}
+
+int main ()
+{
+  if (zoo1 (-3, 3, 5, 8) != 1)
+    abort ();
+
+  if (zoo1 (3, -3, 5, 8) != 0)
+    abort ();
+
+  if (zoo2 (-3, 3, 5, 8) != 5)
+    abort ();
+
+  if (zoo2 (3, -3, 5, 8) != 8)
+    abort ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_2.c b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..14988abac45989578b198f28c7c0ea203959c08b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
@@ -0,0 +1,96 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O3 -std=c99 -save-temps" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#pragma GCC target "+nosve"
+
+#include <string.h>
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+**foo1:
+**	cmgt	v0.4s, v1.4s, v0.4s
+**	bsl	v0.16b, v2.16b, v3.16b
+**	ret
+*/
+v4si foo1 (v4si a, v4si b, v4si c, v4si d) {
+    return ((a < b) & c) | ((a >= b) & d);
+}
+
+/*
+**foo2:
+**	cmgt	v0.4s, v1.4s, v0.4s
+**	bsl	v0.16b, v3.16b, v2.16b
+**	ret
+*/
+v4si foo2 (v4si a, v4si b, v4si c, v4si d) {
+    return (~(a < b) & c) | (~(a >= b) & d);
+}
+
+
+/**
+**bar1:
+**...
+**	cmge	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	bsl	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**	and	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**...
+*/
+void bar1 (int * restrict a, int * restrict b, int * restrict c,
+	  int * restrict d, int * restrict res, int n)
+{
+  for (int i = 0; i < (n & -4); i++)
+    res[i] = ((a[i] < b[i]) & c[i]) | ((a[i] >= b[i]) & d[i]);
+}
+
+/**
+**bar2:
+**...
+**	cmge	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	bsl	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**...
+*/
+void bar2 (int * restrict a, int * restrict b, int * restrict c,
+	  int * restrict d, int * restrict res, int n)
+{
+  for (int i = 0; i < (n & -4); i++)
+    res[i] = (-(a[i] < b[i]) & c[i]) | (-(a[i] >= b[i]) & d[i]);
+}
+
+extern void abort ();
+
+int main ()
+{
+
+  v4si a = { -3, -3, -3, -3 };
+  v4si b = { 3, 3, 3, 3 };
+  v4si c = { 5, 5, 5, 5 };
+  v4si d = { 8, 8, 8, 8 };
+
+  v4si res1 = foo1 (a, b, c, d);
+  if (memcmp (&res1, &c, 16UL) != 0)
+    abort ();
+
+  v4si res2 = foo2 (a, b, c, d);
+  if (memcmp (&res2, &d, 16UL) != 0)
+   abort ();
+
+  int ar[4] = { -3, -3, -3, -3 };
+  int br[4] = { 3, 3, 3, 3 };
+  int cr[4] = { 5, 5, 5, 5 };
+  int dr[4] = { 8, 8, 8, 8 };
+
+  int exp1[4] = { 1, 1, 1, 1 };
+  int res3[4];
+  bar1 ((int*)&ar, (int*)&br, (int*)&cr, (int*)&dr, (int*)&res3, 4);
+  if (memcmp (&res3, &exp1, 16UL) != 0)
+    abort ();
+
+  int res4[4];
+  bar2 ((int*)&ar, (int*)&br, (int*)&cr, (int*)&dr, (int*)&res4, 4);
+  if (memcmp (&res4, &cr, 16UL) != 0)
+    abort ();
+
+  return 0;
+}
  
Jeff Law Oct. 31, 2022, 9:23 p.m. UTC | #13
On 10/31/22 05:42, Tamar Christina via Gcc-patches wrote:
> Hi,
>
> This is a cleaned up version addressing all feedback.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> x86_64-pc-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* match.pd: Add new rule.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/if-compare_1.c: New test.
> 	* gcc.target/aarch64/if-compare_2.c: New test.

OK

jeff
  

Patch

--- a/gcc/fold-const.cc
+++ b/gcc/fold-const.cc
@@ -2833,15 +2833,38 @@  compcode_to_comparison (enum comparison_code code)
 bool
 inverse_conditions_p (const_tree cond1, const_tree cond2)
 {
-  return (COMPARISON_CLASS_P (cond1)
-	  && COMPARISON_CLASS_P (cond2)
-	  && (invert_tree_comparison
-	      (TREE_CODE (cond1),
-	       HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE (cond2))
-	  && operand_equal_p (TREE_OPERAND (cond1, 0),
-			      TREE_OPERAND (cond2, 0), 0)
-	  && operand_equal_p (TREE_OPERAND (cond1, 1),
-			      TREE_OPERAND (cond2, 1), 0));
+  if (COMPARISON_CLASS_P (cond1)
+      && COMPARISON_CLASS_P (cond2)
+      && (invert_tree_comparison
+	   (TREE_CODE (cond1),
+	    HONOR_NANS (TREE_OPERAND (cond1, 0))) == TREE_CODE (cond2))
+      && operand_equal_p (TREE_OPERAND (cond1, 0),
+			  TREE_OPERAND (cond2, 0), 0)
+      && operand_equal_p (TREE_OPERAND (cond1, 1),
+			  TREE_OPERAND (cond2, 1), 0))
+    return true;
+
+  if (TREE_CODE (cond1) == SSA_NAME
+      && TREE_CODE (cond2) == SSA_NAME)
+    {
+      gimple *gcond1 = SSA_NAME_DEF_STMT (cond1);
+      gimple *gcond2 = SSA_NAME_DEF_STMT (cond2);
+      if (!is_gimple_assign (gcond1) || !is_gimple_assign (gcond2))
+	return false;
+
+      tree_code code1 = gimple_assign_rhs_code (gcond1);
+      tree_code code2 = gimple_assign_rhs_code (gcond2);
+      return TREE_CODE_CLASS (code1) == tcc_comparison
+	     && TREE_CODE_CLASS (code2) == tcc_comparison
+	     && invert_tree_comparison (code1,
+		  HONOR_NANS (gimple_arg (gcond1, 0))) == code2
+	     && operand_equal_p (gimple_arg (gcond1, 0),
+				 gimple_arg (gcond2, 0), 0)
+	     && operand_equal_p (gimple_arg (gcond1, 1),
+				 gimple_arg (gcond2, 1), 0);
+    }
+
+  return false;
 }
 
 /* Return a tree for the comparison which is the combination of
diff --git a/gcc/match.pd b/gcc/match.pd
index 6d691d302b339c0e4556b40af158b5208c12d08f..bad49dd348add751d9ec1e3023e34d9ac123194f 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -1160,6 +1160,32 @@  DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
      (convert (bit_and (negate (convert:utype { pmop[0]; }))
 		       (convert:utype @1)))))))
 
+/* Fold (((a < b) & c) | ((a >= b) & d)) into (a < b ? c : d) & 1.  */
+(simplify
+ (bit_ior
+  (bit_and:c (convert? @0) @2)
+  (bit_and:c (convert? @1) @3))
+   (if (inverse_conditions_p (@0, @1)
+	/* The scalar version has to be canonicalized after vectorization
+	   because it makes unconditional loads conditional ones, which
+	   means we lose vectorization because the loads may trap.  */
+	&& canonicalize_math_after_vectorization_p ())
+    (bit_and (cond @0 @2 @3) { build_each_one_cst (type); })))
+(simplify
+ (bit_ior
+  (bit_and:c (convert? (vec_cond:s @0 @4 integer_zerop)) @2)
+  (bit_and:c (convert? (vec_cond:s @1 @4 integer_zerop)) @3))
+   (if (inverse_conditions_p (@0, @1)
+	&& integer_onep (@4))
+    (bit_and (vec_cond @0 @2 @3) @4)))
+/* Fold (((a < b) & c) | ((a >= b) & d)) into a < b ? c : d.  */
+(simplify
+ (bit_ior
+  (bit_and:c (convert? (vec_cond:s @0 integer_minus_onep integer_zerop)) @2)
+  (bit_and:c (convert? (vec_cond:s @1 integer_minus_onep integer_zerop)) @3))
+   (if (inverse_conditions_p (@0, @1))
+    (vec_cond @0 @2 @3)))
+
 /* X % Y is smaller than Y.  */
 (for cmp (lt ge)
  (simplify
diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_1.c b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..05a1292fa90c70b14a7985122f43711f55d047ea
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/if-compare_1.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+/*
+**zoo2:
+**	cmp	w0, w1
+**	csel	w0, w2, w3, lt
+**	and	w0, w0, 1
+**	ret
+*/
+int zoo2 (int a, int b, int c, int d)
+{
+   return ((a < b) & c) | ((a >= b) & d);
+}
+
diff --git a/gcc/testsuite/gcc.target/aarch64/if-compare_2.c b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..34bc65f5db10eae81b8dee3316dfb7d12bf471c8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/if-compare_2.c
@@ -0,0 +1,32 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+typedef int v4si __attribute__ ((vector_size (16)));
+
+/*
+**foo:
+**	cmgt	v0.4s, v1.4s, v0.4s
+**	bsl	v0.16b, v2.16b, v3.16b
+**	ret
+*/
+v4si foo (v4si a, v4si b, v4si c, v4si d) {
+    return ((a < b) & c) | ((a >= b) & d);
+}
+
+
+/**
+**bar:
+**...
+**	cmge	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
+**	bsl	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**	and	v[0-9]+.16b, v[0-9]+.16b, v[0-9]+.16b
+**...
+*/
+void bar (int * restrict a, int * restrict b, int * restrict c,
+	  int * restrict d, int * restrict res, int n)
+{
+  for (int i = 0; i < (n & -4); i++)
+    res[i] = ((a[i] < b[i]) & c[i]) | ((a[i] >= b[i]) & d[i]);
+}
+