[ifcombine] adjust for narrowing converts before shifts [PR118206]

Message ID orzfjzi3ml.fsf@lxoliva.fsfla.org
State New
Headers
Series [ifcombine] adjust for narrowing converts before shifts [PR118206] |

Commit Message

Alexandre Oliva Jan. 10, 2025, 6:28 a.m. UTC
  A narrowing conversion and a shift both drop bits from the loaded
value, but we need to take into account which one comes first to get
the right number of bits and mask.

Fold when applying masks to parts, comparing the parts, and combining
the results, in the odd chance either mask happens to be zero.

Regstrapped on x86_64-linux-gnu.  Ok to intall?


for  gcc/ChangeLog

	PR tree-optimization/118206
	* gimple-fold.cc (decode_field_reference): Account for upper
	bits dropped by narrowing conversions whether before or after
	a right shift.
	(fold_truth_andor_for_ifcombine): Fold masks, compares, and
	combined results.

for  gcc/testsuite/ChangeLog

	PR tree-optimization/118206
	* gcc.dg/field-merge-18.c: New.
---
 gcc/gimple-fold.cc                    |   39 ++++++++++++++++++++++++----
 gcc/testsuite/gcc.dg/field-merge-18.c |   46 +++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/field-merge-18.c
  

Comments

Richard Biener Jan. 10, 2025, 7:49 a.m. UTC | #1
On Fri, 10 Jan 2025, Alexandre Oliva wrote:

> 
> A narrowing conversion and a shift both drop bits from the loaded
> value, but we need to take into account which one comes first to get
> the right number of bits and mask.
> 
> Fold when applying masks to parts, comparing the parts, and combining
> the results, in the odd chance either mask happens to be zero.
> 
> Regstrapped on x86_64-linux-gnu.  Ok to intall?

OK.

Richard.

> 
> for  gcc/ChangeLog
> 
> 	PR tree-optimization/118206
> 	* gimple-fold.cc (decode_field_reference): Account for upper
> 	bits dropped by narrowing conversions whether before or after
> 	a right shift.
> 	(fold_truth_andor_for_ifcombine): Fold masks, compares, and
> 	combined results.
> 
> for  gcc/testsuite/ChangeLog
> 
> 	PR tree-optimization/118206
> 	* gcc.dg/field-merge-18.c: New.
> ---
>  gcc/gimple-fold.cc                    |   39 ++++++++++++++++++++++++----
>  gcc/testsuite/gcc.dg/field-merge-18.c |   46 +++++++++++++++++++++++++++++++++
>  2 files changed, 79 insertions(+), 6 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/field-merge-18.c
> 
> diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc
> index c8a726e0ae3f3..d95f04213ee40 100644
> --- a/gcc/gimple-fold.cc
> +++ b/gcc/gimple-fold.cc
> @@ -7547,6 +7547,7 @@ decode_field_reference (tree *pexp, HOST_WIDE_INT *pbitsize,
>    int shiftrt = 0;
>    tree res_ops[2];
>    machine_mode mode;
> +  bool convert_before_shift = false;
>  
>    *load = NULL;
>    *psignbit = false;
> @@ -7651,6 +7652,12 @@ decode_field_reference (tree *pexp, HOST_WIDE_INT *pbitsize,
>        if (*load)
>  	loc[3] = gimple_location (*load);
>        exp = res_ops[0];
> +      /* This looks backwards, but we're going back the def chain, so if we
> +	 find the conversion here, after finding a shift, that's because the
> +	 convert appears before the shift, and we should thus adjust the bit
> +	 pos and size because of the shift after adjusting it due to type
> +	 conversion.  */
> +      convert_before_shift = true;
>      }
>  
>    /* Identify the load, if there is one.  */
> @@ -7693,6 +7700,15 @@ decode_field_reference (tree *pexp, HOST_WIDE_INT *pbitsize,
>    *pvolatilep = volatilep;
>  
>    /* Adjust shifts...  */
> +  if (convert_before_shift
> +      && outer_type && *pbitsize > TYPE_PRECISION (outer_type))
> +    {
> +      HOST_WIDE_INT excess = *pbitsize - TYPE_PRECISION (outer_type);
> +      if (*preversep ? !BYTES_BIG_ENDIAN : BYTES_BIG_ENDIAN)
> +	*pbitpos += excess;
> +      *pbitsize -= excess;
> +    }
> +
>    if (shiftrt)
>      {
>        if (!*preversep ? !BYTES_BIG_ENDIAN : BYTES_BIG_ENDIAN)
> @@ -7701,7 +7717,8 @@ decode_field_reference (tree *pexp, HOST_WIDE_INT *pbitsize,
>      }
>  
>    /* ... and bit position.  */
> -  if (outer_type && *pbitsize > TYPE_PRECISION (outer_type))
> +  if (!convert_before_shift
> +      && outer_type && *pbitsize > TYPE_PRECISION (outer_type))
>      {
>        HOST_WIDE_INT excess = *pbitsize - TYPE_PRECISION (outer_type);
>        if (*preversep ? !BYTES_BIG_ENDIAN : BYTES_BIG_ENDIAN)
> @@ -8377,6 +8394,8 @@ fold_truth_andor_for_ifcombine (enum tree_code code, tree truth_type,
>    if (get_best_mode (end_bit - first_bit, first_bit, 0, ll_end_region,
>  		     ll_align, BITS_PER_WORD, volatilep, &lnmode))
>      l_split_load = false;
> +  /* ??? If ll and rl share the same load, reuse that?
> +     See PR 118206 -> gcc.dg/field-merge-18.c  */
>    else
>      {
>        /* Consider the possibility of recombining loads if any of the
> @@ -8757,11 +8776,11 @@ fold_truth_andor_for_ifcombine (enum tree_code code, tree truth_type,
>        /* Apply masks.  */
>        for (int j = 0; j < 2; j++)
>  	if (mask[j] != wi::mask (0, true, mask[j].get_precision ()))
> -	  op[j] = build2_loc (locs[j][2], BIT_AND_EXPR, type,
> -			      op[j], wide_int_to_tree (type, mask[j]));
> +	  op[j] = fold_build2_loc (locs[j][2], BIT_AND_EXPR, type,
> +				   op[j], wide_int_to_tree (type, mask[j]));
>  
> -      cmp[i] = build2_loc (i ? rloc : lloc, wanted_code, truth_type,
> -			   op[0], op[1]);
> +      cmp[i] = fold_build2_loc (i ? rloc : lloc, wanted_code, truth_type,
> +				op[0], op[1]);
>      }
>  
>    /* Reorder the compares if needed.  */
> @@ -8773,7 +8792,15 @@ fold_truth_andor_for_ifcombine (enum tree_code code, tree truth_type,
>    if (parts == 1)
>      result = cmp[0];
>    else if (!separatep || !maybe_separate)
> -    result = build2_loc (rloc, orig_code, truth_type, cmp[0], cmp[1]);
> +    {
> +      /* Only fold if any of the cmp is known, otherwise we may lose the
> +	 sequence point, and that may prevent further optimizations.  */
> +      if (TREE_CODE (cmp[0]) == INTEGER_CST
> +	  || TREE_CODE (cmp[1]) == INTEGER_CST)
> +	result = fold_build2_loc (rloc, orig_code, truth_type, cmp[0], cmp[1]);
> +      else
> +	result = build2_loc (rloc, orig_code, truth_type, cmp[0], cmp[1]);
> +    }
>    else
>      {
>        result = cmp[0];
> diff --git a/gcc/testsuite/gcc.dg/field-merge-18.c b/gcc/testsuite/gcc.dg/field-merge-18.c
> new file mode 100644
> index 0000000000000..e8413873d2418
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/field-merge-18.c
> @@ -0,0 +1,46 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1" } */
> +
> +/* PR tree-optimization/118206 */
> +/* Check that shifts, whether before or after narrowing conversions, mask out
> +   the bits that are to be discarded.  */
> +
> +/* This only uses bits from the least significant byte in the short.  */
> +__attribute__((noipa)) int
> +foo (const void *x)
> +{
> +  unsigned short b;
> +  __builtin_memcpy (&b, x, sizeof (short));
> +  if ((b & 15) != 8)
> +    return 1;
> +  if ((((unsigned char) b) >> 4) > 7)
> +    return 1;
> +  return 0;
> +}
> +
> +__attribute__((noipa)) int
> +bar (const void *x)
> +{
> +  unsigned short b;
> +  __builtin_memcpy (&b, x, sizeof (short));
> +  if ((b & 15) != 8)
> +    return 1;
> +  if ((unsigned char)(b >> 4) > 7)
> +    return 1;
> +  return 0;
> +}
> +
> +int
> +main ()
> +{
> +  unsigned short a = 0x78 - 0x80 - 0x80;
> +  if (foo (&a) != 0 || bar (&a) != (a > 0xff))
> +    __builtin_abort ();
> +  unsigned short b = 0x88;
> +  if (foo (&b) != 1 || bar (&b) != 1)
> +    __builtin_abort ();
> +  unsigned short c = 8;
> +  if (foo (&c) != 0 || bar (&c) != 0)
> +    __builtin_abort ();
> +  return 0;
> +}
> 
>
  

Patch

diff --git a/gcc/gimple-fold.cc b/gcc/gimple-fold.cc
index c8a726e0ae3f3..d95f04213ee40 100644
--- a/gcc/gimple-fold.cc
+++ b/gcc/gimple-fold.cc
@@ -7547,6 +7547,7 @@  decode_field_reference (tree *pexp, HOST_WIDE_INT *pbitsize,
   int shiftrt = 0;
   tree res_ops[2];
   machine_mode mode;
+  bool convert_before_shift = false;
 
   *load = NULL;
   *psignbit = false;
@@ -7651,6 +7652,12 @@  decode_field_reference (tree *pexp, HOST_WIDE_INT *pbitsize,
       if (*load)
 	loc[3] = gimple_location (*load);
       exp = res_ops[0];
+      /* This looks backwards, but we're going back the def chain, so if we
+	 find the conversion here, after finding a shift, that's because the
+	 convert appears before the shift, and we should thus adjust the bit
+	 pos and size because of the shift after adjusting it due to type
+	 conversion.  */
+      convert_before_shift = true;
     }
 
   /* Identify the load, if there is one.  */
@@ -7693,6 +7700,15 @@  decode_field_reference (tree *pexp, HOST_WIDE_INT *pbitsize,
   *pvolatilep = volatilep;
 
   /* Adjust shifts...  */
+  if (convert_before_shift
+      && outer_type && *pbitsize > TYPE_PRECISION (outer_type))
+    {
+      HOST_WIDE_INT excess = *pbitsize - TYPE_PRECISION (outer_type);
+      if (*preversep ? !BYTES_BIG_ENDIAN : BYTES_BIG_ENDIAN)
+	*pbitpos += excess;
+      *pbitsize -= excess;
+    }
+
   if (shiftrt)
     {
       if (!*preversep ? !BYTES_BIG_ENDIAN : BYTES_BIG_ENDIAN)
@@ -7701,7 +7717,8 @@  decode_field_reference (tree *pexp, HOST_WIDE_INT *pbitsize,
     }
 
   /* ... and bit position.  */
-  if (outer_type && *pbitsize > TYPE_PRECISION (outer_type))
+  if (!convert_before_shift
+      && outer_type && *pbitsize > TYPE_PRECISION (outer_type))
     {
       HOST_WIDE_INT excess = *pbitsize - TYPE_PRECISION (outer_type);
       if (*preversep ? !BYTES_BIG_ENDIAN : BYTES_BIG_ENDIAN)
@@ -8377,6 +8394,8 @@  fold_truth_andor_for_ifcombine (enum tree_code code, tree truth_type,
   if (get_best_mode (end_bit - first_bit, first_bit, 0, ll_end_region,
 		     ll_align, BITS_PER_WORD, volatilep, &lnmode))
     l_split_load = false;
+  /* ??? If ll and rl share the same load, reuse that?
+     See PR 118206 -> gcc.dg/field-merge-18.c  */
   else
     {
       /* Consider the possibility of recombining loads if any of the
@@ -8757,11 +8776,11 @@  fold_truth_andor_for_ifcombine (enum tree_code code, tree truth_type,
       /* Apply masks.  */
       for (int j = 0; j < 2; j++)
 	if (mask[j] != wi::mask (0, true, mask[j].get_precision ()))
-	  op[j] = build2_loc (locs[j][2], BIT_AND_EXPR, type,
-			      op[j], wide_int_to_tree (type, mask[j]));
+	  op[j] = fold_build2_loc (locs[j][2], BIT_AND_EXPR, type,
+				   op[j], wide_int_to_tree (type, mask[j]));
 
-      cmp[i] = build2_loc (i ? rloc : lloc, wanted_code, truth_type,
-			   op[0], op[1]);
+      cmp[i] = fold_build2_loc (i ? rloc : lloc, wanted_code, truth_type,
+				op[0], op[1]);
     }
 
   /* Reorder the compares if needed.  */
@@ -8773,7 +8792,15 @@  fold_truth_andor_for_ifcombine (enum tree_code code, tree truth_type,
   if (parts == 1)
     result = cmp[0];
   else if (!separatep || !maybe_separate)
-    result = build2_loc (rloc, orig_code, truth_type, cmp[0], cmp[1]);
+    {
+      /* Only fold if any of the cmp is known, otherwise we may lose the
+	 sequence point, and that may prevent further optimizations.  */
+      if (TREE_CODE (cmp[0]) == INTEGER_CST
+	  || TREE_CODE (cmp[1]) == INTEGER_CST)
+	result = fold_build2_loc (rloc, orig_code, truth_type, cmp[0], cmp[1]);
+      else
+	result = build2_loc (rloc, orig_code, truth_type, cmp[0], cmp[1]);
+    }
   else
     {
       result = cmp[0];
diff --git a/gcc/testsuite/gcc.dg/field-merge-18.c b/gcc/testsuite/gcc.dg/field-merge-18.c
new file mode 100644
index 0000000000000..e8413873d2418
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/field-merge-18.c
@@ -0,0 +1,46 @@ 
+/* { dg-do run } */
+/* { dg-options "-O1" } */
+
+/* PR tree-optimization/118206 */
+/* Check that shifts, whether before or after narrowing conversions, mask out
+   the bits that are to be discarded.  */
+
+/* This only uses bits from the least significant byte in the short.  */
+__attribute__((noipa)) int
+foo (const void *x)
+{
+  unsigned short b;
+  __builtin_memcpy (&b, x, sizeof (short));
+  if ((b & 15) != 8)
+    return 1;
+  if ((((unsigned char) b) >> 4) > 7)
+    return 1;
+  return 0;
+}
+
+__attribute__((noipa)) int
+bar (const void *x)
+{
+  unsigned short b;
+  __builtin_memcpy (&b, x, sizeof (short));
+  if ((b & 15) != 8)
+    return 1;
+  if ((unsigned char)(b >> 4) > 7)
+    return 1;
+  return 0;
+}
+
+int
+main ()
+{
+  unsigned short a = 0x78 - 0x80 - 0x80;
+  if (foo (&a) != 0 || bar (&a) != (a > 0xff))
+    __builtin_abort ();
+  unsigned short b = 0x88;
+  if (foo (&b) != 1 || bar (&b) != 1)
+    __builtin_abort ();
+  unsigned short c = 8;
+  if (foo (&c) != 0 || bar (&c) != 0)
+    __builtin_abort ();
+  return 0;
+}