[3/4] middle-end: Implement preferred_div_as_shifts_over_mult [PR108583]

Message ID Y/yjM04lLCimbeu4@arm.com
State Committed
Headers
Series [1/4] middle-end: Revert can_special_div_by_const changes [PR108583] |

Commit Message

Tamar Christina Feb. 27, 2023, 12:33 p.m. UTC
  Hi All,

As Richard S wanted, this now implements a hook
preferred_div_as_shifts_over_mult that indicates whether a target prefers that
the vectorizer decomposes division as shifts rather than multiplication when
possible.

In order to be able to use this we need to check whether the current precision
has enough bits to do the operation without any of the additions overflowing.

We use range information to determine this and only do the operation if we're
sure am overflow won't occur. This now uses ranger to do this range check.

This seems to work better than vect_get_range_info which uses range_query, but I
have not switched the interface of vect_get_range_info over in this PR fix.

As Andy said before initializing a ranger instance is cheap but not free, and if
the intention is to call it often during a pass it should be instantiated at
pass startup and passed along to the places that need it.  This is a big
refactoring and doesn't seem right to do in this PR.  But we should in GCC 14.

Currently we only instantiate it after a long series of much cheaper checks.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	PR target/108583
	* target.def (preferred_div_as_shifts_over_mult): New.
	* doc/tm.texi.in: Document it.
	* doc/tm.texi: Regenerate.
	* targhooks.cc (default_preferred_div_as_shifts_over_mult): New.
	* targhooks.h (default_preferred_div_as_shifts_over_mult): New.
	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Use it.

gcc/testsuite/ChangeLog:

	PR target/108583
	* gcc.dg/vect/vect-div-bitmask-4.c: New test.
	* gcc.dg/vect/vect-div-bitmask-5.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 50a8872a6695b18b9bed0d393bacf733833633db..c85196015e2e53047fcc65d32ef2d3203d2a6bab 100644




--
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 50a8872a6695b18b9bed0d393bacf733833633db..c85196015e2e53047fcc65d32ef2d3203d2a6bab 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6137,6 +6137,9 @@ instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT (void)
+When decomposing a division operation, if possible prefer to decompose the
+operation as shifts rather than multiplication by magic constants.
 @end deftypefn
 
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 3e07978a02f4e6077adae6cadc93ea4273295f1f..0051017a7fd67691a343470f36ad4fc32c8e7e15 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4173,6 +4173,7 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
 
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
diff --git a/gcc/target.def b/gcc/target.def
index e0a5c7adbd962f5d08ed08d1d81afa2c2baa64a5..8cc18b1f3c5de24c21faf891b9d4d0b6fd5b59d7 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1868,6 +1868,15 @@ correct for most targets.",
  poly_uint64, (const_tree type),
  default_preferred_vector_alignment)
 
+/* Returns whether the target has a preference for decomposing divisions using
+   shifts rather than multiplies.  */
+DEFHOOK
+(preferred_div_as_shifts_over_mult,
+ "When decomposing a division operation, if possible prefer to decompose the\n\
+operation as shifts rather than multiplication by magic constants.",
+ bool, (void),
+ default_preferred_div_as_shifts_over_mult)
+
 /* Return true if vector alignment is reachable (by peeling N
    iterations) for the given scalar type.  */
 DEFHOOK
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index a6a4809ca91baa5d7fad2244549317a31390f0c2..dda011c59fbd5973ee648dfea195619cc41c71bc 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -53,6 +53,8 @@ extern scalar_int_mode default_unwind_word_mode (void);
 extern unsigned HOST_WIDE_INT default_shift_truncation_mask
   (machine_mode);
 extern unsigned int default_min_divisions_for_recip_mul (machine_mode);
+extern bool
+default_preferred_div_as_shifts_over_mult (void);
 extern int default_mode_rep_extended (scalar_int_mode, scalar_int_mode);
 
 extern tree default_stack_protect_guard (void);
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index 211525720a620d6f533e2da91e03877337a931e7..6396f344eef09dd61f358938846a1c02a70b31d8 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1483,6 +1483,15 @@ default_preferred_vector_alignment (const_tree type)
   return TYPE_ALIGN (type);
 }
 
+/* The default implementation of
+   TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
+
+bool
+default_preferred_div_as_shifts_over_mult (void)
+{
+  return false;
+}
+
 /* By default assume vectors of element TYPE require a multiple of the natural
    alignment of TYPE.  TYPE is naturally aligned if IS_PACKED is false.  */
 bool
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
new file mode 100644
index 0000000000000000000000000000000000000000..c81f8946922250234bf759e0a0a04ea8c1f73e3c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+typedef unsigned __attribute__((__vector_size__ (16))) V;
+
+static __attribute__((__noinline__)) __attribute__((__noclone__)) V
+foo (V v, unsigned short i)
+{
+  v /= i;
+  return v;
+}
+
+int
+main (void)
+{
+  V v = foo ((V) { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, 0xffff);
+  for (unsigned i = 0; i < sizeof (v) / sizeof (v[0]); i++)
+    if (v[i] != 0x00010001)
+      __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
new file mode 100644
index 0000000000000000000000000000000000000000..b4eb1a4dacba481e6306b49914d2a29b933de625
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
@@ -0,0 +1,58 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] + level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] + level) / 0xff;
+}
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "divmod pattern recognized" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 1766ce277d6b88d8aa3be77e7c8abb504a10a735..31f2a6753b4faccb77351c8c5afed9775888b60f 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3913,6 +3913,84 @@ vect_recog_divmod_pattern (vec_info *vinfo,
 
       return pattern_stmt;
     }
+  else if ((cst = uniform_integer_cst_p (oprnd1))
+	   && TYPE_UNSIGNED (itype)
+	   && rhs_code == TRUNC_DIV_EXPR
+	   && vectype
+	   && targetm.vectorize.preferred_div_as_shifts_over_mult ())
+    {
+      /* div optimizations using narrowings
+       we can do the division e.g. shorts by 255 faster by calculating it as
+       (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+       double the precision of x.
+
+       If we imagine a short as being composed of two blocks of bytes then
+       adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to
+       adding 1 to each sub component:
+
+	    short value of 16-bits
+       ┌──────────────┬────────────────┐
+       │              │                │
+       └──────────────┴────────────────┘
+	 8-bit part1 ▲  8-bit part2   ▲
+		     │                │
+		     │                │
+		    +1               +1
+
+       after the first addition, we have to shift right by 8, and narrow the
+       results back to a byte.  Remember that the addition must be done in
+       double the precision of the input.  However if we know that the addition
+       `x + 257` does not overflow then we can do the operation in the current
+       precision.  In which case we don't need the pack and unpacks.  */
+      auto wcst = wi::to_wide (cst);
+      int pow = wi::exact_log2 (wcst + 1);
+      if (pow == (int) (element_precision (vectype) / 2))
+	{
+	  gimple *stmt = SSA_NAME_DEF_STMT (oprnd0);
+
+	  gimple_ranger ranger;
+	  int_range_max r;
+
+	  /* Check that no overflow will occur.  If we don't have range
+	     information we can't perform the optimization.  */
+
+	  if (ranger.range_of_expr (r, oprnd0, stmt))
+	    {
+	      wide_int max = r.upper_bound ();
+	      wide_int one = wi::to_wide (build_one_cst (itype));
+	      wide_int adder = wi::add (one, wi::lshift (one, pow));
+	      wi::overflow_type ovf;
+	      wi::add (max, adder, UNSIGNED, &ovf);
+	      if (ovf == wi::OVF_NONE)
+		{
+		  *type_out = vectype;
+		  tree tadder = wide_int_to_tree (itype, adder);
+		  tree rshift = wide_int_to_tree (itype, pow);
+
+		  tree new_lhs1 = vect_recog_temp_ssa_var (itype, NULL);
+		  gassign *patt1
+		    = gimple_build_assign (new_lhs1, PLUS_EXPR, oprnd0, tadder);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs2 = vect_recog_temp_ssa_var (itype, NULL);
+		  patt1 = gimple_build_assign (new_lhs2, RSHIFT_EXPR, new_lhs1,
+					       rshift);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs3 = vect_recog_temp_ssa_var (itype, NULL);
+		  patt1 = gimple_build_assign (new_lhs3, PLUS_EXPR, new_lhs2,
+					       oprnd0);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs4 = vect_recog_temp_ssa_var (itype, NULL);
+		  pattern_stmt = gimple_build_assign (new_lhs4, RSHIFT_EXPR,
+						      new_lhs3, rshift);
+
+		  return pattern_stmt;
+		}
+	    }
+	}
+    }
 
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))
  

Comments

Tamar Christina March 6, 2023, 11:23 a.m. UTC | #1
Ping,

And updated the hook to allow to differentiate between ISAs.

As Andy said before initializing a ranger instance is cheap but not free, and if
the intention is to call it often during a pass it should be instantiated at
pass startup and passed along to the places that need it.  This is a big
refactoring and doesn't seem right to do in this PR.  But we should in GCC 14.

Currently we only instantiate it after a long series of much cheaper checks.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	PR target/108583
	* target.def (preferred_div_as_shifts_over_mult): New.
	* doc/tm.texi.in: Document it.
	* doc/tm.texi: Regenerate.
	* targhooks.cc (default_preferred_div_as_shifts_over_mult): New.
	* targhooks.h (default_preferred_div_as_shifts_over_mult): New.
	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Use it.

gcc/testsuite/ChangeLog:

	PR target/108583
	* gcc.dg/vect/vect-div-bitmask-4.c: New test.
	* gcc.dg/vect/vect-div-bitmask-5.c: New test.

--- inline copy of patch ---

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 50a8872a6695b18b9bed0d393bacf733833633db..f69f7f036272e867ea1c3fee851b117f057f68c5 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6137,6 +6137,10 @@ instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT (const_tree @var{type})
+If possible, when decomposing a division operation of vectors of
+type @var{type} during vectorization, prefer to use shifts rather than
+multiplication by magic constants.
 @end deftypefn
 
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 3e07978a02f4e6077adae6cadc93ea4273295f1f..0051017a7fd67691a343470f36ad4fc32c8e7e15 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4173,6 +4173,7 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
 
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
diff --git a/gcc/target.def b/gcc/target.def
index e0a5c7adbd962f5d08ed08d1d81afa2c2baa64a5..bdee9b7f9c941508738fac49593b5baa525e2915 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1868,6 +1868,16 @@ correct for most targets.",
  poly_uint64, (const_tree type),
  default_preferred_vector_alignment)
 
+/* Returns whether the target has a preference for decomposing divisions using
+   shifts rather than multiplies.  */
+DEFHOOK
+(preferred_div_as_shifts_over_mult,
+ "If possible, when decomposing a division operation of vectors of\n\
+type @var{type} during vectorization, prefer to use shifts rather than\n\
+multiplication by magic constants.",
+ bool, (const_tree type),
+ default_preferred_div_as_shifts_over_mult)
+
 /* Return true if vector alignment is reachable (by peeling N
    iterations) for the given scalar type.  */
 DEFHOOK
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index a6a4809ca91baa5d7fad2244549317a31390f0c2..a207963b9e6eb9300df0043e1b79aa6c941d0f7f 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -53,6 +53,8 @@ extern scalar_int_mode default_unwind_word_mode (void);
 extern unsigned HOST_WIDE_INT default_shift_truncation_mask
   (machine_mode);
 extern unsigned int default_min_divisions_for_recip_mul (machine_mode);
+extern bool default_preferred_div_as_shifts_over_mult
+  (const_tree);
 extern int default_mode_rep_extended (scalar_int_mode, scalar_int_mode);
 
 extern tree default_stack_protect_guard (void);
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index 211525720a620d6f533e2da91e03877337a931e7..becea6ef4b6329cfa0b676f8d844630fbdc97f20 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1483,6 +1483,15 @@ default_preferred_vector_alignment (const_tree type)
   return TYPE_ALIGN (type);
 }
 
+/* The default implementation of
+   TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
+
+bool
+default_preferred_div_as_shifts_over_mult (const_tree /* type */)
+{
+  return false;
+}
+
 /* By default assume vectors of element TYPE require a multiple of the natural
    alignment of TYPE.  TYPE is naturally aligned if IS_PACKED is false.  */
 bool
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
new file mode 100644
index 0000000000000000000000000000000000000000..c81f8946922250234bf759e0a0a04ea8c1f73e3c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+typedef unsigned __attribute__((__vector_size__ (16))) V;
+
+static __attribute__((__noinline__)) __attribute__((__noclone__)) V
+foo (V v, unsigned short i)
+{
+  v /= i;
+  return v;
+}
+
+int
+main (void)
+{
+  V v = foo ((V) { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, 0xffff);
+  for (unsigned i = 0; i < sizeof (v) / sizeof (v[0]); i++)
+    if (v[i] != 0x00010001)
+      __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
new file mode 100644
index 0000000000000000000000000000000000000000..b4eb1a4dacba481e6306b49914d2a29b933de625
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
@@ -0,0 +1,58 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] + level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] + level) / 0xff;
+}
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "divmod pattern recognized" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
index 5ab5b944a573ad24ce8427aff24fc5215bf05dac..26ed91d58fa4709a67c903ad446d267a3113c172 100644
--- a/gcc/tree-ssa-math-opts.cc
+++ b/gcc/tree-ssa-math-opts.cc
@@ -3346,6 +3346,20 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
 		    param_avoid_fma_max_bits));
   bool defer = check_defer;
   bool seen_negate_p = false;
+
+  /* There is no numerical difference between fused and unfused integer FMAs,
+     and the assumption below that FMA is as cheap as addition is unlikely
+     to be true, especially if the multiplication occurs multiple times on
+     the same chain.  E.g., for something like:
+
+	 (((a * b) + c) >> 1) + (a * b)
+
+     we do not want to duplicate the a * b into two additions, not least
+     because the result is not a natural FMA chain.  */
+  if (ANY_INTEGRAL_TYPE_P (type)
+      && !has_single_use (mul_result))
+    return false;
+
   /* Make sure that the multiplication statement becomes dead after
      the transformation, thus that all uses are transformed to FMAs.
      This means we assume that an FMA operation has the same cost
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 1766ce277d6b88d8aa3be77e7c8abb504a10a735..27fb4c6a59f0182c4a836f96ab6b5e2e405a18a0 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3913,6 +3913,84 @@ vect_recog_divmod_pattern (vec_info *vinfo,
 
       return pattern_stmt;
     }
+  else if ((cst = uniform_integer_cst_p (oprnd1))
+	   && TYPE_UNSIGNED (itype)
+	   && rhs_code == TRUNC_DIV_EXPR
+	   && vectype
+	   && targetm.vectorize.preferred_div_as_shifts_over_mult (vectype))
+    {
+      /* div optimizations using narrowings
+       we can do the division e.g. shorts by 255 faster by calculating it as
+       (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+       double the precision of x.
+
+       If we imagine a short as being composed of two blocks of bytes then
+       adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to
+       adding 1 to each sub component:
+
+	    short value of 16-bits
+       ┌──────────────┬────────────────┐
+       │              │                │
+       └──────────────┴────────────────┘
+	 8-bit part1 ▲  8-bit part2   ▲
+		     │                │
+		     │                │
+		    +1               +1
+
+       after the first addition, we have to shift right by 8, and narrow the
+       results back to a byte.  Remember that the addition must be done in
+       double the precision of the input.  However if we know that the addition
+       `x + 257` does not overflow then we can do the operation in the current
+       precision.  In which case we don't need the pack and unpacks.  */
+      auto wcst = wi::to_wide (cst);
+      int pow = wi::exact_log2 (wcst + 1);
+      if (pow == (int) (element_precision (vectype) / 2))
+	{
+	  gimple *stmt = SSA_NAME_DEF_STMT (oprnd0);
+
+	  gimple_ranger ranger;
+	  int_range_max r;
+
+	  /* Check that no overflow will occur.  If we don't have range
+	     information we can't perform the optimization.  */
+
+	  if (ranger.range_of_expr (r, oprnd0, stmt))
+	    {
+	      wide_int max = r.upper_bound ();
+	      wide_int one = wi::to_wide (build_one_cst (itype));
+	      wide_int adder = wi::add (one, wi::lshift (one, pow));
+	      wi::overflow_type ovf;
+	      wi::add (max, adder, UNSIGNED, &ovf);
+	      if (ovf == wi::OVF_NONE)
+		{
+		  *type_out = vectype;
+		  tree tadder = wide_int_to_tree (itype, adder);
+		  tree rshift = wide_int_to_tree (itype, pow);
+
+		  tree new_lhs1 = vect_recog_temp_ssa_var (itype, NULL);
+		  gassign *patt1
+		    = gimple_build_assign (new_lhs1, PLUS_EXPR, oprnd0, tadder);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs2 = vect_recog_temp_ssa_var (itype, NULL);
+		  patt1 = gimple_build_assign (new_lhs2, RSHIFT_EXPR, new_lhs1,
+					       rshift);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs3 = vect_recog_temp_ssa_var (itype, NULL);
+		  patt1 = gimple_build_assign (new_lhs3, PLUS_EXPR, new_lhs2,
+					       oprnd0);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs4 = vect_recog_temp_ssa_var (itype, NULL);
+		  pattern_stmt = gimple_build_assign (new_lhs4, RSHIFT_EXPR,
+						      new_lhs3, rshift);
+
+		  return pattern_stmt;
+		}
+	    }
+	}
+    }
 
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))
  
Richard Sandiford March 8, 2023, 8:55 a.m. UTC | #2
Tamar Christina <Tamar.Christina@arm.com> writes:
> Ping,
>
> And updated the hook to allow to differentiate between ISAs.
>
> As Andy said before initializing a ranger instance is cheap but not free, and if
> the intention is to call it often during a pass it should be instantiated at
> pass startup and passed along to the places that need it.  This is a big
> refactoring and doesn't seem right to do in this PR.  But we should in GCC 14.
>
> Currently we only instantiate it after a long series of much cheaper checks.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
>         PR target/108583
>         * target.def (preferred_div_as_shifts_over_mult): New.
>         * doc/tm.texi.in: Document it.
>         * doc/tm.texi: Regenerate.
>         * targhooks.cc (default_preferred_div_as_shifts_over_mult): New.
>         * targhooks.h (default_preferred_div_as_shifts_over_mult): New.
>         * tree-vect-patterns.cc (vect_recog_divmod_pattern): Use it.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/108583
>         * gcc.dg/vect/vect-div-bitmask-4.c: New test.
>         * gcc.dg/vect/vect-div-bitmask-5.c: New test.
>
> --- inline copy of patch ---
>
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 50a8872a6695b18b9bed0d393bacf733833633db..f69f7f036272e867ea1c3fee851b117f057f68c5 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -6137,6 +6137,10 @@ instruction pattern.  There is no need for the hook to handle these two
>  implementation approaches itself.
>  @end deftypefn
>
> +@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT (const_tree @var{type})
> +If possible, when decomposing a division operation of vectors of
> +type @var{type} during vectorization, prefer to use shifts rather than
> +multiplication by magic constants.
>  @end deftypefn
>
>  @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index 3e07978a02f4e6077adae6cadc93ea4273295f1f..0051017a7fd67691a343470f36ad4fc32c8e7e15 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -4173,6 +4173,7 @@ address;  but often a machine-dependent strategy can generate better code.
>
>  @hook TARGET_VECTORIZE_VEC_PERM_CONST
>
> +@hook TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
>
>  @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
>
> diff --git a/gcc/target.def b/gcc/target.def
> index e0a5c7adbd962f5d08ed08d1d81afa2c2baa64a5..bdee9b7f9c941508738fac49593b5baa525e2915 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -1868,6 +1868,16 @@ correct for most targets.",
>   poly_uint64, (const_tree type),
>   default_preferred_vector_alignment)
>
> +/* Returns whether the target has a preference for decomposing divisions using
> +   shifts rather than multiplies.  */
> +DEFHOOK
> +(preferred_div_as_shifts_over_mult,
> + "If possible, when decomposing a division operation of vectors of\n\
> +type @var{type} during vectorization, prefer to use shifts rather than\n\
> +multiplication by magic constants.",

Both approaches requires shifts though.  How about:

  Sometimes it is possible to implement a vector division using a sequence
  of two addition-shift pairs, giving four instructions in total.
  Return true if taking this approach for @var{vectype} is likely
  to be better than using a sequence involving highpart multiplication.

It should also say what the default is, more below.

> + bool, (const_tree type),
> + default_preferred_div_as_shifts_over_mult)
> +
>  /* Return true if vector alignment is reachable (by peeling N
>     iterations) for the given scalar type.  */
>  DEFHOOK
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index a6a4809ca91baa5d7fad2244549317a31390f0c2..a207963b9e6eb9300df0043e1b79aa6c941d0f7f 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -53,6 +53,8 @@ extern scalar_int_mode default_unwind_word_mode (void);
>  extern unsigned HOST_WIDE_INT default_shift_truncation_mask
>    (machine_mode);
>  extern unsigned int default_min_divisions_for_recip_mul (machine_mode);
> +extern bool default_preferred_div_as_shifts_over_mult
> +  (const_tree);
>  extern int default_mode_rep_extended (scalar_int_mode, scalar_int_mode);
>
>  extern tree default_stack_protect_guard (void);
> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> index 211525720a620d6f533e2da91e03877337a931e7..becea6ef4b6329cfa0b676f8d844630fbdc97f20 100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -1483,6 +1483,15 @@ default_preferred_vector_alignment (const_tree type)
>    return TYPE_ALIGN (type);
>  }
>
> +/* The default implementation of
> +   TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
> +
> +bool
> +default_preferred_div_as_shifts_over_mult (const_tree /* type */)
> +{
> +  return false;
> +}
> +

I think the default should be true for targets without highpart multiplication,
since the fallback isn't possible then.  Either that, or we should skip
calling the hook when the fallback isn't possible.  E.g. maybe we could
test and record can_mult_highpart_p before the new code, and skip the
hook test when can_mult_highpart_p is false.

>  /* By default assume vectors of element TYPE require a multiple of the natural
>     alignment of TYPE.  TYPE is naturally aligned if IS_PACKED is false.  */
>  bool
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..c81f8946922250234bf759e0a0a04ea8c1f73e3c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +typedef unsigned __attribute__((__vector_size__ (16))) V;
> +
> +static __attribute__((__noinline__)) __attribute__((__noclone__)) V
> +foo (V v, unsigned short i)
> +{
> +  v /= i;
> +  return v;
> +}
> +
> +int
> +main (void)
> +{
> +  V v = foo ((V) { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, 0xffff);
> +  for (unsigned i = 0; i < sizeof (v) / sizeof (v[0]); i++)
> +    if (v[i] != 0x00010001)
> +      __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..b4eb1a4dacba481e6306b49914d2a29b933de625
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
> @@ -0,0 +1,58 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include <stdio.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint8_t
> +
> +#ifndef DEBUG
> +#define DEBUG 0
> +#endif
> +
> +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> +
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] + level) / 0xff;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] + level) / 0xff;
> +}
> +
> +int main ()
> +{
> +  TYPE a[N];
> +  TYPE b[N];
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 13;
> +      b[i] = BASE + i * 13;
> +      if (DEBUG)
> +        printf ("%d: 0x%x\n", i, a[i]);
> +    }
> +
> +  fun1 (a, N / 2, N);
> +  fun2 (b, N / 2, N);
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (DEBUG)
> +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> +
> +      if (a[i] != b[i])
> +        __builtin_abort ();
> +    }
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump "divmod pattern recognized" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
> index 5ab5b944a573ad24ce8427aff24fc5215bf05dac..26ed91d58fa4709a67c903ad446d267a3113c172 100644
> --- a/gcc/tree-ssa-math-opts.cc
> +++ b/gcc/tree-ssa-math-opts.cc
> @@ -3346,6 +3346,20 @@ convert_mult_to_fma (gimple *mul_stmt, tree op1, tree op2,
>                     param_avoid_fma_max_bits));
>    bool defer = check_defer;
>    bool seen_negate_p = false;
> +
> +  /* There is no numerical difference between fused and unfused integer FMAs,
> +     and the assumption below that FMA is as cheap as addition is unlikely
> +     to be true, especially if the multiplication occurs multiple times on
> +     the same chain.  E.g., for something like:
> +
> +        (((a * b) + c) >> 1) + (a * b)
> +
> +     we do not want to duplicate the a * b into two additions, not least
> +     because the result is not a natural FMA chain.  */
> +  if (ANY_INTEGRAL_TYPE_P (type)
> +      && !has_single_use (mul_result))
> +    return false;
> +
>    /* Make sure that the multiplication statement becomes dead after
>       the transformation, thus that all uses are transformed to FMAs.
>       This means we assume that an FMA operation has the same cost

I think this should be a separate patch, with its own testcase.
Sorry for not saying that until now.  The testcase from that thread
would be enough.

> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index 1766ce277d6b88d8aa3be77e7c8abb504a10a735..27fb4c6a59f0182c4a836f96ab6b5e2e405a18a0 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -3913,6 +3913,84 @@ vect_recog_divmod_pattern (vec_info *vinfo,
>
>        return pattern_stmt;
>      }
> +  else if ((cst = uniform_integer_cst_p (oprnd1))
> +          && TYPE_UNSIGNED (itype)
> +          && rhs_code == TRUNC_DIV_EXPR
> +          && vectype
> +          && targetm.vectorize.preferred_div_as_shifts_over_mult (vectype))

The else isn't really necessary here.

> +    {
> +      /* div optimizations using narrowings
> +       we can do the division e.g. shorts by 255 faster by calculating it as
> +       (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
> +       double the precision of x.
> +
> +       If we imagine a short as being composed of two blocks of bytes then
> +       adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to
> +       adding 1 to each sub component:
> +
> +           short value of 16-bits
> +       ┌──────────────┬────────────────┐
> +       │              │                │
> +       └──────────────┴────────────────┘
> +        8-bit part1 ▲  8-bit part2   ▲
> +                    │                │
> +                    │                │
> +                   +1               +1
> +
> +       after the first addition, we have to shift right by 8, and narrow the
> +       results back to a byte. Remember that the addition must be done in
> +       double the precision of the input.  However if we know that the addition
> +       `x + 257` does not overflow then we can do the operation in the current
> +       precision.  In which case we don't need the pack and unpacks.  */

I think this needs rewording.  The last two sentences describe
the real constraint: x + 257 msut not overflow.  So the part
about "assuming the operation is done in double the precision of x"
and narrowing "the results back to a byte" don't apply.

AIUI, this is really an instance of the general transform:

  x // N == ((x+N+2) // (N+1) + x) // (N+1)  for 0 <= x < N(N+3)

(Hope I've got that right.  Proof sketch below if this isn't an
off-the-shelf result.)

So when 0 <= x < N(N+3) is guaranteed by the precision of the type,
the question becomes whether the operation overflows, like you say.
For smaller N, it's the N(N+3) bound that matters.

How about:

      /* We can use the relationship:

	   x // N == ((x+N+2) // (N+1) + x) // (N+1)  for 0 <= x < N(N+3)

         to optimize cases where N+1 is a power of 2, and where // (N+1)
         is therefore a shift right.  When operating in modes that are
         multiples of a byte in size, there are two cases:

         (1) N(N+3) is not representable, in which case the question
             becomes whether the replacement expression overflows.
             It is enough to test that x+N+2 does not overflow,
             i.e. that x < MAX-(N+1).

         (2) N(N+3) is representable, in which case it is the (only)
             bound that we need to check.

         ??? For now we just handle the case where // (N+1) is a shift
         right by half the precision, since some architectures can
         optimize the associated addition and shift combinations
         into single instructions.  */

> +      auto wcst = wi::to_wide (cst);
> +      int pow = wi::exact_log2 (wcst + 1);
> +      if (pow == (int) (element_precision (vectype) / 2))

Seems like this should be equivalent to:

  if (pow == prec / 2)

which would come in handy for the comment below.

> +       {
> +         gimple *stmt = SSA_NAME_DEF_STMT (oprnd0);
> +
> +         gimple_ranger ranger;
> +         int_range_max r;
> +
> +         /* Check that no overflow will occur.  If we don't have range
> +            information we can't perform the optimization.  */
> +
> +         if (ranger.range_of_expr (r, oprnd0, stmt))
> +           {
> +             wide_int max = r.upper_bound ();
> +             wide_int one = wi::to_wide (build_one_cst (itype));

Looks like this could be:

  auto one = wi::shwi (1, prec);

We shouldn't build trees just to convert them to wide_ints.

> +             wide_int adder = wi::add (one, wi::lshift (one, pow));
> +             wi::overflow_type ovf;
> +             wi::add (max, adder, UNSIGNED, &ovf);
> +             if (ovf == wi::OVF_NONE)
> +               {
> +                 *type_out = vectype;
> +                 tree tadder = wide_int_to_tree (itype, adder);
> +                 tree rshift = wide_int_to_tree (itype, pow);
> +
> +                 tree new_lhs1 = vect_recog_temp_ssa_var (itype, NULL);
> +                 gassign *patt1
> +                   = gimple_build_assign (new_lhs1, PLUS_EXPR, oprnd0, tadder);
> +                 append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
> +
> +                 tree new_lhs2 = vect_recog_temp_ssa_var (itype, NULL);
> +                 patt1 = gimple_build_assign (new_lhs2, RSHIFT_EXPR, new_lhs1,
> +                                              rshift);
> +                 append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
> +
> +                 tree new_lhs3 = vect_recog_temp_ssa_var (itype, NULL);
> +                 patt1 = gimple_build_assign (new_lhs3, PLUS_EXPR, new_lhs2,
> +                                              oprnd0);
> +                 append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
> +
> +                 tree new_lhs4 = vect_recog_temp_ssa_var (itype, NULL);
> +                 pattern_stmt = gimple_build_assign (new_lhs4, RSHIFT_EXPR,
> +                                                     new_lhs3, rshift);
> +
> +                 return pattern_stmt;
> +               }
> +           }
> +       }
> +    }
>
>    if (prec > HOST_BITS_PER_WIDE_INT
>        || integer_zerop (oprnd1))

LGTM otherwise, but would prefer to see the updated patch before acking.

Thanks,
Richard

----

Proof sketch, probably unnecessarily verbose/indirect:

The transform is:

  x // N  ==  ((x+N+2) // (N+1) + x) // (N+1)           [A]

For this to be valid, we need:

  0 <= x - N(((x+N+2) // (N+1) + x) // (N+1)) < N       [B]

Dividing into two cases:

(1) when 0 <= x < N:

  Then N+1 < x+N+2 < 2(N+1),
  so (x+N+2) // (N+1) == 1

  Substituting into [B] gives:

    0 <= x - N((1+x) // (N+1)) < N                      [C]

  And 0 <= x < N implies 1 <= 1+x < N+1,
  so (1+x) // (N+1) == 0.

  Substituting into [C] gives:

    0 <= x < N

  which is given.

(2) when x = K(N+1)-1+L for integral K and L, 0 <= L <= N:

  Then:

    (x+N+2) // (N+1) == (K(N+1)-1+L+N+2) // (N+1)
                     == ((K+1)(N+1)+L) // (N+1)
                     == K+1

  due to the range of L.

  Substituting into [B] gives:

    0 <= K(N+1)-1+L - N((K+1+K(N+1)-1+L) // (N+1)) < N
      <= K(N+1)-1+L - N((K+L+K(N+1)) // (N+1)) < N
      <= K(N+1)-1+L - N((K+L) // (N+1) + K) < N
      <= K+L-1 - N((K+L) // (N+1)) < N
      <= K+L - N((K+L) // (N+1)) < N+1                  [D]

  Dividing into three subcases:

  (2a) when (K+L) // (N+1) == 0

     This division result implies 0 <= K+L < N+1.

     Also, substituting the division into [D] gives:

       0 <= K+L < N+1

     which is the same condition.

  (2b) when (K+L) // (N+1) == 1

     This division result implies N+1 <= K+L < 2N+2.

     Also, substituting the division into [D] gives:

       0 <= K+L < 2N+1

     So for this case, [A] gives the wrong result iff K+L == 2N+1.

     Since L is bounded by N, the minimum K for which this is true
     is K == N+1.

     Therefore, for this case, the minimum invalid value of x is
     (N+1)(N+1)-1+N == N(N+3).

  (2c) when (K+L) // (N+1) >= 2

     This division result implies 2N+2 <= K+L.

     Since L is bounded by N, K >= N+2, and so x >= (N+2)(N+1)-1,
     i.e. x >= N(N+3)+1.  So the minimum invalid x for this case
     is greater than the one for (2b).

So [A] is valid if (but not only if) 0 <= x < N(N+3).
  
Tamar Christina March 9, 2023, 7:39 p.m. UTC | #3
Hi,

Here's the respun patch.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	PR target/108583
	* target.def (preferred_div_as_shifts_over_mult): New.
	* doc/tm.texi.in: Document it.
	* doc/tm.texi: Regenerate.
	* targhooks.cc (default_preferred_div_as_shifts_over_mult): New.
	* targhooks.h (default_preferred_div_as_shifts_over_mult): New.
	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Use it.

gcc/testsuite/ChangeLog:

	PR target/108583
	* gcc.dg/vect/vect-div-bitmask-4.c: New test.
	* gcc.dg/vect/vect-div-bitmask-5.c: New test.

--- inline copy of patch ---

diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 50a8872a6695b18b9bed0d393bacf733833633db..bf7269e323de1a065d4d04376e5a2703cbb0f9fa 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6137,6 +6137,12 @@ instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT (const_tree @var{type})
+Sometimes it is possible to implement a vector division using a sequence
+of two addition-shift pairs, giving four instructions in total.
+Return true if taking this approach for @var{vectype} is likely
+to be better than using a sequence involving highpart multiplication.
+Default is false if @code{can_mult_highpart_p}, otherwise true.
 @end deftypefn
 
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 3e07978a02f4e6077adae6cadc93ea4273295f1f..0051017a7fd67691a343470f36ad4fc32c8e7e15 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4173,6 +4173,7 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
 
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
diff --git a/gcc/target.def b/gcc/target.def
index e0a5c7adbd962f5d08ed08d1d81afa2c2baa64a5..e4474a3ed6bd2f5f5c010bf0d40c2a371370490c 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1868,6 +1868,18 @@ correct for most targets.",
  poly_uint64, (const_tree type),
  default_preferred_vector_alignment)
 
+/* Returns whether the target has a preference for decomposing divisions using
+   shifts rather than multiplies.  */
+DEFHOOK
+(preferred_div_as_shifts_over_mult,
+ "Sometimes it is possible to implement a vector division using a sequence\n\
+of two addition-shift pairs, giving four instructions in total.\n\
+Return true if taking this approach for @var{vectype} is likely\n\
+to be better than using a sequence involving highpart multiplication.\n\
+Default is false if @code{can_mult_highpart_p}, otherwise true.",
+ bool, (const_tree type),
+ default_preferred_div_as_shifts_over_mult)
+
 /* Return true if vector alignment is reachable (by peeling N
    iterations) for the given scalar type.  */
 DEFHOOK
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index a6a4809ca91baa5d7fad2244549317a31390f0c2..a207963b9e6eb9300df0043e1b79aa6c941d0f7f 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -53,6 +53,8 @@ extern scalar_int_mode default_unwind_word_mode (void);
 extern unsigned HOST_WIDE_INT default_shift_truncation_mask
   (machine_mode);
 extern unsigned int default_min_divisions_for_recip_mul (machine_mode);
+extern bool default_preferred_div_as_shifts_over_mult
+  (const_tree);
 extern int default_mode_rep_extended (scalar_int_mode, scalar_int_mode);
 
 extern tree default_stack_protect_guard (void);
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index 211525720a620d6f533e2da91e03877337a931e7..7f39ff9b7ec2bf66625d48a47bb76e96c05a3233 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1483,6 +1483,15 @@ default_preferred_vector_alignment (const_tree type)
   return TYPE_ALIGN (type);
 }
 
+/* The default implementation of
+   TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
+
+bool
+default_preferred_div_as_shifts_over_mult (const_tree type)
+{
+  return can_mult_highpart_p (TYPE_MODE (type), TYPE_UNSIGNED (type));
+}
+
 /* By default assume vectors of element TYPE require a multiple of the natural
    alignment of TYPE.  TYPE is naturally aligned if IS_PACKED is false.  */
 bool
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
new file mode 100644
index 0000000000000000000000000000000000000000..c81f8946922250234bf759e0a0a04ea8c1f73e3c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
@@ -0,0 +1,25 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+typedef unsigned __attribute__((__vector_size__ (16))) V;
+
+static __attribute__((__noinline__)) __attribute__((__noclone__)) V
+foo (V v, unsigned short i)
+{
+  v /= i;
+  return v;
+}
+
+int
+main (void)
+{
+  V v = foo ((V) { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, 0xffff);
+  for (unsigned i = 0; i < sizeof (v) / sizeof (v[0]); i++)
+    if (v[i] != 0x00010001)
+      __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
new file mode 100644
index 0000000000000000000000000000000000000000..b4eb1a4dacba481e6306b49914d2a29b933de625
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
@@ -0,0 +1,58 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] + level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] + level) / 0xff;
+}
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "divmod pattern recognized" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 1766ce277d6b88d8aa3be77e7c8abb504a10a735..183f1a623fbde34f505259cf8f4fb4d34e069614 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3914,6 +3914,83 @@ vect_recog_divmod_pattern (vec_info *vinfo,
       return pattern_stmt;
     }
 
+  if ((cst = uniform_integer_cst_p (oprnd1))
+	   && TYPE_UNSIGNED (itype)
+	   && rhs_code == TRUNC_DIV_EXPR
+	   && vectype
+	   && targetm.vectorize.preferred_div_as_shifts_over_mult (vectype))
+    {
+      /* We can use the relationship:
+
+	   x // N == ((x+N+2) // (N+1) + x) // (N+1)  for 0 <= x < N(N+3)
+
+	 to optimize cases where N+1 is a power of 2, and where // (N+1)
+	 is therefore a shift right.  When operating in modes that are
+	 multiples of a byte in size, there are two cases:
+
+	 (1) N(N+3) is not representable, in which case the question
+	     becomes whether the replacement expression overflows.
+	     It is enough to test that x+N+2 does not overflow,
+	     i.e. that x < MAX-(N+1).
+
+	 (2) N(N+3) is representable, in which case it is the (only)
+	     bound that we need to check.
+
+	 ??? For now we just handle the case where // (N+1) is a shift
+	 right by half the precision, since some architectures can
+	 optimize the associated addition and shift combinations
+	 into single instructions.  */
+
+      auto wcst = wi::to_wide (cst);
+      int pow = wi::exact_log2 (wcst + 1);
+      if (pow == prec / 2)
+	{
+	  gimple *stmt = SSA_NAME_DEF_STMT (oprnd0);
+
+	  gimple_ranger ranger;
+	  int_range_max r;
+
+	  /* Check that no overflow will occur.  If we don't have range
+	     information we can't perform the optimization.  */
+
+	  if (ranger.range_of_expr (r, oprnd0, stmt))
+	    {
+	      wide_int max = r.upper_bound ();
+	      wide_int one = wi::shwi (1, prec);
+	      wide_int adder = wi::add (one, wi::lshift (one, pow));
+	      wi::overflow_type ovf;
+	      wi::add (max, adder, UNSIGNED, &ovf);
+	      if (ovf == wi::OVF_NONE)
+		{
+		  *type_out = vectype;
+		  tree tadder = wide_int_to_tree (itype, adder);
+		  tree rshift = wide_int_to_tree (itype, pow);
+
+		  tree new_lhs1 = vect_recog_temp_ssa_var (itype, NULL);
+		  gassign *patt1
+		    = gimple_build_assign (new_lhs1, PLUS_EXPR, oprnd0, tadder);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs2 = vect_recog_temp_ssa_var (itype, NULL);
+		  patt1 = gimple_build_assign (new_lhs2, RSHIFT_EXPR, new_lhs1,
+					       rshift);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs3 = vect_recog_temp_ssa_var (itype, NULL);
+		  patt1 = gimple_build_assign (new_lhs3, PLUS_EXPR, new_lhs2,
+					       oprnd0);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs4 = vect_recog_temp_ssa_var (itype, NULL);
+		  pattern_stmt = gimple_build_assign (new_lhs4, RSHIFT_EXPR,
+						      new_lhs3, rshift);
+
+		  return pattern_stmt;
+		}
+	    }
+	}
+    }
+
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))
     return NULL;
  
Richard Sandiford March 10, 2023, 8:39 a.m. UTC | #4
Tamar Christina <Tamar.Christina@arm.com> writes:
> Hi,
>
> Here's the respun patch.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	PR target/108583
> 	* target.def (preferred_div_as_shifts_over_mult): New.
> 	* doc/tm.texi.in: Document it.
> 	* doc/tm.texi: Regenerate.
> 	* targhooks.cc (default_preferred_div_as_shifts_over_mult): New.
> 	* targhooks.h (default_preferred_div_as_shifts_over_mult): New.
> 	* tree-vect-patterns.cc (vect_recog_divmod_pattern): Use it.
>
> gcc/testsuite/ChangeLog:
>
> 	PR target/108583
> 	* gcc.dg/vect/vect-div-bitmask-4.c: New test.
> 	* gcc.dg/vect/vect-div-bitmask-5.c: New test.
>
> --- inline copy of patch ---
>
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 50a8872a6695b18b9bed0d393bacf733833633db..bf7269e323de1a065d4d04376e5a2703cbb0f9fa 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -6137,6 +6137,12 @@ instruction pattern.  There is no need for the hook to handle these two
>  implementation approaches itself.
>  @end deftypefn
>  
> +@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT (const_tree @var{type})
> +Sometimes it is possible to implement a vector division using a sequence
> +of two addition-shift pairs, giving four instructions in total.
> +Return true if taking this approach for @var{vectype} is likely
> +to be better than using a sequence involving highpart multiplication.
> +Default is false if @code{can_mult_highpart_p}, otherwise true.
>  @end deftypefn
>  
>  @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index 3e07978a02f4e6077adae6cadc93ea4273295f1f..0051017a7fd67691a343470f36ad4fc32c8e7e15 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -4173,6 +4173,7 @@ address;  but often a machine-dependent strategy can generate better code.
>  
>  @hook TARGET_VECTORIZE_VEC_PERM_CONST
>  
> +@hook TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
>  
>  @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
>  
> diff --git a/gcc/target.def b/gcc/target.def
> index e0a5c7adbd962f5d08ed08d1d81afa2c2baa64a5..e4474a3ed6bd2f5f5c010bf0d40c2a371370490c 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -1868,6 +1868,18 @@ correct for most targets.",
>   poly_uint64, (const_tree type),
>   default_preferred_vector_alignment)
>  
> +/* Returns whether the target has a preference for decomposing divisions using
> +   shifts rather than multiplies.  */
> +DEFHOOK
> +(preferred_div_as_shifts_over_mult,
> + "Sometimes it is possible to implement a vector division using a sequence\n\
> +of two addition-shift pairs, giving four instructions in total.\n\
> +Return true if taking this approach for @var{vectype} is likely\n\
> +to be better than using a sequence involving highpart multiplication.\n\
> +Default is false if @code{can_mult_highpart_p}, otherwise true.",
> + bool, (const_tree type),
> + default_preferred_div_as_shifts_over_mult)
> +
>  /* Return true if vector alignment is reachable (by peeling N
>     iterations) for the given scalar type.  */
>  DEFHOOK
> diff --git a/gcc/targhooks.h b/gcc/targhooks.h
> index a6a4809ca91baa5d7fad2244549317a31390f0c2..a207963b9e6eb9300df0043e1b79aa6c941d0f7f 100644
> --- a/gcc/targhooks.h
> +++ b/gcc/targhooks.h
> @@ -53,6 +53,8 @@ extern scalar_int_mode default_unwind_word_mode (void);
>  extern unsigned HOST_WIDE_INT default_shift_truncation_mask
>    (machine_mode);
>  extern unsigned int default_min_divisions_for_recip_mul (machine_mode);
> +extern bool default_preferred_div_as_shifts_over_mult
> +  (const_tree);
>  extern int default_mode_rep_extended (scalar_int_mode, scalar_int_mode);
>  
>  extern tree default_stack_protect_guard (void);
> diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
> index 211525720a620d6f533e2da91e03877337a931e7..7f39ff9b7ec2bf66625d48a47bb76e96c05a3233 100644
> --- a/gcc/targhooks.cc
> +++ b/gcc/targhooks.cc
> @@ -1483,6 +1483,15 @@ default_preferred_vector_alignment (const_tree type)
>    return TYPE_ALIGN (type);
>  }
>  
> +/* The default implementation of
> +   TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
> +
> +bool
> +default_preferred_div_as_shifts_over_mult (const_tree type)
> +{
> +  return can_mult_highpart_p (TYPE_MODE (type), TYPE_UNSIGNED (type));

The return value should be inverted.

> +}
> +
>  /* By default assume vectors of element TYPE require a multiple of the natural
>     alignment of TYPE.  TYPE is naturally aligned if IS_PACKED is false.  */
>  bool
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..c81f8946922250234bf759e0a0a04ea8c1f73e3c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> @@ -0,0 +1,25 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +typedef unsigned __attribute__((__vector_size__ (16))) V;
> +
> +static __attribute__((__noinline__)) __attribute__((__noclone__)) V
> +foo (V v, unsigned short i)
> +{
> +  v /= i;
> +  return v;
> +}
> +
> +int
> +main (void)
> +{
> +  V v = foo ((V) { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, 0xffff);
> +  for (unsigned i = 0; i < sizeof (v) / sizeof (v[0]); i++)
> +    if (v[i] != 0x00010001)
> +      __builtin_abort ();
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..b4eb1a4dacba481e6306b49914d2a29b933de625
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
> @@ -0,0 +1,58 @@
> +/* { dg-require-effective-target vect_int } */
> +
> +#include <stdint.h>
> +#include <stdio.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint8_t 
> +
> +#ifndef DEBUG
> +#define DEBUG 0
> +#endif
> +
> +#define BASE ((TYPE) -1 < 0 ? -126 : 4)
> +
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] + level) / 0xff;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] = (pixel[i] + level) / 0xff;
> +}
> +
> +int main ()
> +{
> +  TYPE a[N];
> +  TYPE b[N];
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      a[i] = BASE + i * 13;
> +      b[i] = BASE + i * 13;
> +      if (DEBUG)
> +        printf ("%d: 0x%x\n", i, a[i]);
> +    }
> +
> +  fun1 (a, N / 2, N);
> +  fun2 (b, N / 2, N);
> +
> +  for (int i = 0; i < N; ++i)
> +    {
> +      if (DEBUG)
> +        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
> +
> +      if (a[i] != b[i])
> +        __builtin_abort ();
> +    }
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump "divmod pattern recognized" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
> index 1766ce277d6b88d8aa3be77e7c8abb504a10a735..183f1a623fbde34f505259cf8f4fb4d34e069614 100644
> --- a/gcc/tree-vect-patterns.cc
> +++ b/gcc/tree-vect-patterns.cc
> @@ -3914,6 +3914,83 @@ vect_recog_divmod_pattern (vec_info *vinfo,
>        return pattern_stmt;
>      }
>  
> +  if ((cst = uniform_integer_cst_p (oprnd1))
> +	   && TYPE_UNSIGNED (itype)
> +	   && rhs_code == TRUNC_DIV_EXPR
> +	   && vectype
> +	   && targetm.vectorize.preferred_div_as_shifts_over_mult (vectype))

Needs reindenting.

OK with those changes, thanks.

Richard

> +    {
> +      /* We can use the relationship:
> +
> +	   x // N == ((x+N+2) // (N+1) + x) // (N+1)  for 0 <= x < N(N+3)
> +
> +	 to optimize cases where N+1 is a power of 2, and where // (N+1)
> +	 is therefore a shift right.  When operating in modes that are
> +	 multiples of a byte in size, there are two cases:
> +
> +	 (1) N(N+3) is not representable, in which case the question
> +	     becomes whether the replacement expression overflows.
> +	     It is enough to test that x+N+2 does not overflow,
> +	     i.e. that x < MAX-(N+1).
> +
> +	 (2) N(N+3) is representable, in which case it is the (only)
> +	     bound that we need to check.
> +
> +	 ??? For now we just handle the case where // (N+1) is a shift
> +	 right by half the precision, since some architectures can
> +	 optimize the associated addition and shift combinations
> +	 into single instructions.  */
> +
> +      auto wcst = wi::to_wide (cst);
> +      int pow = wi::exact_log2 (wcst + 1);
> +      if (pow == prec / 2)
> +	{
> +	  gimple *stmt = SSA_NAME_DEF_STMT (oprnd0);
> +
> +	  gimple_ranger ranger;
> +	  int_range_max r;
> +
> +	  /* Check that no overflow will occur.  If we don't have range
> +	     information we can't perform the optimization.  */
> +
> +	  if (ranger.range_of_expr (r, oprnd0, stmt))
> +	    {
> +	      wide_int max = r.upper_bound ();
> +	      wide_int one = wi::shwi (1, prec);
> +	      wide_int adder = wi::add (one, wi::lshift (one, pow));
> +	      wi::overflow_type ovf;
> +	      wi::add (max, adder, UNSIGNED, &ovf);
> +	      if (ovf == wi::OVF_NONE)
> +		{
> +		  *type_out = vectype;
> +		  tree tadder = wide_int_to_tree (itype, adder);
> +		  tree rshift = wide_int_to_tree (itype, pow);
> +
> +		  tree new_lhs1 = vect_recog_temp_ssa_var (itype, NULL);
> +		  gassign *patt1
> +		    = gimple_build_assign (new_lhs1, PLUS_EXPR, oprnd0, tadder);
> +		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
> +
> +		  tree new_lhs2 = vect_recog_temp_ssa_var (itype, NULL);
> +		  patt1 = gimple_build_assign (new_lhs2, RSHIFT_EXPR, new_lhs1,
> +					       rshift);
> +		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
> +
> +		  tree new_lhs3 = vect_recog_temp_ssa_var (itype, NULL);
> +		  patt1 = gimple_build_assign (new_lhs3, PLUS_EXPR, new_lhs2,
> +					       oprnd0);
> +		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
> +
> +		  tree new_lhs4 = vect_recog_temp_ssa_var (itype, NULL);
> +		  pattern_stmt = gimple_build_assign (new_lhs4, RSHIFT_EXPR,
> +						      new_lhs3, rshift);
> +
> +		  return pattern_stmt;
> +		}
> +	    }
> +	}
> +    }
> +
>    if (prec > HOST_BITS_PER_WIDE_INT
>        || integer_zerop (oprnd1))
>      return NULL;
  

Patch

--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6137,6 +6137,9 @@  instruction pattern.  There is no need for the hook to handle these two
 implementation approaches itself.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT (void)
+When decomposing a division operation, if possible prefer to decompose the
+operation as shifts rather than multiplication by magic constants.
 @end deftypefn
 
 @deftypefn {Target Hook} tree TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION (unsigned @var{code}, tree @var{vec_type_out}, tree @var{vec_type_in})
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 3e07978a02f4e6077adae6cadc93ea4273295f1f..0051017a7fd67691a343470f36ad4fc32c8e7e15 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4173,6 +4173,7 @@  address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST
 
+@hook TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
 
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
 
diff --git a/gcc/target.def b/gcc/target.def
index e0a5c7adbd962f5d08ed08d1d81afa2c2baa64a5..8cc18b1f3c5de24c21faf891b9d4d0b6fd5b59d7 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1868,6 +1868,15 @@  correct for most targets.",
  poly_uint64, (const_tree type),
  default_preferred_vector_alignment)
 
+/* Returns whether the target has a preference for decomposing divisions using
+   shifts rather than multiplies.  */
+DEFHOOK
+(preferred_div_as_shifts_over_mult,
+ "When decomposing a division operation, if possible prefer to decompose the\n\
+operation as shifts rather than multiplication by magic constants.",
+ bool, (void),
+ default_preferred_div_as_shifts_over_mult)
+
 /* Return true if vector alignment is reachable (by peeling N
    iterations) for the given scalar type.  */
 DEFHOOK
diff --git a/gcc/targhooks.h b/gcc/targhooks.h
index a6a4809ca91baa5d7fad2244549317a31390f0c2..dda011c59fbd5973ee648dfea195619cc41c71bc 100644
--- a/gcc/targhooks.h
+++ b/gcc/targhooks.h
@@ -53,6 +53,8 @@  extern scalar_int_mode default_unwind_word_mode (void);
 extern unsigned HOST_WIDE_INT default_shift_truncation_mask
   (machine_mode);
 extern unsigned int default_min_divisions_for_recip_mul (machine_mode);
+extern bool
+default_preferred_div_as_shifts_over_mult (void);
 extern int default_mode_rep_extended (scalar_int_mode, scalar_int_mode);
 
 extern tree default_stack_protect_guard (void);
diff --git a/gcc/targhooks.cc b/gcc/targhooks.cc
index 211525720a620d6f533e2da91e03877337a931e7..6396f344eef09dd61f358938846a1c02a70b31d8 100644
--- a/gcc/targhooks.cc
+++ b/gcc/targhooks.cc
@@ -1483,6 +1483,15 @@  default_preferred_vector_alignment (const_tree type)
   return TYPE_ALIGN (type);
 }
 
+/* The default implementation of
+   TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
+
+bool
+default_preferred_div_as_shifts_over_mult (void)
+{
+  return false;
+}
+
 /* By default assume vectors of element TYPE require a multiple of the natural
    alignment of TYPE.  TYPE is naturally aligned if IS_PACKED is false.  */
 bool
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
new file mode 100644
index 0000000000000000000000000000000000000000..c81f8946922250234bf759e0a0a04ea8c1f73e3c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
@@ -0,0 +1,25 @@ 
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+typedef unsigned __attribute__((__vector_size__ (16))) V;
+
+static __attribute__((__noinline__)) __attribute__((__noclone__)) V
+foo (V v, unsigned short i)
+{
+  v /= i;
+  return v;
+}
+
+int
+main (void)
+{
+  V v = foo ((V) { 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff }, 0xffff);
+  for (unsigned i = 0; i < sizeof (v) / sizeof (v[0]); i++)
+    if (v[i] != 0x00010001)
+      __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
new file mode 100644
index 0000000000000000000000000000000000000000..b4eb1a4dacba481e6306b49914d2a29b933de625
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-5.c
@@ -0,0 +1,58 @@ 
+/* { dg-require-effective-target vect_int } */
+
+#include <stdint.h>
+#include <stdio.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint8_t 
+
+#ifndef DEBUG
+#define DEBUG 0
+#endif
+
+#define BASE ((TYPE) -1 < 0 ? -126 : 4)
+
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] + level) / 0xff;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] = (pixel[i] + level) / 0xff;
+}
+
+int main ()
+{
+  TYPE a[N];
+  TYPE b[N];
+
+  for (int i = 0; i < N; ++i)
+    {
+      a[i] = BASE + i * 13;
+      b[i] = BASE + i * 13;
+      if (DEBUG)
+        printf ("%d: 0x%x\n", i, a[i]);
+    }
+
+  fun1 (a, N / 2, N);
+  fun2 (b, N / 2, N);
+
+  for (int i = 0; i < N; ++i)
+    {
+      if (DEBUG)
+        printf ("%d = 0x%x == 0x%x\n", i, a[i], b[i]);
+
+      if (a[i] != b[i])
+        __builtin_abort ();
+    }
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "divmod pattern recognized" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/tree-vect-patterns.cc b/gcc/tree-vect-patterns.cc
index 1766ce277d6b88d8aa3be77e7c8abb504a10a735..31f2a6753b4faccb77351c8c5afed9775888b60f 100644
--- a/gcc/tree-vect-patterns.cc
+++ b/gcc/tree-vect-patterns.cc
@@ -3913,6 +3913,84 @@  vect_recog_divmod_pattern (vec_info *vinfo,
 
       return pattern_stmt;
     }
+  else if ((cst = uniform_integer_cst_p (oprnd1))
+	   && TYPE_UNSIGNED (itype)
+	   && rhs_code == TRUNC_DIV_EXPR
+	   && vectype
+	   && targetm.vectorize.preferred_div_as_shifts_over_mult ())
+    {
+      /* div optimizations using narrowings
+       we can do the division e.g. shorts by 255 faster by calculating it as
+       (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
+       double the precision of x.
+
+       If we imagine a short as being composed of two blocks of bytes then
+       adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to
+       adding 1 to each sub component:
+
+	    short value of 16-bits
+       ┌──────────────┬────────────────┐
+       │              │                │
+       └──────────────┴────────────────┘
+	 8-bit part1 ▲  8-bit part2   ▲
+		     │                │
+		     │                │
+		    +1               +1
+
+       after the first addition, we have to shift right by 8, and narrow the
+       results back to a byte.  Remember that the addition must be done in
+       double the precision of the input.  However if we know that the addition
+       `x + 257` does not overflow then we can do the operation in the current
+       precision.  In which case we don't need the pack and unpacks.  */
+      auto wcst = wi::to_wide (cst);
+      int pow = wi::exact_log2 (wcst + 1);
+      if (pow == (int) (element_precision (vectype) / 2))
+	{
+	  gimple *stmt = SSA_NAME_DEF_STMT (oprnd0);
+
+	  gimple_ranger ranger;
+	  int_range_max r;
+
+	  /* Check that no overflow will occur.  If we don't have range
+	     information we can't perform the optimization.  */
+
+	  if (ranger.range_of_expr (r, oprnd0, stmt))
+	    {
+	      wide_int max = r.upper_bound ();
+	      wide_int one = wi::to_wide (build_one_cst (itype));
+	      wide_int adder = wi::add (one, wi::lshift (one, pow));
+	      wi::overflow_type ovf;
+	      wi::add (max, adder, UNSIGNED, &ovf);
+	      if (ovf == wi::OVF_NONE)
+		{
+		  *type_out = vectype;
+		  tree tadder = wide_int_to_tree (itype, adder);
+		  tree rshift = wide_int_to_tree (itype, pow);
+
+		  tree new_lhs1 = vect_recog_temp_ssa_var (itype, NULL);
+		  gassign *patt1
+		    = gimple_build_assign (new_lhs1, PLUS_EXPR, oprnd0, tadder);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs2 = vect_recog_temp_ssa_var (itype, NULL);
+		  patt1 = gimple_build_assign (new_lhs2, RSHIFT_EXPR, new_lhs1,
+					       rshift);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs3 = vect_recog_temp_ssa_var (itype, NULL);
+		  patt1 = gimple_build_assign (new_lhs3, PLUS_EXPR, new_lhs2,
+					       oprnd0);
+		  append_pattern_def_seq (vinfo, stmt_vinfo, patt1, vectype);
+
+		  tree new_lhs4 = vect_recog_temp_ssa_var (itype, NULL);
+		  pattern_stmt = gimple_build_assign (new_lhs4, RSHIFT_EXPR,
+						      new_lhs3, rshift);
+
+		  return pattern_stmt;
+		}
+	    }
+	}
+    }
 
   if (prec > HOST_BITS_PER_WIDE_INT
       || integer_zerop (oprnd1))