[4/4] AArch64 sve2: rewrite pack + NARROWB + NARROWB to NARROWB + NARROWT

Message ID Yy19kZozCiweoBcT@arm.com
State Dropped
Headers
Series [1/2] middle-end Support optimized division by pow2 bitmask |

Commit Message

Tamar Christina Sept. 23, 2022, 9:34 a.m. UTC
  Hi All,

This adds an RTL pattern for when two NARROWB instructions are being combined
with a PACK.  The second NARROWB is then transformed into a NARROWT.

For the example:

void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
{
  for (int i = 0; i < (n & -16); i+=1)
    pixel[i] += (pixel[i] * level) / 0xff;
}

we generate:

        addhnb  z6.b, z0.h, z4.h
        addhnb  z5.b, z1.h, z4.h
        addhnb  z0.b, z0.h, z6.h
        addhnt  z0.b, z1.h, z5.h
        add     z0.b, z0.b, z2.b

instead of:

        addhnb  z6.b, z1.h, z4.h
        addhnb  z5.b, z0.h, z4.h
        addhnb  z1.b, z1.h, z6.h
        addhnb  z0.b, z0.h, z5.h
        uzp1    z0.b, z0.b, z1.b
        add     z0.b, z0.b, z2.b

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

	* config/aarch64/aarch64-sve2.md (*aarch64_sve_pack_<sve_int_op><mode>):
	New.
	* config/aarch64/iterators.md (binary_top): New.

gcc/testsuite/ChangeLog:

	* gcc.dg/vect/vect-div-bitmask-4.c: New test.
	* gcc.target/aarch64/sve2/div-by-bitmask_2.c: New test.

--- inline copy of patch -- 
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index ab5dcc369481311e5bd68a1581265e1ce99b4b0f..0ee46c8b0d43467da4a6b98ad3c41e5d05d8cf38 100644




--
diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
index ab5dcc369481311e5bd68a1581265e1ce99b4b0f..0ee46c8b0d43467da4a6b98ad3c41e5d05d8cf38 100644
--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -1600,6 +1600,25 @@ (define_insn "@aarch64_sve_<sve_int_op><mode>"
   "<sve_int_op>\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
 )
 
+(define_insn_and_split "*aarch64_sve_pack_<sve_int_op><mode>"
+  [(set (match_operand:<VNARROW> 0 "register_operand" "=w")
+	(unspec:<VNARROW>
+	  [(match_operand:SVE_FULL_HSDI 1 "register_operand" "w")
+	   (subreg:SVE_FULL_HSDI (unspec:<VNARROW>
+	     [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
+	      (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")]
+	     SVE2_INT_BINARY_NARROWB) 0)]
+	  UNSPEC_PACK))]
+  "TARGET_SVE2"
+  "#"
+  "&& true"
+  [(const_int 0)]
+{
+  rtx tmp = lowpart_subreg (<VNARROW>mode, operands[1], <MODE>mode);
+  emit_insn (gen_aarch64_sve (<SVE2_INT_BINARY_NARROWB:binary_top>, <MODE>mode,
+			      operands[0], tmp, operands[2], operands[3]));
+})
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] Narrowing right shifts
 ;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 0dd9dc66f7ccd78acacb759662d0cd561cd5b4ef..37d8161a33b1c399d80be82afa67613a087389d4 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -3589,6 +3589,11 @@ (define_int_attr brk_op [(UNSPEC_BRKA "a") (UNSPEC_BRKB "b")
 
 (define_int_attr sve_pred_op [(UNSPEC_PFIRST "pfirst") (UNSPEC_PNEXT "pnext")])
 
+(define_int_attr binary_top [(UNSPEC_ADDHNB "UNSPEC_ADDHNT")
+			     (UNSPEC_RADDHNB "UNSPEC_RADDHNT")
+			     (UNSPEC_RSUBHNB "UNSPEC_RSUBHNT")
+			     (UNSPEC_SUBHNB "UNSPEC_SUBHNT")])
+
 (define_int_attr sve_int_op [(UNSPEC_ADCLB "adclb")
 			     (UNSPEC_ADCLT "adclt")
 			     (UNSPEC_ADDHNB "addhnb")
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
new file mode 100644
index 0000000000000000000000000000000000000000..0df08bda6fd3e33280307ea15c82dd9726897cfd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
@@ -0,0 +1,26 @@
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint32_t
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..cddcebdf15ecaa9dc515f58cdbced36c8038db1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
@@ -0,0 +1,56 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+/*
+** draw_bitmap1:
+** ...
+** 	addhnb	z6.b, z0.h, z4.h
+** 	addhnb	z5.b, z1.h, z4.h
+** 	addhnb	z0.b, z0.h, z6.h
+** 	addhnt	z0.b, z1.h, z5.h
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] += (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] += (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+** 	addhnb	z6.h, z0.s, z4.s
+** 	addhnb	z5.h, z1.s, z4.s
+** 	addhnb	z0.h, z0.s, z6.s
+** 	addhnt	z0.h, z1.s, z5.s
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] += (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+** 	addhnb	z6.s, z0.d, z4.d
+** 	addhnb	z5.s, z1.d, z4.d
+** 	addhnb	z0.s, z0.d, z6.d
+** 	addhnt	z0.s, z1.d, z5.d
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
  

Comments

Tamar Christina Oct. 31, 2022, 11:34 a.m. UTC | #1
Ping

> -----Original Message-----
> From: Gcc-patches <gcc-patches-
> bounces+tamar.christina=arm.com@gcc.gnu.org> On Behalf Of Tamar
> Christina via Gcc-patches
> Sent: Friday, September 23, 2022 10:34 AM
> To: gcc-patches@gcc.gnu.org
> Cc: Richard Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>;
> Richard Sandiford <Richard.Sandiford@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>
> Subject: [PATCH 4/4]AArch64 sve2: rewrite pack + NARROWB + NARROWB to
> NARROWB + NARROWT
> 
> Hi All,
> 
> This adds an RTL pattern for when two NARROWB instructions are being
> combined with a PACK.  The second NARROWB is then transformed into a
> NARROWT.
> 
> For the example:
> 
> void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
>   for (int i = 0; i < (n & -16); i+=1)
>     pixel[i] += (pixel[i] * level) / 0xff; }
> 
> we generate:
> 
>         addhnb  z6.b, z0.h, z4.h
>         addhnb  z5.b, z1.h, z4.h
>         addhnb  z0.b, z0.h, z6.h
>         addhnt  z0.b, z1.h, z5.h
>         add     z0.b, z0.b, z2.b
> 
> instead of:
> 
>         addhnb  z6.b, z1.h, z4.h
>         addhnb  z5.b, z0.h, z4.h
>         addhnb  z1.b, z1.h, z6.h
>         addhnb  z0.b, z0.h, z5.h
>         uzp1    z0.b, z0.b, z1.b
>         add     z0.b, z0.b, z2.b
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
> 	* config/aarch64/aarch64-sve2.md
> (*aarch64_sve_pack_<sve_int_op><mode>):
> 	New.
> 	* config/aarch64/iterators.md (binary_top): New.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/vect/vect-div-bitmask-4.c: New test.
> 	* gcc.target/aarch64/sve2/div-by-bitmask_2.c: New test.
> 
> --- inline copy of patch --
> diff --git a/gcc/config/aarch64/aarch64-sve2.md
> b/gcc/config/aarch64/aarch64-sve2.md
> index
> ab5dcc369481311e5bd68a1581265e1ce99b4b0f..0ee46c8b0d43467da4a6b98a
> d3c41e5d05d8cf38 100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -1600,6 +1600,25 @@ (define_insn
> "@aarch64_sve_<sve_int_op><mode>"
>    "<sve_int_op>\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
>  )
> 
> +(define_insn_and_split "*aarch64_sve_pack_<sve_int_op><mode>"
> +  [(set (match_operand:<VNARROW> 0 "register_operand" "=w")
> +	(unspec:<VNARROW>
> +	  [(match_operand:SVE_FULL_HSDI 1 "register_operand" "w")
> +	   (subreg:SVE_FULL_HSDI (unspec:<VNARROW>
> +	     [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
> +	      (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")]
> +	     SVE2_INT_BINARY_NARROWB) 0)]
> +	  UNSPEC_PACK))]
> +  "TARGET_SVE2"
> +  "#"
> +  "&& true"
> +  [(const_int 0)]
> +{
> +  rtx tmp = lowpart_subreg (<VNARROW>mode, operands[1],
> <MODE>mode);
> +  emit_insn (gen_aarch64_sve
> (<SVE2_INT_BINARY_NARROWB:binary_top>, <MODE>mode,
> +			      operands[0], tmp, operands[2], operands[3]));
> +})
> +
>  ;; -------------------------------------------------------------------------
>  ;; ---- [INT] Narrowing right shifts
>  ;; -------------------------------------------------------------------------
> diff --git a/gcc/config/aarch64/iterators.md
> b/gcc/config/aarch64/iterators.md index
> 0dd9dc66f7ccd78acacb759662d0cd561cd5b4ef..37d8161a33b1c399d80be82af
> a67613a087389d4 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -3589,6 +3589,11 @@ (define_int_attr brk_op [(UNSPEC_BRKA "a")
> (UNSPEC_BRKB "b")
> 
>  (define_int_attr sve_pred_op [(UNSPEC_PFIRST "pfirst") (UNSPEC_PNEXT
> "pnext")])
> 
> +(define_int_attr binary_top [(UNSPEC_ADDHNB "UNSPEC_ADDHNT")
> +			     (UNSPEC_RADDHNB "UNSPEC_RADDHNT")
> +			     (UNSPEC_RSUBHNB "UNSPEC_RSUBHNT")
> +			     (UNSPEC_SUBHNB "UNSPEC_SUBHNT")])
> +
>  (define_int_attr sve_int_op [(UNSPEC_ADCLB "adclb")
>  			     (UNSPEC_ADCLT "adclt")
>  			     (UNSPEC_ADDHNB "addhnb")
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..0df08bda6fd3e33280307ea15
> c82dd9726897cfd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> @@ -0,0 +1,26 @@
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-*
> +} } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint32_t
> +
> +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> +
> +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> +restrict pixel, TYPE level, int n) {
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> +detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..cddcebdf15ecaa9dc515f58cdb
> ced36c8038db1b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } }
> +*/
> +
> +#include <stdint.h>
> +
> +/*
> +** draw_bitmap1:
> +** ...
> +** 	addhnb	z6.b, z0.h, z4.h
> +** 	addhnb	z5.b, z1.h, z4.h
> +** 	addhnb	z0.b, z0.h, z6.h
> +** 	addhnt	z0.b, z1.h, z5.h
> +** ...
> +*/
> +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * level) / 0xff; }
> +
> +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * level) / 0xfe; }
> +
> +/*
> +** draw_bitmap3:
> +** ...
> +** 	addhnb	z6.h, z0.s, z4.s
> +** 	addhnb	z5.h, z1.s, z4.s
> +** 	addhnb	z0.h, z0.s, z6.s
> +** 	addhnt	z0.h, z1.s, z5.s
> +** ...
> +*/
> +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * level) / 0xffffU; }
> +
> +/*
> +** draw_bitmap4:
> +** ...
> +** 	addhnb	z6.s, z0.d, z4.d
> +** 	addhnb	z5.s, z1.d, z4.d
> +** 	addhnb	z0.s, z0.d, z6.d
> +** 	addhnt	z0.s, z1.d, z5.d
> +** ...
> +*/
> +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) {
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> 
> 
> 
> 
> --
  
Tamar Christina Nov. 9, 2022, 8:33 a.m. UTC | #2
ping

> -----Original Message-----
> From: Tamar Christina
> Sent: Monday, October 31, 2022 11:35 AM
> To: Tamar Christina <Tamar.Christina@arm.com>; gcc-patches@gcc.gnu.org
> Cc: Richard Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>;
> Richard Sandiford <Richard.Sandiford@arm.com>; Marcus Shawcroft
> <Marcus.Shawcroft@arm.com>
> Subject: RE: [PATCH 4/4]AArch64 sve2: rewrite pack + NARROWB +
> NARROWB to NARROWB + NARROWT
> 
> Ping
> 
> > -----Original Message-----
> > From: Gcc-patches <gcc-patches-
> > bounces+tamar.christina=arm.com@gcc.gnu.org> On Behalf Of Tamar
> > Christina via Gcc-patches
> > Sent: Friday, September 23, 2022 10:34 AM
> > To: gcc-patches@gcc.gnu.org
> > Cc: Richard Earnshaw <Richard.Earnshaw@arm.com>; nd <nd@arm.com>;
> > Richard Sandiford <Richard.Sandiford@arm.com>; Marcus Shawcroft
> > <Marcus.Shawcroft@arm.com>
> > Subject: [PATCH 4/4]AArch64 sve2: rewrite pack + NARROWB + NARROWB
> to
> > NARROWB + NARROWT
> >
> > Hi All,
> >
> > This adds an RTL pattern for when two NARROWB instructions are being
> > combined with a PACK.  The second NARROWB is then transformed into a
> > NARROWT.
> >
> > For the example:
> >
> > void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> >   for (int i = 0; i < (n & -16); i+=1)
> >     pixel[i] += (pixel[i] * level) / 0xff; }
> >
> > we generate:
> >
> >         addhnb  z6.b, z0.h, z4.h
> >         addhnb  z5.b, z1.h, z4.h
> >         addhnb  z0.b, z0.h, z6.h
> >         addhnt  z0.b, z1.h, z5.h
> >         add     z0.b, z0.b, z2.b
> >
> > instead of:
> >
> >         addhnb  z6.b, z1.h, z4.h
> >         addhnb  z5.b, z0.h, z4.h
> >         addhnb  z1.b, z1.h, z6.h
> >         addhnb  z0.b, z0.h, z5.h
> >         uzp1    z0.b, z0.b, z1.b
> >         add     z0.b, z0.b, z2.b
> >
> > Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
> >
> > Ok for master?
> >
> > Thanks,
> > Tamar
> >
> > gcc/ChangeLog:
> >
> > 	* config/aarch64/aarch64-sve2.md
> > (*aarch64_sve_pack_<sve_int_op><mode>):
> > 	New.
> > 	* config/aarch64/iterators.md (binary_top): New.
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.dg/vect/vect-div-bitmask-4.c: New test.
> > 	* gcc.target/aarch64/sve2/div-by-bitmask_2.c: New test.
> >
> > --- inline copy of patch --
> > diff --git a/gcc/config/aarch64/aarch64-sve2.md
> > b/gcc/config/aarch64/aarch64-sve2.md
> > index
> >
> ab5dcc369481311e5bd68a1581265e1ce99b4b0f..0ee46c8b0d43467da4a6b98a
> > d3c41e5d05d8cf38 100644
> > --- a/gcc/config/aarch64/aarch64-sve2.md
> > +++ b/gcc/config/aarch64/aarch64-sve2.md
> > @@ -1600,6 +1600,25 @@ (define_insn
> > "@aarch64_sve_<sve_int_op><mode>"
> >    "<sve_int_op>\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
> >  )
> >
> > +(define_insn_and_split "*aarch64_sve_pack_<sve_int_op><mode>"
> > +  [(set (match_operand:<VNARROW> 0 "register_operand" "=w")
> > +	(unspec:<VNARROW>
> > +	  [(match_operand:SVE_FULL_HSDI 1 "register_operand" "w")
> > +	   (subreg:SVE_FULL_HSDI (unspec:<VNARROW>
> > +	     [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
> > +	      (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")]
> > +	     SVE2_INT_BINARY_NARROWB) 0)]
> > +	  UNSPEC_PACK))]
> > +  "TARGET_SVE2"
> > +  "#"
> > +  "&& true"
> > +  [(const_int 0)]
> > +{
> > +  rtx tmp = lowpart_subreg (<VNARROW>mode, operands[1],
> > <MODE>mode);
> > +  emit_insn (gen_aarch64_sve
> > (<SVE2_INT_BINARY_NARROWB:binary_top>, <MODE>mode,
> > +			      operands[0], tmp, operands[2], operands[3]));
> > +})
> > +
> >  ;;
> > ----------------------------------------------------------------------
> > ---
> >  ;; ---- [INT] Narrowing right shifts
> >  ;;
> > ----------------------------------------------------------------------
> > --- diff --git a/gcc/config/aarch64/iterators.md
> > b/gcc/config/aarch64/iterators.md index
> >
> 0dd9dc66f7ccd78acacb759662d0cd561cd5b4ef..37d8161a33b1c399d80be82af
> > a67613a087389d4 100644
> > --- a/gcc/config/aarch64/iterators.md
> > +++ b/gcc/config/aarch64/iterators.md
> > @@ -3589,6 +3589,11 @@ (define_int_attr brk_op [(UNSPEC_BRKA "a")
> > (UNSPEC_BRKB "b")
> >
> >  (define_int_attr sve_pred_op [(UNSPEC_PFIRST "pfirst") (UNSPEC_PNEXT
> > "pnext")])
> >
> > +(define_int_attr binary_top [(UNSPEC_ADDHNB "UNSPEC_ADDHNT")
> > +			     (UNSPEC_RADDHNB "UNSPEC_RADDHNT")
> > +			     (UNSPEC_RSUBHNB "UNSPEC_RSUBHNT")
> > +			     (UNSPEC_SUBHNB "UNSPEC_SUBHNT")])
> > +
> >  (define_int_attr sve_int_op [(UNSPEC_ADCLB "adclb")
> >  			     (UNSPEC_ADCLT "adclt")
> >  			     (UNSPEC_ADDHNB "addhnb")
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> > b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..0df08bda6fd3e33280307ea15
> > c82dd9726897cfd
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> > @@ -0,0 +1,26 @@
> > +/* { dg-require-effective-target vect_int } */
> > +/* { dg-additional-options "-fno-vect-cost-model" { target
> > +aarch64*-*-* } } */
> > +
> > +#include <stdint.h>
> > +#include "tree-vect.h"
> > +
> > +#define N 50
> > +#define TYPE uint32_t
> > +
> > +__attribute__((noipa, noinline, optimize("O1"))) void fun1(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > +
> > +__attribute__((noipa, noinline, optimize("O3"))) void fun2(TYPE*
> > +restrict pixel, TYPE level, int n) {
> > +  for (int i = 0; i < n; i+=1)
> > +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> > +
> > +#include "vect-div-bitmask.h"
> > +
> > +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern:
> > +detected" "vect" { target aarch64*-*-* } } } */
> > diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> > b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> > new file mode 100644
> > index
> >
> 0000000000000000000000000000000000000000..cddcebdf15ecaa9dc515f58cdb
> > ced36c8038db1b
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> > @@ -0,0 +1,56 @@
> > +/* { dg-do compile } */
> > +/* { dg-additional-options "-O2 -std=c99" } */
> > +/* { dg-final { check-function-bodies "**" "" "" { target { le } } }
> > +} */
> > +
> > +#include <stdint.h>
> > +
> > +/*
> > +** draw_bitmap1:
> > +** ...
> > +** 	addhnb	z6.b, z0.h, z4.h
> > +** 	addhnb	z5.b, z1.h, z4.h
> > +** 	addhnb	z0.b, z0.h, z6.h
> > +** 	addhnt	z0.b, z1.h, z5.h
> > +** ...
> > +*/
> > +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] += (pixel[i] * level) / 0xff; }
> > +
> > +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] += (pixel[i] * level) / 0xfe; }
> > +
> > +/*
> > +** draw_bitmap3:
> > +** ...
> > +** 	addhnb	z6.h, z0.s, z4.s
> > +** 	addhnb	z5.h, z1.s, z4.s
> > +** 	addhnb	z0.h, z0.s, z6.s
> > +** 	addhnt	z0.h, z1.s, z5.s
> > +** ...
> > +*/
> > +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] += (pixel[i] * level) / 0xffffU; }
> > +
> > +/*
> > +** draw_bitmap4:
> > +** ...
> > +** 	addhnb	z6.s, z0.d, z4.d
> > +** 	addhnb	z5.s, z1.d, z4.d
> > +** 	addhnb	z0.s, z0.d, z6.d
> > +** 	addhnt	z0.s, z1.d, z5.d
> > +** ...
> > +*/
> > +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n) {
> > +  for (int i = 0; i < (n & -16); i+=1)
> > +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL; }
> >
> >
> >
> >
> > --
  
Richard Sandiford Nov. 12, 2022, 12:25 p.m. UTC | #3
Tamar Christina <tamar.christina@arm.com> writes:
> Hi All,
>
> This adds an RTL pattern for when two NARROWB instructions are being combined
> with a PACK.  The second NARROWB is then transformed into a NARROWT.
>
> For the example:
>
> void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
> {
>   for (int i = 0; i < (n & -16); i+=1)
>     pixel[i] += (pixel[i] * level) / 0xff;
> }
>
> we generate:
>
>         addhnb  z6.b, z0.h, z4.h
>         addhnb  z5.b, z1.h, z4.h
>         addhnb  z0.b, z0.h, z6.h
>         addhnt  z0.b, z1.h, z5.h
>         add     z0.b, z0.b, z2.b
>
> instead of:
>
>         addhnb  z6.b, z1.h, z4.h
>         addhnb  z5.b, z0.h, z4.h
>         addhnb  z1.b, z1.h, z6.h
>         addhnb  z0.b, z0.h, z5.h
>         uzp1    z0.b, z0.b, z1.b
>         add     z0.b, z0.b, z2.b
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?
>
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> 	* config/aarch64/aarch64-sve2.md (*aarch64_sve_pack_<sve_int_op><mode>):
> 	New.
> 	* config/aarch64/iterators.md (binary_top): New.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.dg/vect/vect-div-bitmask-4.c: New test.
> 	* gcc.target/aarch64/sve2/div-by-bitmask_2.c: New test.
>
> --- inline copy of patch -- 
> diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
> index ab5dcc369481311e5bd68a1581265e1ce99b4b0f..0ee46c8b0d43467da4a6b98ad3c41e5d05d8cf38 100644
> --- a/gcc/config/aarch64/aarch64-sve2.md
> +++ b/gcc/config/aarch64/aarch64-sve2.md
> @@ -1600,6 +1600,25 @@ (define_insn "@aarch64_sve_<sve_int_op><mode>"
>    "<sve_int_op>\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
>  )
>  
> +(define_insn_and_split "*aarch64_sve_pack_<sve_int_op><mode>"
> +  [(set (match_operand:<VNARROW> 0 "register_operand" "=w")
> +	(unspec:<VNARROW>
> +	  [(match_operand:SVE_FULL_HSDI 1 "register_operand" "w")

"0" would be safer, in case the instruction is only split after RA.

> +	   (subreg:SVE_FULL_HSDI (unspec:<VNARROW>
> +	     [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
> +	      (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")]
> +	     SVE2_INT_BINARY_NARROWB) 0)]
> +	  UNSPEC_PACK))]

I think ideally this would be the canonical pattern, so that we can
drop the separate top unspecs.  That's more work though, and would
probably make sense to do once we have a generic way of representing
the pack.

So OK with the "0" change above.

Thanks,
Richard

> +  "TARGET_SVE2"
> +  "#"
> +  "&& true"
> +  [(const_int 0)]
> +{
> +  rtx tmp = lowpart_subreg (<VNARROW>mode, operands[1], <MODE>mode);
> +  emit_insn (gen_aarch64_sve (<SVE2_INT_BINARY_NARROWB:binary_top>, <MODE>mode,
> +			      operands[0], tmp, operands[2], operands[3]));
> +})
> +
>  ;; -------------------------------------------------------------------------
>  ;; ---- [INT] Narrowing right shifts
>  ;; -------------------------------------------------------------------------
> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
> index 0dd9dc66f7ccd78acacb759662d0cd561cd5b4ef..37d8161a33b1c399d80be82afa67613a087389d4 100644
> --- a/gcc/config/aarch64/iterators.md
> +++ b/gcc/config/aarch64/iterators.md
> @@ -3589,6 +3589,11 @@ (define_int_attr brk_op [(UNSPEC_BRKA "a") (UNSPEC_BRKB "b")
>  
>  (define_int_attr sve_pred_op [(UNSPEC_PFIRST "pfirst") (UNSPEC_PNEXT "pnext")])
>  
> +(define_int_attr binary_top [(UNSPEC_ADDHNB "UNSPEC_ADDHNT")
> +			     (UNSPEC_RADDHNB "UNSPEC_RADDHNT")
> +			     (UNSPEC_RSUBHNB "UNSPEC_RSUBHNT")
> +			     (UNSPEC_SUBHNB "UNSPEC_SUBHNT")])
> +
>  (define_int_attr sve_int_op [(UNSPEC_ADCLB "adclb")
>  			     (UNSPEC_ADCLT "adclt")
>  			     (UNSPEC_ADDHNB "addhnb")
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..0df08bda6fd3e33280307ea15c82dd9726897cfd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
> @@ -0,0 +1,26 @@
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
> +
> +#include <stdint.h>
> +#include "tree-vect.h"
> +
> +#define N 50
> +#define TYPE uint32_t
> +
> +__attribute__((noipa, noinline, optimize("O1")))
> +void fun1(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
> +
> +__attribute__((noipa, noinline, optimize("O3")))
> +void fun2(TYPE* restrict pixel, TYPE level, int n)
> +{
> +  for (int i = 0; i < n; i+=1)
> +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
> +
> +#include "vect-div-bitmask.h"
> +
> +/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..cddcebdf15ecaa9dc515f58cdbced36c8038db1b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
> @@ -0,0 +1,56 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2 -std=c99" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
> +
> +#include <stdint.h>
> +
> +/*
> +** draw_bitmap1:
> +** ...
> +** 	addhnb	z6.b, z0.h, z4.h
> +** 	addhnb	z5.b, z1.h, z4.h
> +** 	addhnb	z0.b, z0.h, z6.h
> +** 	addhnt	z0.b, z1.h, z5.h
> +** ...
> +*/
> +void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * level) / 0xff;
> +}
> +
> +void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * level) / 0xfe;
> +}
> +
> +/*
> +** draw_bitmap3:
> +** ...
> +** 	addhnb	z6.h, z0.s, z4.s
> +** 	addhnb	z5.h, z1.s, z4.s
> +** 	addhnb	z0.h, z0.s, z6.s
> +** 	addhnt	z0.h, z1.s, z5.s
> +** ...
> +*/
> +void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * level) / 0xffffU;
> +}
> +
> +/*
> +** draw_bitmap4:
> +** ...
> +** 	addhnb	z6.s, z0.d, z4.d
> +** 	addhnb	z5.s, z1.d, z4.d
> +** 	addhnb	z0.s, z0.d, z6.d
> +** 	addhnt	z0.s, z1.d, z5.d
> +** ...
> +*/
> +void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
> +{
> +  for (int i = 0; i < (n & -16); i+=1)
> +    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
> +}
  
Richard Sandiford Nov. 12, 2022, 12:33 p.m. UTC | #4
Richard Sandiford <richard.sandiford@arm.com> writes:
> Tamar Christina <tamar.christina@arm.com> writes:
>> Hi All,
>>
>> This adds an RTL pattern for when two NARROWB instructions are being combined
>> with a PACK.  The second NARROWB is then transformed into a NARROWT.
>>
>> For the example:
>>
>> void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
>> {
>>   for (int i = 0; i < (n & -16); i+=1)
>>     pixel[i] += (pixel[i] * level) / 0xff;
>> }
>>
>> we generate:
>>
>>         addhnb  z6.b, z0.h, z4.h
>>         addhnb  z5.b, z1.h, z4.h
>>         addhnb  z0.b, z0.h, z6.h
>>         addhnt  z0.b, z1.h, z5.h
>>         add     z0.b, z0.b, z2.b
>>
>> instead of:
>>
>>         addhnb  z6.b, z1.h, z4.h
>>         addhnb  z5.b, z0.h, z4.h
>>         addhnb  z1.b, z1.h, z6.h
>>         addhnb  z0.b, z0.h, z5.h
>>         uzp1    z0.b, z0.b, z1.b
>>         add     z0.b, z0.b, z2.b
>>
>> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>>
>> Ok for master?
>>
>> Thanks,
>> Tamar
>>
>> gcc/ChangeLog:
>>
>> 	* config/aarch64/aarch64-sve2.md (*aarch64_sve_pack_<sve_int_op><mode>):
>> 	New.
>> 	* config/aarch64/iterators.md (binary_top): New.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 	* gcc.dg/vect/vect-div-bitmask-4.c: New test.
>> 	* gcc.target/aarch64/sve2/div-by-bitmask_2.c: New test.
>>
>> --- inline copy of patch -- 
>> diff --git a/gcc/config/aarch64/aarch64-sve2.md b/gcc/config/aarch64/aarch64-sve2.md
>> index ab5dcc369481311e5bd68a1581265e1ce99b4b0f..0ee46c8b0d43467da4a6b98ad3c41e5d05d8cf38 100644
>> --- a/gcc/config/aarch64/aarch64-sve2.md
>> +++ b/gcc/config/aarch64/aarch64-sve2.md
>> @@ -1600,6 +1600,25 @@ (define_insn "@aarch64_sve_<sve_int_op><mode>"
>>    "<sve_int_op>\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
>>  )
>>  
>> +(define_insn_and_split "*aarch64_sve_pack_<sve_int_op><mode>"
>> +  [(set (match_operand:<VNARROW> 0 "register_operand" "=w")
>> +	(unspec:<VNARROW>
>> +	  [(match_operand:SVE_FULL_HSDI 1 "register_operand" "w")
>
> "0" would be safer, in case the instruction is only split after RA.
>
>> +	   (subreg:SVE_FULL_HSDI (unspec:<VNARROW>
>> +	     [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
>> +	      (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")]
>> +	     SVE2_INT_BINARY_NARROWB) 0)]
>> +	  UNSPEC_PACK))]
>
> I think ideally this would be the canonical pattern, so that we can
> drop the separate top unspecs.  That's more work though, and would
> probably make sense to do once we have a generic way of representing
> the pack.
>
> So OK with the "0" change above.

Hmm, actually, I take that back.  Is this transform really correct?
I think the blend corresponds to a TRN1 rather than a UZP1.
The bottom operations populate the lower half of each wider element
and the top operations populate the upper half.

Thanks,
Richard
  

Patch

--- a/gcc/config/aarch64/aarch64-sve2.md
+++ b/gcc/config/aarch64/aarch64-sve2.md
@@ -1600,6 +1600,25 @@  (define_insn "@aarch64_sve_<sve_int_op><mode>"
   "<sve_int_op>\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
 )
 
+(define_insn_and_split "*aarch64_sve_pack_<sve_int_op><mode>"
+  [(set (match_operand:<VNARROW> 0 "register_operand" "=w")
+	(unspec:<VNARROW>
+	  [(match_operand:SVE_FULL_HSDI 1 "register_operand" "w")
+	   (subreg:SVE_FULL_HSDI (unspec:<VNARROW>
+	     [(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
+	      (match_operand:SVE_FULL_HSDI 3 "register_operand" "w")]
+	     SVE2_INT_BINARY_NARROWB) 0)]
+	  UNSPEC_PACK))]
+  "TARGET_SVE2"
+  "#"
+  "&& true"
+  [(const_int 0)]
+{
+  rtx tmp = lowpart_subreg (<VNARROW>mode, operands[1], <MODE>mode);
+  emit_insn (gen_aarch64_sve (<SVE2_INT_BINARY_NARROWB:binary_top>, <MODE>mode,
+			      operands[0], tmp, operands[2], operands[3]));
+})
+
 ;; -------------------------------------------------------------------------
 ;; ---- [INT] Narrowing right shifts
 ;; -------------------------------------------------------------------------
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 0dd9dc66f7ccd78acacb759662d0cd561cd5b4ef..37d8161a33b1c399d80be82afa67613a087389d4 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -3589,6 +3589,11 @@  (define_int_attr brk_op [(UNSPEC_BRKA "a") (UNSPEC_BRKB "b")
 
 (define_int_attr sve_pred_op [(UNSPEC_PFIRST "pfirst") (UNSPEC_PNEXT "pnext")])
 
+(define_int_attr binary_top [(UNSPEC_ADDHNB "UNSPEC_ADDHNT")
+			     (UNSPEC_RADDHNB "UNSPEC_RADDHNT")
+			     (UNSPEC_RSUBHNB "UNSPEC_RSUBHNT")
+			     (UNSPEC_SUBHNB "UNSPEC_SUBHNT")])
+
 (define_int_attr sve_int_op [(UNSPEC_ADCLB "adclb")
 			     (UNSPEC_ADCLT "adclt")
 			     (UNSPEC_ADDHNB "addhnb")
diff --git a/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
new file mode 100644
index 0000000000000000000000000000000000000000..0df08bda6fd3e33280307ea15c82dd9726897cfd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-div-bitmask-4.c
@@ -0,0 +1,26 @@ 
+/* { dg-require-effective-target vect_int } */
+/* { dg-additional-options "-fno-vect-cost-model" { target aarch64*-*-* } } */
+
+#include <stdint.h>
+#include "tree-vect.h"
+
+#define N 50
+#define TYPE uint32_t
+
+__attribute__((noipa, noinline, optimize("O1")))
+void fun1(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+__attribute__((noipa, noinline, optimize("O3")))
+void fun2(TYPE* restrict pixel, TYPE level, int n)
+{
+  for (int i = 0; i < n; i+=1)
+    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}
+
+#include "vect-div-bitmask.h"
+
+/* { dg-final { scan-tree-dump-not "vect_recog_divmod_pattern: detected" "vect" { target aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..cddcebdf15ecaa9dc515f58cdbced36c8038db1b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve2/div-by-bitmask_2.c
@@ -0,0 +1,56 @@ 
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdint.h>
+
+/*
+** draw_bitmap1:
+** ...
+** 	addhnb	z6.b, z0.h, z4.h
+** 	addhnb	z5.b, z1.h, z4.h
+** 	addhnb	z0.b, z0.h, z6.h
+** 	addhnt	z0.b, z1.h, z5.h
+** ...
+*/
+void draw_bitmap1(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] += (pixel[i] * level) / 0xff;
+}
+
+void draw_bitmap2(uint8_t* restrict pixel, uint8_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] += (pixel[i] * level) / 0xfe;
+}
+
+/*
+** draw_bitmap3:
+** ...
+** 	addhnb	z6.h, z0.s, z4.s
+** 	addhnb	z5.h, z1.s, z4.s
+** 	addhnb	z0.h, z0.s, z6.s
+** 	addhnt	z0.h, z1.s, z5.s
+** ...
+*/
+void draw_bitmap3(uint16_t* restrict pixel, uint16_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] += (pixel[i] * level) / 0xffffU;
+}
+
+/*
+** draw_bitmap4:
+** ...
+** 	addhnb	z6.s, z0.d, z4.d
+** 	addhnb	z5.s, z1.d, z4.d
+** 	addhnb	z0.s, z0.d, z6.d
+** 	addhnt	z0.s, z1.d, z5.d
+** ...
+*/
+void draw_bitmap4(uint32_t* restrict pixel, uint32_t level, int n)
+{
+  for (int i = 0; i < (n & -16); i+=1)
+    pixel[i] += (pixel[i] * (uint64_t)level) / 0xffffffffUL;
+}