diff mbox series

[v2,rs6000] Add multiply-add expand pattern [PR103109]

Message ID	9c7df6aa-c768-114e-4b1e-df6ce65a63b9@linux.ibm.com
State	New
Headers	DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 24E3938582B3 Message-ID: <9c7df6aa-c768-114e-4b1e-df6ce65a63b9@linux.ibm.com> Date: Mon, 8 Aug 2022 14:04:07 +0800 MIME-Version: 1.0 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Thunderbird/91.12.0 Content-Language: en-US To: gcc-patches <gcc-patches@gcc.gnu.org> Subject: [PATCH v2, rs6000] Add multiply-add expand pattern [PR103109] Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 7bit Precedence: list From: HAO CHEN GUI via Gcc-patches <gcc-patches@gcc.gnu.org> Reply-To: HAO CHEN GUI <guihaoc@linux.ibm.com> Cc: Peter Bergner <bergner@linux.ibm.com>, David <dje.gcc@gmail.com>, Segher Boessenkool <segher@kernel.crashing.org> Errors-To: gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org>
Series	[v2,rs6000] Add multiply-add expand pattern [PR103109] \| [v2,rs6000] Add multiply-add expand pattern [PR103109]

Commit Message

HAO CHEN GUI Aug. 8, 2022, 6:04 a.m. UTC

  Hi,
  This patch adds an expand and several insns for multiply-add with three
64bit operands.

  Compared with last version, the main changes are:
1 The "maddld" pattern is reused for the low-part generation.
2 A runnable testcase replaces the original compiling case.
3 Fixes indention problems.

  Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.
Is this okay for trunk? Any recommendations? Thanks a lot.

ChangeLog
2022-08-08  Haochen Gui  <guihaoc@linux.ibm.com>

gcc/
	PR target/103109
	* config/rs6000/rs6000.md (<u>maddditi4): New pattern for multiply-add.
	(<u>madddi4_highpart): New.
	(<u>madddi4_highpart_le): New.

gcc/testsuite/
	PR target/103109
	* gcc.target/powerpc/pr103109.c: New.



patch.diff

Comments

Kewen.Lin Aug. 9, 2022, 3:14 a.m. UTC | #1

Hi Haochen,

Thanks for the patch.

on 2022/8/8 14:04, HAO CHEN GUI wrote:
> Hi,
>   This patch adds an expand and several insns for multiply-add with three
> 64bit operands.
> 
>   Compared with last version, the main changes are:
> 1 The "maddld" pattern is reused for the low-part generation.
> 2 A runnable testcase replaces the original compiling case.
> 3 Fixes indention problems.
> 
>   Bootstrapped and tested on powerpc64-linux BE and LE with no regressions.
> Is this okay for trunk? Any recommendations? Thanks a lot.
> 
> ChangeLog
> 2022-08-08  Haochen Gui  <guihaoc@linux.ibm.com>
> 
> gcc/
> 	PR target/103109
> 	* config/rs6000/rs6000.md (<u>maddditi4): New pattern for multiply-add.
> 	(<u>madddi4_highpart): New.
> 	(<u>madddi4_highpart_le): New.
> 
> gcc/testsuite/
> 	PR target/103109
> 	* gcc.target/powerpc/pr103109.c: New.
> 
> 
> 
> patch.diff
> diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
> index c55ee7e171a..4c58023490a 100644
> --- a/gcc/config/rs6000/rs6000.md
> +++ b/gcc/config/rs6000/rs6000.md
> @@ -3217,7 +3217,7 @@ (define_expand "<u>mul<mode><dmode>3"
>    DONE;
>  })
> 
> -(define_insn "*maddld<mode>4"
> +(define_insn "maddld<mode>4"
>    [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
>  	(plus:GPR (mult:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
>  			    (match_operand:GPR 2 "gpc_reg_operand" "r"))
> @@ -3226,6 +3226,52 @@ (define_insn "*maddld<mode>4"
>    "maddld %0,%1,%2,%3"
>    [(set_attr "type" "mul")])
> 
> +(define_expand "<u>maddditi4"
> +  [(set (match_operand:TI 0 "gpc_reg_operand")
> +	(plus:TI
> +	  (mult:TI (any_extend:TI (match_operand:DI 1 "gpc_reg_operand"))
> +		   (any_extend:TI (match_operand:DI 2 "gpc_reg_operand")))
> +	  (any_extend:TI (match_operand:DI 3 "gpc_reg_operand"))))]
> +  "TARGET_MADDLD && TARGET_POWERPC64"
> +{
> +  rtx op0_lo = gen_rtx_SUBREG (DImode, operands[0], BYTES_BIG_ENDIAN ? 8 : 0);
> +  rtx op0_hi = gen_rtx_SUBREG (DImode, operands[0], BYTES_BIG_ENDIAN ? 0 : 8);
> +
> +  emit_insn (gen_maddlddi4 (op0_lo, operands[1], operands[2], operands[3]));
> +
> +  if (BYTES_BIG_ENDIAN)
> +    emit_insn (gen_<u>madddi4_highpart (op0_hi, operands[1], operands[2],
> +					operands[3]));
> +  else
> +    emit_insn (gen_<u>madddi4_highpart_le (op0_hi, operands[1], operands[2],
> +					   operands[3]));
> +  DONE;
> +})
> +
> +(define_insn "<u>madddi4_highpart"
> +  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
> +	(subreg:DI
> +	  (plus:TI
> +	    (mult:TI (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
> +		     (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
> +	    (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r")))
> +	 0))]
> +  "TARGET_MADDLD && BYTES_BIG_ENDIAN && TARGET_POWERPC64"
> +  "maddhd<u> %0,%1,%2,%3"
> +  [(set_attr "type" "mul")])
> +
> +(define_insn "<u>madddi4_highpart_le"
> +  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
> +	(subreg:DI
> +	  (plus:TI
> +	    (mult:TI (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
> +		     (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
> +	    (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r")))
> +	 8))]
> +  "TARGET_MADDLD && !BYTES_BIG_ENDIAN && TARGET_POWERPC64"
> +  "maddhd<u> %0,%1,%2,%3"
> +  [(set_attr "type" "mul")])
> +
>  (define_insn "udiv<mode>3"
>    [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
>          (udiv:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr103109.c b/gcc/testsuite/gcc.target/powerpc/pr103109.c
> new file mode 100644
> index 00000000000..969b9751b21
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr103109.c
> @@ -0,0 +1,110 @@
> +/* { dg-do run { target { has_arch_ppc64 } } } */
> +/* { dg-options "-O2 -mdejagnu-cpu=power9 -save-temps" } */
> +/* { dg-require-effective-target int128 } */
> +/* { dg-require-effective-target p9modulo_hw } */
> +/* { dg-final { scan-assembler-times {\mmaddld\M} 2 } } */
> +/* { dg-final { scan-assembler-times {\mmaddhd\M} 1 } } */
> +/* { dg-final { scan-assembler-times {\mmaddhdu\M} 1 } } */
> +

Maybe it's good to split this case into two, one for compiling and the other for running.
Since the generated asm is a test point here, with one separated case for compiling, we
can still have that part of test coverage on hosts which are unable to run this case.
You can move functions multiply_add and multiply_addu into one common header file, then
include it in both source files.

> +union U {
> +  __int128 i128;
> +  struct {
> +    long l1;
> +    long l2;
> +  } s;
> +};
> +
> +__int128
> +create_i128 (long most_sig, long least_sig)
> +{
> +  union U u;
> +
> +#if __LITTLE_ENDIAN__
> +  u.s.l1 = least_sig;
> +  u.s.l2 = most_sig;
> +#else
> +  u.s.l1 = most_sig;
> +  u.s.l2 = least_sig;
> +#endif
> +  return u.i128;
> +}
> +
> +
> +#define DEBUG 0
> +
> +#if DEBUG
> +#include <stdio.h>
> +#include <stdlib.h>
> +
> +void print_i128(__int128 val, int unsignedp)
> +{
> +  if (unsignedp)
> +    printf(" %llu ", (unsigned long long)(val >> 64));
> +  else
> +    printf(" %lld ", (signed long long)(val >> 64));
> +
> +  printf("%llu (0x%llx %llx)",
> +         (unsigned long long)(val & 0xFFFFFFFFFFFFFFFF),
> +         (unsigned long long)(val >> 64),
> +         (unsigned long long)(val & 0xFFFFFFFFFFFFFFFF));
> +}
> +#endif
> +
> +void abort (void);
> +
> +__attribute__((noinline))
> +__int128 multiply_add (long a, long b, long c)
> +{
> +  return (__int128) a * b + c;
> +}
> +
> +__attribute__((noinline))
> +unsigned __int128 multiply_addu (unsigned long a, unsigned long b,
> +				 unsigned long c)
> +{
> +  return (unsigned __int128) a * b + c;
> +}
> +
> +int main ()
> +{
> +  long a = 0xFEDCBA9876543210L;
> +  long b = 0x1000000L;
> +  long c = 0x123456L;
> +  __int128 expected_result = create_i128 (0xFFFFFFFFFFFEDCBAL,
> +					  0x9876543210123456L);
> +
> +  __int128 result = multiply_add (a, b, c);
> +
> +  if (result != expected_result)
> +    {
> +#if DEBUG
> +      printf ("ERROR: multiply_add (%lld, %lld, %lld) = ", a, b, c);
> +      print_i128 (result, 0);
> +      printf ("\n does not match expected_result = ");
> +      print_i128 (expected_result, 0);
> +      printf ("\n\n");
> +#else
> +      abort();
> +#endif
> +    }
> +
> +  unsigned long au = 0xFEDCBA9876543210UL;
> +  unsigned long bu = 0x1000000UL;
> +  unsigned long cu = 0x123456UL;
> +  unsigned __int128 expected_resultu = create_i128 (0x0000000000FEDCBAL,
> +						    0x9876543210123456L);
> +
> +  unsigned __int128 resultu = multiply_addu (au, bu, cu);
> +  if (resultu != expected_resultu)
> +    {
> +#if DEBUG
> +      printf ("ERROR: multiply_addu (%llu, %llu, %llu) = ", au, bu, cu);
> +      print_i128 (resultu, 1);
> +      printf ("\n does not match expected_result = ");
> +      print_i128 (expected_resultu, 1);
> +      printf ("\n\n");
> +#else
> +      abort();
> +#endif

Nit: better to add one explicit "return 0;" to avoid possible warning.

BR,
Kewen
> +    }
> +}

Segher Boessenkool Aug. 9, 2022, 9:34 p.m. UTC | #2

On Tue, Aug 09, 2022 at 11:14:16AM +0800, Kewen.Lin wrote:
> on 2022/8/8 14:04, HAO CHEN GUI wrote:
> > +/* { dg-do run { target { has_arch_ppc64 } } } */
> > +/* { dg-options "-O2 -mdejagnu-cpu=power9 -save-temps" } */
> > +/* { dg-require-effective-target int128 } */
> > +/* { dg-require-effective-target p9modulo_hw } */
> > +/* { dg-final { scan-assembler-times {\mmaddld\M} 2 } } */
> > +/* { dg-final { scan-assembler-times {\mmaddhd\M} 1 } } */
> > +/* { dg-final { scan-assembler-times {\mmaddhdu\M} 1 } } */
> > +
> 
> Maybe it's good to split this case into two, one for compiling and the other for running.
> Since the generated asm is a test point here, with one separated case for compiling, we
> can still have that part of test coverage on hosts which are unable to run this case.
> You can move functions multiply_add and multiply_addu into one common header file, then
> include it in both source files.

Yeah, good point.  You cannot make dg-do do different things on
different targets.  Fortunatelt just duplicating this test and then
removing the things not relevant to run resp. compile testing makes
things even more clear :-)

> Nit: better to add one explicit "return 0;" to avoid possible warning.

This is in main(), the C standard requires this to work without return
(and it is common).  But, before C99 the implicit return value from
main() was undefined, so yes, it could warn then.  Does it?


Segher

Segher Boessenkool Aug. 9, 2022, 9:43 p.m. UTC | #3

Hi!

On Mon, Aug 08, 2022 at 02:04:07PM +0800, HAO CHEN GUI wrote:
>   This patch adds an expand and several insns for multiply-add with three
> 64bit operands.

Also for maddld for 32-bit operands.

>    "maddld %0,%1,%2,%3"
>    [(set_attr "type" "mul")])

I suppose attr "size" isn't relevant for any of the cpus that implement
these instructions?

Okay for trunk.  Thanks!

(The testcase improvements can be done later).

Segher

HAO CHEN GUI Aug. 10, 2022, 5:20 a.m. UTC | #4

Hi Segher,
  Thanks for your comments. I checked the cost table. For P9 and P10, the
cost of all mul* insn is the same, not relevant to the size of operand.

  I will split the test case to one compiling and one runnable case.

Thanks.
Gui Haochen

On 10/8/2022 上午 5:43, Segher Boessenkool wrote:
> Hi!
> 
> On Mon, Aug 08, 2022 at 02:04:07PM +0800, HAO CHEN GUI wrote:
>>   This patch adds an expand and several insns for multiply-add with three
>> 64bit operands.
> 
> Also for maddld for 32-bit operands.
> 
>>    "maddld %0,%1,%2,%3"
>>    [(set_attr "type" "mul")])
> 
> I suppose attr "size" isn't relevant for any of the cpus that implement
> these instructions?
> 
> Okay for trunk.  Thanks!
> 
> (The testcase improvements can be done later).
> 
> 
> Segher

Kewen.Lin Aug. 10, 2022, 7:37 a.m. UTC | #5

on 2022/8/10 05:34, Segher Boessenkool wrote:
> On Tue, Aug 09, 2022 at 11:14:16AM +0800, Kewen.Lin wrote:
>> on 2022/8/8 14:04, HAO CHEN GUI wrote:
>>> +/* { dg-do run { target { has_arch_ppc64 } } } */
>>> +/* { dg-options "-O2 -mdejagnu-cpu=power9 -save-temps" } */
>>> +/* { dg-require-effective-target int128 } */
>>> +/* { dg-require-effective-target p9modulo_hw } */
>>> +/* { dg-final { scan-assembler-times {\mmaddld\M} 2 } } */
>>> +/* { dg-final { scan-assembler-times {\mmaddhd\M} 1 } } */
>>> +/* { dg-final { scan-assembler-times {\mmaddhdu\M} 1 } } */
>>> +
>>
>> Maybe it's good to split this case into two, one for compiling and the other for running.
>> Since the generated asm is a test point here, with one separated case for compiling, we
>> can still have that part of test coverage on hosts which are unable to run this case.
>> You can move functions multiply_add and multiply_addu into one common header file, then
>> include it in both source files.
> 
> Yeah, good point.  You cannot make dg-do do different things on
> different targets.  Fortunatelt just duplicating this test and then
> removing the things not relevant to run resp. compile testing makes
> things even more clear :-)
> 
>> Nit: better to add one explicit "return 0;" to avoid possible warning.
> 
> This is in main(), the C standard requires this to work without return
> (and it is common).  But, before C99 the implicit return value from
> main() was undefined, so yes, it could warn then.  Does it?
> 

Yes, exactly, with explicit -std=c89 -Wreturn-type it will have a warning:

warning: control reaches end of non-void function...

BR,
Kewen

diff mbox series

Patch

diff --git a/gcc/config/rs6000/rs6000.md b/gcc/config/rs6000/rs6000.md
index c55ee7e171a..4c58023490a 100644
--- a/gcc/config/rs6000/rs6000.md
+++ b/gcc/config/rs6000/rs6000.md
@@ -3217,7 +3217,7 @@  (define_expand "<u>mul<mode><dmode>3"
   DONE;
 })

-(define_insn "*maddld<mode>4"
+(define_insn "maddld<mode>4"
   [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
 	(plus:GPR (mult:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
 			    (match_operand:GPR 2 "gpc_reg_operand" "r"))
@@ -3226,6 +3226,52 @@  (define_insn "*maddld<mode>4"
   "maddld %0,%1,%2,%3"
   [(set_attr "type" "mul")])

+(define_expand "<u>maddditi4"
+  [(set (match_operand:TI 0 "gpc_reg_operand")
+	(plus:TI
+	  (mult:TI (any_extend:TI (match_operand:DI 1 "gpc_reg_operand"))
+		   (any_extend:TI (match_operand:DI 2 "gpc_reg_operand")))
+	  (any_extend:TI (match_operand:DI 3 "gpc_reg_operand"))))]
+  "TARGET_MADDLD && TARGET_POWERPC64"
+{
+  rtx op0_lo = gen_rtx_SUBREG (DImode, operands[0], BYTES_BIG_ENDIAN ? 8 : 0);
+  rtx op0_hi = gen_rtx_SUBREG (DImode, operands[0], BYTES_BIG_ENDIAN ? 0 : 8);
+
+  emit_insn (gen_maddlddi4 (op0_lo, operands[1], operands[2], operands[3]));
+
+  if (BYTES_BIG_ENDIAN)
+    emit_insn (gen_<u>madddi4_highpart (op0_hi, operands[1], operands[2],
+					operands[3]));
+  else
+    emit_insn (gen_<u>madddi4_highpart_le (op0_hi, operands[1], operands[2],
+					   operands[3]));
+  DONE;
+})
+
+(define_insn "<u>madddi4_highpart"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
+	(subreg:DI
+	  (plus:TI
+	    (mult:TI (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+		     (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+	    (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r")))
+	 0))]
+  "TARGET_MADDLD && BYTES_BIG_ENDIAN && TARGET_POWERPC64"
+  "maddhd<u> %0,%1,%2,%3"
+  [(set_attr "type" "mul")])
+
+(define_insn "<u>madddi4_highpart_le"
+  [(set (match_operand:DI 0 "gpc_reg_operand" "=r")
+	(subreg:DI
+	  (plus:TI
+	    (mult:TI (any_extend:TI (match_operand:DI 1 "gpc_reg_operand" "r"))
+		     (any_extend:TI (match_operand:DI 2 "gpc_reg_operand" "r")))
+	    (any_extend:TI (match_operand:DI 3 "gpc_reg_operand" "r")))
+	 8))]
+  "TARGET_MADDLD && !BYTES_BIG_ENDIAN && TARGET_POWERPC64"
+  "maddhd<u> %0,%1,%2,%3"
+  [(set_attr "type" "mul")])
+
 (define_insn "udiv<mode>3"
   [(set (match_operand:GPR 0 "gpc_reg_operand" "=r")
         (udiv:GPR (match_operand:GPR 1 "gpc_reg_operand" "r")
diff --git a/gcc/testsuite/gcc.target/powerpc/pr103109.c b/gcc/testsuite/gcc.target/powerpc/pr103109.c
new file mode 100644
index 00000000000..969b9751b21
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr103109.c
@@ -0,0 +1,110 @@ 
+/* { dg-do run { target { has_arch_ppc64 } } } */
+/* { dg-options "-O2 -mdejagnu-cpu=power9 -save-temps" } */
+/* { dg-require-effective-target int128 } */
+/* { dg-require-effective-target p9modulo_hw } */
+/* { dg-final { scan-assembler-times {\mmaddld\M} 2 } } */
+/* { dg-final { scan-assembler-times {\mmaddhd\M} 1 } } */
+/* { dg-final { scan-assembler-times {\mmaddhdu\M} 1 } } */
+
+union U {
+  __int128 i128;
+  struct {
+    long l1;
+    long l2;
+  } s;
+};
+
+__int128
+create_i128 (long most_sig, long least_sig)
+{
+  union U u;
+
+#if __LITTLE_ENDIAN__
+  u.s.l1 = least_sig;
+  u.s.l2 = most_sig;
+#else
+  u.s.l1 = most_sig;
+  u.s.l2 = least_sig;
+#endif
+  return u.i128;
+}
+
+
+#define DEBUG 0
+
+#if DEBUG
+#include <stdio.h>
+#include <stdlib.h>
+
+void print_i128(__int128 val, int unsignedp)
+{
+  if (unsignedp)
+    printf(" %llu ", (unsigned long long)(val >> 64));
+  else
+    printf(" %lld ", (signed long long)(val >> 64));
+
+  printf("%llu (0x%llx %llx)",
+         (unsigned long long)(val & 0xFFFFFFFFFFFFFFFF),
+         (unsigned long long)(val >> 64),
+         (unsigned long long)(val & 0xFFFFFFFFFFFFFFFF));
+}
+#endif
+
+void abort (void);
+
+__attribute__((noinline))
+__int128 multiply_add (long a, long b, long c)
+{
+  return (__int128) a * b + c;
+}
+
+__attribute__((noinline))
+unsigned __int128 multiply_addu (unsigned long a, unsigned long b,
+				 unsigned long c)
+{
+  return (unsigned __int128) a * b + c;
+}
+
+int main ()
+{
+  long a = 0xFEDCBA9876543210L;
+  long b = 0x1000000L;
+  long c = 0x123456L;
+  __int128 expected_result = create_i128 (0xFFFFFFFFFFFEDCBAL,
+					  0x9876543210123456L);
+
+  __int128 result = multiply_add (a, b, c);
+
+  if (result != expected_result)
+    {
+#if DEBUG
+      printf ("ERROR: multiply_add (%lld, %lld, %lld) = ", a, b, c);
+      print_i128 (result, 0);
+      printf ("\n does not match expected_result = ");
+      print_i128 (expected_result, 0);
+      printf ("\n\n");
+#else
+      abort();
+#endif
+    }
+
+  unsigned long au = 0xFEDCBA9876543210UL;
+  unsigned long bu = 0x1000000UL;
+  unsigned long cu = 0x123456UL;
+  unsigned __int128 expected_resultu = create_i128 (0x0000000000FEDCBAL,
+						    0x9876543210123456L);
+
+  unsigned __int128 resultu = multiply_addu (au, bu, cu);
+  if (resultu != expected_resultu)
+    {
+#if DEBUG
+      printf ("ERROR: multiply_addu (%llu, %llu, %llu) = ", au, bu, cu);
+      print_i128 (resultu, 1);
+      printf ("\n does not match expected_result = ");
+      print_i128 (expected_resultu, 1);
+      printf ("\n\n");
+#else
+      abort();
+#endif
+    }
+}