[PR/target,105666] RISC-V: Inhibit FP <--> int register moves via tune param

Message ID 20220523181209.2208136-1-vineetg@rivosinc.com
State Committed
Commit b646d7d279ae0c0d35564542d09866bf3e8afac0
Headers
Series [PR/target,105666] RISC-V: Inhibit FP <--> int register moves via tune param |

Commit Message

Vineet Gupta May 23, 2022, 6:12 p.m. UTC
  Under extreme register pressure, compiler can use FP <--> int
moves as a cheap alternate to spilling to memory.
This was seen with SPEC2017 FP benchmark 507.cactu:
ML_BSSN_Advect.cc:ML_BSSN_Advect_Body()

|	fmv.d.x	fa5,s9	# PDupwindNthSymm2Xt1, PDupwindNthSymm2Xt1
| .LVL325:
|	ld	s9,184(sp)		# _12469, %sfp
| ...
| .LVL339:
|	fmv.x.d	s4,fa5	# PDupwindNthSymm2Xt1, PDupwindNthSymm2Xt1
|

The FMV instructions could be costlier (than stack spill) on certain
micro-architectures, thus this needs to be a per-cpu tunable
(default being to inhibit on all existing RV cpus).

Testsuite run with new test reports 10 failures without the fix
corresponding to the build variations of pr105666.c

| 		=== gcc Summary ===
|
| # of expected passes		123318   (+10)
| # of unexpected failures	34       (-10)
| # of unexpected successes	4
| # of expected failures	780
| # of unresolved testcases	4
| # of unsupported tests	2796

gcc/Changelog:

	* config/riscv/riscv.cc: (struct riscv_tune_param): Add
	  fmv_cost.
	(rocket_tune_info): Add default fmv_cost 8.
	(sifive_7_tune_info): Ditto.
	(thead_c906_tune_info): Ditto.
	(optimize_size_tune_info): Ditto.
	(riscv_register_move_cost): Use fmv_cost for int<->fp moves.

gcc/testsuite/Changelog:

	* gcc.target/riscv/pr105666.c: New test.

Signed-off-by: Vineet Gupta <vineetg@rivosinc.com>
---
 gcc/config/riscv/riscv.cc                 |  9 ++++
 gcc/testsuite/gcc.target/riscv/pr105666.c | 55 +++++++++++++++++++++++
 2 files changed, 64 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/riscv/pr105666.c
  

Comments

Philipp Tomsich May 23, 2022, 7:37 p.m. UTC | #1
Good catch!

On Mon, 23 May 2022 at 20:12, Vineet Gupta <vineetg@rivosinc.com> wrote:

> Under extreme register pressure, compiler can use FP <--> int
> moves as a cheap alternate to spilling to memory.
> This was seen with SPEC2017 FP benchmark 507.cactu:
> ML_BSSN_Advect.cc:ML_BSSN_Advect_Body()
>
> |       fmv.d.x fa5,s9  # PDupwindNthSymm2Xt1, PDupwindNthSymm2Xt1
> | .LVL325:
> |       ld      s9,184(sp)              # _12469, %sfp
> | ...
> | .LVL339:
> |       fmv.x.d s4,fa5  # PDupwindNthSymm2Xt1, PDupwindNthSymm2Xt1
> |
>
> The FMV instructions could be costlier (than stack spill) on certain
> micro-architectures, thus this needs to be a per-cpu tunable
> (default being to inhibit on all existing RV cpus).
>
> Testsuite run with new test reports 10 failures without the fix
> corresponding to the build variations of pr105666.c
>
> |               === gcc Summary ===
> |
> | # of expected passes          123318   (+10)
> | # of unexpected failures      34       (-10)
> | # of unexpected successes     4
> | # of expected failures        780
> | # of unresolved testcases     4
> | # of unsupported tests        2796
>
> gcc/Changelog:
>
>         * config/riscv/riscv.cc: (struct riscv_tune_param): Add
>           fmv_cost.
>         (rocket_tune_info): Add default fmv_cost 8.
>         (sifive_7_tune_info): Ditto.
>         (thead_c906_tune_info): Ditto.
>         (optimize_size_tune_info): Ditto.
>         (riscv_register_move_cost): Use fmv_cost for int<->fp moves.
>
> gcc/testsuite/Changelog:
>
>         * gcc.target/riscv/pr105666.c: New test.
>
> Signed-off-by: Vineet Gupta <vineetg@rivosinc.com>
> ---
>  gcc/config/riscv/riscv.cc                 |  9 ++++
>  gcc/testsuite/gcc.target/riscv/pr105666.c | 55 +++++++++++++++++++++++
>  2 files changed, 64 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/riscv/pr105666.c
>
> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> index ee756aab6940..f3ac0d8865f0 100644
> --- a/gcc/config/riscv/riscv.cc
> +++ b/gcc/config/riscv/riscv.cc
> @@ -220,6 +220,7 @@ struct riscv_tune_param
>    unsigned short issue_rate;
>    unsigned short branch_cost;
>    unsigned short memory_cost;
> +  unsigned short fmv_cost;
>    bool slow_unaligned_access;
>  };
>
> @@ -285,6 +286,7 @@ static const struct riscv_tune_param rocket_tune_info
> = {
>    1,                                           /* issue_rate */
>    3,                                           /* branch_cost */
>    5,                                           /* memory_cost */
> +  8,                                           /* fmv_cost */
>    true,                                                /*
> slow_unaligned_access */
>  };
>
> @@ -298,6 +300,7 @@ static const struct riscv_tune_param
> sifive_7_tune_info = {
>    2,                                           /* issue_rate */
>    4,                                           /* branch_cost */
>    3,                                           /* memory_cost */
> +  8,                                           /* fmv_cost */
>    true,                                                /*
> slow_unaligned_access */
>  };
>
> @@ -311,6 +314,7 @@ static const struct riscv_tune_param
> thead_c906_tune_info = {
>    1,            /* issue_rate */
>    3,            /* branch_cost */
>    5,            /* memory_cost */
> +  8,           /* fmv_cost */
>    false,            /* slow_unaligned_access */
>  };
>
> @@ -324,6 +328,7 @@ static const struct riscv_tune_param
> optimize_size_tune_info = {
>    1,                                           /* issue_rate */
>    1,                                           /* branch_cost */
>    2,                                           /* memory_cost */
> +  8,                                           /* fmv_cost */
>    false,                                       /* slow_unaligned_access */
>  };
>
> @@ -4737,6 +4742,10 @@ static int
>  riscv_register_move_cost (machine_mode mode,
>                           reg_class_t from, reg_class_t to)
>  {
> +  if ((from == FP_REGS && to == GR_REGS) ||
> +      (from == GR_REGS && to == FP_REGS))
> +    return tune_param->fmv_cost;
> +
>    return riscv_secondary_memory_needed (mode, from, to) ? 8 : 2;
>  }
>
> diff --git a/gcc/testsuite/gcc.target/riscv/pr105666.c
> b/gcc/testsuite/gcc.target/riscv/pr105666.c
> new file mode 100644
> index 000000000000..904f3bc0763f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/riscv/pr105666.c
> @@ -0,0 +1,55 @@
> +/* Shamelessly plugged off
> gcc/testsuite/gcc.c-torture/execute/pr28982a.c.
> +
> +   The idea is to induce high register pressure for both int/fp registers
> +   so that they spill. By default FMV instructions would be used to stash
> +   int reg to a fp reg (and vice-versa) but that could be costlier than
> +   spilling to stack.  */
> +
> +/* { dg-do compile } */
> +/* { dg-options "-march=rv64g -ffast-math" } */
> +
> +#define NITER 4
> +#define NVARS 20
> +#define MULTI(X) \
> +  X( 0), X( 1), X( 2), X( 3), X( 4), X( 5), X( 6), X( 7), X( 8), X( 9), \
> +  X(10), X(11), X(12), X(13), X(14), X(15), X(16), X(17), X(18), X(19)
> +
> +#define DECLAREI(INDEX) inc##INDEX = incs[INDEX]
> +#define DECLAREF(INDEX) *ptr##INDEX = ptrs[INDEX], result##INDEX = 5
> +#define LOOP(INDEX) result##INDEX += result##INDEX * (*ptr##INDEX),
> ptr##INDEX += inc##INDEX
> +#define COPYOUT(INDEX) results[INDEX] = result##INDEX
> +
> +double *ptrs[NVARS];
> +double results[NVARS];
> +int incs[NVARS];
> +
> +void __attribute__((noinline))
> +foo (int n)
> +{
> +  int MULTI (DECLAREI);
> +  double MULTI (DECLAREF);
> +  while (n--)
> +    MULTI (LOOP);
> +  MULTI (COPYOUT);
> +}
> +
> +double input[NITER * NVARS];
> +
> +int
> +main (void)
> +{
> +  int i;
> +
> +  for (i = 0; i < NVARS; i++)
> +    ptrs[i] = input + i, incs[i] = i;
> +  for (i = 0; i < NITER * NVARS; i++)
> +    input[i] = i;
> +  foo (NITER);
> +  for (i = 0; i < NVARS; i++)
> +    if (results[i] != i * NITER * (NITER + 1) / 2)
> +      return 1;
> +  return 0;
> +}
> +
> +/* { dg-final { scan-assembler-not "\tfmv\\.d\\.x\t" } } */
> +/* { dg-final { scan-assembler-not "\tfmv\\.x\\.d\t" } } */
> --
> 2.32.0
>
>
  
Kito Cheng May 24, 2022, 7:59 a.m. UTC | #2
Committed, thanks!

On Tue, May 24, 2022 at 3:40 AM Philipp Tomsich
<philipp.tomsich@vrull.eu> wrote:
>
> Good catch!
>
> On Mon, 23 May 2022 at 20:12, Vineet Gupta <vineetg@rivosinc.com> wrote:
>
> > Under extreme register pressure, compiler can use FP <--> int
> > moves as a cheap alternate to spilling to memory.
> > This was seen with SPEC2017 FP benchmark 507.cactu:
> > ML_BSSN_Advect.cc:ML_BSSN_Advect_Body()
> >
> > |       fmv.d.x fa5,s9  # PDupwindNthSymm2Xt1, PDupwindNthSymm2Xt1
> > | .LVL325:
> > |       ld      s9,184(sp)              # _12469, %sfp
> > | ...
> > | .LVL339:
> > |       fmv.x.d s4,fa5  # PDupwindNthSymm2Xt1, PDupwindNthSymm2Xt1
> > |
> >
> > The FMV instructions could be costlier (than stack spill) on certain
> > micro-architectures, thus this needs to be a per-cpu tunable
> > (default being to inhibit on all existing RV cpus).
> >
> > Testsuite run with new test reports 10 failures without the fix
> > corresponding to the build variations of pr105666.c
> >
> > |               === gcc Summary ===
> > |
> > | # of expected passes          123318   (+10)
> > | # of unexpected failures      34       (-10)
> > | # of unexpected successes     4
> > | # of expected failures        780
> > | # of unresolved testcases     4
> > | # of unsupported tests        2796
> >
> > gcc/Changelog:
> >
> >         * config/riscv/riscv.cc: (struct riscv_tune_param): Add
> >           fmv_cost.
> >         (rocket_tune_info): Add default fmv_cost 8.
> >         (sifive_7_tune_info): Ditto.
> >         (thead_c906_tune_info): Ditto.
> >         (optimize_size_tune_info): Ditto.
> >         (riscv_register_move_cost): Use fmv_cost for int<->fp moves.
> >
> > gcc/testsuite/Changelog:
> >
> >         * gcc.target/riscv/pr105666.c: New test.
> >
> > Signed-off-by: Vineet Gupta <vineetg@rivosinc.com>
> > ---
> >  gcc/config/riscv/riscv.cc                 |  9 ++++
> >  gcc/testsuite/gcc.target/riscv/pr105666.c | 55 +++++++++++++++++++++++
> >  2 files changed, 64 insertions(+)
> >  create mode 100644 gcc/testsuite/gcc.target/riscv/pr105666.c
> >
> > diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> > index ee756aab6940..f3ac0d8865f0 100644
> > --- a/gcc/config/riscv/riscv.cc
> > +++ b/gcc/config/riscv/riscv.cc
> > @@ -220,6 +220,7 @@ struct riscv_tune_param
> >    unsigned short issue_rate;
> >    unsigned short branch_cost;
> >    unsigned short memory_cost;
> > +  unsigned short fmv_cost;
> >    bool slow_unaligned_access;
> >  };
> >
> > @@ -285,6 +286,7 @@ static const struct riscv_tune_param rocket_tune_info
> > = {
> >    1,                                           /* issue_rate */
> >    3,                                           /* branch_cost */
> >    5,                                           /* memory_cost */
> > +  8,                                           /* fmv_cost */
> >    true,                                                /*
> > slow_unaligned_access */
> >  };
> >
> > @@ -298,6 +300,7 @@ static const struct riscv_tune_param
> > sifive_7_tune_info = {
> >    2,                                           /* issue_rate */
> >    4,                                           /* branch_cost */
> >    3,                                           /* memory_cost */
> > +  8,                                           /* fmv_cost */
> >    true,                                                /*
> > slow_unaligned_access */
> >  };
> >
> > @@ -311,6 +314,7 @@ static const struct riscv_tune_param
> > thead_c906_tune_info = {
> >    1,            /* issue_rate */
> >    3,            /* branch_cost */
> >    5,            /* memory_cost */
> > +  8,           /* fmv_cost */
> >    false,            /* slow_unaligned_access */
> >  };
> >
> > @@ -324,6 +328,7 @@ static const struct riscv_tune_param
> > optimize_size_tune_info = {
> >    1,                                           /* issue_rate */
> >    1,                                           /* branch_cost */
> >    2,                                           /* memory_cost */
> > +  8,                                           /* fmv_cost */
> >    false,                                       /* slow_unaligned_access */
> >  };
> >
> > @@ -4737,6 +4742,10 @@ static int
> >  riscv_register_move_cost (machine_mode mode,
> >                           reg_class_t from, reg_class_t to)
> >  {
> > +  if ((from == FP_REGS && to == GR_REGS) ||
> > +      (from == GR_REGS && to == FP_REGS))
> > +    return tune_param->fmv_cost;
> > +
> >    return riscv_secondary_memory_needed (mode, from, to) ? 8 : 2;
> >  }
> >
> > diff --git a/gcc/testsuite/gcc.target/riscv/pr105666.c
> > b/gcc/testsuite/gcc.target/riscv/pr105666.c
> > new file mode 100644
> > index 000000000000..904f3bc0763f
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/riscv/pr105666.c
> > @@ -0,0 +1,55 @@
> > +/* Shamelessly plugged off
> > gcc/testsuite/gcc.c-torture/execute/pr28982a.c.
> > +
> > +   The idea is to induce high register pressure for both int/fp registers
> > +   so that they spill. By default FMV instructions would be used to stash
> > +   int reg to a fp reg (and vice-versa) but that could be costlier than
> > +   spilling to stack.  */
> > +
> > +/* { dg-do compile } */
> > +/* { dg-options "-march=rv64g -ffast-math" } */
> > +
> > +#define NITER 4
> > +#define NVARS 20
> > +#define MULTI(X) \
> > +  X( 0), X( 1), X( 2), X( 3), X( 4), X( 5), X( 6), X( 7), X( 8), X( 9), \
> > +  X(10), X(11), X(12), X(13), X(14), X(15), X(16), X(17), X(18), X(19)
> > +
> > +#define DECLAREI(INDEX) inc##INDEX = incs[INDEX]
> > +#define DECLAREF(INDEX) *ptr##INDEX = ptrs[INDEX], result##INDEX = 5
> > +#define LOOP(INDEX) result##INDEX += result##INDEX * (*ptr##INDEX),
> > ptr##INDEX += inc##INDEX
> > +#define COPYOUT(INDEX) results[INDEX] = result##INDEX
> > +
> > +double *ptrs[NVARS];
> > +double results[NVARS];
> > +int incs[NVARS];
> > +
> > +void __attribute__((noinline))
> > +foo (int n)
> > +{
> > +  int MULTI (DECLAREI);
> > +  double MULTI (DECLAREF);
> > +  while (n--)
> > +    MULTI (LOOP);
> > +  MULTI (COPYOUT);
> > +}
> > +
> > +double input[NITER * NVARS];
> > +
> > +int
> > +main (void)
> > +{
> > +  int i;
> > +
> > +  for (i = 0; i < NVARS; i++)
> > +    ptrs[i] = input + i, incs[i] = i;
> > +  for (i = 0; i < NITER * NVARS; i++)
> > +    input[i] = i;
> > +  foo (NITER);
> > +  for (i = 0; i < NVARS; i++)
> > +    if (results[i] != i * NITER * (NITER + 1) / 2)
> > +      return 1;
> > +  return 0;
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "\tfmv\\.d\\.x\t" } } */
> > +/* { dg-final { scan-assembler-not "\tfmv\\.x\\.d\t" } } */
> > --
> > 2.32.0
> >
> >
  
Vineet Gupta May 24, 2022, 7:38 p.m. UTC | #3
On 5/24/22 00:59, Kito Cheng wrote:
> Committed, thanks!

Thx for the quick action Kito,
Can this be backported to gcc 12 as well ?

Thx,
-Vineet

>
> On Tue, May 24, 2022 at 3:40 AM Philipp Tomsich
> <philipp.tomsich@vrull.eu> wrote:
>> Good catch!
>>
>> On Mon, 23 May 2022 at 20:12, Vineet Gupta <vineetg@rivosinc.com> wrote:
>>
>>> Under extreme register pressure, compiler can use FP <--> int
>>> moves as a cheap alternate to spilling to memory.
>>> This was seen with SPEC2017 FP benchmark 507.cactu:
>>> ML_BSSN_Advect.cc:ML_BSSN_Advect_Body()
>>>
>>> |       fmv.d.x fa5,s9  # PDupwindNthSymm2Xt1, PDupwindNthSymm2Xt1
>>> | .LVL325:
>>> |       ld      s9,184(sp)              # _12469, %sfp
>>> | ...
>>> | .LVL339:
>>> |       fmv.x.d s4,fa5  # PDupwindNthSymm2Xt1, PDupwindNthSymm2Xt1
>>> |
>>>
>>> The FMV instructions could be costlier (than stack spill) on certain
>>> micro-architectures, thus this needs to be a per-cpu tunable
>>> (default being to inhibit on all existing RV cpus).
>>>
>>> Testsuite run with new test reports 10 failures without the fix
>>> corresponding to the build variations of pr105666.c
>>>
>>> |               === gcc Summary ===
>>> |
>>> | # of expected passes          123318   (+10)
>>> | # of unexpected failures      34       (-10)
>>> | # of unexpected successes     4
>>> | # of expected failures        780
>>> | # of unresolved testcases     4
>>> | # of unsupported tests        2796
>>>
>>> gcc/Changelog:
>>>
>>>          * config/riscv/riscv.cc: (struct riscv_tune_param): Add
>>>            fmv_cost.
>>>          (rocket_tune_info): Add default fmv_cost 8.
>>>          (sifive_7_tune_info): Ditto.
>>>          (thead_c906_tune_info): Ditto.
>>>          (optimize_size_tune_info): Ditto.
>>>          (riscv_register_move_cost): Use fmv_cost for int<->fp moves.
>>>
>>> gcc/testsuite/Changelog:
>>>
>>>          * gcc.target/riscv/pr105666.c: New test.
>>>
>>> Signed-off-by: Vineet Gupta <vineetg@rivosinc.com>
>>> ---
>>>   gcc/config/riscv/riscv.cc                 |  9 ++++
>>>   gcc/testsuite/gcc.target/riscv/pr105666.c | 55 +++++++++++++++++++++++
>>>   2 files changed, 64 insertions(+)
>>>   create mode 100644 gcc/testsuite/gcc.target/riscv/pr105666.c
>>>
>>> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
>>> index ee756aab6940..f3ac0d8865f0 100644
>>> --- a/gcc/config/riscv/riscv.cc
>>> +++ b/gcc/config/riscv/riscv.cc
>>> @@ -220,6 +220,7 @@ struct riscv_tune_param
>>>     unsigned short issue_rate;
>>>     unsigned short branch_cost;
>>>     unsigned short memory_cost;
>>> +  unsigned short fmv_cost;
>>>     bool slow_unaligned_access;
>>>   };
>>>
>>> @@ -285,6 +286,7 @@ static const struct riscv_tune_param rocket_tune_info
>>> = {
>>>     1,                                           /* issue_rate */
>>>     3,                                           /* branch_cost */
>>>     5,                                           /* memory_cost */
>>> +  8,                                           /* fmv_cost */
>>>     true,                                                /*
>>> slow_unaligned_access */
>>>   };
>>>
>>> @@ -298,6 +300,7 @@ static const struct riscv_tune_param
>>> sifive_7_tune_info = {
>>>     2,                                           /* issue_rate */
>>>     4,                                           /* branch_cost */
>>>     3,                                           /* memory_cost */
>>> +  8,                                           /* fmv_cost */
>>>     true,                                                /*
>>> slow_unaligned_access */
>>>   };
>>>
>>> @@ -311,6 +314,7 @@ static const struct riscv_tune_param
>>> thead_c906_tune_info = {
>>>     1,            /* issue_rate */
>>>     3,            /* branch_cost */
>>>     5,            /* memory_cost */
>>> +  8,           /* fmv_cost */
>>>     false,            /* slow_unaligned_access */
>>>   };
>>>
>>> @@ -324,6 +328,7 @@ static const struct riscv_tune_param
>>> optimize_size_tune_info = {
>>>     1,                                           /* issue_rate */
>>>     1,                                           /* branch_cost */
>>>     2,                                           /* memory_cost */
>>> +  8,                                           /* fmv_cost */
>>>     false,                                       /* slow_unaligned_access */
>>>   };
>>>
>>> @@ -4737,6 +4742,10 @@ static int
>>>   riscv_register_move_cost (machine_mode mode,
>>>                            reg_class_t from, reg_class_t to)
>>>   {
>>> +  if ((from == FP_REGS && to == GR_REGS) ||
>>> +      (from == GR_REGS && to == FP_REGS))
>>> +    return tune_param->fmv_cost;
>>> +
>>>     return riscv_secondary_memory_needed (mode, from, to) ? 8 : 2;
>>>   }
>>>
>>> diff --git a/gcc/testsuite/gcc.target/riscv/pr105666.c
>>> b/gcc/testsuite/gcc.target/riscv/pr105666.c
>>> new file mode 100644
>>> index 000000000000..904f3bc0763f
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/riscv/pr105666.c
>>> @@ -0,0 +1,55 @@
>>> +/* Shamelessly plugged off
>>> gcc/testsuite/gcc.c-torture/execute/pr28982a.c.
>>> +
>>> +   The idea is to induce high register pressure for both int/fp registers
>>> +   so that they spill. By default FMV instructions would be used to stash
>>> +   int reg to a fp reg (and vice-versa) but that could be costlier than
>>> +   spilling to stack.  */
>>> +
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-march=rv64g -ffast-math" } */
>>> +
>>> +#define NITER 4
>>> +#define NVARS 20
>>> +#define MULTI(X) \
>>> +  X( 0), X( 1), X( 2), X( 3), X( 4), X( 5), X( 6), X( 7), X( 8), X( 9), \
>>> +  X(10), X(11), X(12), X(13), X(14), X(15), X(16), X(17), X(18), X(19)
>>> +
>>> +#define DECLAREI(INDEX) inc##INDEX = incs[INDEX]
>>> +#define DECLAREF(INDEX) *ptr##INDEX = ptrs[INDEX], result##INDEX = 5
>>> +#define LOOP(INDEX) result##INDEX += result##INDEX * (*ptr##INDEX),
>>> ptr##INDEX += inc##INDEX
>>> +#define COPYOUT(INDEX) results[INDEX] = result##INDEX
>>> +
>>> +double *ptrs[NVARS];
>>> +double results[NVARS];
>>> +int incs[NVARS];
>>> +
>>> +void __attribute__((noinline))
>>> +foo (int n)
>>> +{
>>> +  int MULTI (DECLAREI);
>>> +  double MULTI (DECLAREF);
>>> +  while (n--)
>>> +    MULTI (LOOP);
>>> +  MULTI (COPYOUT);
>>> +}
>>> +
>>> +double input[NITER * NVARS];
>>> +
>>> +int
>>> +main (void)
>>> +{
>>> +  int i;
>>> +
>>> +  for (i = 0; i < NVARS; i++)
>>> +    ptrs[i] = input + i, incs[i] = i;
>>> +  for (i = 0; i < NITER * NVARS; i++)
>>> +    input[i] = i;
>>> +  foo (NITER);
>>> +  for (i = 0; i < NVARS; i++)
>>> +    if (results[i] != i * NITER * (NITER + 1) / 2)
>>> +      return 1;
>>> +  return 0;
>>> +}
>>> +
>>> +/* { dg-final { scan-assembler-not "\tfmv\\.d\\.x\t" } } */
>>> +/* { dg-final { scan-assembler-not "\tfmv\\.x\\.d\t" } } */
>>> --
>>> 2.32.0
>>>
>>>
  
Kito Cheng June 2, 2022, 2:11 a.m. UTC | #4
I just hesitated for a few days about backporting this, but I think
it's OK to back port because
1. Simple enough
2. Good for general RISC-V core

Committed with your latest testsuite fix.

Thanks!



On Wed, May 25, 2022 at 3:38 AM Vineet Gupta <vineetg@rivosinc.com> wrote:
>
>
>
> On 5/24/22 00:59, Kito Cheng wrote:
> > Committed, thanks!
>
> Thx for the quick action Kito,
> Can this be backported to gcc 12 as well ?
>
> Thx,
> -Vineet
>
> >
> > On Tue, May 24, 2022 at 3:40 AM Philipp Tomsich
> > <philipp.tomsich@vrull.eu> wrote:
> >> Good catch!
> >>
> >> On Mon, 23 May 2022 at 20:12, Vineet Gupta <vineetg@rivosinc.com> wrote:
> >>
> >>> Under extreme register pressure, compiler can use FP <--> int
> >>> moves as a cheap alternate to spilling to memory.
> >>> This was seen with SPEC2017 FP benchmark 507.cactu:
> >>> ML_BSSN_Advect.cc:ML_BSSN_Advect_Body()
> >>>
> >>> |       fmv.d.x fa5,s9  # PDupwindNthSymm2Xt1, PDupwindNthSymm2Xt1
> >>> | .LVL325:
> >>> |       ld      s9,184(sp)              # _12469, %sfp
> >>> | ...
> >>> | .LVL339:
> >>> |       fmv.x.d s4,fa5  # PDupwindNthSymm2Xt1, PDupwindNthSymm2Xt1
> >>> |
> >>>
> >>> The FMV instructions could be costlier (than stack spill) on certain
> >>> micro-architectures, thus this needs to be a per-cpu tunable
> >>> (default being to inhibit on all existing RV cpus).
> >>>
> >>> Testsuite run with new test reports 10 failures without the fix
> >>> corresponding to the build variations of pr105666.c
> >>>
> >>> |               === gcc Summary ===
> >>> |
> >>> | # of expected passes          123318   (+10)
> >>> | # of unexpected failures      34       (-10)
> >>> | # of unexpected successes     4
> >>> | # of expected failures        780
> >>> | # of unresolved testcases     4
> >>> | # of unsupported tests        2796
> >>>
> >>> gcc/Changelog:
> >>>
> >>>          * config/riscv/riscv.cc: (struct riscv_tune_param): Add
> >>>            fmv_cost.
> >>>          (rocket_tune_info): Add default fmv_cost 8.
> >>>          (sifive_7_tune_info): Ditto.
> >>>          (thead_c906_tune_info): Ditto.
> >>>          (optimize_size_tune_info): Ditto.
> >>>          (riscv_register_move_cost): Use fmv_cost for int<->fp moves.
> >>>
> >>> gcc/testsuite/Changelog:
> >>>
> >>>          * gcc.target/riscv/pr105666.c: New test.
> >>>
> >>> Signed-off-by: Vineet Gupta <vineetg@rivosinc.com>
> >>> ---
> >>>   gcc/config/riscv/riscv.cc                 |  9 ++++
> >>>   gcc/testsuite/gcc.target/riscv/pr105666.c | 55 +++++++++++++++++++++++
> >>>   2 files changed, 64 insertions(+)
> >>>   create mode 100644 gcc/testsuite/gcc.target/riscv/pr105666.c
> >>>
> >>> diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
> >>> index ee756aab6940..f3ac0d8865f0 100644
> >>> --- a/gcc/config/riscv/riscv.cc
> >>> +++ b/gcc/config/riscv/riscv.cc
> >>> @@ -220,6 +220,7 @@ struct riscv_tune_param
> >>>     unsigned short issue_rate;
> >>>     unsigned short branch_cost;
> >>>     unsigned short memory_cost;
> >>> +  unsigned short fmv_cost;
> >>>     bool slow_unaligned_access;
> >>>   };
> >>>
> >>> @@ -285,6 +286,7 @@ static const struct riscv_tune_param rocket_tune_info
> >>> = {
> >>>     1,                                           /* issue_rate */
> >>>     3,                                           /* branch_cost */
> >>>     5,                                           /* memory_cost */
> >>> +  8,                                           /* fmv_cost */
> >>>     true,                                                /*
> >>> slow_unaligned_access */
> >>>   };
> >>>
> >>> @@ -298,6 +300,7 @@ static const struct riscv_tune_param
> >>> sifive_7_tune_info = {
> >>>     2,                                           /* issue_rate */
> >>>     4,                                           /* branch_cost */
> >>>     3,                                           /* memory_cost */
> >>> +  8,                                           /* fmv_cost */
> >>>     true,                                                /*
> >>> slow_unaligned_access */
> >>>   };
> >>>
> >>> @@ -311,6 +314,7 @@ static const struct riscv_tune_param
> >>> thead_c906_tune_info = {
> >>>     1,            /* issue_rate */
> >>>     3,            /* branch_cost */
> >>>     5,            /* memory_cost */
> >>> +  8,           /* fmv_cost */
> >>>     false,            /* slow_unaligned_access */
> >>>   };
> >>>
> >>> @@ -324,6 +328,7 @@ static const struct riscv_tune_param
> >>> optimize_size_tune_info = {
> >>>     1,                                           /* issue_rate */
> >>>     1,                                           /* branch_cost */
> >>>     2,                                           /* memory_cost */
> >>> +  8,                                           /* fmv_cost */
> >>>     false,                                       /* slow_unaligned_access */
> >>>   };
> >>>
> >>> @@ -4737,6 +4742,10 @@ static int
> >>>   riscv_register_move_cost (machine_mode mode,
> >>>                            reg_class_t from, reg_class_t to)
> >>>   {
> >>> +  if ((from == FP_REGS && to == GR_REGS) ||
> >>> +      (from == GR_REGS && to == FP_REGS))
> >>> +    return tune_param->fmv_cost;
> >>> +
> >>>     return riscv_secondary_memory_needed (mode, from, to) ? 8 : 2;
> >>>   }
> >>>
> >>> diff --git a/gcc/testsuite/gcc.target/riscv/pr105666.c
> >>> b/gcc/testsuite/gcc.target/riscv/pr105666.c
> >>> new file mode 100644
> >>> index 000000000000..904f3bc0763f
> >>> --- /dev/null
> >>> +++ b/gcc/testsuite/gcc.target/riscv/pr105666.c
> >>> @@ -0,0 +1,55 @@
> >>> +/* Shamelessly plugged off
> >>> gcc/testsuite/gcc.c-torture/execute/pr28982a.c.
> >>> +
> >>> +   The idea is to induce high register pressure for both int/fp registers
> >>> +   so that they spill. By default FMV instructions would be used to stash
> >>> +   int reg to a fp reg (and vice-versa) but that could be costlier than
> >>> +   spilling to stack.  */
> >>> +
> >>> +/* { dg-do compile } */
> >>> +/* { dg-options "-march=rv64g -ffast-math" } */
> >>> +
> >>> +#define NITER 4
> >>> +#define NVARS 20
> >>> +#define MULTI(X) \
> >>> +  X( 0), X( 1), X( 2), X( 3), X( 4), X( 5), X( 6), X( 7), X( 8), X( 9), \
> >>> +  X(10), X(11), X(12), X(13), X(14), X(15), X(16), X(17), X(18), X(19)
> >>> +
> >>> +#define DECLAREI(INDEX) inc##INDEX = incs[INDEX]
> >>> +#define DECLAREF(INDEX) *ptr##INDEX = ptrs[INDEX], result##INDEX = 5
> >>> +#define LOOP(INDEX) result##INDEX += result##INDEX * (*ptr##INDEX),
> >>> ptr##INDEX += inc##INDEX
> >>> +#define COPYOUT(INDEX) results[INDEX] = result##INDEX
> >>> +
> >>> +double *ptrs[NVARS];
> >>> +double results[NVARS];
> >>> +int incs[NVARS];
> >>> +
> >>> +void __attribute__((noinline))
> >>> +foo (int n)
> >>> +{
> >>> +  int MULTI (DECLAREI);
> >>> +  double MULTI (DECLAREF);
> >>> +  while (n--)
> >>> +    MULTI (LOOP);
> >>> +  MULTI (COPYOUT);
> >>> +}
> >>> +
> >>> +double input[NITER * NVARS];
> >>> +
> >>> +int
> >>> +main (void)
> >>> +{
> >>> +  int i;
> >>> +
> >>> +  for (i = 0; i < NVARS; i++)
> >>> +    ptrs[i] = input + i, incs[i] = i;
> >>> +  for (i = 0; i < NITER * NVARS; i++)
> >>> +    input[i] = i;
> >>> +  foo (NITER);
> >>> +  for (i = 0; i < NVARS; i++)
> >>> +    if (results[i] != i * NITER * (NITER + 1) / 2)
> >>> +      return 1;
> >>> +  return 0;
> >>> +}
> >>> +
> >>> +/* { dg-final { scan-assembler-not "\tfmv\\.d\\.x\t" } } */
> >>> +/* { dg-final { scan-assembler-not "\tfmv\\.x\\.d\t" } } */
> >>> --
> >>> 2.32.0
> >>>
> >>>
>
  

Patch

diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc
index ee756aab6940..f3ac0d8865f0 100644
--- a/gcc/config/riscv/riscv.cc
+++ b/gcc/config/riscv/riscv.cc
@@ -220,6 +220,7 @@  struct riscv_tune_param
   unsigned short issue_rate;
   unsigned short branch_cost;
   unsigned short memory_cost;
+  unsigned short fmv_cost;
   bool slow_unaligned_access;
 };
 
@@ -285,6 +286,7 @@  static const struct riscv_tune_param rocket_tune_info = {
   1,						/* issue_rate */
   3,						/* branch_cost */
   5,						/* memory_cost */
+  8,						/* fmv_cost */
   true,						/* slow_unaligned_access */
 };
 
@@ -298,6 +300,7 @@  static const struct riscv_tune_param sifive_7_tune_info = {
   2,						/* issue_rate */
   4,						/* branch_cost */
   3,						/* memory_cost */
+  8,						/* fmv_cost */
   true,						/* slow_unaligned_access */
 };
 
@@ -311,6 +314,7 @@  static const struct riscv_tune_param thead_c906_tune_info = {
   1,            /* issue_rate */
   3,            /* branch_cost */
   5,            /* memory_cost */
+  8,		/* fmv_cost */
   false,            /* slow_unaligned_access */
 };
 
@@ -324,6 +328,7 @@  static const struct riscv_tune_param optimize_size_tune_info = {
   1,						/* issue_rate */
   1,						/* branch_cost */
   2,						/* memory_cost */
+  8,						/* fmv_cost */
   false,					/* slow_unaligned_access */
 };
 
@@ -4737,6 +4742,10 @@  static int
 riscv_register_move_cost (machine_mode mode,
 			  reg_class_t from, reg_class_t to)
 {
+  if ((from == FP_REGS && to == GR_REGS) ||
+      (from == GR_REGS && to == FP_REGS))
+    return tune_param->fmv_cost;
+
   return riscv_secondary_memory_needed (mode, from, to) ? 8 : 2;
 }
 
diff --git a/gcc/testsuite/gcc.target/riscv/pr105666.c b/gcc/testsuite/gcc.target/riscv/pr105666.c
new file mode 100644
index 000000000000..904f3bc0763f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/pr105666.c
@@ -0,0 +1,55 @@ 
+/* Shamelessly plugged off gcc/testsuite/gcc.c-torture/execute/pr28982a.c.  
+
+   The idea is to induce high register pressure for both int/fp registers
+   so that they spill. By default FMV instructions would be used to stash
+   int reg to a fp reg (and vice-versa) but that could be costlier than
+   spilling to stack.  */
+
+/* { dg-do compile } */
+/* { dg-options "-march=rv64g -ffast-math" } */
+
+#define NITER 4
+#define NVARS 20
+#define MULTI(X) \
+  X( 0), X( 1), X( 2), X( 3), X( 4), X( 5), X( 6), X( 7), X( 8), X( 9), \
+  X(10), X(11), X(12), X(13), X(14), X(15), X(16), X(17), X(18), X(19)
+
+#define DECLAREI(INDEX) inc##INDEX = incs[INDEX]
+#define DECLAREF(INDEX) *ptr##INDEX = ptrs[INDEX], result##INDEX = 5
+#define LOOP(INDEX) result##INDEX += result##INDEX * (*ptr##INDEX), ptr##INDEX += inc##INDEX
+#define COPYOUT(INDEX) results[INDEX] = result##INDEX
+
+double *ptrs[NVARS];
+double results[NVARS];
+int incs[NVARS];
+
+void __attribute__((noinline))
+foo (int n)
+{
+  int MULTI (DECLAREI);
+  double MULTI (DECLAREF);
+  while (n--)
+    MULTI (LOOP);
+  MULTI (COPYOUT);
+}
+
+double input[NITER * NVARS];
+
+int
+main (void)
+{
+  int i;
+
+  for (i = 0; i < NVARS; i++)
+    ptrs[i] = input + i, incs[i] = i;
+  for (i = 0; i < NITER * NVARS; i++)
+    input[i] = i;
+  foo (NITER);
+  for (i = 0; i < NVARS; i++)
+    if (results[i] != i * NITER * (NITER + 1) / 2)
+      return 1;
+  return 0;
+}
+
+/* { dg-final { scan-assembler-not "\tfmv\\.d\\.x\t" } } */
+/* { dg-final { scan-assembler-not "\tfmv\\.x\\.d\t" } } */