Fix ARM optimized fma & fmaf implementations

Message ID 20240629204434.2289174-1-mickael9@gmail.com
State New
Headers
Series Fix ARM optimized fma & fmaf implementations |

Commit Message

Mickaël Thomas June 29, 2024, 8:44 p.m. UTC
  The vfma.f32|64 z, x, y instruction performs the operation
z += x * y without intermediate rounding.

The register used for z is both read and written by the instruction.
The inline assembly must therefore use the "+" constraint modifier.
Using "=" the generated assembly gives an incorrect result.

Current (incorrect) implementation:

In archive arm-none-eabi/thumb/v8-m.main+dp/hard/newlib/libm.a:

<fma>:
        eea0 0b01       vfma.f64        d0, d0, d1
        4770            bx      lr
<fmaf>:
        eea0 0a20       vfma.f32        s0, s0, s1
        4770            bx      lr

Correct implementation:

<fma>:
        eea0 2b01       vfma.f64        d2, d0, d1
        eeb0 0b42       vmov.f64        d0, d2
        4770            bx      lr

<fmaf>:
        eea0 1a20       vfma.f32        s2, s0, s1
        eeb0 0a41       vmov.f32        s0, s2
        4770            bx      lr
---
 newlib/libm/machine/arm/s_fma_arm.c  | 2 +-
 newlib/libm/machine/arm/sf_fma_arm.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
  

Comments

Richard Earnshaw (lists) July 1, 2024, 12:48 p.m. UTC | #1
On 29/06/2024 21:44, Mickaël Thomas wrote:
> The vfma.f32|64 z, x, y instruction performs the operation
> z += x * y without intermediate rounding.
> 
> The register used for z is both read and written by the instruction.
> The inline assembly must therefore use the "+" constraint modifier.
> Using "=" the generated assembly gives an incorrect result.
> 
> Current (incorrect) implementation:
> 
> In archive arm-none-eabi/thumb/v8-m.main+dp/hard/newlib/libm.a:
> 
> <fma>:
>         eea0 0b01       vfma.f64        d0, d0, d1
>         4770            bx      lr
> <fmaf>:
>         eea0 0a20       vfma.f32        s0, s0, s1
>         4770            bx      lr
> 
> Correct implementation:
> 
> <fma>:
>         eea0 2b01       vfma.f64        d2, d0, d1
>         eeb0 0b42       vmov.f64        d0, d2
>         4770            bx      lr
> 
> <fmaf>:
>         eea0 1a20       vfma.f32        s2, s0, s1
>         eeb0 0a41       vmov.f32        s0, s2
>         4770            bx      lr
> ---
>  newlib/libm/machine/arm/s_fma_arm.c  | 2 +-
>  newlib/libm/machine/arm/sf_fma_arm.c | 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/newlib/libm/machine/arm/s_fma_arm.c b/newlib/libm/machine/arm/s_fma_arm.c
> index f945419b5..6a2b403e1 100644
> --- a/newlib/libm/machine/arm/s_fma_arm.c
> +++ b/newlib/libm/machine/arm/s_fma_arm.c
> @@ -41,7 +41,7 @@
>  double
>  fma (double x, double y, double z)
>  {
> -  asm ("vfma.f64 %P0, %P1, %P2" : "=w" (z) : "w" (x), "w" (y));
> +  asm ("vfma.f64 %P0, %P1, %P2" : "+w" (z) : "w" (x), "w" (y));
>    return z;
>  }
>  
> diff --git a/newlib/libm/machine/arm/sf_fma_arm.c b/newlib/libm/machine/arm/sf_fma_arm.c
> index 4befd9017..8d1b63d99 100644
> --- a/newlib/libm/machine/arm/sf_fma_arm.c
> +++ b/newlib/libm/machine/arm/sf_fma_arm.c
> @@ -41,7 +41,7 @@
>  float
>  fmaf (float x, float y, float z)
>  {
> -  asm ("vfma.f32 %0, %1, %2" : "=t" (z) : "t" (x), "t" (y));
> +  asm ("vfma.f32 %0, %1, %2" : "+t" (z) : "t" (x), "t" (y));
>    return z;
>  }
>  

Nice catch.  Patch applied.

Thank you.

R.
  

Patch

diff --git a/newlib/libm/machine/arm/s_fma_arm.c b/newlib/libm/machine/arm/s_fma_arm.c
index f945419b5..6a2b403e1 100644
--- a/newlib/libm/machine/arm/s_fma_arm.c
+++ b/newlib/libm/machine/arm/s_fma_arm.c
@@ -41,7 +41,7 @@ 
 double
 fma (double x, double y, double z)
 {
-  asm ("vfma.f64 %P0, %P1, %P2" : "=w" (z) : "w" (x), "w" (y));
+  asm ("vfma.f64 %P0, %P1, %P2" : "+w" (z) : "w" (x), "w" (y));
   return z;
 }
 
diff --git a/newlib/libm/machine/arm/sf_fma_arm.c b/newlib/libm/machine/arm/sf_fma_arm.c
index 4befd9017..8d1b63d99 100644
--- a/newlib/libm/machine/arm/sf_fma_arm.c
+++ b/newlib/libm/machine/arm/sf_fma_arm.c
@@ -41,7 +41,7 @@ 
 float
 fmaf (float x, float y, float z)
 {
-  asm ("vfma.f32 %0, %1, %2" : "=t" (z) : "t" (x), "t" (y));
+  asm ("vfma.f32 %0, %1, %2" : "+t" (z) : "t" (x), "t" (y));
   return z;
 }