i386: Support partial signbit/xorsign/copysign/abs/neg/and/xor/ior/andn for V2BF/V4BF

Message ID 20240904025252.1894695-1-admin@levyhsu.com
State Committed
Commit d0c86be1ce7131aeca2cf3304a8d65a00da4f12a
Headers
Series i386: Support partial signbit/xorsign/copysign/abs/neg/and/xor/ior/andn for V2BF/V4BF |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 success Test passed
linaro-tcwg-bot/tcwg_gcc_check--master-arm success Test passed

Commit Message

Levy Hsu Sept. 4, 2024, 2:52 a.m. UTC
  Hi

This patch adds support for bf16 operations in V2BF and V4BF modes on i386,
handling signbit, xorsign, copysign, abs, neg, and various logical operations.

Bootstrapped and tested on x86-64-pc-linux-gnu. 
Ok for trunk?

gcc/ChangeLog:

	* config/i386/i386.cc (ix86_build_const_vector): Add V2BF/V4BF.
	(ix86_build_signbit_mask): Add V2BF/V4BF.
	* config/i386/mmx.md: Modified supported logic op to use VHBF_32_64.

gcc/testsuite/ChangeLog:

	* gcc.target/i386/part-vect-absnegbf.c: New test.
---
 gcc/config/i386/i386.cc                       |  4 +
 gcc/config/i386/mmx.md                        | 74 +++++++++--------
 .../gcc.target/i386/part-vect-absnegbf.c      | 81 +++++++++++++++++++
 3 files changed, 124 insertions(+), 35 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c
  

Comments

Hongtao Liu Sept. 5, 2024, 1:35 a.m. UTC | #1
On Wed, Sep 4, 2024 at 10:53 AM Levy Hsu <admin@levyhsu.com> wrote:
>
> Hi
>
> This patch adds support for bf16 operations in V2BF and V4BF modes on i386,
> handling signbit, xorsign, copysign, abs, neg, and various logical operations.
>
> Bootstrapped and tested on x86-64-pc-linux-gnu.
> Ok for trunk?
Ok.
>
> gcc/ChangeLog:
>
>         * config/i386/i386.cc (ix86_build_const_vector): Add V2BF/V4BF.
>         (ix86_build_signbit_mask): Add V2BF/V4BF.
>         * config/i386/mmx.md: Modified supported logic op to use VHBF_32_64.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/part-vect-absnegbf.c: New test.
> ---
>  gcc/config/i386/i386.cc                       |  4 +
>  gcc/config/i386/mmx.md                        | 74 +++++++++--------
>  .../gcc.target/i386/part-vect-absnegbf.c      | 81 +++++++++++++++++++
>  3 files changed, 124 insertions(+), 35 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 78bf890f14b..2bbfb1bf5fc 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -16176,6 +16176,8 @@ ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
>      case E_V32BFmode:
>      case E_V16BFmode:
>      case E_V8BFmode:
> +    case E_V4BFmode:
> +    case E_V2BFmode:
>        n_elt = GET_MODE_NUNITS (mode);
>        v = rtvec_alloc (n_elt);
>        scalar_mode = GET_MODE_INNER (mode);
> @@ -16215,6 +16217,8 @@ ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
>      case E_V32BFmode:
>      case E_V16BFmode:
>      case E_V8BFmode:
> +    case E_V4BFmode:
> +    case E_V2BFmode:
>        vec_mode = mode;
>        imode = HImode;
>        break;
> diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
> index cb2697537a8..44adcd8d8e0 100644
> --- a/gcc/config/i386/mmx.md
> +++ b/gcc/config/i386/mmx.md
> @@ -121,7 +121,7 @@
>  ;; Mapping of vector float modes to an integer mode of the same size
>  (define_mode_attr mmxintvecmode
>    [(V2SF "V2SI") (V2SI "V2SI") (V4HI "V4HI") (V8QI "V8QI")
> -   (V4HF "V4HI") (V2HF "V2HI")])
> +   (V4HF "V4HI") (V2HF "V2HI") (V4BF "V4HI") (V2BF "V2HI")])
>
>  (define_mode_attr mmxintvecmodelower
>    [(V2SF "v2si") (V2SI "v2si") (V4HI "v4hi") (V8QI "v8qi")
> @@ -2091,18 +2091,22 @@
>    DONE;
>  })
>
> +(define_mode_iterator VHBF_32_64
> + [V2BF (V4BF "TARGET_MMX_WITH_SSE")
> +  V2HF (V4HF "TARGET_MMX_WITH_SSE")])
> +
>  (define_expand "<code><mode>2"
> -  [(set (match_operand:VHF_32_64 0 "register_operand")
> -       (absneg:VHF_32_64
> -         (match_operand:VHF_32_64 1 "register_operand")))]
> +  [(set (match_operand:VHBF_32_64 0 "register_operand")
> +       (absneg:VHBF_32_64
> +         (match_operand:VHBF_32_64 1 "register_operand")))]
>    "TARGET_SSE"
>    "ix86_expand_fp_absneg_operator (<CODE>, <MODE>mode, operands); DONE;")
>
>  (define_insn_and_split "*mmx_<code><mode>"
> -  [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x,x")
> -       (absneg:VHF_32_64
> -         (match_operand:VHF_32_64 1 "register_operand" "0,x,x")))
> -   (use (match_operand:VHF_32_64 2 "register_operand" "x,0,x"))]
> +  [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x,x")
> +       (absneg:VHBF_32_64
> +         (match_operand:VHBF_32_64 1 "register_operand" "0,x,x")))
> +   (use (match_operand:VHBF_32_64 2 "register_operand" "x,0,x"))]
>    "TARGET_SSE"
>    "#"
>    "&& reload_completed"
> @@ -2115,11 +2119,11 @@
>    [(set_attr "isa" "noavx,noavx,avx")])
>
>  (define_insn_and_split "*mmx_nabs<mode>2"
> -  [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x,x")
> -       (neg:VHF_32_64
> -         (abs:VHF_32_64
> -           (match_operand:VHF_32_64 1 "register_operand" "0,x,x"))))
> -   (use (match_operand:VHF_32_64 2 "register_operand" "x,0,x"))]
> +  [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x,x")
> +       (neg:VHBF_32_64
> +         (abs:VHBF_32_64
> +           (match_operand:VHBF_32_64 1 "register_operand" "0,x,x"))))
> +   (use (match_operand:VHBF_32_64 2 "register_operand" "x,0,x"))]
>    "TARGET_SSE"
>    "#"
>    "&& reload_completed"
> @@ -2410,11 +2414,11 @@
>  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
>
>  (define_insn "*mmx_andnot<mode>3"
> -  [(set (match_operand:VHF_32_64 0 "register_operand"    "=x,x")
> -       (and:VHF_32_64
> -         (not:VHF_32_64
> -           (match_operand:VHF_32_64 1 "register_operand" "0,x"))
> -         (match_operand:VHF_32_64 2 "register_operand"   "x,x")))]
> +  [(set (match_operand:VHBF_32_64 0 "register_operand"    "=x,x")
> +       (and:VHBF_32_64
> +         (not:VHBF_32_64
> +           (match_operand:VHBF_32_64 1 "register_operand" "0,x"))
> +         (match_operand:VHBF_32_64 2 "register_operand"   "x,x")))]
>    "TARGET_SSE"
>    "@
>     andnps\t{%2, %0|%0, %2}
> @@ -2425,10 +2429,10 @@
>     (set_attr "mode" "V4SF")])
>
>  (define_insn "<code><mode>3"
> -  [(set (match_operand:VHF_32_64 0 "register_operand"   "=x,x")
> -       (any_logic:VHF_32_64
> -         (match_operand:VHF_32_64 1 "register_operand" "%0,x")
> -         (match_operand:VHF_32_64 2 "register_operand" " x,x")))]
> +  [(set (match_operand:VHBF_32_64 0 "register_operand"   "=x,x")
> +       (any_logic:VHBF_32_64
> +         (match_operand:VHBF_32_64 1 "register_operand" "%0,x")
> +         (match_operand:VHBF_32_64 2 "register_operand" " x,x")))]
>    "TARGET_SSE"
>    "@
>     <logic>ps\t{%2, %0|%0, %2}
> @@ -2440,14 +2444,14 @@
>
>  (define_expand "copysign<mode>3"
>    [(set (match_dup 4)
> -       (and:VHF_32_64
> -         (not:VHF_32_64 (match_dup 3))
> -         (match_operand:VHF_32_64 1 "register_operand")))
> +       (and:VHBF_32_64
> +         (not:VHBF_32_64 (match_dup 3))
> +         (match_operand:VHBF_32_64 1 "register_operand")))
>     (set (match_dup 5)
> -       (and:VHF_32_64 (match_dup 3)
> -                 (match_operand:VHF_32_64 2 "register_operand")))
> -   (set (match_operand:VHF_32_64 0 "register_operand")
> -       (ior:VHF_32_64 (match_dup 4) (match_dup 5)))]
> +       (and:VHBF_32_64 (match_dup 3)
> +                 (match_operand:VHBF_32_64 2 "register_operand")))
> +   (set (match_operand:VHBF_32_64 0 "register_operand")
> +       (ior:VHBF_32_64 (match_dup 4) (match_dup 5)))]
>    "TARGET_SSE"
>  {
>    operands[3] = ix86_build_signbit_mask (<MODE>mode, true, false);
> @@ -2458,11 +2462,11 @@
>
>  (define_expand "xorsign<mode>3"
>    [(set (match_dup 4)
> -       (and:VHF_32_64 (match_dup 3)
> -                 (match_operand:VHF_32_64 2 "register_operand")))
> -   (set (match_operand:VHF_32_64 0 "register_operand")
> -       (xor:VHF_32_64 (match_dup 4)
> -                 (match_operand:VHF_32_64 1 "register_operand")))]
> +       (and:VHBF_32_64 (match_dup 3)
> +                 (match_operand:VHBF_32_64 2 "register_operand")))
> +   (set (match_operand:VHBF_32_64 0 "register_operand")
> +       (xor:VHBF_32_64 (match_dup 4)
> +                 (match_operand:VHBF_32_64 1 "register_operand")))]
>    "TARGET_SSE"
>  {
>    operands[3] = ix86_build_signbit_mask (<MODE>mode, true, false);
> @@ -2474,7 +2478,7 @@
>    [(set (match_operand:<mmxintvecmode> 0 "register_operand")
>         (lshiftrt:<mmxintvecmode>
>           (subreg:<mmxintvecmode>
> -           (match_operand:VHF_32_64 1 "register_operand") 0)
> +           (match_operand:VHBF_32_64 1 "register_operand") 0)
>           (match_dup 2)))]
>    "TARGET_SSE2"
>  {
> diff --git a/gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c b/gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c
> new file mode 100644
> index 00000000000..2d7ae35298e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c
> @@ -0,0 +1,81 @@
> +/* { dg-do run } */
> +/* { dg-options "-O1 -fdump-tree-vect-details -fdump-tree-slp-details -fdump-tree-optimized" } */
> +
> +extern void abort (void);
> +static void do_test (void);
> +
> +#define DO_TEST do_test
> +#define AVX512BF16
> +#include "avx512-check.h"
> +
> +__bf16 b_32[2], r_abs_32[2], r_neg_32[2];
> +__bf16 b_64[4], r_abs_64[4], r_neg_64[4];
> +
> +void
> +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
> +abs_32 (void)
> +{
> +  for (int i = 0; i < 2; i++)
> +    r_abs_32[i] = __builtin_fabsf16 (b_32[i]);
> +}
> +
> +void
> +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
> +neg_32 (void)
> +{
> +  for (int i = 0; i < 2; i++)
> +    r_neg_32[i] = -b_32[i];
> +}
> +
> +void
> +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
> +abs_64 (void)
> +{
> +  for (int i = 0; i < 4; i++)
> +    r_abs_64[i] = __builtin_fabsf16 (b_64[i]);
> +}
> +
> +void
> +__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
> +neg_64 (void)
> +{
> +  for (int i = 0; i < 4; i++)
> +    r_neg_64[i] = -b_64[i];
> +}
> +
> +void
> +check_absneg_results (__bf16 *b, __bf16 *r_abs, __bf16 *r_neg, int len)
> +{
> +  for (int i = 0; i < len; i++)
> +    {
> +      __bf16 expected_abs = __builtin_fabsf16 (b[i]);
> +      __bf16 expected_neg = -b[i];
> +      if (r_abs[i] != expected_abs || r_neg[i] != expected_neg)
> +        abort ();
> +    }
> +}
> +
> +static void
> +__attribute__ ((noinline, noclone))
> +do_test (void)
> +{
> +  float float_b[16] = {-1.2f, 3.4f, -5.6f, 7.8f};
> +
> +  for (int i = 0; i < 2; i++)
> +    b_32[i] = (__bf16) float_b[i];
> +
> +  for (int i = 0; i < 4; i++)
> +    b_64[i] = (__bf16) float_b[i];
> +
> +  abs_32 ();
> +  neg_32 ();
> +  check_absneg_results (b_32, r_abs_32, r_neg_32, 2);
> +
> +  abs_64 ();
> +  neg_64 ();
> +  check_absneg_results (b_64, r_abs_64, r_neg_64, 4);
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized using 4 byte vectors" 2 "slp1" } } */
> +/* { dg-final { scan-tree-dump-times "loop vectorized using 8 byte vectors" 2 "vect" { target { ! ia32 } } } } */
> +/* { dg-final { scan-tree-dump-times {(?n)ABS_EXPR <vect} 2 "optimized" { target { ! ia32 } } } } */
> --
> 2.31.1
>
  

Patch

diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index 78bf890f14b..2bbfb1bf5fc 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -16176,6 +16176,8 @@  ix86_build_const_vector (machine_mode mode, bool vect, rtx value)
     case E_V32BFmode:
     case E_V16BFmode:
     case E_V8BFmode:
+    case E_V4BFmode:
+    case E_V2BFmode:
       n_elt = GET_MODE_NUNITS (mode);
       v = rtvec_alloc (n_elt);
       scalar_mode = GET_MODE_INNER (mode);
@@ -16215,6 +16217,8 @@  ix86_build_signbit_mask (machine_mode mode, bool vect, bool invert)
     case E_V32BFmode:
     case E_V16BFmode:
     case E_V8BFmode:
+    case E_V4BFmode:
+    case E_V2BFmode:
       vec_mode = mode;
       imode = HImode;
       break;
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index cb2697537a8..44adcd8d8e0 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -121,7 +121,7 @@ 
 ;; Mapping of vector float modes to an integer mode of the same size
 (define_mode_attr mmxintvecmode
   [(V2SF "V2SI") (V2SI "V2SI") (V4HI "V4HI") (V8QI "V8QI")
-   (V4HF "V4HI") (V2HF "V2HI")])
+   (V4HF "V4HI") (V2HF "V2HI") (V4BF "V4HI") (V2BF "V2HI")])
 
 (define_mode_attr mmxintvecmodelower
   [(V2SF "v2si") (V2SI "v2si") (V4HI "v4hi") (V8QI "v8qi")
@@ -2091,18 +2091,22 @@ 
   DONE;
 })
 
+(define_mode_iterator VHBF_32_64
+ [V2BF (V4BF "TARGET_MMX_WITH_SSE")
+  V2HF (V4HF "TARGET_MMX_WITH_SSE")]) 
+
 (define_expand "<code><mode>2"
-  [(set (match_operand:VHF_32_64 0 "register_operand")
-	(absneg:VHF_32_64
-	  (match_operand:VHF_32_64 1 "register_operand")))]
+  [(set (match_operand:VHBF_32_64 0 "register_operand")
+	(absneg:VHBF_32_64
+	  (match_operand:VHBF_32_64 1 "register_operand")))]
   "TARGET_SSE"
   "ix86_expand_fp_absneg_operator (<CODE>, <MODE>mode, operands); DONE;")
 
 (define_insn_and_split "*mmx_<code><mode>"
-  [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x,x")
-	(absneg:VHF_32_64
-	  (match_operand:VHF_32_64 1 "register_operand" "0,x,x")))
-   (use (match_operand:VHF_32_64 2 "register_operand" "x,0,x"))]
+  [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x,x")
+	(absneg:VHBF_32_64
+	  (match_operand:VHBF_32_64 1 "register_operand" "0,x,x")))
+   (use (match_operand:VHBF_32_64 2 "register_operand" "x,0,x"))]
   "TARGET_SSE"
   "#"
   "&& reload_completed"
@@ -2115,11 +2119,11 @@ 
   [(set_attr "isa" "noavx,noavx,avx")])
 
 (define_insn_and_split "*mmx_nabs<mode>2"
-  [(set (match_operand:VHF_32_64 0 "register_operand" "=x,x,x")
-	(neg:VHF_32_64
-	  (abs:VHF_32_64
-	    (match_operand:VHF_32_64 1 "register_operand" "0,x,x"))))
-   (use (match_operand:VHF_32_64 2 "register_operand" "x,0,x"))]
+  [(set (match_operand:VHBF_32_64 0 "register_operand" "=x,x,x")
+	(neg:VHBF_32_64
+	  (abs:VHBF_32_64
+	    (match_operand:VHBF_32_64 1 "register_operand" "0,x,x"))))
+   (use (match_operand:VHBF_32_64 2 "register_operand" "x,0,x"))]
   "TARGET_SSE"
   "#"
   "&& reload_completed"
@@ -2410,11 +2414,11 @@ 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (define_insn "*mmx_andnot<mode>3"
-  [(set (match_operand:VHF_32_64 0 "register_operand"    "=x,x")
-	(and:VHF_32_64
-	  (not:VHF_32_64
-	    (match_operand:VHF_32_64 1 "register_operand" "0,x"))
-	  (match_operand:VHF_32_64 2 "register_operand"   "x,x")))]
+  [(set (match_operand:VHBF_32_64 0 "register_operand"    "=x,x")
+	(and:VHBF_32_64
+	  (not:VHBF_32_64
+	    (match_operand:VHBF_32_64 1 "register_operand" "0,x"))
+	  (match_operand:VHBF_32_64 2 "register_operand"   "x,x")))]
   "TARGET_SSE"
   "@
    andnps\t{%2, %0|%0, %2}
@@ -2425,10 +2429,10 @@ 
    (set_attr "mode" "V4SF")])
 
 (define_insn "<code><mode>3"
-  [(set (match_operand:VHF_32_64 0 "register_operand"   "=x,x")
-	(any_logic:VHF_32_64
-	  (match_operand:VHF_32_64 1 "register_operand" "%0,x")
-	  (match_operand:VHF_32_64 2 "register_operand" " x,x")))]
+  [(set (match_operand:VHBF_32_64 0 "register_operand"   "=x,x")
+	(any_logic:VHBF_32_64
+	  (match_operand:VHBF_32_64 1 "register_operand" "%0,x")
+	  (match_operand:VHBF_32_64 2 "register_operand" " x,x")))]
   "TARGET_SSE"
   "@
    <logic>ps\t{%2, %0|%0, %2}
@@ -2440,14 +2444,14 @@ 
 
 (define_expand "copysign<mode>3"
   [(set (match_dup 4)
-	(and:VHF_32_64
-	  (not:VHF_32_64 (match_dup 3))
-	  (match_operand:VHF_32_64 1 "register_operand")))
+	(and:VHBF_32_64
+	  (not:VHBF_32_64 (match_dup 3))
+	  (match_operand:VHBF_32_64 1 "register_operand")))
    (set (match_dup 5)
-	(and:VHF_32_64 (match_dup 3)
-		  (match_operand:VHF_32_64 2 "register_operand")))
-   (set (match_operand:VHF_32_64 0 "register_operand")
-	(ior:VHF_32_64 (match_dup 4) (match_dup 5)))]
+	(and:VHBF_32_64 (match_dup 3)
+		  (match_operand:VHBF_32_64 2 "register_operand")))
+   (set (match_operand:VHBF_32_64 0 "register_operand")
+	(ior:VHBF_32_64 (match_dup 4) (match_dup 5)))]
   "TARGET_SSE"
 {
   operands[3] = ix86_build_signbit_mask (<MODE>mode, true, false);
@@ -2458,11 +2462,11 @@ 
 
 (define_expand "xorsign<mode>3"
   [(set (match_dup 4)
-	(and:VHF_32_64 (match_dup 3)
-		  (match_operand:VHF_32_64 2 "register_operand")))
-   (set (match_operand:VHF_32_64 0 "register_operand")
-	(xor:VHF_32_64 (match_dup 4)
-		  (match_operand:VHF_32_64 1 "register_operand")))]
+	(and:VHBF_32_64 (match_dup 3)
+		  (match_operand:VHBF_32_64 2 "register_operand")))
+   (set (match_operand:VHBF_32_64 0 "register_operand")
+	(xor:VHBF_32_64 (match_dup 4)
+		  (match_operand:VHBF_32_64 1 "register_operand")))]
   "TARGET_SSE"
 {
   operands[3] = ix86_build_signbit_mask (<MODE>mode, true, false);
@@ -2474,7 +2478,7 @@ 
   [(set (match_operand:<mmxintvecmode> 0 "register_operand")
 	(lshiftrt:<mmxintvecmode>
 	  (subreg:<mmxintvecmode>
-	    (match_operand:VHF_32_64 1 "register_operand") 0)
+	    (match_operand:VHBF_32_64 1 "register_operand") 0)
 	  (match_dup 2)))]
   "TARGET_SSE2"
 {
diff --git a/gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c b/gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c
new file mode 100644
index 00000000000..2d7ae35298e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/part-vect-absnegbf.c
@@ -0,0 +1,81 @@ 
+/* { dg-do run } */
+/* { dg-options "-O1 -fdump-tree-vect-details -fdump-tree-slp-details -fdump-tree-optimized" } */
+
+extern void abort (void);
+static void do_test (void);
+
+#define DO_TEST do_test
+#define AVX512BF16
+#include "avx512-check.h"
+
+__bf16 b_32[2], r_abs_32[2], r_neg_32[2];
+__bf16 b_64[4], r_abs_64[4], r_neg_64[4];
+
+void
+__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
+abs_32 (void)
+{
+  for (int i = 0; i < 2; i++)
+    r_abs_32[i] = __builtin_fabsf16 (b_32[i]);
+}
+
+void
+__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
+neg_32 (void)
+{
+  for (int i = 0; i < 2; i++)
+    r_neg_32[i] = -b_32[i];
+}
+
+void
+__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
+abs_64 (void)
+{
+  for (int i = 0; i < 4; i++)
+    r_abs_64[i] = __builtin_fabsf16 (b_64[i]);
+}
+
+void
+__attribute__((optimize ("O2"), noinline, noipa, noclone, no_icf))
+neg_64 (void)
+{
+  for (int i = 0; i < 4; i++)
+    r_neg_64[i] = -b_64[i];
+}
+
+void
+check_absneg_results (__bf16 *b, __bf16 *r_abs, __bf16 *r_neg, int len)
+{
+  for (int i = 0; i < len; i++)
+    {
+      __bf16 expected_abs = __builtin_fabsf16 (b[i]);
+      __bf16 expected_neg = -b[i];
+      if (r_abs[i] != expected_abs || r_neg[i] != expected_neg)
+        abort ();
+    }
+}
+
+static void
+__attribute__ ((noinline, noclone))
+do_test (void)
+{
+  float float_b[16] = {-1.2f, 3.4f, -5.6f, 7.8f};
+
+  for (int i = 0; i < 2; i++)
+    b_32[i] = (__bf16) float_b[i];
+
+  for (int i = 0; i < 4; i++)
+    b_64[i] = (__bf16) float_b[i];
+
+  abs_32 ();
+  neg_32 ();
+  check_absneg_results (b_32, r_abs_32, r_neg_32, 2);
+
+  abs_64 ();
+  neg_64 ();
+  check_absneg_results (b_64, r_abs_64, r_neg_64, 4);
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized using 4 byte vectors" 2 "slp1" } } */
+/* { dg-final { scan-tree-dump-times "loop vectorized using 8 byte vectors" 2 "vect" { target { ! ia32 } } } } */
+/* { dg-final { scan-tree-dump-times {(?n)ABS_EXPR <vect} 2 "optimized" { target { ! ia32 } } } } */