aarch64: Fix libmvec benchmarks

Message ID 20231121143939.38465-1-Joe.Ramsay@arm.com
State Committed
Commit bd70d3bacfcd53b04b5b5dd7a5d10062ac6fa50a
Headers
Series aarch64: Fix libmvec benchmarks |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
redhat-pt-bot/TryBot-32bit success Build for i686
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Testing passed

Commit Message

Joe Ramsay Nov. 21, 2023, 2:39 p.m. UTC
  These were broken by the new atan2 functions, as they were only
set up for univariate functions. Arity is now detected from the
input file - this revealed a mistake that the double-precision
inputs were being used for both single- and double-precision
routines, which is now remedied.
---
Thanks,
Joe
 .../fpu/scripts/bench_libmvec_advsimd.py      | 66 ++++++++++++-------
 .../aarch64/fpu/scripts/bench_libmvec_sve.py  | 64 +++++++++++-------
 2 files changed, 81 insertions(+), 49 deletions(-)
  

Comments

Szabolcs Nagy Nov. 22, 2023, 9:12 a.m. UTC | #1
The 11/21/2023 14:39, Joe Ramsay wrote:
> These were broken by the new atan2 functions, as they were only
> set up for univariate functions. Arity is now detected from the
> input file - this revealed a mistake that the double-precision
> inputs were being used for both single- and double-precision
> routines, which is now remedied.
> ---
> Thanks,
> Joe


thanks, committed.
  

Patch

diff --git a/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py b/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py
index 3e124c7810..3661a24044 100644
--- a/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py
+++ b/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py
@@ -22,40 +22,49 @@  TEMPLATE = """
 #include <math.h>
 #include <arm_neon.h>
 
-#define STRIDE {stride}
+#define STRIDE {rowlen}
 
-#define CALL_BENCH_FUNC(v, i) (__extension__ ({{                         \\
-   {rtype} mx0 = {fname}(vld1q_f{prec_short} (variants[v].in[i].arg0));  \\
+#define CALL_BENCH_FUNC_1(v, i) (__extension__ ({{                                 \\
+   {rtype} mx0 = {fname}(vld1q_f{prec_short} (&variants[v].in->arg0[i * STRIDE])); \\
    mx0; }}))
 
-struct args
+#define CALL_BENCH_FUNC_2(v, i) (__extension__ ({{                                 \\
+   {rtype} mx0 = {fname}(vld1q_f{prec_short} (&variants[v].in->arg0[i * STRIDE]),  \\
+                         vld1q_f{prec_short} (&variants[v].in->arg1[i * STRIDE])); \\
+   mx0; }}))
+
+struct args_1
+{{
+  {stype} arg0[{nelems}];
+}};
+
+struct args_2
 {{
-  {stype} arg0[STRIDE];
-  double timing;
+  {stype} arg0[{nelems}];
+  {stype} arg1[{nelems}];
 }};
 
 struct _variants
 {{
   const char *name;
-  int count;
-  const struct args *in;
+  const struct args_{arity} *in;
 }};
 
-static const struct args in0[{rowcount}] = {{
+static const struct args_{arity} in0 = {{
 {in_data}
 }};
 
 static const struct _variants variants[1] = {{
-  {{"", {rowcount}, in0}},
+  {{"", &in0}},
 }};
 
 #define NUM_VARIANTS 1
-#define NUM_SAMPLES(i) (variants[i].count)
+#define NUM_SAMPLES(i) ({nelems} / STRIDE)
 #define VARIANT(i) (variants[i].name)
 
 static {rtype} volatile ret;
 
-#define BENCH_FUNC(i, j) ({{ ret = CALL_BENCH_FUNC(i, j); }})
+#define BENCH_FUNC(i, j) ({{ ret = CALL_BENCH_FUNC_{arity}(i, j); }})
 #define FUNCNAME "{fname}"
 #include <bench-libmvec-skeleton.c>
 """
@@ -63,27 +72,34 @@  static {rtype} volatile ret;
 def main(name):
     _, prec, _, func = name.split("-")
     scalar_to_advsimd_type = {"double": "float64x2_t", "float": "float32x4_t"}
-
-    stride = {"double": 2, "float": 4}[prec]
+    rowlen = {"double": 2, "float": 4}[prec]
     rtype = scalar_to_advsimd_type[prec]
     atype = scalar_to_advsimd_type[prec]
-    fname = f"_ZGVnN{stride}v_{func}{'f' if prec == 'float' else ''}"
     prec_short = {"double": 64, "float": 32}[prec]
-
-    with open(f"../benchtests/libmvec/{func}-inputs") as f:
-        in_vals = [l.strip() for l in f.readlines() if l and not l.startswith("#")]
-    in_vals = [in_vals[i:i+stride] for i in range(0, len(in_vals), stride)]
-    rowcount= len(in_vals)
-    in_data = ",\n".join("{{" + ", ".join(row) + "}, 0}" for row in in_vals)
-
-    print(TEMPLATE.format(stride=stride,
+    input_filename = {"double": f"{func}-inputs", "float": f"{func}f-inputs"}[prec]
+
+    with open(f"../benchtests/libmvec/{input_filename}") as f:
+        input_file = f.readlines()
+    in_vals = (l.strip() for l in input_file if l and not l.startswith("#"))
+    # Split in case of multivariate signature
+    in_vals = (l.split(", ") for l in in_vals)
+    # Transpose
+    in_vals = list(zip(*in_vals))
+    in_data = ",\n".join("{" + (", ".join(val for val in col) + "}")
+                         for col in in_vals)
+
+    arity = [l for l in input_file if l.startswith("## args: ")][0].count(prec)
+    fname = f"_ZGVnN{rowlen}{'v' * arity}_{func}{'f' if prec == 'float' else ''}"
+
+    print(TEMPLATE.format(rowlen=rowlen,
                           rtype=rtype,
                           atype=atype,
                           fname=fname,
                           prec_short=prec_short,
                           in_data=in_data,
-                          rowcount=rowcount,
-                          stype=prec))
+                          stype=prec,
+                          arity=arity,
+                          nelems=len(in_vals[0])))
 
 
 if __name__ == "__main__":
diff --git a/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py b/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py
index 66f2c8e0f4..5d9332be9c 100755
--- a/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py
+++ b/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py
@@ -22,46 +22,55 @@  TEMPLATE = """
 #include <math.h>
 #include <arm_sve.h>
 
-#define MAX_STRIDE {max_stride}
 #define STRIDE {stride}
 #define PTRUE svptrue_b{prec_short}
 #define SV_LOAD svld1_f{prec_short}
 #define SV_STORE svst1_f{prec_short}
 #define REQUIRE_SVE
 
-#define CALL_BENCH_FUNC(v, i) (__extension__ ({{                              \\
-   {rtype} mx0 = {fname}(SV_LOAD (PTRUE(), variants[v].in[i].arg0), PTRUE()); \\
+#define CALL_BENCH_FUNC_1(v, i) (__extension__ ({{                                       \\
+   {rtype} mx0 = {fname}(SV_LOAD (PTRUE(), &variants[v].in->arg0[i * STRIDE]), PTRUE()); \\
    mx0; }}))
 
-struct args
+#define CALL_BENCH_FUNC_2(v, i) (__extension__ ({{                              \\
+   {rtype} mx0 = {fname}(SV_LOAD (PTRUE(), &variants[v].in->arg0[i * STRIDE]),  \\
+                         SV_LOAD (PTRUE(), &variants[v].in->arg1[i * STRIDE]),  \\
+                         PTRUE());                                              \\
+   mx0; }}))
+
+struct args_1
 {{
-  {stype} arg0[MAX_STRIDE];
-  double timing;
+  {stype} arg0[{nelems}];
+}};
+
+struct args_2
+{{
+  {stype} arg0[{nelems}];
+  {stype} arg1[{nelems}];
 }};
 
 struct _variants
 {{
   const char *name;
-  int count;
-  const struct args *in;
+  const struct args_{arity} *in;
 }};
 
-static const struct args in0[{rowcount}] = {{
+static const struct args_{arity} in0 = {{
 {in_data}
 }};
 
 static const struct _variants variants[1] = {{
-  {{"", {rowcount}, in0}},
+  {{"", &in0}},
 }};
 
 #define NUM_VARIANTS 1
-#define NUM_SAMPLES(i) (variants[i].count)
+#define NUM_SAMPLES(i) ({nelems} / STRIDE)
 #define VARIANT(i) (variants[i].name)
 
 // Cannot pass volatile pointer to svst1. This still does not appear to get optimised out.
-static {stype} /*volatile*/ ret[MAX_STRIDE];
+static {stype} /*volatile*/ ret[{rowlen}];
 
-#define BENCH_FUNC(i, j) ({{ SV_STORE(PTRUE(), ret, CALL_BENCH_FUNC(i, j)); }})
+#define BENCH_FUNC(i, j) ({{ SV_STORE(PTRUE(), ret, CALL_BENCH_FUNC_{arity}(i, j)); }})
 #define FUNCNAME "{fname}"
 #include <bench-libmvec-skeleton.c>
 """
@@ -69,23 +78,29 @@  static {stype} /*volatile*/ ret[MAX_STRIDE];
 def main(name):
     _, prec, _, func = name.split("-")
     scalar_to_sve_type = {"double": "svfloat64_t", "float": "svfloat32_t"}
-
     stride = {"double": "svcntd()", "float": "svcntw()"}[prec]
     rtype = scalar_to_sve_type[prec]
     atype = scalar_to_sve_type[prec]
-    fname = f"_ZGVsMxv_{func}{'f' if prec == 'float' else ''}"
     prec_short = {"double": 64, "float": 32}[prec]
     # Max SVE vector length is 2048 bits. To ensure benchmarks are
     # vector-length-agnostic, but still use as wide vectors as
     # possible on any given target, divide input data into 2048-bit
     # rows, then load/store as many elements as the target will allow.
-    max_stride = 2048 // prec_short
-
-    with open(f"../benchtests/libmvec/{func}-inputs") as f:
-        in_vals = [l.strip() for l in f.readlines() if l and not l.startswith("#")]
-    in_vals = [in_vals[i:i+max_stride] for i in range(0, len(in_vals), max_stride)]
-    rowcount= len(in_vals)
-    in_data = ",\n".join("{{" + ", ".join(row) + "}, 0}" for row in in_vals)
+    rowlen = {"double": 32, "float": 64}[prec]
+    input_filename = {"double": f"{func}-inputs", "float": f"{func}f-inputs"}[prec]
+
+    with open(f"../benchtests/libmvec/{input_filename}") as f:
+        input_file = f.readlines()
+    in_vals = (l.strip() for l in input_file if l and not l.startswith("#"))
+    # Split in case of multivariate signature
+    in_vals = (l.split(", ") for l in in_vals)
+    # Transpose
+    in_vals = list(zip(*in_vals))
+    in_data = ",\n".join("{" + (", ".join(val for val in col) + "}")
+                         for col in in_vals)
+
+    arity = [l for l in input_file if l.startswith("## args: ")][0].count(prec)
+    fname = f"_ZGVsMx{'v' * arity}_{func}{'f' if prec == 'float' else ''}"
 
     print(TEMPLATE.format(stride=stride,
                           rtype=rtype,
@@ -93,9 +108,10 @@  def main(name):
                           fname=fname,
                           prec_short=prec_short,
                           in_data=in_data,
-                          rowcount=rowcount,
                           stype=prec,
-                          max_stride=max_stride))
+                          rowlen=rowlen,
+                          arity=arity,
+                          nelems=len(in_vals[0])))
 
 
 if __name__ == "__main__":