[v2,1/6] x86-64: Create microbenchmark infrastructure for libmvec

Message ID 20211112191800.790574-2-skpgkp2@gmail.com
State Superseded
Headers
Series Implement microbenchmark for libmvec |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Sunil Pandey Nov. 12, 2021, 7:17 p.m. UTC
  Add python script to generate libmvec microbenchmark from the input
values for each libmvec function using skeleton benchmark template.

Creates double and float benchmarks with vector length 1, 2, 4, 8,
and 16 for each libmvec function.  Vector length 1 corresponds to
scalar version of function and is included for vector function perf
comparison.
---
 sysdeps/x86_64/fpu/Makeconfig               |  35 ++
 sysdeps/x86_64/fpu/Makefile                 |  40 ++
 sysdeps/x86_64/fpu/bench-libmvec-skeleton.c | 104 +++++
 sysdeps/x86_64/fpu/scripts/bench_libmvec.py | 464 ++++++++++++++++++++
 4 files changed, 643 insertions(+)
 create mode 100644 sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
 create mode 100755 sysdeps/x86_64/fpu/scripts/bench_libmvec.py
  

Comments

Noah Goldstein Nov. 12, 2021, 9:02 p.m. UTC | #1
On Fri, Nov 12, 2021 at 1:19 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Add python script to generate libmvec microbenchmark from the input
> values for each libmvec function using skeleton benchmark template.
>
> Creates double and float benchmarks with vector length 1, 2, 4, 8,
> and 16 for each libmvec function.  Vector length 1 corresponds to
> scalar version of function and is included for vector function perf
> comparison.
> ---
>  sysdeps/x86_64/fpu/Makeconfig               |  35 ++
>  sysdeps/x86_64/fpu/Makefile                 |  40 ++
>  sysdeps/x86_64/fpu/bench-libmvec-skeleton.c | 104 +++++
>  sysdeps/x86_64/fpu/scripts/bench_libmvec.py | 464 ++++++++++++++++++++
>  4 files changed, 643 insertions(+)
>  create mode 100644 sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
>  create mode 100755 sysdeps/x86_64/fpu/scripts/bench_libmvec.py
>
> diff --git a/sysdeps/x86_64/fpu/Makeconfig b/sysdeps/x86_64/fpu/Makeconfig
> index 24aaee1a43..503e9b5ffa 100644
> --- a/sysdeps/x86_64/fpu/Makeconfig
> +++ b/sysdeps/x86_64/fpu/Makeconfig
> @@ -29,6 +29,23 @@ libmvec-funcs = \
>    sin \
>    sincos \
>
> +# Define libmvec function for benchtests directory.
> +libmvec-bench-funcs = \
> +
> +bench-libmvec-double = \
> +  $(addprefix double-vlen1-, $(libmvec-bench-funcs)) \
> +  $(addprefix double-vlen2-, $(libmvec-bench-funcs)) \
> +  $(addprefix double-vlen4-, $(libmvec-bench-funcs)) \
> +  $(addprefix double-vlen4-avx2-, $(libmvec-bench-funcs)) \
> +  $(addprefix double-vlen8-, $(libmvec-bench-funcs)) \
> +
> +bench-libmvec-float = \
> +  $(addsuffix f, $(addprefix float-vlen1-, $(libmvec-bench-funcs))) \
> +  $(addsuffix f, $(addprefix float-vlen4-, $(libmvec-bench-funcs))) \
> +  $(addsuffix f, $(addprefix float-vlen8-, $(libmvec-bench-funcs))) \
> +  $(addsuffix f, $(addprefix float-vlen8-avx2-, $(libmvec-bench-funcs))) \
> +  $(addsuffix f, $(addprefix float-vlen16-, $(libmvec-bench-funcs))) \
> +
>  # The base libmvec ABI tests.
>  libmvec-abi-func-tests = \
>    $(addprefix test-double-libmvec-,$(libmvec-funcs)) \
> @@ -83,5 +100,23 @@ $(common-objpfx)libmvec.mk: $(common-objpfx)config.make
>            echo "  \$$(float-vlen16-arch-ext-cflags)"; \
>            echo; \
>          done; \
> +        echo "endif"; \
> +        echo "ifeq (\$$(subdir),benchtests)"; \
> +        for t in $(libmvec-bench-funcs); do \
> +          echo "CFLAGS-bench-double-vlen4-$$t.c = \\"; \
> +          echo "  \$$(double-vlen4-arch-ext-cflags)"; \
> +          echo "CFLAGS-bench-double-vlen4-avx2-$$t.c = \\"; \
> +          echo "  \$$(double-vlen4-arch-ext2-cflags)"; \
> +          echo "CFLAGS-bench-double-vlen8-$$t.c = \\"; \
> +          echo "  \$$(double-vlen8-arch-ext-cflags)"; \
> +          echo; \
> +          echo "CFLAGS-bench-float-vlen8-$${t}f.c = \\"; \
> +          echo "  \$$(float-vlen8-arch-ext-cflags)"; \
> +          echo "CFLAGS-bench-float-vlen8-avx2-$${t}f.c = \\"; \
> +          echo "  \$$(float-vlen8-arch-ext2-cflags)"; \
> +          echo "CFLAGS-bench-float-vlen16-$${t}f.c = \\"; \
> +          echo "  \$$(float-vlen16-arch-ext-cflags)"; \
> +          echo; \
> +        done; \
>          echo "endif") > $@T
>         mv -f $@T $@
> diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
> index d172ae815d..9fb587cf8f 100644
> --- a/sysdeps/x86_64/fpu/Makefile
> +++ b/sysdeps/x86_64/fpu/Makefile
> @@ -72,3 +72,43 @@ ifeq ($(subdir)$(config-cflags-mprefer-vector-width),mathyes)
>  # performance of sin and cos by more than 40% on Skylake.
>  CFLAGS-branred.c = -mprefer-vector-width=128
>  endif
> +
> +ifeq ($(subdir),benchtests)
> +double-vlen4-arch-ext-cflags = -mavx
> +double-vlen4-arch-ext2-cflags = -mavx2
> +double-vlen8-arch-ext-cflags = -mavx512f
> +
> +float-vlen8-arch-ext-cflags = -mavx
> +float-vlen8-arch-ext2-cflags = -mavx2
> +float-vlen16-arch-ext-cflags = -mavx512f
> +
> +bench-libmvec := $(bench-libmvec-double) $(bench-libmvec-float)
> +
> +ifeq (${BENCHSET},)
> +bench += $(bench-libmvec)
> +endif
> +
> +ifeq (${STATIC-BENCHTESTS},yes)
> +libmvec-benchtests = $(common-objpfx)mathvec/libmvec.a $(common-objpfx)math/libm.a
> +else
> +libmvec-benchtests = $(libmvec) $(libm)
> +endif
> +
> +$(addprefix $(objpfx)bench-,$(bench-libmvec-double)): $(libmvec-benchtests)
> +$(addprefix $(objpfx)bench-,$(bench-libmvec-float)): $(libmvec-benchtests)
> +bench-libmvec-deps = $(..)sysdeps/x86_64/fpu/bench-libmvec-skeleton.c bench-timing.h Makefile
> +
> +$(objpfx)bench-float-%.c: $(bench-libmvec-deps)
> +       { if [ -n "$($*-INCLUDE)" ]; then \
> +         cat $($*-INCLUDE); \
> +       fi; \
> +       $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py $(basename $(@F)); } > $@-tmp
> +       mv -f $@-tmp $@
> +
> +$(objpfx)bench-double-%.c: $(bench-libmvec-deps)
> +       { if [ -n "$($*-INCLUDE)" ]; then \
> +         cat $($*-INCLUDE); \
> +       fi; \
> +       $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py $(basename $(@F)); } > $@-tmp
> +       mv -f $@-tmp $@
> +endif
> diff --git a/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> new file mode 100644
> index 0000000000..d56a0c4462
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> @@ -0,0 +1,104 @@
> +/* Skeleton for libmvec benchmark programs.
> +   Copyright (C) 2021 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <string.h>
> +#include <stdint.h>
> +#include <stdbool.h>
> +#include <stdio.h>
> +#include <time.h>
> +#include <inttypes.h>
> +#include <bench-timing.h>
> +#include <json-lib.h>
> +#include <bench-util.h>
> +
> +#include <bench-util.c>
> +#include <math-tests-arch.h>
> +#define D_ITERS 10000
> +
> +int
> +main (int argc, char **argv)
> +{
> +  unsigned long i, k;
> +  timing_t start, end;
> +  json_ctx_t json_ctx;
> +
> +#if defined REQUIRE_AVX
> +  if (!CPU_FEATURE_ACTIVE (AVX))
> +    {
> +      printf ("AVX not supported.\n");
> +      return 0;
> +    }
> +#elif defined REQUIRE_AVX2
> +  if (!CPU_FEATURE_ACTIVE (AVX2))
> +    {
> +      printf ("AVX2 not supported.\n");
> +      return 0;
> +    }
> +#elif defined REQUIRE_AVX512F
> +  if (!CPU_FEATURE_ACTIVE (AVX512F))
> +    {
> +      printf ("AVX512F not supported.\n");
> +      return 0;
> +    }
> +#endif
> +
> +  bench_start ();
> +
> +#ifdef BENCH_INIT
> +  BENCH_INIT ();
> +#endif
> +
> +  json_init (&json_ctx, 2, stdout);
> +
> +  /* Begin function.  */
> +  json_attr_object_begin (&json_ctx, FUNCNAME);
> +
> +  for (int v = 0; v < NUM_VARIANTS; v++)
> +    {
> +      double d_total_time = 0;
> +      uint64_t cur;

Think these should also be type `timing_t`

> +      for (k = 0; k < D_ITERS; k++)
> +       {
> +         TIMING_NOW (start);
> +         for (i = 0; i < NUM_SAMPLES (v); i++)

What is the rationale for both `D_ITERS` and `NUM_SAMPLES (v)`? Why not
one loop that iterates for `D_ITERS * NUM_SAMPLES (v)`?

> +           BENCH_FUNC (v, i);
> +         TIMING_NOW (end);
> +
> +         TIMING_DIFF (cur, start, end);
> +
> +         d_total_time += cur;

Think this should be `TIMING_ACCUM(d_total_time, cur)`.

> +
> +       }
> +      double d_total_data_set = D_ITERS * NUM_SAMPLES (v) * STRIDE;
> +
> +      /* Begin variant.  */
> +      json_attr_object_begin (&json_ctx, VARIANT (v));
> +
> +      json_attr_double (&json_ctx, "duration", d_total_time);
> +      json_attr_double (&json_ctx, "iterations", d_total_data_set);
> +      json_attr_double (&json_ctx, "mean", d_total_time / d_total_data_set);
> +
> +      /* End variant.  */
> +      json_attr_object_end (&json_ctx);
> +    }
> +
> +  /* End function.  */
> +  json_attr_object_end (&json_ctx);
> +
> +  return 0;
> +}
> diff --git a/sysdeps/x86_64/fpu/scripts/bench_libmvec.py b/sysdeps/x86_64/fpu/scripts/bench_libmvec.py
> new file mode 100755
> index 0000000000..762865de8f
> --- /dev/null
> +++ b/sysdeps/x86_64/fpu/scripts/bench_libmvec.py
> @@ -0,0 +1,464 @@
> +#!/usr/bin/python3
> +# Copyright (C) 2021 Free Software Foundation, Inc.
> +# This file is part of the GNU C Library.
> +#
> +# The GNU C Library is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU Lesser General Public
> +# License as published by the Free Software Foundation; either
> +# version 2.1 of the License, or (at your option) any later version.
> +#
> +# The GNU C Library is distributed in the hope that it will be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +# Lesser General Public License for more details.
> +#
> +# You should have received a copy of the GNU Lesser General Public
> +# License along with the GNU C Library; if not, see
> +# <https://www.gnu.org/licenses/>.
> +
> +"""Benchmark program generator script
> +
> +This script takes a function name as input and generates a program using
> +an libmvec input file located in the sysdeps/x86_64/fpu directory.  The
> +name of the input file should be of the form libmvec-foo-inputs where
> +'foo' is the name of the function.
> +"""
> +
> +from __future__ import print_function
> +import sys
> +import os
> +import itertools
> +import re
> +
> +# Macro definitions for functions that take no arguments.  For functions
> +# that take arguments, the STRUCT_TEMPLATE, ARGS_TEMPLATE and
> +# VARIANTS_TEMPLATE are used instead.
> +DEFINES_TEMPLATE = '''
> +#define CALL_BENCH_FUNC(v, i) %(func)s ()
> +#define NUM_VARIANTS (1)
> +#define NUM_SAMPLES(v) (1)
> +#define VARIANT(v) FUNCNAME "()"
> +'''
> +
> +# Structures to store arguments for the function call.  A function may
> +# have its inputs partitioned to represent distinct performance
> +# characteristics or distinct flavors of the function.  Each such
> +# variant is represented by the _VARIANT structure.  The ARGS structure
> +# represents a single set of arguments.
> +BENCH_VEC_TEMPLATE = '''
> +#define CALL_BENCH_FUNC(v, i) (__extension__ ({ \\
> +  %(defs)s mx0 = %(func)s (%(func_args)s); \\
> +  mx0; }))
> +'''
> +
> +BENCH_SCALAR_TEMPLATE = '''
> +#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s)
> +'''
> +
> +STRUCT_TEMPLATE = '''struct args
> +{
> +%(args)s
> +  double timing;
> +};
> +
> +struct _variants
> +{
> +  const char *name;
> +  int count;
> +  struct args *in;
> +};
> +'''
> +
> +# The actual input arguments.
> +ARGS_TEMPLATE = '''struct args in%(argnum)d[%(num_args)d] = {
> +%(args)s
> +};
> +'''
> +
> +# The actual variants, along with macros defined to access the variants.
> +VARIANTS_TEMPLATE = '''struct _variants variants[%(num_variants)d] = {
> +%(variants)s
> +};
> +
> +#define NUM_VARIANTS %(num_variants)d
> +#define NUM_SAMPLES(i) (variants[i].count)
> +#define VARIANT(i) (variants[i].name)
> +'''
> +
> +# Epilogue for the generated source file.
> +EPILOGUE = '''
> +#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);})
> +#define FUNCNAME "%(func)s"
> +#include <bench-libmvec-skeleton.c>'''
> +
> +
> +def gen_source(func_types, directives, all_vals):
> +  """Generate source for the function
> +
> +  Generate the C source for the function from the values and
> +  directives.
> +
> +  Args:
> +    func: The function name
> +    directives: A dictionary of directives applicable to this function
> +    all_vals: A dictionary input values
> +  """
> +  # The includes go in first.
> +  for header in directives['includes']:
> +    print('#include <%s>' % header)
> +
> +  for header in directives['include-sources']:
> +    print('#include "%s"' % header)
> +
> +  argtype_vtable = {
> +    2: '128',
> +    4: '256',
> +    8: '512'
> +  }
> +  prefix_vtable = {
> +    2: 'b',
> +    4: 'c',
> +    8: 'e'
> +  }
> +
> +  # Get all the function properties
> +  funcname_argtype = ''
> +  float_flag = False
> +  if func_types[1] == 'float':
> +    float_flag = True
> +  avx_flag = False
> +  if func_types[3] == 'avx2':
> +    avx_flag = True
> +  funcname_stride = int(func_types[2][4:])
> +  funcname_origin = func_types[-1]
> +  if float_flag:
> +    funcname_origin = funcname_origin[:-1]
> +
> +  if funcname_stride == 1:
> +    # Prepare for scalar functions file generation
> +    funcname_prefix = ''
> +    funcname_prefix_1 = ''
> +    funcname_argtype = 'double'
> +    if float_flag:
> +      funcname_argtype = 'float'
> +  else:
> +    # Prepare for libmvec functions file generation
> +    funcname_prefix_1 = len(directives['args']) * 'v' + '_'
> +    aligned_stride = funcname_stride
> +    if float_flag:
> +      aligned_stride /= 2
> +    funcname_prefix = '_ZGV'
> +    if (avx_flag and (aligned_stride == 4)):
> +      funcname_prefix += 'd'
> +    else:
> +      funcname_prefix += prefix_vtable[aligned_stride]
> +    funcname_prefix = funcname_prefix + 'N' + func_types[2][4:]
> +    funcname_argtype = '__m' + argtype_vtable[aligned_stride]
> +    if not float_flag:
> +      funcname_argtype += 'd'
> +
> +  # Include x86intrin.h for vector functions
> +  if not funcname_stride == 1:
> +    print('#include <x86intrin.h>')
> +    if (avx_flag and (aligned_stride == 4)):
> +      # For bench-float-vlen8-avx2* and bench-double-vlen4-avx2*
> +      print('#define REQUIRE_AVX2')
> +    elif aligned_stride == 8:
> +      # For bench-float-vlen16* and bench-double-vlen8*
> +      print('#define REQUIRE_AVX512F')
> +    elif aligned_stride == 4:
> +      # For bench-float-vlen8* and bench-double-vlen4* without avx2
> +      print('#define REQUIRE_AVX')
> +  else:
> +    print('#define FUNCTYPE %s' % funcname_argtype)
> +
> +  print('#define STRIDE %d ' % funcname_stride)
> +
> +  funcname = funcname_prefix + funcname_prefix_1 + funcname_origin
> +  if float_flag:
> +    funcname += 'f'
> +
> +  funcname_rettype = funcname_argtype
> +  if directives['ret'] == '':
> +    funcname_rettype = 'void'
> +
> +  funcname_inputtype = []
> +  for arg, i in zip(directives['args'], itertools.count()):
> +    funcname_inputtype.append(funcname_argtype)
> +    if arg[0] == '<' and arg[-1] == '>':
> +      pos = arg.rfind('*')
> +      if pos == -1:
> +        die('Output argument must be a pointer type')
> +      funcname_inputtype[i] += ' *'
> +
> +  if not funcname_stride == 1:
> +    if len(directives['args']) == 2:
> +      print('extern %s %s (%s, %s);' % (funcname_rettype, funcname, funcname_inputtype[0], funcname_inputtype[1]))
> +    elif len(directives['args']) == 3:
> +      print('extern %s %s (%s, %s, %s);' % (funcname_rettype, funcname, funcname_inputtype[0], funcname_inputtype[1], funcname_inputtype[2]))
> +    else:
> +      print('extern %s %s (%s);' % (funcname_rettype, funcname, funcname_inputtype[0]))
> +
> +  # Print macros.  This branches out to a separate routine if
> +  # the function takes arguments.
> +  if not directives['args']:
> +    print(DEFINES_TEMPLATE % {'funcname': funcname})
> +    outargs = []
> +  else:
> +    outargs = _print_arg_data(funcname, float_flag, funcname_argtype, funcname_stride, directives, all_vals)
> +
> +  # Print the output variable definitions if necessary.
> +  for out in outargs:
> +    print(out)
> +
> +  # If we have a return value from the function, make sure it is
> +  # assigned to prevent the compiler from optimizing out the
> +  # call.
> +  getret = ''
> +
> +  if directives['ret']:
> +    if funcname_argtype != '':
> +      print('static %s volatile ret;' % funcname_argtype)
> +      getret = 'ret ='
> +    else:
> +      print('static %s volatile ret;' % directives['ret'])
> +      getret = 'ret ='
> +
> +  # Test initialization.
> +  if directives['init']:
> +    print('#define BENCH_INIT %s' % directives['init'])
> +
> +  print(EPILOGUE % {'getret': getret, 'func': funcname})
> +
> +
> +def _print_arg_data(func, float_flag, funcname_argtype, funcname_stride, directives, all_vals):
> +  """Print argument data
> +
> +  This is a helper function for gen_source that prints structure and
> +  values for arguments and their variants and returns output arguments
> +  if any are found.
> +
> +  Args:
> +    func: Function name
> +    float_flag: True if function is float type
> +    funcname_argtype: Type for vector variants
> +    funcname_stride: Vector Length
> +    directives: A dictionary of directives applicable to this function
> +    all_vals: A dictionary input values
> +
> +  Returns:
> +    Returns a list of definitions for function arguments that act as
> +    output parameters.
> +  """
> +  # First, all of the definitions.  We process writing of
> +  # CALL_BENCH_FUNC, struct args and also the output arguments
> +  # together in a single traversal of the arguments list.
> +  func_args = []
> +  _func_args = []
> +  arg_struct = []
> +  outargs = []
> +  # Conversion function for each type
> +  vtable = {
> +    '__m128d': '_mm_loadu_pd',
> +    '__m256d': '_mm256_loadu_pd',
> +    '__m512d': '_mm512_loadu_pd',
> +    '__m128': '_mm_loadu_ps',
> +    '__m256': '_mm256_loadu_ps',
> +    '__m512': '_mm512_loadu_ps',
> +    'double': '',
> +    'float': ''
> +  }
> +
> +  # For double max_vlen=8, for float max_vlen=16.
> +  if float_flag == True:
> +    max_vlen = 16
> +  else:
> +    max_vlen = 8
> +
> +  for arg, i in zip(directives['args'], itertools.count()):
> +    if arg[0] == '<' and arg[-1] == '>':
> +      outargs.append('static %s out%d __attribute__((used));' % (funcname_argtype, i))
> +      func_args.append('&out%d' % i)
> +      _func_args.append('&out%d' % i)
> +    else:
> +      arg_struct.append('  %s arg%d[STRIDE];' % (arg, i))
> +      func_args.append('%s (variants[v].in[i].arg%d)' %
> +                       (vtable[funcname_argtype], i))
> +      _func_args.append('variants[v].in[i].arg%d[0]' % i)
> +
> +  if funcname_stride == 1:
> +    print(BENCH_SCALAR_TEMPLATE % {'func': func,
> +                                   'func_args': ', '.join(_func_args)})
> +  elif directives['ret'] == '':
> +    print(BENCH_SCALAR_TEMPLATE % {'func': func,
> +                                   'func_args': ', '.join(func_args)})
> +  else:
> +    print(BENCH_VEC_TEMPLATE % {'func': func, 'func_args': ', '.join(func_args),
> +                                'defs': funcname_argtype})
> +  print(STRUCT_TEMPLATE % {'args': '\n'.join(arg_struct)})
> +
> +  # Now print the values.
> +  variants = []
> +  for (k, _vals), i in zip(all_vals.items(), itertools.count()):
> +    vals = []
> +    temp_vals = []
> +    j = 0
> +    temp_j = 0
> +    result_v = ['', '', '']
> +    for _v in _vals:
> +      nums = _v.split(',')
> +      for l in range(0, len(nums)):
> +        result_v[l] = result_v[l] + nums[l].strip() + ','
> +      j += 1
> +      temp_j += 1
> +
> +      if temp_j == funcname_stride:
> +        final_result = ''
> +        for l in range(0, len(nums)):
> +          final_result = final_result + '{' + result_v[l][:-1] + '},'
> +        temp_vals.append(final_result[:-1])
> +        temp_j = 0
> +        result_v = ['', '', '']
> +
> +      # Make sure amount of test data is multiple of max_vlen
> +      # to keep data size same for all vector length.
> +      if j == max_vlen:
> +        vals.extend(temp_vals)
> +        temp_vals = []
> +        j = 0
> +
> +    out = ['  {%s, 0},' % v for v in vals]
> +
> +    # Members for the variants structure list that we will
> +    # print later.
> +    variants.append('  {"%s", %d, in%d},' % (k, len(vals), i))
> +    print(ARGS_TEMPLATE % {'argnum': i, 'num_args': len(vals),
> +                           'args': '\n'.join(out)})
> +
> +  # Print the variants and the last set of macros.
> +  print(VARIANTS_TEMPLATE % {'num_variants': len(all_vals),
> +                             'variants': '\n'.join(variants)})
> +  return outargs
> +
> +
> +def _process_directive(d_name, d_val, func_args):
> +  """Process a directive.
> +
> +  Evaluate the directive name and value passed and return the
> +  processed value. This is a helper function for parse_file.
> +
> +  Args:
> +    d_name: Name of the directive
> +    d_val: The string value to process
> +
> +  Returns:
> +    The processed value, which may be the string as it is or an object
> +    that describes the directive.
> +  """
> +  # Process the directive values if necessary.  name and ret don't
> +  # need any processing.
> +  if d_name.startswith('include'):
> +    d_val = d_val.split(',')
> +  elif d_name == 'args':
> +    d_val = d_val.split(':')
> +    # Check if args type match
> +    if not d_val[0] == func_args:
> +      die("Args mismatch, should be %s, but get %s" % (d_val[0], func_args))
> +
> +  # Return the values.
> +  return d_val
> +
> +
> +def parse_file(func_types):
> +  """Parse an input file
> +
> +  Given a function name, open and parse an input file for the function
> +  and get the necessary parameters for the generated code and the list
> +  of inputs.
> +
> +  Args:
> +    func: The function name
> +
> +  Returns:
> +    A tuple of two elements, one a dictionary of directives and the
> +    other a dictionary of all input values.
> +  """
> +  all_vals = {}
> +  # Valid directives.
> +  directives = {
> +    'name': '',
> +    'args': [],
> +    'includes': [],
> +    'include-sources': [],
> +    'ret': '',
> +    'init': ''
> +  }
> +
> +  func = func_types[-1]
> +  try:
> +    with open('../sysdeps/x86_64/fpu/libmvec-%s-inputs' % func) as f:
> +      for line in f:
> +        # Look for directives and parse it if found.
> +        if line.startswith('##'):
> +          try:
> +            d_name, d_val = line[2:].split(':', 1)
> +            d_name = d_name.strip()
> +            d_val = d_val.strip()
> +            directives[d_name] = _process_directive(d_name, d_val, func_types[1])
> +          except (IndexError, KeyError):
> +            die('Invalid directive: %s' % line[2:])
> +
> +        # Skip blank lines and comments.
> +        line = line.split('#', 1)[0].rstrip()
> +        if not line:
> +          continue
> +
> +        # Otherwise, we're an input.  Add to the appropriate
> +        # input set.
> +        cur_name = directives['name']
> +        all_vals.setdefault(cur_name, [])
> +        all_vals[cur_name].append(line)
> +  except IOError as ex:
> +    die("Failed to open input file (%s): %s" % (ex.filename, ex.strerror))
> +
> +  return directives, all_vals
> +
> +
> +def die(msg):
> +  """Exit with an error
> +
> +  Prints an error message to the standard error stream and exits with
> +  a non-zero status.
> +
> +  Args:
> +    msg: The error message to print to standard error
> +  """
> +  print('%s\n' % msg, file=sys.stderr)
> +  sys.exit(os.EX_DATAERR)
> +
> +
> +def main(args):
> +  """Main function
> +
> +  Use the first command line argument as function name and parse its
> +  input file to generate C source that calls the function repeatedly
> +  for the input.
> +
> +  Args:
> +    args: The command line arguments with the program name dropped
> +
> +  Returns:
> +    os.EX_USAGE on error and os.EX_OK on success.
> +  """
> +  if len(args) != 1:
> +    print('Usage: %s <function>' % sys.argv[0])
> +    return os.EX_USAGE
> +
> +  func_types = args[0].split('-')
> +  directives, all_vals = parse_file(func_types)
> +  gen_source(func_types, directives, all_vals)
> +  return os.EX_OK
> +
> +
> +if __name__ == '__main__':
> +  sys.exit(main(sys.argv[1:]))
> --
> 2.31.1
>
  
Sunil Pandey Nov. 12, 2021, 10:49 p.m. UTC | #2
On Fri, Nov 12, 2021 at 1:02 PM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> On Fri, Nov 12, 2021 at 1:19 PM Sunil K Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > Add python script to generate libmvec microbenchmark from the input
> > values for each libmvec function using skeleton benchmark template.
> >
> > Creates double and float benchmarks with vector length 1, 2, 4, 8,
> > and 16 for each libmvec function.  Vector length 1 corresponds to
> > scalar version of function and is included for vector function perf
> > comparison.
> > ---
> >  sysdeps/x86_64/fpu/Makeconfig               |  35 ++
> >  sysdeps/x86_64/fpu/Makefile                 |  40 ++
> >  sysdeps/x86_64/fpu/bench-libmvec-skeleton.c | 104 +++++
> >  sysdeps/x86_64/fpu/scripts/bench_libmvec.py | 464 ++++++++++++++++++++
> >  4 files changed, 643 insertions(+)
> >  create mode 100644 sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> >  create mode 100755 sysdeps/x86_64/fpu/scripts/bench_libmvec.py
> >
> > diff --git a/sysdeps/x86_64/fpu/Makeconfig
> b/sysdeps/x86_64/fpu/Makeconfig
> > index 24aaee1a43..503e9b5ffa 100644
> > --- a/sysdeps/x86_64/fpu/Makeconfig
> > +++ b/sysdeps/x86_64/fpu/Makeconfig
> > @@ -29,6 +29,23 @@ libmvec-funcs = \
> >    sin \
> >    sincos \
> >
> > +# Define libmvec function for benchtests directory.
> > +libmvec-bench-funcs = \
> > +
> > +bench-libmvec-double = \
> > +  $(addprefix double-vlen1-, $(libmvec-bench-funcs)) \
> > +  $(addprefix double-vlen2-, $(libmvec-bench-funcs)) \
> > +  $(addprefix double-vlen4-, $(libmvec-bench-funcs)) \
> > +  $(addprefix double-vlen4-avx2-, $(libmvec-bench-funcs)) \
> > +  $(addprefix double-vlen8-, $(libmvec-bench-funcs)) \
> > +
> > +bench-libmvec-float = \
> > +  $(addsuffix f, $(addprefix float-vlen1-, $(libmvec-bench-funcs))) \
> > +  $(addsuffix f, $(addprefix float-vlen4-, $(libmvec-bench-funcs))) \
> > +  $(addsuffix f, $(addprefix float-vlen8-, $(libmvec-bench-funcs))) \
> > +  $(addsuffix f, $(addprefix float-vlen8-avx2-,
> $(libmvec-bench-funcs))) \
> > +  $(addsuffix f, $(addprefix float-vlen16-, $(libmvec-bench-funcs))) \
> > +
> >  # The base libmvec ABI tests.
> >  libmvec-abi-func-tests = \
> >    $(addprefix test-double-libmvec-,$(libmvec-funcs)) \
> > @@ -83,5 +100,23 @@ $(common-objpfx)libmvec.mk:
> $(common-objpfx)config.make
> >            echo "  \$$(float-vlen16-arch-ext-cflags)"; \
> >            echo; \
> >          done; \
> > +        echo "endif"; \
> > +        echo "ifeq (\$$(subdir),benchtests)"; \
> > +        for t in $(libmvec-bench-funcs); do \
> > +          echo "CFLAGS-bench-double-vlen4-$$t.c = \\"; \
> > +          echo "  \$$(double-vlen4-arch-ext-cflags)"; \
> > +          echo "CFLAGS-bench-double-vlen4-avx2-$$t.c = \\"; \
> > +          echo "  \$$(double-vlen4-arch-ext2-cflags)"; \
> > +          echo "CFLAGS-bench-double-vlen8-$$t.c = \\"; \
> > +          echo "  \$$(double-vlen8-arch-ext-cflags)"; \
> > +          echo; \
> > +          echo "CFLAGS-bench-float-vlen8-$${t}f.c = \\"; \
> > +          echo "  \$$(float-vlen8-arch-ext-cflags)"; \
> > +          echo "CFLAGS-bench-float-vlen8-avx2-$${t}f.c = \\"; \
> > +          echo "  \$$(float-vlen8-arch-ext2-cflags)"; \
> > +          echo "CFLAGS-bench-float-vlen16-$${t}f.c = \\"; \
> > +          echo "  \$$(float-vlen16-arch-ext-cflags)"; \
> > +          echo; \
> > +        done; \
> >          echo "endif") > $@T
> >         mv -f $@T $@
> > diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
> > index d172ae815d..9fb587cf8f 100644
> > --- a/sysdeps/x86_64/fpu/Makefile
> > +++ b/sysdeps/x86_64/fpu/Makefile
> > @@ -72,3 +72,43 @@ ifeq
> ($(subdir)$(config-cflags-mprefer-vector-width),mathyes)
> >  # performance of sin and cos by more than 40% on Skylake.
> >  CFLAGS-branred.c = -mprefer-vector-width=128
> >  endif
> > +
> > +ifeq ($(subdir),benchtests)
> > +double-vlen4-arch-ext-cflags = -mavx
> > +double-vlen4-arch-ext2-cflags = -mavx2
> > +double-vlen8-arch-ext-cflags = -mavx512f
> > +
> > +float-vlen8-arch-ext-cflags = -mavx
> > +float-vlen8-arch-ext2-cflags = -mavx2
> > +float-vlen16-arch-ext-cflags = -mavx512f
> > +
> > +bench-libmvec := $(bench-libmvec-double) $(bench-libmvec-float)
> > +
> > +ifeq (${BENCHSET},)
> > +bench += $(bench-libmvec)
> > +endif
> > +
> > +ifeq (${STATIC-BENCHTESTS},yes)
> > +libmvec-benchtests = $(common-objpfx)mathvec/libmvec.a
> $(common-objpfx)math/libm.a
> > +else
> > +libmvec-benchtests = $(libmvec) $(libm)
> > +endif
> > +
> > +$(addprefix $(objpfx)bench-,$(bench-libmvec-double)):
> $(libmvec-benchtests)
> > +$(addprefix $(objpfx)bench-,$(bench-libmvec-float)):
> $(libmvec-benchtests)
> > +bench-libmvec-deps = $(..)sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> bench-timing.h Makefile
> > +
> > +$(objpfx)bench-float-%.c: $(bench-libmvec-deps)
> > +       { if [ -n "$($*-INCLUDE)" ]; then \
> > +         cat $($*-INCLUDE); \
> > +       fi; \
> > +       $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py
> $(basename $(@F)); } > $@-tmp
> > +       mv -f $@-tmp $@
> > +
> > +$(objpfx)bench-double-%.c: $(bench-libmvec-deps)
> > +       { if [ -n "$($*-INCLUDE)" ]; then \
> > +         cat $($*-INCLUDE); \
> > +       fi; \
> > +       $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py
> $(basename $(@F)); } > $@-tmp
> > +       mv -f $@-tmp $@
> > +endif
> > diff --git a/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> > new file mode 100644
> > index 0000000000..d56a0c4462
> > --- /dev/null
> > +++ b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> > @@ -0,0 +1,104 @@
> > +/* Skeleton for libmvec benchmark programs.
> > +   Copyright (C) 2021 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <string.h>
> > +#include <stdint.h>
> > +#include <stdbool.h>
> > +#include <stdio.h>
> > +#include <time.h>
> > +#include <inttypes.h>
> > +#include <bench-timing.h>
> > +#include <json-lib.h>
> > +#include <bench-util.h>
> > +
> > +#include <bench-util.c>
> > +#include <math-tests-arch.h>
> > +#define D_ITERS 10000
> > +
> > +int
> > +main (int argc, char **argv)
> > +{
> > +  unsigned long i, k;
> > +  timing_t start, end;
> > +  json_ctx_t json_ctx;
> > +
> > +#if defined REQUIRE_AVX
> > +  if (!CPU_FEATURE_ACTIVE (AVX))
> > +    {
> > +      printf ("AVX not supported.\n");
> > +      return 0;
> > +    }
> > +#elif defined REQUIRE_AVX2
> > +  if (!CPU_FEATURE_ACTIVE (AVX2))
> > +    {
> > +      printf ("AVX2 not supported.\n");
> > +      return 0;
> > +    }
> > +#elif defined REQUIRE_AVX512F
> > +  if (!CPU_FEATURE_ACTIVE (AVX512F))
> > +    {
> > +      printf ("AVX512F not supported.\n");
> > +      return 0;
> > +    }
> > +#endif
> > +
> > +  bench_start ();
> > +
> > +#ifdef BENCH_INIT
> > +  BENCH_INIT ();
> > +#endif
> > +
> > +  json_init (&json_ctx, 2, stdout);
> > +
> > +  /* Begin function.  */
> > +  json_attr_object_begin (&json_ctx, FUNCNAME);
> > +
> > +  for (int v = 0; v < NUM_VARIANTS; v++)
> > +    {
> > +      double d_total_time = 0;
> > +      uint64_t cur;
>
> Think these should also be type `timing_t`
>

I do not see a difference if I use timing_t or uint64_t. In any case
variable cur stores the
difference between start and end time, not time.


>
> > +      for (k = 0; k < D_ITERS; k++)
> > +       {
> > +         TIMING_NOW (start);
> > +         for (i = 0; i < NUM_SAMPLES (v); i++)
>
> What is the rationale for both `D_ITERS` and `NUM_SAMPLES (v)`? Why not
> one loop that iterates for `D_ITERS * NUM_SAMPLES (v)`?
>

D_ITERS define how many times each variant full data set will run.
NUM_SAMPLES(v)
represent the number of data sets in variant v. Index v and i select, i'th
data set from
variant v and call vector function.  Having two loops simplifies logic.


> > +           BENCH_FUNC (v, i);
> > +         TIMING_NOW (end);
> > +
> > +         TIMING_DIFF (cur, start, end);
> > +
> > +         d_total_time += cur;
>
> Think this should be `TIMING_ACCUM(d_total_time, cur)`.
>

Not much difference, if I use TIMING_ACCUM or simply add cur to
d_total_time.


> > +
> > +       }
> > +      double d_total_data_set = D_ITERS * NUM_SAMPLES (v) * STRIDE;
> > +
> > +      /* Begin variant.  */
> > +      json_attr_object_begin (&json_ctx, VARIANT (v));
> > +
> > +      json_attr_double (&json_ctx, "duration", d_total_time);
> > +      json_attr_double (&json_ctx, "iterations", d_total_data_set);
> > +      json_attr_double (&json_ctx, "mean", d_total_time /
> d_total_data_set);
> > +
> > +      /* End variant.  */
> > +      json_attr_object_end (&json_ctx);
> > +    }
> > +
> > +  /* End function.  */
> > +  json_attr_object_end (&json_ctx);
> > +
> > +  return 0;
> > +}
> > diff --git a/sysdeps/x86_64/fpu/scripts/bench_libmvec.py
> b/sysdeps/x86_64/fpu/scripts/bench_libmvec.py
> > new file mode 100755
> > index 0000000000..762865de8f
> > --- /dev/null
> > +++ b/sysdeps/x86_64/fpu/scripts/bench_libmvec.py
> > @@ -0,0 +1,464 @@
> > +#!/usr/bin/python3
> > +# Copyright (C) 2021 Free Software Foundation, Inc.
> > +# This file is part of the GNU C Library.
> > +#
> > +# The GNU C Library is free software; you can redistribute it and/or
> > +# modify it under the terms of the GNU Lesser General Public
> > +# License as published by the Free Software Foundation; either
> > +# version 2.1 of the License, or (at your option) any later version.
> > +#
> > +# The GNU C Library is distributed in the hope that it will be useful,
> > +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +# Lesser General Public License for more details.
> > +#
> > +# You should have received a copy of the GNU Lesser General Public
> > +# License along with the GNU C Library; if not, see
> > +# <https://www.gnu.org/licenses/>.
> > +
> > +"""Benchmark program generator script
> > +
> > +This script takes a function name as input and generates a program using
> > +an libmvec input file located in the sysdeps/x86_64/fpu directory.  The
> > +name of the input file should be of the form libmvec-foo-inputs where
> > +'foo' is the name of the function.
> > +"""
> > +
> > +from __future__ import print_function
> > +import sys
> > +import os
> > +import itertools
> > +import re
> > +
> > +# Macro definitions for functions that take no arguments.  For functions
> > +# that take arguments, the STRUCT_TEMPLATE, ARGS_TEMPLATE and
> > +# VARIANTS_TEMPLATE are used instead.
> > +DEFINES_TEMPLATE = '''
> > +#define CALL_BENCH_FUNC(v, i) %(func)s ()
> > +#define NUM_VARIANTS (1)
> > +#define NUM_SAMPLES(v) (1)
> > +#define VARIANT(v) FUNCNAME "()"
> > +'''
> > +
> > +# Structures to store arguments for the function call.  A function may
> > +# have its inputs partitioned to represent distinct performance
> > +# characteristics or distinct flavors of the function.  Each such
> > +# variant is represented by the _VARIANT structure.  The ARGS structure
> > +# represents a single set of arguments.
> > +BENCH_VEC_TEMPLATE = '''
> > +#define CALL_BENCH_FUNC(v, i) (__extension__ ({ \\
> > +  %(defs)s mx0 = %(func)s (%(func_args)s); \\
> > +  mx0; }))
> > +'''
> > +
> > +BENCH_SCALAR_TEMPLATE = '''
> > +#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s)
> > +'''
> > +
> > +STRUCT_TEMPLATE = '''struct args
> > +{
> > +%(args)s
> > +  double timing;
> > +};
> > +
> > +struct _variants
> > +{
> > +  const char *name;
> > +  int count;
> > +  struct args *in;
> > +};
> > +'''
> > +
> > +# The actual input arguments.
> > +ARGS_TEMPLATE = '''struct args in%(argnum)d[%(num_args)d] = {
> > +%(args)s
> > +};
> > +'''
> > +
> > +# The actual variants, along with macros defined to access the variants.
> > +VARIANTS_TEMPLATE = '''struct _variants variants[%(num_variants)d] = {
> > +%(variants)s
> > +};
> > +
> > +#define NUM_VARIANTS %(num_variants)d
> > +#define NUM_SAMPLES(i) (variants[i].count)
> > +#define VARIANT(i) (variants[i].name)
> > +'''
> > +
> > +# Epilogue for the generated source file.
> > +EPILOGUE = '''
> > +#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);})
> > +#define FUNCNAME "%(func)s"
> > +#include <bench-libmvec-skeleton.c>'''
> > +
> > +
> > +def gen_source(func_types, directives, all_vals):
> > +  """Generate source for the function
> > +
> > +  Generate the C source for the function from the values and
> > +  directives.
> > +
> > +  Args:
> > +    func: The function name
> > +    directives: A dictionary of directives applicable to this function
> > +    all_vals: A dictionary input values
> > +  """
> > +  # The includes go in first.
> > +  for header in directives['includes']:
> > +    print('#include <%s>' % header)
> > +
> > +  for header in directives['include-sources']:
> > +    print('#include "%s"' % header)
> > +
> > +  argtype_vtable = {
> > +    2: '128',
> > +    4: '256',
> > +    8: '512'
> > +  }
> > +  prefix_vtable = {
> > +    2: 'b',
> > +    4: 'c',
> > +    8: 'e'
> > +  }
> > +
> > +  # Get all the function properties
> > +  funcname_argtype = ''
> > +  float_flag = False
> > +  if func_types[1] == 'float':
> > +    float_flag = True
> > +  avx_flag = False
> > +  if func_types[3] == 'avx2':
> > +    avx_flag = True
> > +  funcname_stride = int(func_types[2][4:])
> > +  funcname_origin = func_types[-1]
> > +  if float_flag:
> > +    funcname_origin = funcname_origin[:-1]
> > +
> > +  if funcname_stride == 1:
> > +    # Prepare for scalar functions file generation
> > +    funcname_prefix = ''
> > +    funcname_prefix_1 = ''
> > +    funcname_argtype = 'double'
> > +    if float_flag:
> > +      funcname_argtype = 'float'
> > +  else:
> > +    # Prepare for libmvec functions file generation
> > +    funcname_prefix_1 = len(directives['args']) * 'v' + '_'
> > +    aligned_stride = funcname_stride
> > +    if float_flag:
> > +      aligned_stride /= 2
> > +    funcname_prefix = '_ZGV'
> > +    if (avx_flag and (aligned_stride == 4)):
> > +      funcname_prefix += 'd'
> > +    else:
> > +      funcname_prefix += prefix_vtable[aligned_stride]
> > +    funcname_prefix = funcname_prefix + 'N' + func_types[2][4:]
> > +    funcname_argtype = '__m' + argtype_vtable[aligned_stride]
> > +    if not float_flag:
> > +      funcname_argtype += 'd'
> > +
> > +  # Include x86intrin.h for vector functions
> > +  if not funcname_stride == 1:
> > +    print('#include <x86intrin.h>')
> > +    if (avx_flag and (aligned_stride == 4)):
> > +      # For bench-float-vlen8-avx2* and bench-double-vlen4-avx2*
> > +      print('#define REQUIRE_AVX2')
> > +    elif aligned_stride == 8:
> > +      # For bench-float-vlen16* and bench-double-vlen8*
> > +      print('#define REQUIRE_AVX512F')
> > +    elif aligned_stride == 4:
> > +      # For bench-float-vlen8* and bench-double-vlen4* without avx2
> > +      print('#define REQUIRE_AVX')
> > +  else:
> > +    print('#define FUNCTYPE %s' % funcname_argtype)
> > +
> > +  print('#define STRIDE %d ' % funcname_stride)
> > +
> > +  funcname = funcname_prefix + funcname_prefix_1 + funcname_origin
> > +  if float_flag:
> > +    funcname += 'f'
> > +
> > +  funcname_rettype = funcname_argtype
> > +  if directives['ret'] == '':
> > +    funcname_rettype = 'void'
> > +
> > +  funcname_inputtype = []
> > +  for arg, i in zip(directives['args'], itertools.count()):
> > +    funcname_inputtype.append(funcname_argtype)
> > +    if arg[0] == '<' and arg[-1] == '>':
> > +      pos = arg.rfind('*')
> > +      if pos == -1:
> > +        die('Output argument must be a pointer type')
> > +      funcname_inputtype[i] += ' *'
> > +
> > +  if not funcname_stride == 1:
> > +    if len(directives['args']) == 2:
> > +      print('extern %s %s (%s, %s);' % (funcname_rettype, funcname,
> funcname_inputtype[0], funcname_inputtype[1]))
> > +    elif len(directives['args']) == 3:
> > +      print('extern %s %s (%s, %s, %s);' % (funcname_rettype, funcname,
> funcname_inputtype[0], funcname_inputtype[1], funcname_inputtype[2]))
> > +    else:
> > +      print('extern %s %s (%s);' % (funcname_rettype, funcname,
> funcname_inputtype[0]))
> > +
> > +  # Print macros.  This branches out to a separate routine if
> > +  # the function takes arguments.
> > +  if not directives['args']:
> > +    print(DEFINES_TEMPLATE % {'funcname': funcname})
> > +    outargs = []
> > +  else:
> > +    outargs = _print_arg_data(funcname, float_flag, funcname_argtype,
> funcname_stride, directives, all_vals)
> > +
> > +  # Print the output variable definitions if necessary.
> > +  for out in outargs:
> > +    print(out)
> > +
> > +  # If we have a return value from the function, make sure it is
> > +  # assigned to prevent the compiler from optimizing out the
> > +  # call.
> > +  getret = ''
> > +
> > +  if directives['ret']:
> > +    if funcname_argtype != '':
> > +      print('static %s volatile ret;' % funcname_argtype)
> > +      getret = 'ret ='
> > +    else:
> > +      print('static %s volatile ret;' % directives['ret'])
> > +      getret = 'ret ='
> > +
> > +  # Test initialization.
> > +  if directives['init']:
> > +    print('#define BENCH_INIT %s' % directives['init'])
> > +
> > +  print(EPILOGUE % {'getret': getret, 'func': funcname})
> > +
> > +
> > +def _print_arg_data(func, float_flag, funcname_argtype,
> funcname_stride, directives, all_vals):
> > +  """Print argument data
> > +
> > +  This is a helper function for gen_source that prints structure and
> > +  values for arguments and their variants and returns output arguments
> > +  if any are found.
> > +
> > +  Args:
> > +    func: Function name
> > +    float_flag: True if function is float type
> > +    funcname_argtype: Type for vector variants
> > +    funcname_stride: Vector Length
> > +    directives: A dictionary of directives applicable to this function
> > +    all_vals: A dictionary input values
> > +
> > +  Returns:
> > +    Returns a list of definitions for function arguments that act as
> > +    output parameters.
> > +  """
> > +  # First, all of the definitions.  We process writing of
> > +  # CALL_BENCH_FUNC, struct args and also the output arguments
> > +  # together in a single traversal of the arguments list.
> > +  func_args = []
> > +  _func_args = []
> > +  arg_struct = []
> > +  outargs = []
> > +  # Conversion function for each type
> > +  vtable = {
> > +    '__m128d': '_mm_loadu_pd',
> > +    '__m256d': '_mm256_loadu_pd',
> > +    '__m512d': '_mm512_loadu_pd',
> > +    '__m128': '_mm_loadu_ps',
> > +    '__m256': '_mm256_loadu_ps',
> > +    '__m512': '_mm512_loadu_ps',
> > +    'double': '',
> > +    'float': ''
> > +  }
> > +
> > +  # For double max_vlen=8, for float max_vlen=16.
> > +  if float_flag == True:
> > +    max_vlen = 16
> > +  else:
> > +    max_vlen = 8
> > +
> > +  for arg, i in zip(directives['args'], itertools.count()):
> > +    if arg[0] == '<' and arg[-1] == '>':
> > +      outargs.append('static %s out%d __attribute__((used));' %
> (funcname_argtype, i))
> > +      func_args.append('&out%d' % i)
> > +      _func_args.append('&out%d' % i)
> > +    else:
> > +      arg_struct.append('  %s arg%d[STRIDE];' % (arg, i))
> > +      func_args.append('%s (variants[v].in[i].arg%d)' %
> > +                       (vtable[funcname_argtype], i))
> > +      _func_args.append('variants[v].in[i].arg%d[0]' % i)
> > +
> > +  if funcname_stride == 1:
> > +    print(BENCH_SCALAR_TEMPLATE % {'func': func,
> > +                                   'func_args': ', '.join(_func_args)})
> > +  elif directives['ret'] == '':
> > +    print(BENCH_SCALAR_TEMPLATE % {'func': func,
> > +                                   'func_args': ', '.join(func_args)})
> > +  else:
> > +    print(BENCH_VEC_TEMPLATE % {'func': func, 'func_args': ',
> '.join(func_args),
> > +                                'defs': funcname_argtype})
> > +  print(STRUCT_TEMPLATE % {'args': '\n'.join(arg_struct)})
> > +
> > +  # Now print the values.
> > +  variants = []
> > +  for (k, _vals), i in zip(all_vals.items(), itertools.count()):
> > +    vals = []
> > +    temp_vals = []
> > +    j = 0
> > +    temp_j = 0
> > +    result_v = ['', '', '']
> > +    for _v in _vals:
> > +      nums = _v.split(',')
> > +      for l in range(0, len(nums)):
> > +        result_v[l] = result_v[l] + nums[l].strip() + ','
> > +      j += 1
> > +      temp_j += 1
> > +
> > +      if temp_j == funcname_stride:
> > +        final_result = ''
> > +        for l in range(0, len(nums)):
> > +          final_result = final_result + '{' + result_v[l][:-1] + '},'
> > +        temp_vals.append(final_result[:-1])
> > +        temp_j = 0
> > +        result_v = ['', '', '']
> > +
> > +      # Make sure amount of test data is multiple of max_vlen
> > +      # to keep data size same for all vector length.
> > +      if j == max_vlen:
> > +        vals.extend(temp_vals)
> > +        temp_vals = []
> > +        j = 0
> > +
> > +    out = ['  {%s, 0},' % v for v in vals]
> > +
> > +    # Members for the variants structure list that we will
> > +    # print later.
> > +    variants.append('  {"%s", %d, in%d},' % (k, len(vals), i))
> > +    print(ARGS_TEMPLATE % {'argnum': i, 'num_args': len(vals),
> > +                           'args': '\n'.join(out)})
> > +
> > +  # Print the variants and the last set of macros.
> > +  print(VARIANTS_TEMPLATE % {'num_variants': len(all_vals),
> > +                             'variants': '\n'.join(variants)})
> > +  return outargs
> > +
> > +
> > +def _process_directive(d_name, d_val, func_args):
> > +  """Process a directive.
> > +
> > +  Evaluate the directive name and value passed and return the
> > +  processed value. This is a helper function for parse_file.
> > +
> > +  Args:
> > +    d_name: Name of the directive
> > +    d_val: The string value to process
> > +
> > +  Returns:
> > +    The processed value, which may be the string as it is or an object
> > +    that describes the directive.
> > +  """
> > +  # Process the directive values if necessary.  name and ret don't
> > +  # need any processing.
> > +  if d_name.startswith('include'):
> > +    d_val = d_val.split(',')
> > +  elif d_name == 'args':
> > +    d_val = d_val.split(':')
> > +    # Check if args type match
> > +    if not d_val[0] == func_args:
> > +      die("Args mismatch, should be %s, but get %s" % (d_val[0],
> func_args))
> > +
> > +  # Return the values.
> > +  return d_val
> > +
> > +
> > +def parse_file(func_types):
> > +  """Parse an input file
> > +
> > +  Given a function name, open and parse an input file for the function
> > +  and get the necessary parameters for the generated code and the list
> > +  of inputs.
> > +
> > +  Args:
> > +    func: The function name
> > +
> > +  Returns:
> > +    A tuple of two elements, one a dictionary of directives and the
> > +    other a dictionary of all input values.
> > +  """
> > +  all_vals = {}
> > +  # Valid directives.
> > +  directives = {
> > +    'name': '',
> > +    'args': [],
> > +    'includes': [],
> > +    'include-sources': [],
> > +    'ret': '',
> > +    'init': ''
> > +  }
> > +
> > +  func = func_types[-1]
> > +  try:
> > +    with open('../sysdeps/x86_64/fpu/libmvec-%s-inputs' % func) as f:
> > +      for line in f:
> > +        # Look for directives and parse it if found.
> > +        if line.startswith('##'):
> > +          try:
> > +            d_name, d_val = line[2:].split(':', 1)
> > +            d_name = d_name.strip()
> > +            d_val = d_val.strip()
> > +            directives[d_name] = _process_directive(d_name, d_val,
> func_types[1])
> > +          except (IndexError, KeyError):
> > +            die('Invalid directive: %s' % line[2:])
> > +
> > +        # Skip blank lines and comments.
> > +        line = line.split('#', 1)[0].rstrip()
> > +        if not line:
> > +          continue
> > +
> > +        # Otherwise, we're an input.  Add to the appropriate
> > +        # input set.
> > +        cur_name = directives['name']
> > +        all_vals.setdefault(cur_name, [])
> > +        all_vals[cur_name].append(line)
> > +  except IOError as ex:
> > +    die("Failed to open input file (%s): %s" % (ex.filename,
> ex.strerror))
> > +
> > +  return directives, all_vals
> > +
> > +
> > +def die(msg):
> > +  """Exit with an error
> > +
> > +  Prints an error message to the standard error stream and exits with
> > +  a non-zero status.
> > +
> > +  Args:
> > +    msg: The error message to print to standard error
> > +  """
> > +  print('%s\n' % msg, file=sys.stderr)
> > +  sys.exit(os.EX_DATAERR)
> > +
> > +
> > +def main(args):
> > +  """Main function
> > +
> > +  Use the first command line argument as function name and parse its
> > +  input file to generate C source that calls the function repeatedly
> > +  for the input.
> > +
> > +  Args:
> > +    args: The command line arguments with the program name dropped
> > +
> > +  Returns:
> > +    os.EX_USAGE on error and os.EX_OK on success.
> > +  """
> > +  if len(args) != 1:
> > +    print('Usage: %s <function>' % sys.argv[0])
> > +    return os.EX_USAGE
> > +
> > +  func_types = args[0].split('-')
> > +  directives, all_vals = parse_file(func_types)
> > +  gen_source(func_types, directives, all_vals)
> > +  return os.EX_OK
> > +
> > +
> > +if __name__ == '__main__':
> > +  sys.exit(main(sys.argv[1:]))
> > --
> > 2.31.1
> >
>
  
H.J. Lu Nov. 13, 2021, 7:47 p.m. UTC | #3
On Fri, Nov 12, 2021 at 2:51 PM Sunil Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Fri, Nov 12, 2021 at 1:02 PM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
>
> > On Fri, Nov 12, 2021 at 1:19 PM Sunil K Pandey via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > Add python script to generate libmvec microbenchmark from the input
> > > values for each libmvec function using skeleton benchmark template.
> > >
> > > Creates double and float benchmarks with vector length 1, 2, 4, 8,
> > > and 16 for each libmvec function.  Vector length 1 corresponds to
> > > scalar version of function and is included for vector function perf
> > > comparison.
> > > ---
> > >  sysdeps/x86_64/fpu/Makeconfig               |  35 ++
> > >  sysdeps/x86_64/fpu/Makefile                 |  40 ++
> > >  sysdeps/x86_64/fpu/bench-libmvec-skeleton.c | 104 +++++
> > >  sysdeps/x86_64/fpu/scripts/bench_libmvec.py | 464 ++++++++++++++++++++
> > >  4 files changed, 643 insertions(+)
> > >  create mode 100644 sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> > >  create mode 100755 sysdeps/x86_64/fpu/scripts/bench_libmvec.py
> > >
> > > diff --git a/sysdeps/x86_64/fpu/Makeconfig
> > b/sysdeps/x86_64/fpu/Makeconfig
> > > index 24aaee1a43..503e9b5ffa 100644
> > > --- a/sysdeps/x86_64/fpu/Makeconfig
> > > +++ b/sysdeps/x86_64/fpu/Makeconfig
> > > @@ -29,6 +29,23 @@ libmvec-funcs = \
> > >    sin \
> > >    sincos \
> > >
> > > +# Define libmvec function for benchtests directory.
> > > +libmvec-bench-funcs = \
> > > +
> > > +bench-libmvec-double = \
> > > +  $(addprefix double-vlen1-, $(libmvec-bench-funcs)) \
> > > +  $(addprefix double-vlen2-, $(libmvec-bench-funcs)) \
> > > +  $(addprefix double-vlen4-, $(libmvec-bench-funcs)) \
> > > +  $(addprefix double-vlen4-avx2-, $(libmvec-bench-funcs)) \
> > > +  $(addprefix double-vlen8-, $(libmvec-bench-funcs)) \
> > > +
> > > +bench-libmvec-float = \
> > > +  $(addsuffix f, $(addprefix float-vlen1-, $(libmvec-bench-funcs))) \
> > > +  $(addsuffix f, $(addprefix float-vlen4-, $(libmvec-bench-funcs))) \
> > > +  $(addsuffix f, $(addprefix float-vlen8-, $(libmvec-bench-funcs))) \
> > > +  $(addsuffix f, $(addprefix float-vlen8-avx2-,
> > $(libmvec-bench-funcs))) \
> > > +  $(addsuffix f, $(addprefix float-vlen16-, $(libmvec-bench-funcs))) \
> > > +
> > >  # The base libmvec ABI tests.
> > >  libmvec-abi-func-tests = \
> > >    $(addprefix test-double-libmvec-,$(libmvec-funcs)) \
> > > @@ -83,5 +100,23 @@ $(common-objpfx)libmvec.mk:
> > $(common-objpfx)config.make
> > >            echo "  \$$(float-vlen16-arch-ext-cflags)"; \
> > >            echo; \
> > >          done; \
> > > +        echo "endif"; \
> > > +        echo "ifeq (\$$(subdir),benchtests)"; \
> > > +        for t in $(libmvec-bench-funcs); do \
> > > +          echo "CFLAGS-bench-double-vlen4-$$t.c = \\"; \
> > > +          echo "  \$$(double-vlen4-arch-ext-cflags)"; \
> > > +          echo "CFLAGS-bench-double-vlen4-avx2-$$t.c = \\"; \
> > > +          echo "  \$$(double-vlen4-arch-ext2-cflags)"; \
> > > +          echo "CFLAGS-bench-double-vlen8-$$t.c = \\"; \
> > > +          echo "  \$$(double-vlen8-arch-ext-cflags)"; \
> > > +          echo; \
> > > +          echo "CFLAGS-bench-float-vlen8-$${t}f.c = \\"; \
> > > +          echo "  \$$(float-vlen8-arch-ext-cflags)"; \
> > > +          echo "CFLAGS-bench-float-vlen8-avx2-$${t}f.c = \\"; \
> > > +          echo "  \$$(float-vlen8-arch-ext2-cflags)"; \
> > > +          echo "CFLAGS-bench-float-vlen16-$${t}f.c = \\"; \
> > > +          echo "  \$$(float-vlen16-arch-ext-cflags)"; \
> > > +          echo; \
> > > +        done; \
> > >          echo "endif") > $@T
> > >         mv -f $@T $@
> > > diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
> > > index d172ae815d..9fb587cf8f 100644
> > > --- a/sysdeps/x86_64/fpu/Makefile
> > > +++ b/sysdeps/x86_64/fpu/Makefile
> > > @@ -72,3 +72,43 @@ ifeq
> > ($(subdir)$(config-cflags-mprefer-vector-width),mathyes)
> > >  # performance of sin and cos by more than 40% on Skylake.
> > >  CFLAGS-branred.c = -mprefer-vector-width=128
> > >  endif
> > > +
> > > +ifeq ($(subdir),benchtests)
> > > +double-vlen4-arch-ext-cflags = -mavx
> > > +double-vlen4-arch-ext2-cflags = -mavx2
> > > +double-vlen8-arch-ext-cflags = -mavx512f
> > > +
> > > +float-vlen8-arch-ext-cflags = -mavx
> > > +float-vlen8-arch-ext2-cflags = -mavx2
> > > +float-vlen16-arch-ext-cflags = -mavx512f
> > > +
> > > +bench-libmvec := $(bench-libmvec-double) $(bench-libmvec-float)
> > > +
> > > +ifeq (${BENCHSET},)
> > > +bench += $(bench-libmvec)
> > > +endif
> > > +
> > > +ifeq (${STATIC-BENCHTESTS},yes)
> > > +libmvec-benchtests = $(common-objpfx)mathvec/libmvec.a
> > $(common-objpfx)math/libm.a
> > > +else
> > > +libmvec-benchtests = $(libmvec) $(libm)
> > > +endif
> > > +
> > > +$(addprefix $(objpfx)bench-,$(bench-libmvec-double)):
> > $(libmvec-benchtests)
> > > +$(addprefix $(objpfx)bench-,$(bench-libmvec-float)):
> > $(libmvec-benchtests)
> > > +bench-libmvec-deps = $(..)sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> > bench-timing.h Makefile
> > > +
> > > +$(objpfx)bench-float-%.c: $(bench-libmvec-deps)
> > > +       { if [ -n "$($*-INCLUDE)" ]; then \
> > > +         cat $($*-INCLUDE); \
> > > +       fi; \
> > > +       $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py
> > $(basename $(@F)); } > $@-tmp
> > > +       mv -f $@-tmp $@
> > > +
> > > +$(objpfx)bench-double-%.c: $(bench-libmvec-deps)
> > > +       { if [ -n "$($*-INCLUDE)" ]; then \
> > > +         cat $($*-INCLUDE); \
> > > +       fi; \
> > > +       $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py
> > $(basename $(@F)); } > $@-tmp
> > > +       mv -f $@-tmp $@
> > > +endif
> > > diff --git a/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> > b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> > > new file mode 100644
> > > index 0000000000..d56a0c4462
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> > > @@ -0,0 +1,104 @@
> > > +/* Skeleton for libmvec benchmark programs.
> > > +   Copyright (C) 2021 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#include <string.h>
> > > +#include <stdint.h>
> > > +#include <stdbool.h>
> > > +#include <stdio.h>
> > > +#include <time.h>
> > > +#include <inttypes.h>
> > > +#include <bench-timing.h>
> > > +#include <json-lib.h>
> > > +#include <bench-util.h>
> > > +
> > > +#include <bench-util.c>
> > > +#include <math-tests-arch.h>
> > > +#define D_ITERS 10000
> > > +
> > > +int
> > > +main (int argc, char **argv)
> > > +{
> > > +  unsigned long i, k;
> > > +  timing_t start, end;
> > > +  json_ctx_t json_ctx;
> > > +
> > > +#if defined REQUIRE_AVX
> > > +  if (!CPU_FEATURE_ACTIVE (AVX))
> > > +    {
> > > +      printf ("AVX not supported.\n");
> > > +      return 0;
> > > +    }
> > > +#elif defined REQUIRE_AVX2
> > > +  if (!CPU_FEATURE_ACTIVE (AVX2))
> > > +    {
> > > +      printf ("AVX2 not supported.\n");
> > > +      return 0;
> > > +    }
> > > +#elif defined REQUIRE_AVX512F
> > > +  if (!CPU_FEATURE_ACTIVE (AVX512F))
> > > +    {
> > > +      printf ("AVX512F not supported.\n");
> > > +      return 0;
> > > +    }
> > > +#endif
> > > +
> > > +  bench_start ();
> > > +
> > > +#ifdef BENCH_INIT
> > > +  BENCH_INIT ();
> > > +#endif
> > > +
> > > +  json_init (&json_ctx, 2, stdout);
> > > +
> > > +  /* Begin function.  */
> > > +  json_attr_object_begin (&json_ctx, FUNCNAME);
> > > +
> > > +  for (int v = 0; v < NUM_VARIANTS; v++)
> > > +    {
> > > +      double d_total_time = 0;
> > > +      uint64_t cur;
> >
> > Think these should also be type `timing_t`
> >
>
> I do not see a difference if I use timing_t or uint64_t. In any case
> variable cur stores the
> difference between start and end time, not time.
>
>
> >
> > > +      for (k = 0; k < D_ITERS; k++)
> > > +       {
> > > +         TIMING_NOW (start);
> > > +         for (i = 0; i < NUM_SAMPLES (v); i++)
> >
> > What is the rationale for both `D_ITERS` and `NUM_SAMPLES (v)`? Why not
> > one loop that iterates for `D_ITERS * NUM_SAMPLES (v)`?
> >
>
> D_ITERS define how many times each variant full data set will run.
> NUM_SAMPLES(v)
> represent the number of data sets in variant v. Index v and i select, i'th
> data set from
> variant v and call vector function.  Having two loops simplifies logic.
>
>
> > > +           BENCH_FUNC (v, i);
> > > +         TIMING_NOW (end);
> > > +
> > > +         TIMING_DIFF (cur, start, end);
> > > +
> > > +         d_total_time += cur;
> >.> > Think this should be `TIMING_ACCUM(d_total_time, cur)`.
> >
>
> Not much difference, if I use TIMING_ACCUM or simply add cur to
> d_total_time.
>

Please use TIMING_ACCUM (d_total_time, cur) to be consistent with
TIMING_DIFF (cur, start, end).

Thanks.
  
Sunil Pandey Nov. 14, 2021, 2:59 a.m. UTC | #4
On Sat, Nov 13, 2021 at 11:48 AM H.J. Lu <hjl.tools@gmail.com> wrote:

> On Fri, Nov 12, 2021 at 2:51 PM Sunil Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > On Fri, Nov 12, 2021 at 1:02 PM Noah Goldstein <goldstein.w.n@gmail.com>
> > wrote:
> >
> > > On Fri, Nov 12, 2021 at 1:19 PM Sunil K Pandey via Libc-alpha
> > > <libc-alpha@sourceware.org> wrote:
> > > >
> > > > Add python script to generate libmvec microbenchmark from the input
> > > > values for each libmvec function using skeleton benchmark template.
> > > >
> > > > Creates double and float benchmarks with vector length 1, 2, 4, 8,
> > > > and 16 for each libmvec function.  Vector length 1 corresponds to
> > > > scalar version of function and is included for vector function perf
> > > > comparison.
> > > > ---
> > > >  sysdeps/x86_64/fpu/Makeconfig               |  35 ++
> > > >  sysdeps/x86_64/fpu/Makefile                 |  40 ++
> > > >  sysdeps/x86_64/fpu/bench-libmvec-skeleton.c | 104 +++++
> > > >  sysdeps/x86_64/fpu/scripts/bench_libmvec.py | 464
> ++++++++++++++++++++
> > > >  4 files changed, 643 insertions(+)
> > > >  create mode 100644 sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> > > >  create mode 100755 sysdeps/x86_64/fpu/scripts/bench_libmvec.py
> > > >
> > > > diff --git a/sysdeps/x86_64/fpu/Makeconfig
> > > b/sysdeps/x86_64/fpu/Makeconfig
> > > > index 24aaee1a43..503e9b5ffa 100644
> > > > --- a/sysdeps/x86_64/fpu/Makeconfig
> > > > +++ b/sysdeps/x86_64/fpu/Makeconfig
> > > > @@ -29,6 +29,23 @@ libmvec-funcs = \
> > > >    sin \
> > > >    sincos \
> > > >
> > > > +# Define libmvec function for benchtests directory.
> > > > +libmvec-bench-funcs = \
> > > > +
> > > > +bench-libmvec-double = \
> > > > +  $(addprefix double-vlen1-, $(libmvec-bench-funcs)) \
> > > > +  $(addprefix double-vlen2-, $(libmvec-bench-funcs)) \
> > > > +  $(addprefix double-vlen4-, $(libmvec-bench-funcs)) \
> > > > +  $(addprefix double-vlen4-avx2-, $(libmvec-bench-funcs)) \
> > > > +  $(addprefix double-vlen8-, $(libmvec-bench-funcs)) \
> > > > +
> > > > +bench-libmvec-float = \
> > > > +  $(addsuffix f, $(addprefix float-vlen1-, $(libmvec-bench-funcs)))
> \
> > > > +  $(addsuffix f, $(addprefix float-vlen4-, $(libmvec-bench-funcs)))
> \
> > > > +  $(addsuffix f, $(addprefix float-vlen8-, $(libmvec-bench-funcs)))
> \
> > > > +  $(addsuffix f, $(addprefix float-vlen8-avx2-,
> > > $(libmvec-bench-funcs))) \
> > > > +  $(addsuffix f, $(addprefix float-vlen16-,
> $(libmvec-bench-funcs))) \
> > > > +
> > > >  # The base libmvec ABI tests.
> > > >  libmvec-abi-func-tests = \
> > > >    $(addprefix test-double-libmvec-,$(libmvec-funcs)) \
> > > > @@ -83,5 +100,23 @@ $(common-objpfx)libmvec.mk:
> > > $(common-objpfx)config.make
> > > >            echo "  \$$(float-vlen16-arch-ext-cflags)"; \
> > > >            echo; \
> > > >          done; \
> > > > +        echo "endif"; \
> > > > +        echo "ifeq (\$$(subdir),benchtests)"; \
> > > > +        for t in $(libmvec-bench-funcs); do \
> > > > +          echo "CFLAGS-bench-double-vlen4-$$t.c = \\"; \
> > > > +          echo "  \$$(double-vlen4-arch-ext-cflags)"; \
> > > > +          echo "CFLAGS-bench-double-vlen4-avx2-$$t.c = \\"; \
> > > > +          echo "  \$$(double-vlen4-arch-ext2-cflags)"; \
> > > > +          echo "CFLAGS-bench-double-vlen8-$$t.c = \\"; \
> > > > +          echo "  \$$(double-vlen8-arch-ext-cflags)"; \
> > > > +          echo; \
> > > > +          echo "CFLAGS-bench-float-vlen8-$${t}f.c = \\"; \
> > > > +          echo "  \$$(float-vlen8-arch-ext-cflags)"; \
> > > > +          echo "CFLAGS-bench-float-vlen8-avx2-$${t}f.c = \\"; \
> > > > +          echo "  \$$(float-vlen8-arch-ext2-cflags)"; \
> > > > +          echo "CFLAGS-bench-float-vlen16-$${t}f.c = \\"; \
> > > > +          echo "  \$$(float-vlen16-arch-ext-cflags)"; \
> > > > +          echo; \
> > > > +        done; \
> > > >          echo "endif") > $@T
> > > >         mv -f $@T $@
> > > > diff --git a/sysdeps/x86_64/fpu/Makefile
> b/sysdeps/x86_64/fpu/Makefile
> > > > index d172ae815d..9fb587cf8f 100644
> > > > --- a/sysdeps/x86_64/fpu/Makefile
> > > > +++ b/sysdeps/x86_64/fpu/Makefile
> > > > @@ -72,3 +72,43 @@ ifeq
> > > ($(subdir)$(config-cflags-mprefer-vector-width),mathyes)
> > > >  # performance of sin and cos by more than 40% on Skylake.
> > > >  CFLAGS-branred.c = -mprefer-vector-width=128
> > > >  endif
> > > > +
> > > > +ifeq ($(subdir),benchtests)
> > > > +double-vlen4-arch-ext-cflags = -mavx
> > > > +double-vlen4-arch-ext2-cflags = -mavx2
> > > > +double-vlen8-arch-ext-cflags = -mavx512f
> > > > +
> > > > +float-vlen8-arch-ext-cflags = -mavx
> > > > +float-vlen8-arch-ext2-cflags = -mavx2
> > > > +float-vlen16-arch-ext-cflags = -mavx512f
> > > > +
> > > > +bench-libmvec := $(bench-libmvec-double) $(bench-libmvec-float)
> > > > +
> > > > +ifeq (${BENCHSET},)
> > > > +bench += $(bench-libmvec)
> > > > +endif
> > > > +
> > > > +ifeq (${STATIC-BENCHTESTS},yes)
> > > > +libmvec-benchtests = $(common-objpfx)mathvec/libmvec.a
> > > $(common-objpfx)math/libm.a
> > > > +else
> > > > +libmvec-benchtests = $(libmvec) $(libm)
> > > > +endif
> > > > +
> > > > +$(addprefix $(objpfx)bench-,$(bench-libmvec-double)):
> > > $(libmvec-benchtests)
> > > > +$(addprefix $(objpfx)bench-,$(bench-libmvec-float)):
> > > $(libmvec-benchtests)
> > > > +bench-libmvec-deps =
> $(..)sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> > > bench-timing.h Makefile
> > > > +
> > > > +$(objpfx)bench-float-%.c: $(bench-libmvec-deps)
> > > > +       { if [ -n "$($*-INCLUDE)" ]; then \
> > > > +         cat $($*-INCLUDE); \
> > > > +       fi; \
> > > > +       $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py
> > > $(basename $(@F)); } > $@-tmp
> > > > +       mv -f $@-tmp $@
> > > > +
> > > > +$(objpfx)bench-double-%.c: $(bench-libmvec-deps)
> > > > +       { if [ -n "$($*-INCLUDE)" ]; then \
> > > > +         cat $($*-INCLUDE); \
> > > > +       fi; \
> > > > +       $(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py
> > > $(basename $(@F)); } > $@-tmp
> > > > +       mv -f $@-tmp $@
> > > > +endif
> > > > diff --git a/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> > > b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> > > > new file mode 100644
> > > > index 0000000000..d56a0c4462
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
> > > > @@ -0,0 +1,104 @@
> > > > +/* Skeleton for libmvec benchmark programs.
> > > > +   Copyright (C) 2021 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it
> and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later
> version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be
> useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +#include <string.h>
> > > > +#include <stdint.h>
> > > > +#include <stdbool.h>
> > > > +#include <stdio.h>
> > > > +#include <time.h>
> > > > +#include <inttypes.h>
> > > > +#include <bench-timing.h>
> > > > +#include <json-lib.h>
> > > > +#include <bench-util.h>
> > > > +
> > > > +#include <bench-util.c>
> > > > +#include <math-tests-arch.h>
> > > > +#define D_ITERS 10000
> > > > +
> > > > +int
> > > > +main (int argc, char **argv)
> > > > +{
> > > > +  unsigned long i, k;
> > > > +  timing_t start, end;
> > > > +  json_ctx_t json_ctx;
> > > > +
> > > > +#if defined REQUIRE_AVX
> > > > +  if (!CPU_FEATURE_ACTIVE (AVX))
> > > > +    {
> > > > +      printf ("AVX not supported.\n");
> > > > +      return 0;
> > > > +    }
> > > > +#elif defined REQUIRE_AVX2
> > > > +  if (!CPU_FEATURE_ACTIVE (AVX2))
> > > > +    {
> > > > +      printf ("AVX2 not supported.\n");
> > > > +      return 0;
> > > > +    }
> > > > +#elif defined REQUIRE_AVX512F
> > > > +  if (!CPU_FEATURE_ACTIVE (AVX512F))
> > > > +    {
> > > > +      printf ("AVX512F not supported.\n");
> > > > +      return 0;
> > > > +    }
> > > > +#endif
> > > > +
> > > > +  bench_start ();
> > > > +
> > > > +#ifdef BENCH_INIT
> > > > +  BENCH_INIT ();
> > > > +#endif
> > > > +
> > > > +  json_init (&json_ctx, 2, stdout);
> > > > +
> > > > +  /* Begin function.  */
> > > > +  json_attr_object_begin (&json_ctx, FUNCNAME);
> > > > +
> > > > +  for (int v = 0; v < NUM_VARIANTS; v++)
> > > > +    {
> > > > +      double d_total_time = 0;
> > > > +      uint64_t cur;
> > >
> > > Think these should also be type `timing_t`
> > >
> >
> > I do not see a difference if I use timing_t or uint64_t. In any case
> > variable cur stores the
> > difference between start and end time, not time.
> >
> >
> > >
> > > > +      for (k = 0; k < D_ITERS; k++)
> > > > +       {
> > > > +         TIMING_NOW (start);
> > > > +         for (i = 0; i < NUM_SAMPLES (v); i++)
> > >
> > > What is the rationale for both `D_ITERS` and `NUM_SAMPLES (v)`? Why not
> > > one loop that iterates for `D_ITERS * NUM_SAMPLES (v)`?
> > >
> >
> > D_ITERS define how many times each variant full data set will run.
> > NUM_SAMPLES(v)
> > represent the number of data sets in variant v. Index v and i select,
> i'th
> > data set from
> > variant v and call vector function.  Having two loops simplifies logic.
> >
> >
> > > > +           BENCH_FUNC (v, i);
> > > > +         TIMING_NOW (end);
> > > > +
> > > > +         TIMING_DIFF (cur, start, end);
> > > > +
> > > > +         d_total_time += cur;
> > >.> > Think this should be `TIMING_ACCUM(d_total_time, cur)`.
> > >
> >
> > Not much difference, if I use TIMING_ACCUM or simply add cur to
> > d_total_time.
> >
>
> Please use TIMING_ACCUM (d_total_time, cur) to be consistent with
> TIMING_DIFF (cur, start, end).
>

Sure, I will fix it in the next version.


>
> Thanks.
>
>
> --
> H.J.
>
  

Patch

diff --git a/sysdeps/x86_64/fpu/Makeconfig b/sysdeps/x86_64/fpu/Makeconfig
index 24aaee1a43..503e9b5ffa 100644
--- a/sysdeps/x86_64/fpu/Makeconfig
+++ b/sysdeps/x86_64/fpu/Makeconfig
@@ -29,6 +29,23 @@  libmvec-funcs = \
   sin \
   sincos \
 
+# Define libmvec function for benchtests directory.
+libmvec-bench-funcs = \
+
+bench-libmvec-double = \
+  $(addprefix double-vlen1-, $(libmvec-bench-funcs)) \
+  $(addprefix double-vlen2-, $(libmvec-bench-funcs)) \
+  $(addprefix double-vlen4-, $(libmvec-bench-funcs)) \
+  $(addprefix double-vlen4-avx2-, $(libmvec-bench-funcs)) \
+  $(addprefix double-vlen8-, $(libmvec-bench-funcs)) \
+
+bench-libmvec-float = \
+  $(addsuffix f, $(addprefix float-vlen1-, $(libmvec-bench-funcs))) \
+  $(addsuffix f, $(addprefix float-vlen4-, $(libmvec-bench-funcs))) \
+  $(addsuffix f, $(addprefix float-vlen8-, $(libmvec-bench-funcs))) \
+  $(addsuffix f, $(addprefix float-vlen8-avx2-, $(libmvec-bench-funcs))) \
+  $(addsuffix f, $(addprefix float-vlen16-, $(libmvec-bench-funcs))) \
+
 # The base libmvec ABI tests.
 libmvec-abi-func-tests = \
   $(addprefix test-double-libmvec-,$(libmvec-funcs)) \
@@ -83,5 +100,23 @@  $(common-objpfx)libmvec.mk: $(common-objpfx)config.make
 	   echo "  \$$(float-vlen16-arch-ext-cflags)"; \
 	   echo; \
 	 done; \
+	 echo "endif"; \
+	 echo "ifeq (\$$(subdir),benchtests)"; \
+	 for t in $(libmvec-bench-funcs); do \
+	   echo "CFLAGS-bench-double-vlen4-$$t.c = \\"; \
+	   echo "  \$$(double-vlen4-arch-ext-cflags)"; \
+	   echo "CFLAGS-bench-double-vlen4-avx2-$$t.c = \\"; \
+	   echo "  \$$(double-vlen4-arch-ext2-cflags)"; \
+	   echo "CFLAGS-bench-double-vlen8-$$t.c = \\"; \
+	   echo "  \$$(double-vlen8-arch-ext-cflags)"; \
+	   echo; \
+	   echo "CFLAGS-bench-float-vlen8-$${t}f.c = \\"; \
+	   echo "  \$$(float-vlen8-arch-ext-cflags)"; \
+	   echo "CFLAGS-bench-float-vlen8-avx2-$${t}f.c = \\"; \
+	   echo "  \$$(float-vlen8-arch-ext2-cflags)"; \
+	   echo "CFLAGS-bench-float-vlen16-$${t}f.c = \\"; \
+	   echo "  \$$(float-vlen16-arch-ext-cflags)"; \
+	   echo; \
+	 done; \
 	 echo "endif") > $@T
 	mv -f $@T $@
diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
index d172ae815d..9fb587cf8f 100644
--- a/sysdeps/x86_64/fpu/Makefile
+++ b/sysdeps/x86_64/fpu/Makefile
@@ -72,3 +72,43 @@  ifeq ($(subdir)$(config-cflags-mprefer-vector-width),mathyes)
 # performance of sin and cos by more than 40% on Skylake.
 CFLAGS-branred.c = -mprefer-vector-width=128
 endif
+
+ifeq ($(subdir),benchtests)
+double-vlen4-arch-ext-cflags = -mavx
+double-vlen4-arch-ext2-cflags = -mavx2
+double-vlen8-arch-ext-cflags = -mavx512f
+
+float-vlen8-arch-ext-cflags = -mavx
+float-vlen8-arch-ext2-cflags = -mavx2
+float-vlen16-arch-ext-cflags = -mavx512f
+
+bench-libmvec := $(bench-libmvec-double) $(bench-libmvec-float)
+
+ifeq (${BENCHSET},)
+bench += $(bench-libmvec)
+endif
+
+ifeq (${STATIC-BENCHTESTS},yes)
+libmvec-benchtests = $(common-objpfx)mathvec/libmvec.a $(common-objpfx)math/libm.a
+else
+libmvec-benchtests = $(libmvec) $(libm)
+endif
+
+$(addprefix $(objpfx)bench-,$(bench-libmvec-double)): $(libmvec-benchtests)
+$(addprefix $(objpfx)bench-,$(bench-libmvec-float)): $(libmvec-benchtests)
+bench-libmvec-deps = $(..)sysdeps/x86_64/fpu/bench-libmvec-skeleton.c bench-timing.h Makefile
+
+$(objpfx)bench-float-%.c: $(bench-libmvec-deps)
+	{ if [ -n "$($*-INCLUDE)" ]; then \
+	  cat $($*-INCLUDE); \
+	fi; \
+	$(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py $(basename $(@F)); } > $@-tmp
+	mv -f $@-tmp $@
+
+$(objpfx)bench-double-%.c: $(bench-libmvec-deps)
+	{ if [ -n "$($*-INCLUDE)" ]; then \
+	  cat $($*-INCLUDE); \
+	fi; \
+	$(PYTHON) $(..)sysdeps/x86_64/fpu/scripts/bench_libmvec.py $(basename $(@F)); } > $@-tmp
+	mv -f $@-tmp $@
+endif
diff --git a/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
new file mode 100644
index 0000000000..d56a0c4462
--- /dev/null
+++ b/sysdeps/x86_64/fpu/bench-libmvec-skeleton.c
@@ -0,0 +1,104 @@ 
+/* Skeleton for libmvec benchmark programs.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <time.h>
+#include <inttypes.h>
+#include <bench-timing.h>
+#include <json-lib.h>
+#include <bench-util.h>
+
+#include <bench-util.c>
+#include <math-tests-arch.h>
+#define D_ITERS 10000
+
+int
+main (int argc, char **argv)
+{
+  unsigned long i, k;
+  timing_t start, end;
+  json_ctx_t json_ctx;
+
+#if defined REQUIRE_AVX
+  if (!CPU_FEATURE_ACTIVE (AVX))
+    {
+      printf ("AVX not supported.\n");
+      return 0;
+    }
+#elif defined REQUIRE_AVX2
+  if (!CPU_FEATURE_ACTIVE (AVX2))
+    {
+      printf ("AVX2 not supported.\n");
+      return 0;
+    }
+#elif defined REQUIRE_AVX512F
+  if (!CPU_FEATURE_ACTIVE (AVX512F))
+    {
+      printf ("AVX512F not supported.\n");
+      return 0;
+    }
+#endif
+
+  bench_start ();
+
+#ifdef BENCH_INIT
+  BENCH_INIT ();
+#endif
+
+  json_init (&json_ctx, 2, stdout);
+
+  /* Begin function.  */
+  json_attr_object_begin (&json_ctx, FUNCNAME);
+
+  for (int v = 0; v < NUM_VARIANTS; v++)
+    {
+      double d_total_time = 0;
+      uint64_t cur;
+      for (k = 0; k < D_ITERS; k++)
+	{
+	  TIMING_NOW (start);
+	  for (i = 0; i < NUM_SAMPLES (v); i++)
+	    BENCH_FUNC (v, i);
+	  TIMING_NOW (end);
+
+	  TIMING_DIFF (cur, start, end);
+
+	  d_total_time += cur;
+
+	}
+      double d_total_data_set = D_ITERS * NUM_SAMPLES (v) * STRIDE;
+
+      /* Begin variant.  */
+      json_attr_object_begin (&json_ctx, VARIANT (v));
+
+      json_attr_double (&json_ctx, "duration", d_total_time);
+      json_attr_double (&json_ctx, "iterations", d_total_data_set);
+      json_attr_double (&json_ctx, "mean", d_total_time / d_total_data_set);
+
+      /* End variant.  */
+      json_attr_object_end (&json_ctx);
+    }
+
+  /* End function.  */
+  json_attr_object_end (&json_ctx);
+
+  return 0;
+}
diff --git a/sysdeps/x86_64/fpu/scripts/bench_libmvec.py b/sysdeps/x86_64/fpu/scripts/bench_libmvec.py
new file mode 100755
index 0000000000..762865de8f
--- /dev/null
+++ b/sysdeps/x86_64/fpu/scripts/bench_libmvec.py
@@ -0,0 +1,464 @@ 
+#!/usr/bin/python3
+# Copyright (C) 2021 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+
+"""Benchmark program generator script
+
+This script takes a function name as input and generates a program using
+an libmvec input file located in the sysdeps/x86_64/fpu directory.  The
+name of the input file should be of the form libmvec-foo-inputs where
+'foo' is the name of the function.
+"""
+
+from __future__ import print_function
+import sys
+import os
+import itertools
+import re
+
+# Macro definitions for functions that take no arguments.  For functions
+# that take arguments, the STRUCT_TEMPLATE, ARGS_TEMPLATE and
+# VARIANTS_TEMPLATE are used instead.
+DEFINES_TEMPLATE = '''
+#define CALL_BENCH_FUNC(v, i) %(func)s ()
+#define NUM_VARIANTS (1)
+#define NUM_SAMPLES(v) (1)
+#define VARIANT(v) FUNCNAME "()"
+'''
+
+# Structures to store arguments for the function call.  A function may
+# have its inputs partitioned to represent distinct performance
+# characteristics or distinct flavors of the function.  Each such
+# variant is represented by the _VARIANT structure.  The ARGS structure
+# represents a single set of arguments.
+BENCH_VEC_TEMPLATE = '''
+#define CALL_BENCH_FUNC(v, i) (__extension__ ({ \\
+  %(defs)s mx0 = %(func)s (%(func_args)s); \\
+  mx0; }))
+'''
+
+BENCH_SCALAR_TEMPLATE = '''
+#define CALL_BENCH_FUNC(v, i) %(func)s (%(func_args)s)
+'''
+
+STRUCT_TEMPLATE = '''struct args
+{
+%(args)s
+  double timing;
+};
+
+struct _variants
+{
+  const char *name;
+  int count;
+  struct args *in;
+};
+'''
+
+# The actual input arguments.
+ARGS_TEMPLATE = '''struct args in%(argnum)d[%(num_args)d] = {
+%(args)s
+};
+'''
+
+# The actual variants, along with macros defined to access the variants.
+VARIANTS_TEMPLATE = '''struct _variants variants[%(num_variants)d] = {
+%(variants)s
+};
+
+#define NUM_VARIANTS %(num_variants)d
+#define NUM_SAMPLES(i) (variants[i].count)
+#define VARIANT(i) (variants[i].name)
+'''
+
+# Epilogue for the generated source file.
+EPILOGUE = '''
+#define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);})
+#define FUNCNAME "%(func)s"
+#include <bench-libmvec-skeleton.c>'''
+
+
+def gen_source(func_types, directives, all_vals):
+  """Generate source for the function
+
+  Generate the C source for the function from the values and
+  directives.
+
+  Args:
+    func: The function name
+    directives: A dictionary of directives applicable to this function
+    all_vals: A dictionary input values
+  """
+  # The includes go in first.
+  for header in directives['includes']:
+    print('#include <%s>' % header)
+
+  for header in directives['include-sources']:
+    print('#include "%s"' % header)
+
+  argtype_vtable = {
+    2: '128',
+    4: '256',
+    8: '512'
+  }
+  prefix_vtable = {
+    2: 'b',
+    4: 'c',
+    8: 'e'
+  }
+
+  # Get all the function properties
+  funcname_argtype = ''
+  float_flag = False
+  if func_types[1] == 'float':
+    float_flag = True
+  avx_flag = False
+  if func_types[3] == 'avx2':
+    avx_flag = True
+  funcname_stride = int(func_types[2][4:])
+  funcname_origin = func_types[-1]
+  if float_flag:
+    funcname_origin = funcname_origin[:-1]
+
+  if funcname_stride == 1:
+    # Prepare for scalar functions file generation
+    funcname_prefix = ''
+    funcname_prefix_1 = ''
+    funcname_argtype = 'double'
+    if float_flag:
+      funcname_argtype = 'float'
+  else:
+    # Prepare for libmvec functions file generation
+    funcname_prefix_1 = len(directives['args']) * 'v' + '_'
+    aligned_stride = funcname_stride
+    if float_flag:
+      aligned_stride /= 2
+    funcname_prefix = '_ZGV'
+    if (avx_flag and (aligned_stride == 4)):
+      funcname_prefix += 'd'
+    else:
+      funcname_prefix += prefix_vtable[aligned_stride]
+    funcname_prefix = funcname_prefix + 'N' + func_types[2][4:]
+    funcname_argtype = '__m' + argtype_vtable[aligned_stride]
+    if not float_flag:
+      funcname_argtype += 'd'
+
+  # Include x86intrin.h for vector functions
+  if not funcname_stride == 1:
+    print('#include <x86intrin.h>')
+    if (avx_flag and (aligned_stride == 4)):
+      # For bench-float-vlen8-avx2* and bench-double-vlen4-avx2*
+      print('#define REQUIRE_AVX2')
+    elif aligned_stride == 8:
+      # For bench-float-vlen16* and bench-double-vlen8*
+      print('#define REQUIRE_AVX512F')
+    elif aligned_stride == 4:
+      # For bench-float-vlen8* and bench-double-vlen4* without avx2
+      print('#define REQUIRE_AVX')
+  else:
+    print('#define FUNCTYPE %s' % funcname_argtype)
+
+  print('#define STRIDE %d ' % funcname_stride)
+
+  funcname = funcname_prefix + funcname_prefix_1 + funcname_origin
+  if float_flag:
+    funcname += 'f'
+
+  funcname_rettype = funcname_argtype
+  if directives['ret'] == '':
+    funcname_rettype = 'void'
+
+  funcname_inputtype = []
+  for arg, i in zip(directives['args'], itertools.count()):
+    funcname_inputtype.append(funcname_argtype)
+    if arg[0] == '<' and arg[-1] == '>':
+      pos = arg.rfind('*')
+      if pos == -1:
+        die('Output argument must be a pointer type')
+      funcname_inputtype[i] += ' *'
+
+  if not funcname_stride == 1:
+    if len(directives['args']) == 2:
+      print('extern %s %s (%s, %s);' % (funcname_rettype, funcname, funcname_inputtype[0], funcname_inputtype[1]))
+    elif len(directives['args']) == 3:
+      print('extern %s %s (%s, %s, %s);' % (funcname_rettype, funcname, funcname_inputtype[0], funcname_inputtype[1], funcname_inputtype[2]))
+    else:
+      print('extern %s %s (%s);' % (funcname_rettype, funcname, funcname_inputtype[0]))
+
+  # Print macros.  This branches out to a separate routine if
+  # the function takes arguments.
+  if not directives['args']:
+    print(DEFINES_TEMPLATE % {'funcname': funcname})
+    outargs = []
+  else:
+    outargs = _print_arg_data(funcname, float_flag, funcname_argtype, funcname_stride, directives, all_vals)
+
+  # Print the output variable definitions if necessary.
+  for out in outargs:
+    print(out)
+
+  # If we have a return value from the function, make sure it is
+  # assigned to prevent the compiler from optimizing out the
+  # call.
+  getret = ''
+
+  if directives['ret']:
+    if funcname_argtype != '':
+      print('static %s volatile ret;' % funcname_argtype)
+      getret = 'ret ='
+    else:
+      print('static %s volatile ret;' % directives['ret'])
+      getret = 'ret ='
+
+  # Test initialization.
+  if directives['init']:
+    print('#define BENCH_INIT %s' % directives['init'])
+
+  print(EPILOGUE % {'getret': getret, 'func': funcname})
+
+
+def _print_arg_data(func, float_flag, funcname_argtype, funcname_stride, directives, all_vals):
+  """Print argument data
+
+  This is a helper function for gen_source that prints structure and
+  values for arguments and their variants and returns output arguments
+  if any are found.
+
+  Args:
+    func: Function name
+    float_flag: True if function is float type
+    funcname_argtype: Type for vector variants
+    funcname_stride: Vector Length
+    directives: A dictionary of directives applicable to this function
+    all_vals: A dictionary input values
+
+  Returns:
+    Returns a list of definitions for function arguments that act as
+    output parameters.
+  """
+  # First, all of the definitions.  We process writing of
+  # CALL_BENCH_FUNC, struct args and also the output arguments
+  # together in a single traversal of the arguments list.
+  func_args = []
+  _func_args = []
+  arg_struct = []
+  outargs = []
+  # Conversion function for each type
+  vtable = {
+    '__m128d': '_mm_loadu_pd',
+    '__m256d': '_mm256_loadu_pd',
+    '__m512d': '_mm512_loadu_pd',
+    '__m128': '_mm_loadu_ps',
+    '__m256': '_mm256_loadu_ps',
+    '__m512': '_mm512_loadu_ps',
+    'double': '',
+    'float': ''
+  }
+
+  # For double max_vlen=8, for float max_vlen=16.
+  if float_flag == True:
+    max_vlen = 16
+  else:
+    max_vlen = 8
+
+  for arg, i in zip(directives['args'], itertools.count()):
+    if arg[0] == '<' and arg[-1] == '>':
+      outargs.append('static %s out%d __attribute__((used));' % (funcname_argtype, i))
+      func_args.append('&out%d' % i)
+      _func_args.append('&out%d' % i)
+    else:
+      arg_struct.append('  %s arg%d[STRIDE];' % (arg, i))
+      func_args.append('%s (variants[v].in[i].arg%d)' %
+                       (vtable[funcname_argtype], i))
+      _func_args.append('variants[v].in[i].arg%d[0]' % i)
+
+  if funcname_stride == 1:
+    print(BENCH_SCALAR_TEMPLATE % {'func': func,
+                                   'func_args': ', '.join(_func_args)})
+  elif directives['ret'] == '':
+    print(BENCH_SCALAR_TEMPLATE % {'func': func,
+                                   'func_args': ', '.join(func_args)})
+  else:
+    print(BENCH_VEC_TEMPLATE % {'func': func, 'func_args': ', '.join(func_args),
+                                'defs': funcname_argtype})
+  print(STRUCT_TEMPLATE % {'args': '\n'.join(arg_struct)})
+
+  # Now print the values.
+  variants = []
+  for (k, _vals), i in zip(all_vals.items(), itertools.count()):
+    vals = []
+    temp_vals = []
+    j = 0
+    temp_j = 0
+    result_v = ['', '', '']
+    for _v in _vals:
+      nums = _v.split(',')
+      for l in range(0, len(nums)):
+        result_v[l] = result_v[l] + nums[l].strip() + ','
+      j += 1
+      temp_j += 1
+
+      if temp_j == funcname_stride:
+        final_result = ''
+        for l in range(0, len(nums)):
+          final_result = final_result + '{' + result_v[l][:-1] + '},'
+        temp_vals.append(final_result[:-1])
+        temp_j = 0
+        result_v = ['', '', '']
+
+      # Make sure amount of test data is multiple of max_vlen
+      # to keep data size same for all vector length.
+      if j == max_vlen:
+        vals.extend(temp_vals)
+        temp_vals = []
+        j = 0
+
+    out = ['  {%s, 0},' % v for v in vals]
+
+    # Members for the variants structure list that we will
+    # print later.
+    variants.append('  {"%s", %d, in%d},' % (k, len(vals), i))
+    print(ARGS_TEMPLATE % {'argnum': i, 'num_args': len(vals),
+                           'args': '\n'.join(out)})
+
+  # Print the variants and the last set of macros.
+  print(VARIANTS_TEMPLATE % {'num_variants': len(all_vals),
+                             'variants': '\n'.join(variants)})
+  return outargs
+
+
+def _process_directive(d_name, d_val, func_args):
+  """Process a directive.
+
+  Evaluate the directive name and value passed and return the
+  processed value. This is a helper function for parse_file.
+
+  Args:
+    d_name: Name of the directive
+    d_val: The string value to process
+
+  Returns:
+    The processed value, which may be the string as it is or an object
+    that describes the directive.
+  """
+  # Process the directive values if necessary.  name and ret don't
+  # need any processing.
+  if d_name.startswith('include'):
+    d_val = d_val.split(',')
+  elif d_name == 'args':
+    d_val = d_val.split(':')
+    # Check if args type match
+    if not d_val[0] == func_args:
+      die("Args mismatch, should be %s, but get %s" % (d_val[0], func_args))
+
+  # Return the values.
+  return d_val
+
+
+def parse_file(func_types):
+  """Parse an input file
+
+  Given a function name, open and parse an input file for the function
+  and get the necessary parameters for the generated code and the list
+  of inputs.
+
+  Args:
+    func: The function name
+
+  Returns:
+    A tuple of two elements, one a dictionary of directives and the
+    other a dictionary of all input values.
+  """
+  all_vals = {}
+  # Valid directives.
+  directives = {
+    'name': '',
+    'args': [],
+    'includes': [],
+    'include-sources': [],
+    'ret': '',
+    'init': ''
+  }
+
+  func = func_types[-1]
+  try:
+    with open('../sysdeps/x86_64/fpu/libmvec-%s-inputs' % func) as f:
+      for line in f:
+        # Look for directives and parse it if found.
+        if line.startswith('##'):
+          try:
+            d_name, d_val = line[2:].split(':', 1)
+            d_name = d_name.strip()
+            d_val = d_val.strip()
+            directives[d_name] = _process_directive(d_name, d_val, func_types[1])
+          except (IndexError, KeyError):
+            die('Invalid directive: %s' % line[2:])
+
+        # Skip blank lines and comments.
+        line = line.split('#', 1)[0].rstrip()
+        if not line:
+          continue
+
+        # Otherwise, we're an input.  Add to the appropriate
+        # input set.
+        cur_name = directives['name']
+        all_vals.setdefault(cur_name, [])
+        all_vals[cur_name].append(line)
+  except IOError as ex:
+    die("Failed to open input file (%s): %s" % (ex.filename, ex.strerror))
+
+  return directives, all_vals
+
+
+def die(msg):
+  """Exit with an error
+
+  Prints an error message to the standard error stream and exits with
+  a non-zero status.
+
+  Args:
+    msg: The error message to print to standard error
+  """
+  print('%s\n' % msg, file=sys.stderr)
+  sys.exit(os.EX_DATAERR)
+
+
+def main(args):
+  """Main function
+
+  Use the first command line argument as function name and parse its
+  input file to generate C source that calls the function repeatedly
+  for the input.
+
+  Args:
+    args: The command line arguments with the program name dropped
+
+  Returns:
+    os.EX_USAGE on error and os.EX_OK on success.
+  """
+  if len(args) != 1:
+    print('Usage: %s <function>' % sys.argv[0])
+    return os.EX_USAGE
+
+  func_types = args[0].split('-')
+  directives, all_vals = parse_file(func_types)
+  gen_source(func_types, directives, all_vals)
+  return os.EX_OK
+
+
+if __name__ == '__main__':
+  sys.exit(main(sys.argv[1:]))