[1/1] benchtest: Run benchmark only once for each input

Message ID 20170706085152.36466-2-ashwin.sekhar@caviumnetworks.com
State New, archived
Headers

Commit Message

Ashwin Sekhar T K July 6, 2017, 8:51 a.m. UTC
  Currently for each input in the input file, the benchtest framework
runs the microbenchmark in a loop a specific number of times before
it moves onto the next input. This has the drawback that it might
eliminate the cache effects and branch effects that we see in real
world programs.

This patch changes the framework in such a way that the microbenchmark
is run only once for each input before moving onto the next input.

	* benchtests/bench-skeleton.c: Change benchmarking logic from running
	multiple times for an input to running only once for an input.
	* benchtests/scripts/bench.py: Add variable 'runs' to args struct to
	count the total number of test runs for an input. Add macro RUNS to
	easily access this count. Modify RESULT_ACCUM macro accordingly.
---
 benchtests/bench-skeleton.c | 47 +++++++++++++++++++++------------------------
 benchtests/scripts/bench.py | 12 ++++++++++--
 2 files changed, 32 insertions(+), 27 deletions(-)
  

Comments

Siddhesh Poyarekar July 6, 2017, 8:58 a.m. UTC | #1
On Thursday 06 July 2017 02:21 PM, Ashwin Sekhar T K wrote:
> Currently for each input in the input file, the benchtest framework
> runs the microbenchmark in a loop a specific number of times before
> it moves onto the next input. This has the drawback that it might
> eliminate the cache effects and branch effects that we see in real
> world programs.
> 
> This patch changes the framework in such a way that the microbenchmark
> is run only once for each input before moving onto the next input.

This is solved with Wilco's workload addition patch.  That is, for
workload tests it makes sense to have cache effects in place since the
call sequence kinda resembles a real workload - note that there is still
a non-trivial effect due to other code in the program that cannot be
emulated without actually running the program itself.

These inputs on the other hand simply exercise branches of the program
to measure the variance in performance across branches irrespective of
caching.  I'm open to changing the constitution of the inputs to a
different model, but running them exactly once in that sequence does not
really emulate anything.

Siddhesh
  
Sekhar, Ashwin July 6, 2017, 9:17 a.m. UTC | #2
On Thu, 2017-07-06 at 14:28 +0530, Siddhesh Poyarekar wrote:
> On Thursday 06 July 2017 02:21 PM, Ashwin Sekhar T K wrote:

> > 

> > Currently for each input in the input file, the benchtest framework

> > runs the microbenchmark in a loop a specific number of times before

> > it moves onto the next input. This has the drawback that it might

> > eliminate the cache effects and branch effects that we see in real

> > world programs.

> > 

> > This patch changes the framework in such a way that the

> > microbenchmark

> > is run only once for each input before moving onto the next input.

> This is solved with Wilco's workload addition patch.  That is, for

> workload tests it makes sense to have cache effects in place since

> the

> call sequence kinda resembles a real workload - note that there is

> still

> a non-trivial effect due to other code in the program that cannot be

> emulated without actually running the program itself.

> 

Apologies didn't notice that this patch was already made.

> These inputs on the other hand simply exercise branches of the

> program

> to measure the variance in performance across branches irrespective

> of

> caching.  I'm open to changing the constitution of the inputs to a

> different model, but running them exactly once in that sequence does

> not

> really emulate anything.

> 

> Siddhesh
  

Patch

diff --git a/benchtests/bench-skeleton.c b/benchtests/bench-skeleton.c
index 09eb78df1b..5707f1da65 100644
--- a/benchtests/bench-skeleton.c
+++ b/benchtests/bench-skeleton.c
@@ -35,7 +35,7 @@ 
 int
 main (int argc, char **argv)
 {
-  unsigned long i, k;
+  unsigned long i;
   struct timespec runtime;
   timing_t start, end;
   bool detailed = false;
@@ -48,15 +48,9 @@  main (int argc, char **argv)
 
   memset (&runtime, 0, sizeof (runtime));
 
-  unsigned long iters, res;
-
 #ifdef BENCH_INIT
   BENCH_INIT ();
 #endif
-  TIMING_INIT (res);
-
-  iters = 1000 * res;
-
   json_init (&json_ctx, 2, stdout);
 
   /* Begin function.  */
@@ -68,35 +62,40 @@  main (int argc, char **argv)
       clock_gettime (CLOCK_MONOTONIC_RAW, &runtime);
       runtime.tv_sec += DURATION;
 
-      double d_total_i = 0;
-      timing_t total = 0, max = 0, min = 0x7fffffffffffffff;
-      int64_t c = 0;
+      double d_total_i = 0, d_max = 0, d_min = 0x7fffffffffffffff;
+      timing_t total = 0;
       while (1)
 	{
 	  for (i = 0; i < NUM_SAMPLES (v); i++)
 	    {
-	      uint64_t cur;
+	      uint64_t cur, iters = 0;
+	      double d_cur;
+
 	      TIMING_NOW (start);
-	      for (k = 0; k < iters; k++)
-		BENCH_FUNC (v, i);
+	    run_bench:
+	      BENCH_FUNC (v, i);
 	      TIMING_NOW (end);
-
 	      TIMING_DIFF (cur, start, end);
+	      iters++;
+
+	      /* If benchmark ran quickly than the clock resolution, re-run
+		 until it can be captured by the clock.  */
+	      if (cur == 0)
+		goto run_bench;
 
-	      if (cur > max)
-		max = cur;
+	      d_cur = (double)cur / iters;
+	      if (d_cur > d_max)
+		d_max = d_cur;
 
-	      if (cur < min)
-		min = cur;
+	      if (d_cur < d_min)
+		d_min = d_cur;
 
 	      TIMING_ACCUM (total, cur);
 	      /* Accumulate timings for the value.  In the end we will divide
 	         by the total iterations.  */
-	      RESULT_ACCUM (cur, v, i, c * iters, (c + 1) * iters);
-
+	      RESULT_ACCUM (cur, v, i, iters);
 	      d_total_i += iters;
 	    }
-	  c++;
 	  struct timespec curtime;
 
 	  memset (&curtime, 0, sizeof (curtime));
@@ -106,19 +105,17 @@  main (int argc, char **argv)
 	}
 
       double d_total_s;
-      double d_iters;
 
     done:
       d_total_s = total;
-      d_iters = iters;
 
       /* Begin variant.  */
       json_attr_object_begin (&json_ctx, VARIANT (v));
 
       json_attr_double (&json_ctx, "duration", d_total_s);
       json_attr_double (&json_ctx, "iterations", d_total_i);
-      json_attr_double (&json_ctx, "max", max / d_iters);
-      json_attr_double (&json_ctx, "min", min / d_iters);
+      json_attr_double (&json_ctx, "max", d_max);
+      json_attr_double (&json_ctx, "min", d_min);
       json_attr_double (&json_ctx, "mean", d_total_s / d_total_i);
 
       if (detailed)
diff --git a/benchtests/scripts/bench.py b/benchtests/scripts/bench.py
index 8c1c9eeb2b..d9d27e70c7 100755
--- a/benchtests/scripts/bench.py
+++ b/benchtests/scripts/bench.py
@@ -51,6 +51,7 @@  struct args
 {
 %(args)s
   double timing;
+  int runs;
 };
 
 struct _variants
@@ -82,8 +83,15 @@  struct _variants variants[%(num_variants)d] = {
 # Epilogue for the generated source file.
 EPILOGUE = '''
 #define RESULT(__v, __i) (variants[(__v)].in[(__i)].timing)
-#define RESULT_ACCUM(r, v, i, old, new) \\
-        ((RESULT ((v), (i))) = (RESULT ((v), (i)) * (old) + (r)) / ((new) + 1))
+#define RUNS(__v, __i) (variants[(__v)].in[(__i)].runs)
+#define RESULT_ACCUM(r, v, i, c) \\
+  do \\
+  { \\
+    int old = RUNS ((v), (i)); \\
+    RESULT ((v), (i)) = (RESULT ((v), (i)) * old + (r)) / (old + c); \\
+    RUNS ((v), (i)) = old + c; \\
+  } \\
+  while (0)
 #define BENCH_FUNC(i, j) ({%(getret)s CALL_BENCH_FUNC (i, j);})
 #define FUNCNAME "%(func)s"
 #include "bench-skeleton.c"'''