diff mbox series

[v1,1/2] x86: Optimize less_vec evex and avx512 memset-vec-unaligned-erms.S

Message ID 20210418220921.1868796-1-goldstein.w.n@gmail.com
State Superseded
Headers show
Series [v1,1/2] x86: Optimize less_vec evex and avx512 memset-vec-unaligned-erms.S | expand

Commit Message

Noah Goldstein April 18, 2021, 10:09 p.m. UTC
No bug. This commit adds optimized cased for less_vec memset case that
uses the avx512vl/avx512bw mask store avoiding the excessive
branches. test-memset and test-wmemset are passing.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
Tests where run on the following CPUs:

Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html

Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html


All times are the geometric mean of N=20. The unit of time is
seconds.

"Cur" refers to the current implementation
"New" refers to this patches implementation

There are 3 cases that matter for performance.

1) ptr is not within VEC_SIZE of a page
2) ptr is within VEC_SIZE of a page but length is small enough so that
   there is not page cross
3) page cross.

Case 1 (which should be the most common) the new implementation has a
near universal improvement. The only exception is the avx512 case for
size = [0, 15] where I believe the downclocking from avx512 is causing
slowdown. Its worth noting that because bench-memset.c repeats the
same size the branch heavy case should be favored as the branches will
all be predicted correctly. In a setting with unpredictable length
this version should perform significant better. For example I
implemented something similiar to this change for memmove/memcpy and
saw ~40% speedup in bench-memcpy-random (but for other reasons this
change isnt good there).

Cases 2 has a slowdown with this patch (roughly equivilent to the
performance improvement for case 1). Though I think this is probably
less important than the improvements for case 1 as page cross are
probably rarer than non-page cross.

Case 3 has a very slight slowdown with this patch. But for the same
reason as above I think this patch is still an improvement.

Its worth noting that the page cross check could be removed and
the mask store implementation would still be correct, but I'm finding
the the fault suppression is incredibly expensive from a performance
perspective and without the branch I see a 2 orders of magnitude
performance regression on the Case 2 benchmarks.


All performance numbers for less_vec:


Results For Tigerlake memset-evex-erms
size, algn, Cur T , New T , Win , Dif 
1   , 0   , 4.043 , 3.424 , New , 0.62
2   , 0   , 4.599 , 3.423 , New , 1.18
4   , 0   , 4.591 , 3.422 , New , 1.17
8   , 0   , 4.641 , 3.424 , New , 1.22
16  , 0   , 3.989 , 3.423 , New , 0.57
1   , 1   , 3.971 , 3.365 , New , 0.61
2   , 2   , 4.585 , 3.357 , New , 1.23
3   , 3   , 4.583 , 3.362 , New , 1.22
3   , 0   , 4.584 , 3.359 , New , 1.22
4   , 4   , 4.583 , 3.358 , New , 1.23
5   , 5   , 4.58  , 3.356 , New , 1.22
5   , 0   , 4.581 , 3.359 , New , 1.22
6   , 6   , 4.583 , 3.357 , New , 1.23
6   , 0   , 4.582 , 3.357 , New , 1.22
7   , 7   , 4.581 , 3.357 , New , 1.22
7   , 0   , 4.581 , 3.359 , New , 1.22
8   , 8   , 4.586 , 3.356 , New , 1.23
9   , 9   , 4.584 , 3.358 , New , 1.23
9   , 0   , 4.585 , 3.355 , New , 1.23
10  , 10  , 4.582 , 3.354 , New , 1.23
10  , 0   , 4.582 , 3.358 , New , 1.22
11  , 11  , 4.585 , 3.358 , New , 1.23
11  , 0   , 4.582 , 3.356 , New , 1.23
12  , 12  , 4.582 , 3.357 , New , 1.22
12  , 0   , 4.582 , 3.358 , New , 1.22
13  , 13  , 4.583 , 3.357 , New , 1.23
13  , 0   , 4.582 , 3.357 , New , 1.22
14  , 14  , 4.582 , 3.356 , New , 1.23
14  , 0   , 4.582 , 3.36  , New , 1.22
15  , 15  , 4.582 , 3.355 , New , 1.23
15  , 0   , 4.582 , 3.357 , New , 1.22
16  , 16  , 3.972 , 3.368 , New , 0.6 
17  , 17  , 3.961 , 3.355 , New , 0.61
17  , 0   , 3.961 , 3.357 , New , 0.6 
18  , 18  , 3.959 , 3.359 , New , 0.6 
18  , 0   , 3.962 , 3.356 , New , 0.61
19  , 19  , 3.959 , 3.354 , New , 0.6 
19  , 0   , 3.962 , 3.355 , New , 0.61
20  , 20  , 3.959 , 3.355 , New , 0.6 
20  , 0   , 3.961 , 3.358 , New , 0.6 
21  , 21  , 3.961 , 3.356 , New , 0.6 
21  , 0   , 3.959 , 3.357 , New , 0.6 
22  , 22  , 3.959 , 3.354 , New , 0.6 
22  , 0   , 3.959 , 3.355 , New , 0.6 
23  , 23  , 3.958 , 3.355 , New , 0.6 
23  , 0   , 3.963 , 3.37  , New , 0.59
24  , 24  , 3.965 , 3.356 , New , 0.61
24  , 0   , 3.96  , 3.357 , New , 0.6 
25  , 25  , 3.96  , 3.356 , New , 0.6 
25  , 0   , 3.962 , 3.355 , New , 0.61
26  , 26  , 3.957 , 3.352 , New , 0.6 
26  , 0   , 3.961 , 3.353 , New , 0.61
27  , 27  , 3.959 , 3.355 , New , 0.6 
27  , 0   , 3.958 , 3.354 , New , 0.6 
28  , 28  , 3.959 , 3.356 , New , 0.6 
28  , 0   , 3.96  , 3.357 , New , 0.6 
29  , 29  , 3.959 , 3.357 , New , 0.6 
29  , 0   , 3.96  , 3.355 , New , 0.6 
30  , 30  , 3.96  , 3.354 , New , 0.61
30  , 0   , 3.957 , 3.355 , New , 0.6 
31  , 31  , 3.959 , 3.355 , New , 0.6 
31  , 0   , 3.96  , 3.356 , New , 0.6 
14  , 1   , 4.585 , 3.36  , New , 1.23
25  , 2   , 3.957 , 3.358 , New , 0.6 
1   , 4095, 3.964 , 4.088 , Cur , -0.12
2   , 4095, 32.221, 32.234, Cur , -0.01
3   , 4095, 17.237, 17.261, Cur , -0.02
4   , 4095, 32.223, 32.238, Cur , -0.02
5   , 4095, 17.241, 17.259, Cur , -0.02
6   , 4095, 17.237, 17.259, Cur , -0.02
7   , 4095, 17.239, 17.243, Cur , -0.0
8   , 4095, 32.226, 32.228, Cur , -0.0
9   , 4095, 17.241, 17.25 , Cur , -0.01
10  , 4095, 17.237, 17.26 , Cur , -0.02
11  , 4095, 17.238, 17.252, Cur , -0.01
12  , 4095, 17.238, 17.264, Cur , -0.03
13  , 4095, 17.237, 17.246, Cur , -0.01
14  , 4095, 17.238, 17.258, Cur , -0.02
15  , 4095, 17.237, 17.252, Cur , -0.02
16  , 4095, 32.221, 32.519, Cur , -0.3
17  , 4095, 17.24 , 17.26 , Cur , -0.02
18  , 4095, 17.243, 17.26 , Cur , -0.02
19  , 4095, 17.234, 17.245, Cur , -0.01
20  , 4095, 17.239, 17.257, Cur , -0.02
21  , 4095, 17.238, 17.245, Cur , -0.01
22  , 4095, 17.237, 17.249, Cur , -0.01
23  , 4095, 17.237, 17.246, Cur , -0.01
24  , 4095, 17.233, 17.241, Cur , -0.01
25  , 4095, 17.235, 17.248, Cur , -0.01
26  , 4095, 17.233, 17.24 , Cur , -0.01
27  , 4095, 17.236, 17.24 , Cur , -0.0
28  , 4095, 17.238, 17.241, Cur , -0.0
29  , 4095, 17.233, 17.247, Cur , -0.01
30  , 4095, 17.236, 17.244, Cur , -0.01
31  , 4095, 17.236, 17.249, Cur , -0.01
2   , 4094, 4.585 , 5.28  , Cur , -0.7
3   , 4093, 4.583 , 5.226 , Cur , -0.64
4   , 4092, 4.582 , 4.599 , Cur , -0.02
5   , 4091, 4.582 , 4.601 , Cur , -0.02
6   , 4090, 4.581 , 4.594 , Cur , -0.01
7   , 4089, 4.582 , 4.606 , Cur , -0.02
8   , 4088, 4.586 , 4.591 , Cur , -0.0
9   , 4087, 4.581 , 4.592 , Cur , -0.01
10  , 4086, 4.582 , 4.589 , Cur , -0.01
11  , 4085, 4.584 , 4.598 , Cur , -0.01
12  , 4084, 4.583 , 4.592 , Cur , -0.01
13  , 4083, 4.582 , 4.589 , Cur , -0.01
14  , 4082, 4.585 , 4.594 , Cur , -0.01
15  , 4081, 4.583 , 4.59  , Cur , -0.01
16  , 4080, 3.962 , 4.602 , Cur , -0.64
17  , 4079, 3.96  , 4.602 , Cur , -0.64
18  , 4078, 3.962 , 4.599 , Cur , -0.64
19  , 4077, 3.959 , 4.597 , Cur , -0.64
20  , 4076, 3.961 , 4.591 , Cur , -0.63
21  , 4075, 3.959 , 4.593 , Cur , -0.63
22  , 4074, 3.964 , 4.591 , Cur , -0.63
23  , 4073, 3.959 , 4.593 , Cur , -0.63
24  , 4072, 3.96  , 4.589 , Cur , -0.63
25  , 4071, 3.961 , 4.587 , Cur , -0.63
26  , 4070, 3.961 , 4.591 , Cur , -0.63
27  , 4069, 3.958 , 4.591 , Cur , -0.63
28  , 4068, 3.961 , 4.593 , Cur , -0.63
29  , 4067, 3.957 , 4.591 , Cur , -0.63
30  , 4066, 3.958 , 4.589 , Cur , -0.63
31  , 4065, 3.959 , 4.59  , Cur , -0.63

Results For Tigerlake memset-evex
size, algn, Cur T , New T , Win , Dif 
1   , 0   , 4.065 , 3.425 , New , 0.64
2   , 0   , 4.642 , 3.422 , New , 1.22
4   , 0   , 4.613 , 3.426 , New , 1.19
8   , 0   , 4.677 , 3.421 , New , 1.26
16  , 0   , 3.973 , 3.425 , New , 0.55
1   , 1   , 3.985 , 3.364 , New , 0.62
2   , 2   , 4.588 , 3.359 , New , 1.23
3   , 3   , 4.583 , 3.362 , New , 1.22
3   , 0   , 4.584 , 3.361 , New , 1.22
4   , 4   , 4.585 , 3.359 , New , 1.23
5   , 5   , 4.58  , 3.36  , New , 1.22
5   , 0   , 4.582 , 3.363 , New , 1.22
6   , 6   , 4.58  , 3.356 , New , 1.22
6   , 0   , 4.58  , 3.361 , New , 1.22
7   , 7   , 4.58  , 3.364 , New , 1.22
7   , 0   , 4.581 , 3.357 , New , 1.22
8   , 8   , 4.583 , 3.358 , New , 1.23
9   , 9   , 4.582 , 3.359 , New , 1.22
9   , 0   , 4.58  , 3.358 , New , 1.22
10  , 10  , 4.582 , 3.358 , New , 1.22
10  , 0   , 4.582 , 3.359 , New , 1.22
11  , 11  , 4.581 , 3.355 , New , 1.23
11  , 0   , 4.581 , 3.36  , New , 1.22
12  , 12  , 4.582 , 3.356 , New , 1.23
12  , 0   , 4.584 , 3.357 , New , 1.23
13  , 13  , 4.582 , 3.359 , New , 1.22
13  , 0   , 4.582 , 3.359 , New , 1.22
14  , 14  , 4.582 , 3.358 , New , 1.22
14  , 0   , 4.582 , 3.359 , New , 1.22
15  , 15  , 4.582 , 3.358 , New , 1.22
15  , 0   , 4.581 , 3.356 , New , 1.23
16  , 16  , 3.967 , 3.355 , New , 0.61
17  , 17  , 3.966 , 3.356 , New , 0.61
17  , 0   , 3.968 , 3.363 , New , 0.6 
18  , 18  , 3.967 , 3.356 , New , 0.61
18  , 0   , 3.966 , 3.361 , New , 0.6 
19  , 19  , 3.965 , 3.353 , New , 0.61
19  , 0   , 3.966 , 3.361 , New , 0.6 
20  , 20  , 3.964 , 3.355 , New , 0.61
20  , 0   , 3.965 , 3.357 , New , 0.61
21  , 21  , 3.966 , 3.357 , New , 0.61
21  , 0   , 3.963 , 3.357 , New , 0.61
22  , 22  , 3.964 , 3.355 , New , 0.61
22  , 0   , 3.966 , 3.359 , New , 0.61
23  , 23  , 3.967 , 3.355 , New , 0.61
23  , 0   , 3.965 , 3.356 , New , 0.61
24  , 24  , 3.964 , 3.355 , New , 0.61
24  , 0   , 3.962 , 3.359 , New , 0.6 
25  , 25  , 3.964 , 3.355 , New , 0.61
25  , 0   , 3.963 , 3.358 , New , 0.6 
26  , 26  , 3.967 , 3.352 , New , 0.62
26  , 0   , 3.962 , 3.357 , New , 0.6 
27  , 27  , 3.964 , 3.355 , New , 0.61
27  , 0   , 3.964 , 3.356 , New , 0.61
28  , 28  , 3.967 , 3.353 , New , 0.61
28  , 0   , 3.966 , 3.359 , New , 0.61
29  , 29  , 3.963 , 3.355 , New , 0.61
29  , 0   , 3.963 , 3.357 , New , 0.61
30  , 30  , 3.964 , 3.354 , New , 0.61
30  , 0   , 3.963 , 3.359 , New , 0.6 
31  , 31  , 3.963 , 3.355 , New , 0.61
31  , 0   , 3.964 , 3.356 , New , 0.61
14  , 1   , 4.582 , 3.359 , New , 1.22
25  , 2   , 3.962 , 3.369 , New , 0.59
1   , 4095, 3.964 , 4.099 , Cur , -0.14
2   , 4095, 32.236, 32.482, Cur , -0.25
3   , 4095, 17.238, 17.259, Cur , -0.02
4   , 4095, 32.22 , 32.457, Cur , -0.24
5   , 4095, 17.24 , 17.269, Cur , -0.03
6   , 4095, 17.238, 17.254, Cur , -0.02
7   , 4095, 17.239, 17.251, Cur , -0.01
8   , 4095, 32.222, 32.243, Cur , -0.02
9   , 4095, 17.237, 17.257, Cur , -0.02
10  , 4095, 17.239, 17.248, Cur , -0.01
11  , 4095, 17.241, 17.251, Cur , -0.01
12  , 4095, 17.238, 17.257, Cur , -0.02
13  , 4095, 17.239, 17.255, Cur , -0.02
14  , 4095, 17.238, 17.256, Cur , -0.02
15  , 4095, 17.236, 17.259, Cur , -0.02
16  , 4095, 32.22 , 32.263, Cur , -0.04
17  , 4095, 17.235, 17.254, Cur , -0.02
18  , 4095, 17.238, 17.25 , Cur , -0.01
19  , 4095, 17.238, 17.247, Cur , -0.01
20  , 4095, 17.238, 17.259, Cur , -0.02
21  , 4095, 17.236, 17.251, Cur , -0.02
22  , 4095, 17.237, 17.245, Cur , -0.01
23  , 4095, 17.238, 17.244, Cur , -0.01
24  , 4095, 17.235, 17.252, Cur , -0.02
25  , 4095, 17.237, 17.245, Cur , -0.01
26  , 4095, 17.236, 17.243, Cur , -0.01
27  , 4095, 17.234, 17.243, Cur , -0.01
28  , 4095, 17.234, 17.242, Cur , -0.01
29  , 4095, 17.233, 17.244, Cur , -0.01
30  , 4095, 17.236, 17.246, Cur , -0.01
31  , 4095, 17.235, 17.246, Cur , -0.01
2   , 4094, 4.584 , 5.283 , Cur , -0.7
3   , 4093, 4.584 , 5.231 , Cur , -0.65
4   , 4092, 4.581 , 4.609 , Cur , -0.03
5   , 4091, 4.583 , 4.599 , Cur , -0.02
6   , 4090, 4.581 , 4.597 , Cur , -0.02
7   , 4089, 4.578 , 4.591 , Cur , -0.01
8   , 4088, 4.584 , 4.597 , Cur , -0.01
9   , 4087, 4.583 , 4.591 , Cur , -0.01
10  , 4086, 4.581 , 4.593 , Cur , -0.01
11  , 4085, 4.581 , 4.596 , Cur , -0.01
12  , 4084, 4.58  , 4.59  , Cur , -0.01
13  , 4083, 4.582 , 4.59  , Cur , -0.01
14  , 4082, 4.581 , 4.601 , Cur , -0.02
15  , 4081, 4.581 , 4.591 , Cur , -0.01
16  , 4080, 3.969 , 4.614 , Cur , -0.65
17  , 4079, 3.968 , 4.602 , Cur , -0.63
18  , 4078, 3.965 , 4.601 , Cur , -0.64
19  , 4077, 3.964 , 4.602 , Cur , -0.64
20  , 4076, 3.968 , 4.6   , Cur , -0.63
21  , 4075, 3.965 , 4.599 , Cur , -0.63
22  , 4074, 3.965 , 4.604 , Cur , -0.64
23  , 4073, 3.967 , 4.601 , Cur , -0.63
24  , 4072, 3.963 , 4.6   , Cur , -0.64
25  , 4071, 3.965 , 4.601 , Cur , -0.64
26  , 4070, 3.964 , 4.597 , Cur , -0.63
27  , 4069, 3.967 , 4.598 , Cur , -0.63
28  , 4068, 3.964 , 4.595 , Cur , -0.63
29  , 4067, 3.964 , 4.598 , Cur , -0.63
30  , 4066, 3.966 , 4.596 , Cur , -0.63
31  , 4065, 3.966 , 4.597 , Cur , -0.63

Results For Tigerlake memset-avx512-erms
size, algn, Cur T , New T , Win , Dif 
1   , 0   , 4.071 , 4.065 , New , 0.01
2   , 0   , 4.014 , 4.064 , Cur , -0.05
4   , 0   , 3.974 , 4.063 , Cur , -0.09
8   , 0   , 4.01  , 4.059 , Cur , -0.05
16  , 0   , 4.596 , 4.061 , New , 0.54
32  , 0   , 3.964 , 3.987 , Cur , -0.02
1   , 1   , 3.989 , 3.981 , New , 0.01
2   , 2   , 3.967 , 3.981 , Cur , -0.01
3   , 3   , 3.961 , 3.984 , Cur , -0.02
3   , 0   , 3.958 , 3.993 , Cur , -0.03
4   , 4   , 3.96  , 3.98  , Cur , -0.02
5   , 5   , 3.958 , 3.981 , Cur , -0.02
5   , 0   , 3.959 , 3.993 , Cur , -0.03
6   , 6   , 3.956 , 3.979 , Cur , -0.02
6   , 0   , 3.956 , 3.985 , Cur , -0.03
7   , 7   , 3.958 , 3.979 , Cur , -0.02
7   , 0   , 3.957 , 3.984 , Cur , -0.03
8   , 8   , 3.963 , 3.984 , Cur , -0.02
9   , 9   , 3.959 , 3.985 , Cur , -0.03
9   , 0   , 3.958 , 3.989 , Cur , -0.03
10  , 10  , 3.958 , 3.982 , Cur , -0.02
10  , 0   , 3.956 , 3.984 , Cur , -0.03
11  , 11  , 3.954 , 3.981 , Cur , -0.03
11  , 0   , 3.954 , 3.985 , Cur , -0.03
12  , 12  , 3.956 , 3.979 , Cur , -0.02
12  , 0   , 3.956 , 3.984 , Cur , -0.03
13  , 13  , 3.958 , 3.979 , Cur , -0.02
13  , 0   , 3.958 , 3.986 , Cur , -0.03
14  , 14  , 3.957 , 3.98  , Cur , -0.02
14  , 0   , 3.959 , 3.993 , Cur , -0.03
15  , 15  , 3.958 , 3.981 , Cur , -0.02
15  , 0   , 3.957 , 3.98  , Cur , -0.02
16  , 16  , 4.581 , 3.978 , New , 0.6 
17  , 17  , 4.579 , 3.985 , New , 0.59
17  , 0   , 4.58  , 3.983 , New , 0.6 
18  , 18  , 4.581 , 3.981 , New , 0.6 
18  , 0   , 4.581 , 3.982 , New , 0.6 
19  , 19  , 4.582 , 3.98  , New , 0.6 
19  , 0   , 4.583 , 3.982 , New , 0.6 
20  , 20  , 4.581 , 3.979 , New , 0.6 
20  , 0   , 4.581 , 3.982 , New , 0.6 
21  , 21  , 4.58  , 3.979 , New , 0.6 
21  , 0   , 4.578 , 3.985 , New , 0.59
22  , 22  , 4.579 , 3.979 , New , 0.6 
22  , 0   , 4.581 , 3.981 , New , 0.6 
23  , 23  , 4.582 , 3.979 , New , 0.6 
23  , 0   , 4.582 , 3.976 , New , 0.61
24  , 24  , 4.579 , 3.98  , New , 0.6 
24  , 0   , 4.58  , 3.982 , New , 0.6 
25  , 25  , 4.58  , 3.978 , New , 0.6 
25  , 0   , 4.578 , 3.978 , New , 0.6 
26  , 26  , 4.576 , 3.978 , New , 0.6 
26  , 0   , 4.578 , 3.982 , New , 0.6 
27  , 27  , 4.581 , 3.978 , New , 0.6 
27  , 0   , 4.581 , 3.982 , New , 0.6 
28  , 28  , 4.58  , 3.977 , New , 0.6 
28  , 0   , 4.579 , 3.978 , New , 0.6 
29  , 29  , 4.58  , 3.979 , New , 0.6 
29  , 0   , 4.579 , 3.979 , New , 0.6 
30  , 30  , 4.579 , 3.976 , New , 0.6 
30  , 0   , 4.58  , 3.983 , New , 0.6 
31  , 31  , 4.579 , 3.976 , New , 0.6 
31  , 0   , 4.58  , 3.978 , New , 0.6 
32  , 32  , 3.957 , 3.977 , Cur , -0.02
33  , 33  , 3.957 , 3.978 , Cur , -0.02
33  , 0   , 3.954 , 3.979 , Cur , -0.02
34  , 34  , 3.955 , 3.979 , Cur , -0.02
34  , 0   , 3.956 , 3.978 , Cur , -0.02
35  , 35  , 3.955 , 3.977 , Cur , -0.02
35  , 0   , 3.956 , 3.981 , Cur , -0.02
36  , 36  , 3.954 , 3.977 , Cur , -0.02
36  , 0   , 3.955 , 3.979 , Cur , -0.02
37  , 37  , 3.955 , 3.982 , Cur , -0.03
37  , 0   , 3.954 , 3.978 , Cur , -0.02
38  , 38  , 3.952 , 3.976 , Cur , -0.02
38  , 0   , 3.956 , 3.979 , Cur , -0.02
39  , 39  , 3.952 , 3.977 , Cur , -0.02
39  , 0   , 3.954 , 3.978 , Cur , -0.02
40  , 40  , 3.953 , 3.977 , Cur , -0.02
40  , 0   , 3.956 , 3.978 , Cur , -0.02
41  , 41  , 3.955 , 3.979 , Cur , -0.02
41  , 0   , 3.956 , 3.979 , Cur , -0.02
42  , 42  , 3.955 , 3.978 , Cur , -0.02
42  , 0   , 3.957 , 3.978 , Cur , -0.02
43  , 43  , 3.956 , 3.978 , Cur , -0.02
43  , 0   , 3.952 , 3.977 , Cur , -0.02
44  , 44  , 3.954 , 3.976 , Cur , -0.02
44  , 0   , 3.952 , 3.979 , Cur , -0.03
45  , 45  , 3.952 , 3.979 , Cur , -0.03
45  , 0   , 3.953 , 3.979 , Cur , -0.03
46  , 46  , 3.954 , 3.979 , Cur , -0.02
46  , 0   , 3.95  , 3.98  , Cur , -0.03
47  , 47  , 3.952 , 3.979 , Cur , -0.03
47  , 0   , 3.951 , 3.979 , Cur , -0.03
48  , 48  , 3.951 , 3.977 , Cur , -0.03
48  , 0   , 3.952 , 3.979 , Cur , -0.03
49  , 49  , 3.954 , 3.979 , Cur , -0.02
49  , 0   , 3.951 , 3.976 , Cur , -0.02
50  , 50  , 3.95  , 3.977 , Cur , -0.03
50  , 0   , 3.952 , 3.977 , Cur , -0.02
51  , 51  , 3.954 , 3.979 , Cur , -0.02
51  , 0   , 3.953 , 3.977 , Cur , -0.02
52  , 52  , 3.953 , 3.978 , Cur , -0.03
52  , 0   , 3.953 , 3.977 , Cur , -0.02
53  , 53  , 3.952 , 3.976 , Cur , -0.02
53  , 0   , 3.953 , 3.979 , Cur , -0.03
54  , 54  , 3.954 , 3.976 , Cur , -0.02
54  , 0   , 3.953 , 3.977 , Cur , -0.02
55  , 55  , 3.952 , 3.978 , Cur , -0.03
55  , 0   , 3.954 , 3.975 , Cur , -0.02
56  , 56  , 3.955 , 3.978 , Cur , -0.02
56  , 0   , 3.951 , 3.979 , Cur , -0.03
57  , 57  , 3.953 , 3.978 , Cur , -0.03
57  , 0   , 3.951 , 3.976 , Cur , -0.02
58  , 58  , 3.953 , 3.978 , Cur , -0.03
58  , 0   , 3.954 , 3.98  , Cur , -0.03
59  , 59  , 3.952 , 3.976 , Cur , -0.02
59  , 0   , 3.952 , 3.977 , Cur , -0.02
60  , 60  , 3.955 , 3.979 , Cur , -0.02
60  , 0   , 3.953 , 3.977 , Cur , -0.02
61  , 61  , 3.952 , 3.979 , Cur , -0.03
61  , 0   , 3.951 , 3.978 , Cur , -0.03
62  , 62  , 3.949 , 3.98  , Cur , -0.03
62  , 0   , 3.949 , 3.979 , Cur , -0.03
63  , 63  , 3.951 , 3.977 , Cur , -0.03
63  , 0   , 3.949 , 3.977 , Cur , -0.03
14  , 1   , 3.968 , 3.979 , Cur , -0.01
25  , 2   , 4.592 , 3.978 , New , 0.61
1   , 4095, 10.829, 11.313, Cur , -0.48
2   , 4095, 32.223, 32.39 , Cur , -0.17
3   , 4095, 17.239, 17.256, Cur , -0.02
4   , 4095, 32.225, 32.244, Cur , -0.02
5   , 4095, 17.239, 17.272, Cur , -0.03
6   , 4095, 17.24 , 17.258, Cur , -0.02
7   , 4095, 17.242, 17.265, Cur , -0.02
8   , 4095, 32.227, 32.434, Cur , -0.21
9   , 4095, 17.239, 17.254, Cur , -0.02
10  , 4095, 17.238, 17.262, Cur , -0.02
11  , 4095, 17.238, 17.258, Cur , -0.02
12  , 4095, 17.241, 17.445, Cur , -0.2
13  , 4095, 17.238, 17.438, Cur , -0.2
14  , 4095, 17.238, 17.245, Cur , -0.01
15  , 4095, 17.24 , 17.25 , Cur , -0.01
16  , 4095, 32.22 , 32.599, Cur , -0.38
17  , 4095, 17.236, 17.251, Cur , -0.02
18  , 4095, 17.237, 17.258, Cur , -0.02
19  , 4095, 17.237, 17.255, Cur , -0.02
20  , 4095, 17.237, 17.248, Cur , -0.01
21  , 4095, 17.235, 17.256, Cur , -0.02
22  , 4095, 17.235, 17.245, Cur , -0.01
23  , 4095, 17.234, 17.257, Cur , -0.02
24  , 4095, 17.234, 17.251, Cur , -0.02
25  , 4095, 17.235, 17.252, Cur , -0.02
26  , 4095, 17.232, 17.244, Cur , -0.01
27  , 4095, 17.237, 17.244, Cur , -0.01
28  , 4095, 17.236, 17.25 , Cur , -0.01
29  , 4095, 17.236, 17.245, Cur , -0.01
30  , 4095, 17.235, 17.25 , Cur , -0.02
31  , 4095, 17.237, 17.237, Eq  , 0.0 
32  , 4095, 32.218, 32.226, Cur , -0.01
33  , 4095, 17.234, 17.239, Cur , -0.0
34  , 4095, 17.232, 17.248, Cur , -0.02
35  , 4095, 17.234, 17.249, Cur , -0.01
36  , 4095, 17.234, 17.241, Cur , -0.01
37  , 4095, 17.235, 17.246, Cur , -0.01
38  , 4095, 17.234, 17.247, Cur , -0.01
39  , 4095, 17.231, 17.242, Cur , -0.01
40  , 4095, 17.234, 17.245, Cur , -0.01
41  , 4095, 17.237, 17.25 , Cur , -0.01
42  , 4095, 17.233, 17.243, Cur , -0.01
43  , 4095, 17.234, 17.242, Cur , -0.01
44  , 4095, 17.234, 17.242, Cur , -0.01
45  , 4095, 17.234, 17.244, Cur , -0.01
46  , 4095, 17.231, 17.257, Cur , -0.03
47  , 4095, 17.236, 17.247, Cur , -0.01
48  , 4095, 17.234, 17.24 , Cur , -0.01
49  , 4095, 17.235, 17.244, Cur , -0.01
50  , 4095, 17.233, 17.245, Cur , -0.01
51  , 4095, 17.236, 17.25 , Cur , -0.01
52  , 4095, 17.232, 17.242, Cur , -0.01
53  , 4095, 17.233, 17.253, Cur , -0.02
54  , 4095, 17.233, 17.245, Cur , -0.01
55  , 4095, 17.234, 17.245, Cur , -0.01
56  , 4095, 17.235, 17.246, Cur , -0.01
57  , 4095, 17.234, 17.241, Cur , -0.01
58  , 4095, 17.234, 17.256, Cur , -0.02
59  , 4095, 17.235, 17.248, Cur , -0.01
60  , 4095, 17.235, 17.244, Cur , -0.01
61  , 4095, 17.23 , 17.24 , Cur , -0.01
62  , 4095, 17.232, 17.244, Cur , -0.01
63  , 4095, 17.231, 17.242, Cur , -0.01
2   , 4094, 10.475, 11.063, Cur , -0.59
3   , 4093, 10.867, 11.242, Cur , -0.38
4   , 4092, 10.614, 10.98 , Cur , -0.37
5   , 4091, 10.893, 11.457, Cur , -0.56
6   , 4090, 10.588, 11.216, Cur , -0.63
7   , 4089, 10.969, 11.24 , Cur , -0.27
8   , 4088, 10.359, 11.251, Cur , -0.89
9   , 4087, 10.532, 11.642, Cur , -1.11
10  , 4086, 10.402, 11.17 , Cur , -0.77
11  , 4085, 10.231, 11.153, Cur , -0.92
12  , 4084, 10.335, 10.755, Cur , -0.42
13  , 4083, 10.849, 11.2  , Cur , -0.35
14  , 4082, 10.575, 11.34 , Cur , -0.77
15  , 4081, 10.523, 10.993, Cur , -0.47
16  , 4080, 4.581 , 4.594 , Cur , -0.01
17  , 4079, 4.582 , 4.588 , Cur , -0.01
18  , 4078, 4.582 , 4.591 , Cur , -0.01
19  , 4077, 4.582 , 4.59  , Cur , -0.01
20  , 4076, 4.582 , 4.587 , Cur , -0.0
21  , 4075, 4.58  , 4.587 , Cur , -0.01
22  , 4074, 4.582 , 4.585 , Cur , -0.0
23  , 4073, 4.583 , 4.588 , Cur , -0.0
24  , 4072, 4.58  , 4.588 , Cur , -0.01
25  , 4071, 4.579 , 4.586 , Cur , -0.01
26  , 4070, 4.58  , 4.584 , Cur , -0.0
27  , 4069, 4.582 , 4.585 , Cur , -0.0
28  , 4068, 4.579 , 4.585 , Cur , -0.01
29  , 4067, 4.58  , 4.588 , Cur , -0.01
30  , 4066, 4.578 , 4.588 , Cur , -0.01
31  , 4065, 4.58  , 4.587 , Cur , -0.01
32  , 4064, 3.958 , 5.222 , Cur , -1.26
33  , 4063, 3.956 , 5.219 , Cur , -1.26
34  , 4062, 3.957 , 5.22  , Cur , -1.26
35  , 4061, 3.955 , 5.223 , Cur , -1.27
36  , 4060, 3.955 , 5.222 , Cur , -1.27
37  , 4059, 3.954 , 5.222 , Cur , -1.27
38  , 4058, 3.955 , 5.219 , Cur , -1.26
39  , 4057, 3.954 , 5.222 , Cur , -1.27
40  , 4056, 3.952 , 5.218 , Cur , -1.27
41  , 4055, 3.957 , 5.219 , Cur , -1.26
42  , 4054, 3.953 , 5.22  , Cur , -1.27
43  , 4053, 3.954 , 5.215 , Cur , -1.26
44  , 4052, 3.953 , 5.222 , Cur , -1.27
45  , 4051, 3.954 , 5.221 , Cur , -1.27
46  , 4050, 3.955 , 5.218 , Cur , -1.26
47  , 4049, 3.953 , 5.221 , Cur , -1.27
48  , 4048, 3.95  , 5.219 , Cur , -1.27
49  , 4047, 3.957 , 5.22  , Cur , -1.26
50  , 4046, 3.953 , 5.217 , Cur , -1.26
51  , 4045, 3.951 , 5.218 , Cur , -1.27
52  , 4044, 3.951 , 5.216 , Cur , -1.27
53  , 4043, 3.956 , 5.223 , Cur , -1.27
54  , 4042, 3.953 , 5.217 , Cur , -1.26
55  , 4041, 3.953 , 5.219 , Cur , -1.27
56  , 4040, 3.953 , 5.221 , Cur , -1.27
57  , 4039, 3.953 , 5.219 , Cur , -1.27
58  , 4038, 3.953 , 5.212 , Cur , -1.26
59  , 4037, 3.954 , 5.216 , Cur , -1.26
60  , 4036, 3.953 , 5.214 , Cur , -1.26
61  , 4035, 3.955 , 5.218 , Cur , -1.26
62  , 4034, 3.953 , 5.22  , Cur , -1.27
63  , 4033, 3.95  , 5.218 , Cur , -1.27

Results For Tigerlake memset-avx512
size, algn, Cur T , New T , Win , Dif 
1   , 0   , 4.043 , 4.066 , Cur , -0.02
2   , 0   , 4.038 , 4.06  , Cur , -0.02
4   , 0   , 4.006 , 4.059 , Cur , -0.05
8   , 0   , 4.006 , 4.06  , Cur , -0.05
16  , 0   , 4.637 , 4.062 , New , 0.57
32  , 0   , 3.956 , 3.992 , Cur , -0.04
1   , 1   , 3.974 , 3.989 , Cur , -0.01
2   , 2   , 3.965 , 3.991 , Cur , -0.03
3   , 3   , 3.965 , 3.99  , Cur , -0.03
3   , 0   , 3.962 , 3.985 , Cur , -0.02
4   , 4   , 3.96  , 3.982 , Cur , -0.02
5   , 5   , 3.957 , 3.986 , Cur , -0.03
5   , 0   , 3.957 , 3.982 , Cur , -0.03
6   , 6   , 3.959 , 3.98  , Cur , -0.02
6   , 0   , 3.956 , 3.982 , Cur , -0.03
7   , 7   , 3.958 , 3.981 , Cur , -0.02
7   , 0   , 3.958 , 3.985 , Cur , -0.03
8   , 8   , 3.961 , 3.981 , Cur , -0.02
9   , 9   , 3.957 , 3.983 , Cur , -0.03
9   , 0   , 3.956 , 3.981 , Cur , -0.02
10  , 10  , 3.956 , 3.985 , Cur , -0.03
10  , 0   , 3.956 , 3.984 , Cur , -0.03
11  , 11  , 3.957 , 3.983 , Cur , -0.03
11  , 0   , 3.954 , 3.986 , Cur , -0.03
12  , 12  , 3.956 , 3.989 , Cur , -0.03
12  , 0   , 3.955 , 3.98  , Cur , -0.02
13  , 13  , 3.96  , 3.994 , Cur , -0.03
13  , 0   , 3.956 , 3.987 , Cur , -0.03
14  , 14  , 3.956 , 3.981 , Cur , -0.02
14  , 0   , 3.96  , 3.996 , Cur , -0.04
15  , 15  , 3.956 , 3.986 , Cur , -0.03
15  , 0   , 3.955 , 3.983 , Cur , -0.03
16  , 16  , 4.582 , 3.98  , New , 0.6 
17  , 17  , 4.581 , 3.986 , New , 0.6 
17  , 0   , 4.583 , 3.982 , New , 0.6 
18  , 18  , 4.578 , 3.984 , New , 0.59
18  , 0   , 4.582 , 3.983 , New , 0.6 
19  , 19  , 4.58  , 3.979 , New , 0.6 
19  , 0   , 4.577 , 3.982 , New , 0.59
20  , 20  , 4.581 , 3.979 , New , 0.6 
20  , 0   , 4.58  , 3.981 , New , 0.6 
21  , 21  , 4.581 , 3.982 , New , 0.6 
21  , 0   , 4.581 , 3.98  , New , 0.6 
22  , 22  , 4.581 , 3.976 , New , 0.61
22  , 0   , 4.579 , 3.987 , New , 0.59
23  , 23  , 4.579 , 3.979 , New , 0.6 
23  , 0   , 4.577 , 3.981 , New , 0.6 
24  , 24  , 4.581 , 3.977 , New , 0.6 
24  , 0   , 4.579 , 3.981 , New , 0.6 
25  , 25  , 4.578 , 3.979 , New , 0.6 
25  , 0   , 4.581 , 3.979 , New , 0.6 
26  , 26  , 4.578 , 3.981 , New , 0.6 
26  , 0   , 4.58  , 3.978 , New , 0.6 
27  , 27  , 4.576 , 3.979 , New , 0.6 
27  , 0   , 4.577 , 3.979 , New , 0.6 
28  , 28  , 4.577 , 3.977 , New , 0.6 
28  , 0   , 4.577 , 3.978 , New , 0.6 
29  , 29  , 4.577 , 3.977 , New , 0.6 
29  , 0   , 4.579 , 3.979 , New , 0.6 
30  , 30  , 4.578 , 3.977 , New , 0.6 
30  , 0   , 4.581 , 3.988 , New , 0.59
31  , 31  , 4.58  , 3.979 , New , 0.6 
31  , 0   , 4.578 , 3.979 , New , 0.6 
32  , 32  , 3.955 , 3.978 , Cur , -0.02
33  , 33  , 3.956 , 3.979 , Cur , -0.02
33  , 0   , 3.956 , 3.978 , Cur , -0.02
34  , 34  , 3.953 , 3.982 , Cur , -0.03
34  , 0   , 3.956 , 3.978 , Cur , -0.02
35  , 35  , 3.954 , 3.979 , Cur , -0.02
35  , 0   , 3.952 , 3.977 , Cur , -0.02
36  , 36  , 3.957 , 3.978 , Cur , -0.02
36  , 0   , 3.955 , 3.979 , Cur , -0.02
37  , 37  , 3.951 , 3.979 , Cur , -0.03
37  , 0   , 3.955 , 3.977 , Cur , -0.02
38  , 38  , 3.956 , 3.979 , Cur , -0.02
38  , 0   , 3.953 , 3.98  , Cur , -0.03
39  , 39  , 3.957 , 3.979 , Cur , -0.02
39  , 0   , 3.957 , 3.985 , Cur , -0.03
40  , 40  , 3.953 , 3.981 , Cur , -0.03
40  , 0   , 3.954 , 3.978 , Cur , -0.02
41  , 41  , 3.952 , 3.978 , Cur , -0.03
41  , 0   , 3.952 , 3.98  , Cur , -0.03
42  , 42  , 3.951 , 3.982 , Cur , -0.03
42  , 0   , 3.952 , 3.978 , Cur , -0.03
43  , 43  , 3.95  , 3.986 , Cur , -0.04
43  , 0   , 3.952 , 3.979 , Cur , -0.03
44  , 44  , 3.952 , 3.979 , Cur , -0.03
44  , 0   , 3.952 , 3.98  , Cur , -0.03
45  , 45  , 3.951 , 3.979 , Cur , -0.03
45  , 0   , 3.954 , 3.978 , Cur , -0.02
46  , 46  , 3.951 , 3.982 , Cur , -0.03
46  , 0   , 3.953 , 3.978 , Cur , -0.03
47  , 47  , 3.953 , 3.978 , Cur , -0.03
47  , 0   , 3.951 , 3.986 , Cur , -0.04
48  , 48  , 3.953 , 3.979 , Cur , -0.03
48  , 0   , 3.953 , 3.977 , Cur , -0.02
49  , 49  , 3.952 , 3.976 , Cur , -0.02
49  , 0   , 3.953 , 3.979 , Cur , -0.03
50  , 50  , 3.95  , 3.98  , Cur , -0.03
50  , 0   , 3.955 , 3.978 , Cur , -0.02
51  , 51  , 3.951 , 3.978 , Cur , -0.03
51  , 0   , 3.953 , 3.979 , Cur , -0.03
52  , 52  , 3.952 , 3.977 , Cur , -0.02
52  , 0   , 3.953 , 3.98  , Cur , -0.03
53  , 53  , 3.955 , 3.977 , Cur , -0.02
53  , 0   , 3.951 , 3.979 , Cur , -0.03
54  , 54  , 3.951 , 3.978 , Cur , -0.03
54  , 0   , 3.952 , 3.978 , Cur , -0.03
55  , 55  , 3.954 , 3.98  , Cur , -0.03
55  , 0   , 3.952 , 3.98  , Cur , -0.03
56  , 56  , 3.951 , 3.976 , Cur , -0.02
56  , 0   , 3.954 , 3.982 , Cur , -0.03
57  , 57  , 3.952 , 3.979 , Cur , -0.03
57  , 0   , 3.953 , 3.98  , Cur , -0.03
58  , 58  , 3.952 , 3.979 , Cur , -0.03
58  , 0   , 3.955 , 3.98  , Cur , -0.02
59  , 59  , 3.951 , 3.976 , Cur , -0.02
59  , 0   , 3.953 , 3.979 , Cur , -0.03
60  , 60  , 3.953 , 3.976 , Cur , -0.02
60  , 0   , 3.952 , 3.979 , Cur , -0.03
61  , 61  , 3.951 , 3.98  , Cur , -0.03
61  , 0   , 3.953 , 3.977 , Cur , -0.02
62  , 62  , 3.951 , 3.978 , Cur , -0.03
62  , 0   , 3.95  , 3.979 , Cur , -0.03
63  , 63  , 3.952 , 3.977 , Cur , -0.02
63  , 0   , 3.948 , 3.977 , Cur , -0.03
14  , 1   , 3.959 , 3.98  , Cur , -0.02
25  , 2   , 4.577 , 3.979 , New , 0.6 
1   , 4095, 3.967 , 4.639 , Cur , -0.67
2   , 4095, 32.223, 32.253, Cur , -0.03
3   , 4095, 17.245, 17.277, Cur , -0.03
4   , 4095, 32.219, 32.387, Cur , -0.17
5   , 4095, 17.24 , 17.256, Cur , -0.02
6   , 4095, 17.239, 17.256, Cur , -0.02
7   , 4095, 17.241, 17.407, Cur , -0.17
8   , 4095, 32.222, 32.44 , Cur , -0.22
9   , 4095, 17.238, 17.246, Cur , -0.01
10  , 4095, 17.239, 17.257, Cur , -0.02
11  , 4095, 17.238, 17.261, Cur , -0.02
12  , 4095, 17.236, 17.441, Cur , -0.2
13  , 4095, 17.24 , 17.492, Cur , -0.25
14  , 4095, 17.238, 17.256, Cur , -0.02
15  , 4095, 17.236, 17.264, Cur , -0.03
16  , 4095, 32.22 , 32.24 , Cur , -0.02
17  , 4095, 17.235, 17.445, Cur , -0.21
18  , 4095, 17.236, 17.249, Cur , -0.01
19  , 4095, 17.236, 17.243, Cur , -0.01
20  , 4095, 17.238, 17.244, Cur , -0.01
21  , 4095, 17.237, 17.256, Cur , -0.02
22  , 4095, 17.239, 17.252, Cur , -0.01
23  , 4095, 17.238, 17.247, Cur , -0.01
24  , 4095, 17.236, 17.241, Cur , -0.0
25  , 4095, 17.236, 17.241, Cur , -0.0
26  , 4095, 17.236, 17.246, Cur , -0.01
27  , 4095, 17.238, 17.247, Cur , -0.01
28  , 4095, 17.232, 17.242, Cur , -0.01
29  , 4095, 17.233, 17.249, Cur , -0.02
30  , 4095, 17.236, 17.241, Cur , -0.0
31  , 4095, 17.237, 17.244, Cur , -0.01
32  , 4095, 32.219, 32.236, Cur , -0.02
33  , 4095, 17.236, 17.245, Cur , -0.01
34  , 4095, 17.234, 17.244, Cur , -0.01
35  , 4095, 17.236, 17.248, Cur , -0.01
36  , 4095, 17.233, 17.24 , Cur , -0.01
37  , 4095, 17.233, 17.241, Cur , -0.01
38  , 4095, 17.236, 17.24 , Cur , -0.0
39  , 4095, 17.236, 17.24 , Cur , -0.0
40  , 4095, 17.238, 17.244, Cur , -0.01
41  , 4095, 17.238, 17.244, Cur , -0.01
42  , 4095, 17.236, 17.243, Cur , -0.01
43  , 4095, 17.234, 17.241, Cur , -0.01
44  , 4095, 17.235, 17.241, Cur , -0.01
45  , 4095, 17.235, 17.246, Cur , -0.01
46  , 4095, 17.234, 17.253, Cur , -0.02
47  , 4095, 17.232, 17.239, Cur , -0.01
48  , 4095, 17.235, 17.253, Cur , -0.02
49  , 4095, 17.232, 17.242, Cur , -0.01
50  , 4095, 17.232, 17.248, Cur , -0.02
51  , 4095, 17.233, 17.249, Cur , -0.02
52  , 4095, 17.232, 17.244, Cur , -0.01
53  , 4095, 17.231, 17.242, Cur , -0.01
54  , 4095, 17.234, 17.239, Cur , -0.0
55  , 4095, 17.234, 17.245, Cur , -0.01
56  , 4095, 17.232, 17.241, Cur , -0.01
57  , 4095, 17.232, 17.243, Cur , -0.01
58  , 4095, 17.235, 17.242, Cur , -0.01
59  , 4095, 17.232, 17.246, Cur , -0.01
60  , 4095, 17.234, 17.24 , Cur , -0.01
61  , 4095, 17.232, 17.242, Cur , -0.01
62  , 4095, 17.233, 17.244, Cur , -0.01
63  , 4095, 17.235, 17.244, Cur , -0.01
2   , 4094, 3.965 , 4.609 , Cur , -0.64
3   , 4093, 3.966 , 4.598 , Cur , -0.63
4   , 4092, 3.959 , 4.604 , Cur , -0.65
5   , 4091, 3.971 , 4.623 , Cur , -0.65
6   , 4090, 3.956 , 4.591 , Cur , -0.64
7   , 4089, 3.958 , 4.594 , Cur , -0.64
8   , 4088, 3.959 , 4.607 , Cur , -0.65
9   , 4087, 3.959 , 4.605 , Cur , -0.65
10  , 4086, 3.957 , 4.6   , Cur , -0.64
11  , 4085, 3.958 , 4.599 , Cur , -0.64
12  , 4084, 3.955 , 4.597 , Cur , -0.64
13  , 4083, 3.957 , 4.602 , Cur , -0.65
14  , 4082, 3.957 , 4.599 , Cur , -0.64
15  , 4081, 3.956 , 4.601 , Cur , -0.65
16  , 4080, 4.581 , 4.596 , Cur , -0.01
17  , 4079, 4.581 , 4.588 , Cur , -0.01
18  , 4078, 4.582 , 4.589 , Cur , -0.01
19  , 4077, 4.581 , 4.587 , Cur , -0.01
20  , 4076, 4.582 , 4.589 , Cur , -0.01
21  , 4075, 4.582 , 4.589 , Cur , -0.01
22  , 4074, 4.582 , 4.583 , Cur , -0.0
23  , 4073, 4.586 , 4.599 , Cur , -0.01
24  , 4072, 4.58  , 4.586 , Cur , -0.01
25  , 4071, 4.58  , 4.585 , Cur , -0.0
26  , 4070, 4.581 , 4.587 , Cur , -0.01
27  , 4069, 4.576 , 4.586 , Cur , -0.01
28  , 4068, 4.578 , 4.585 , Cur , -0.01
29  , 4067, 4.578 , 4.585 , Cur , -0.01
30  , 4066, 4.578 , 4.587 , Cur , -0.01
31  , 4065, 4.578 , 4.583 , Cur , -0.0
32  , 4064, 3.955 , 5.225 , Cur , -1.27
33  , 4063, 3.952 , 5.221 , Cur , -1.27
34  , 4062, 3.952 , 5.218 , Cur , -1.27
35  , 4061, 3.955 , 5.22  , Cur , -1.26
36  , 4060, 3.955 , 5.221 , Cur , -1.27
37  , 4059, 3.956 , 5.219 , Cur , -1.26
38  , 4058, 3.955 , 5.219 , Cur , -1.26
39  , 4057, 3.956 , 5.224 , Cur , -1.27
40  , 4056, 3.956 , 5.221 , Cur , -1.27
41  , 4055, 3.952 , 5.22  , Cur , -1.27
42  , 4054, 3.956 , 5.217 , Cur , -1.26
43  , 4053, 3.953 , 5.22  , Cur , -1.27
44  , 4052, 3.951 , 5.222 , Cur , -1.27
45  , 4051, 3.949 , 5.22  , Cur , -1.27
46  , 4050, 3.952 , 5.222 , Cur , -1.27
47  , 4049, 3.953 , 5.222 , Cur , -1.27
48  , 4048, 3.953 , 5.218 , Cur , -1.27
49  , 4047, 3.956 , 5.226 , Cur , -1.27
50  , 4046, 3.953 , 5.221 , Cur , -1.27
51  , 4045, 3.953 , 5.218 , Cur , -1.27
52  , 4044, 3.954 , 5.221 , Cur , -1.27
53  , 4043, 3.948 , 5.219 , Cur , -1.27
54  , 4042, 3.951 , 5.22  , Cur , -1.27
55  , 4041, 3.951 , 5.218 , Cur , -1.27
56  , 4040, 3.957 , 5.224 , Cur , -1.27
57  , 4039, 3.953 , 5.221 , Cur , -1.27
58  , 4038, 3.95  , 5.222 , Cur , -1.27
59  , 4037, 3.953 , 5.22  , Cur , -1.27
60  , 4036, 3.953 , 5.221 , Cur , -1.27
61  , 4035, 3.951 , 5.22  , Cur , -1.27
62  , 4034, 3.951 , 5.22  , Cur , -1.27
63  , 4033, 3.949 , 5.22  , Cur , -1.27

Results For Tigerlake memset-evex-erms
size, algn, Cur T , New T , Win , Dif 
1   , 0   , 4.043 , 3.424 , New , 0.62
2   , 0   , 4.599 , 3.423 , New , 1.18
4   , 0   , 4.591 , 3.422 , New , 1.17
8   , 0   , 4.641 , 3.424 , New , 1.22
16  , 0   , 3.989 , 3.423 , New , 0.57
1   , 1   , 3.971 , 3.365 , New , 0.61
2   , 2   , 4.585 , 3.357 , New , 1.23
3   , 3   , 4.583 , 3.362 , New , 1.22
3   , 0   , 4.584 , 3.359 , New , 1.22
4   , 4   , 4.583 , 3.358 , New , 1.23
5   , 5   , 4.58  , 3.356 , New , 1.22
5   , 0   , 4.581 , 3.359 , New , 1.22
6   , 6   , 4.583 , 3.357 , New , 1.23
6   , 0   , 4.582 , 3.357 , New , 1.22
7   , 7   , 4.581 , 3.357 , New , 1.22
7   , 0   , 4.581 , 3.359 , New , 1.22
8   , 8   , 4.586 , 3.356 , New , 1.23
9   , 9   , 4.584 , 3.358 , New , 1.23
9   , 0   , 4.585 , 3.355 , New , 1.23
10  , 10  , 4.582 , 3.354 , New , 1.23
10  , 0   , 4.582 , 3.358 , New , 1.22
11  , 11  , 4.585 , 3.358 , New , 1.23
11  , 0   , 4.582 , 3.356 , New , 1.23
12  , 12  , 4.582 , 3.357 , New , 1.22
12  , 0   , 4.582 , 3.358 , New , 1.22
13  , 13  , 4.583 , 3.357 , New , 1.23
13  , 0   , 4.582 , 3.357 , New , 1.22
14  , 14  , 4.582 , 3.356 , New , 1.23
14  , 0   , 4.582 , 3.36  , New , 1.22
15  , 15  , 4.582 , 3.355 , New , 1.23
15  , 0   , 4.582 , 3.357 , New , 1.22
16  , 16  , 3.972 , 3.368 , New , 0.6 
17  , 17  , 3.961 , 3.355 , New , 0.61
17  , 0   , 3.961 , 3.357 , New , 0.6 
18  , 18  , 3.959 , 3.359 , New , 0.6 
18  , 0   , 3.962 , 3.356 , New , 0.61
19  , 19  , 3.959 , 3.354 , New , 0.6 
19  , 0   , 3.962 , 3.355 , New , 0.61
20  , 20  , 3.959 , 3.355 , New , 0.6 
20  , 0   , 3.961 , 3.358 , New , 0.6 
21  , 21  , 3.961 , 3.356 , New , 0.6 
21  , 0   , 3.959 , 3.357 , New , 0.6 
22  , 22  , 3.959 , 3.354 , New , 0.6 
22  , 0   , 3.959 , 3.355 , New , 0.6 
23  , 23  , 3.958 , 3.355 , New , 0.6 
23  , 0   , 3.963 , 3.37  , New , 0.59
24  , 24  , 3.965 , 3.356 , New , 0.61
24  , 0   , 3.96  , 3.357 , New , 0.6 
25  , 25  , 3.96  , 3.356 , New , 0.6 
25  , 0   , 3.962 , 3.355 , New , 0.61
26  , 26  , 3.957 , 3.352 , New , 0.6 
26  , 0   , 3.961 , 3.353 , New , 0.61
27  , 27  , 3.959 , 3.355 , New , 0.6 
27  , 0   , 3.958 , 3.354 , New , 0.6 
28  , 28  , 3.959 , 3.356 , New , 0.6 
28  , 0   , 3.96  , 3.357 , New , 0.6 
29  , 29  , 3.959 , 3.357 , New , 0.6 
29  , 0   , 3.96  , 3.355 , New , 0.6 
30  , 30  , 3.96  , 3.354 , New , 0.61
30  , 0   , 3.957 , 3.355 , New , 0.6 
31  , 31  , 3.959 , 3.355 , New , 0.6 
31  , 0   , 3.96  , 3.356 , New , 0.6 
14  , 1   , 4.585 , 3.36  , New , 1.23
25  , 2   , 3.957 , 3.358 , New , 0.6 
1   , 4095, 3.964 , 4.088 , Cur , -0.12
2   , 4095, 32.221, 32.234, Cur , -0.01
3   , 4095, 17.237, 17.261, Cur , -0.02
4   , 4095, 32.223, 32.238, Cur , -0.02
5   , 4095, 17.241, 17.259, Cur , -0.02
6   , 4095, 17.237, 17.259, Cur , -0.02
7   , 4095, 17.239, 17.243, Cur , -0.0
8   , 4095, 32.226, 32.228, Cur , -0.0
9   , 4095, 17.241, 17.25 , Cur , -0.01
10  , 4095, 17.237, 17.26 , Cur , -0.02
11  , 4095, 17.238, 17.252, Cur , -0.01
12  , 4095, 17.238, 17.264, Cur , -0.03
13  , 4095, 17.237, 17.246, Cur , -0.01
14  , 4095, 17.238, 17.258, Cur , -0.02
15  , 4095, 17.237, 17.252, Cur , -0.02
16  , 4095, 32.221, 32.519, Cur , -0.3
17  , 4095, 17.24 , 17.26 , Cur , -0.02
18  , 4095, 17.243, 17.26 , Cur , -0.02
19  , 4095, 17.234, 17.245, Cur , -0.01
20  , 4095, 17.239, 17.257, Cur , -0.02
21  , 4095, 17.238, 17.245, Cur , -0.01
22  , 4095, 17.237, 17.249, Cur , -0.01
23  , 4095, 17.237, 17.246, Cur , -0.01
24  , 4095, 17.233, 17.241, Cur , -0.01
25  , 4095, 17.235, 17.248, Cur , -0.01
26  , 4095, 17.233, 17.24 , Cur , -0.01
27  , 4095, 17.236, 17.24 , Cur , -0.0
28  , 4095, 17.238, 17.241, Cur , -0.0
29  , 4095, 17.233, 17.247, Cur , -0.01
30  , 4095, 17.236, 17.244, Cur , -0.01
31  , 4095, 17.236, 17.249, Cur , -0.01
2   , 4094, 4.585 , 5.28  , Cur , -0.7
3   , 4093, 4.583 , 5.226 , Cur , -0.64
4   , 4092, 4.582 , 4.599 , Cur , -0.02
5   , 4091, 4.582 , 4.601 , Cur , -0.02
6   , 4090, 4.581 , 4.594 , Cur , -0.01
7   , 4089, 4.582 , 4.606 , Cur , -0.02
8   , 4088, 4.586 , 4.591 , Cur , -0.0
9   , 4087, 4.581 , 4.592 , Cur , -0.01
10  , 4086, 4.582 , 4.589 , Cur , -0.01
11  , 4085, 4.584 , 4.598 , Cur , -0.01
12  , 4084, 4.583 , 4.592 , Cur , -0.01
13  , 4083, 4.582 , 4.589 , Cur , -0.01
14  , 4082, 4.585 , 4.594 , Cur , -0.01
15  , 4081, 4.583 , 4.59  , Cur , -0.01
16  , 4080, 3.962 , 4.602 , Cur , -0.64
17  , 4079, 3.96  , 4.602 , Cur , -0.64
18  , 4078, 3.962 , 4.599 , Cur , -0.64
19  , 4077, 3.959 , 4.597 , Cur , -0.64
20  , 4076, 3.961 , 4.591 , Cur , -0.63
21  , 4075, 3.959 , 4.593 , Cur , -0.63
22  , 4074, 3.964 , 4.591 , Cur , -0.63
23  , 4073, 3.959 , 4.593 , Cur , -0.63
24  , 4072, 3.96  , 4.589 , Cur , -0.63
25  , 4071, 3.961 , 4.587 , Cur , -0.63
26  , 4070, 3.961 , 4.591 , Cur , -0.63
27  , 4069, 3.958 , 4.591 , Cur , -0.63
28  , 4068, 3.961 , 4.593 , Cur , -0.63
29  , 4067, 3.957 , 4.591 , Cur , -0.63
30  , 4066, 3.958 , 4.589 , Cur , -0.63
31  , 4065, 3.959 , 4.59  , Cur , -0.63

Results For Tigerlake memset-evex
size, algn, Cur T , New T , Win , Dif 
1   , 0   , 4.065 , 3.425 , New , 0.64
2   , 0   , 4.642 , 3.422 , New , 1.22
4   , 0   , 4.613 , 3.426 , New , 1.19
8   , 0   , 4.677 , 3.421 , New , 1.26
16  , 0   , 3.973 , 3.425 , New , 0.55
1   , 1   , 3.985 , 3.364 , New , 0.62
2   , 2   , 4.588 , 3.359 , New , 1.23
3   , 3   , 4.583 , 3.362 , New , 1.22
3   , 0   , 4.584 , 3.361 , New , 1.22
4   , 4   , 4.585 , 3.359 , New , 1.23
5   , 5   , 4.58  , 3.36  , New , 1.22
5   , 0   , 4.582 , 3.363 , New , 1.22
6   , 6   , 4.58  , 3.356 , New , 1.22
6   , 0   , 4.58  , 3.361 , New , 1.22
7   , 7   , 4.58  , 3.364 , New , 1.22
7   , 0   , 4.581 , 3.357 , New , 1.22
8   , 8   , 4.583 , 3.358 , New , 1.23
9   , 9   , 4.582 , 3.359 , New , 1.22
9   , 0   , 4.58  , 3.358 , New , 1.22
10  , 10  , 4.582 , 3.358 , New , 1.22
10  , 0   , 4.582 , 3.359 , New , 1.22
11  , 11  , 4.581 , 3.355 , New , 1.23
11  , 0   , 4.581 , 3.36  , New , 1.22
12  , 12  , 4.582 , 3.356 , New , 1.23
12  , 0   , 4.584 , 3.357 , New , 1.23
13  , 13  , 4.582 , 3.359 , New , 1.22
13  , 0   , 4.582 , 3.359 , New , 1.22
14  , 14  , 4.582 , 3.358 , New , 1.22
14  , 0   , 4.582 , 3.359 , New , 1.22
15  , 15  , 4.582 , 3.358 , New , 1.22
15  , 0   , 4.581 , 3.356 , New , 1.23
16  , 16  , 3.967 , 3.355 , New , 0.61
17  , 17  , 3.966 , 3.356 , New , 0.61
17  , 0   , 3.968 , 3.363 , New , 0.6 
18  , 18  , 3.967 , 3.356 , New , 0.61
18  , 0   , 3.966 , 3.361 , New , 0.6 
19  , 19  , 3.965 , 3.353 , New , 0.61
19  , 0   , 3.966 , 3.361 , New , 0.6 
20  , 20  , 3.964 , 3.355 , New , 0.61
20  , 0   , 3.965 , 3.357 , New , 0.61
21  , 21  , 3.966 , 3.357 , New , 0.61
21  , 0   , 3.963 , 3.357 , New , 0.61
22  , 22  , 3.964 , 3.355 , New , 0.61
22  , 0   , 3.966 , 3.359 , New , 0.61
23  , 23  , 3.967 , 3.355 , New , 0.61
23  , 0   , 3.965 , 3.356 , New , 0.61
24  , 24  , 3.964 , 3.355 , New , 0.61
24  , 0   , 3.962 , 3.359 , New , 0.6 
25  , 25  , 3.964 , 3.355 , New , 0.61
25  , 0   , 3.963 , 3.358 , New , 0.6 
26  , 26  , 3.967 , 3.352 , New , 0.62
26  , 0   , 3.962 , 3.357 , New , 0.6 
27  , 27  , 3.964 , 3.355 , New , 0.61
27  , 0   , 3.964 , 3.356 , New , 0.61
28  , 28  , 3.967 , 3.353 , New , 0.61
28  , 0   , 3.966 , 3.359 , New , 0.61
29  , 29  , 3.963 , 3.355 , New , 0.61
29  , 0   , 3.963 , 3.357 , New , 0.61
30  , 30  , 3.964 , 3.354 , New , 0.61
30  , 0   , 3.963 , 3.359 , New , 0.6 
31  , 31  , 3.963 , 3.355 , New , 0.61
31  , 0   , 3.964 , 3.356 , New , 0.61
14  , 1   , 4.582 , 3.359 , New , 1.22
25  , 2   , 3.962 , 3.369 , New , 0.59
1   , 4095, 3.964 , 4.099 , Cur , -0.14
2   , 4095, 32.236, 32.482, Cur , -0.25
3   , 4095, 17.238, 17.259, Cur , -0.02
4   , 4095, 32.22 , 32.457, Cur , -0.24
5   , 4095, 17.24 , 17.269, Cur , -0.03
6   , 4095, 17.238, 17.254, Cur , -0.02
7   , 4095, 17.239, 17.251, Cur , -0.01
8   , 4095, 32.222, 32.243, Cur , -0.02
9   , 4095, 17.237, 17.257, Cur , -0.02
10  , 4095, 17.239, 17.248, Cur , -0.01
11  , 4095, 17.241, 17.251, Cur , -0.01
12  , 4095, 17.238, 17.257, Cur , -0.02
13  , 4095, 17.239, 17.255, Cur , -0.02
14  , 4095, 17.238, 17.256, Cur , -0.02
15  , 4095, 17.236, 17.259, Cur , -0.02
16  , 4095, 32.22 , 32.263, Cur , -0.04
17  , 4095, 17.235, 17.254, Cur , -0.02
18  , 4095, 17.238, 17.25 , Cur , -0.01
19  , 4095, 17.238, 17.247, Cur , -0.01
20  , 4095, 17.238, 17.259, Cur , -0.02
21  , 4095, 17.236, 17.251, Cur , -0.02
22  , 4095, 17.237, 17.245, Cur , -0.01
23  , 4095, 17.238, 17.244, Cur , -0.01
24  , 4095, 17.235, 17.252, Cur , -0.02
25  , 4095, 17.237, 17.245, Cur , -0.01
26  , 4095, 17.236, 17.243, Cur , -0.01
27  , 4095, 17.234, 17.243, Cur , -0.01
28  , 4095, 17.234, 17.242, Cur , -0.01
29  , 4095, 17.233, 17.244, Cur , -0.01
30  , 4095, 17.236, 17.246, Cur , -0.01
31  , 4095, 17.235, 17.246, Cur , -0.01
2   , 4094, 4.584 , 5.283 , Cur , -0.7
3   , 4093, 4.584 , 5.231 , Cur , -0.65
4   , 4092, 4.581 , 4.609 , Cur , -0.03
5   , 4091, 4.583 , 4.599 , Cur , -0.02
6   , 4090, 4.581 , 4.597 , Cur , -0.02
7   , 4089, 4.578 , 4.591 , Cur , -0.01
8   , 4088, 4.584 , 4.597 , Cur , -0.01
9   , 4087, 4.583 , 4.591 , Cur , -0.01
10  , 4086, 4.581 , 4.593 , Cur , -0.01
11  , 4085, 4.581 , 4.596 , Cur , -0.01
12  , 4084, 4.58  , 4.59  , Cur , -0.01
13  , 4083, 4.582 , 4.59  , Cur , -0.01
14  , 4082, 4.581 , 4.601 , Cur , -0.02
15  , 4081, 4.581 , 4.591 , Cur , -0.01
16  , 4080, 3.969 , 4.614 , Cur , -0.65
17  , 4079, 3.968 , 4.602 , Cur , -0.63
18  , 4078, 3.965 , 4.601 , Cur , -0.64
19  , 4077, 3.964 , 4.602 , Cur , -0.64
20  , 4076, 3.968 , 4.6   , Cur , -0.63
21  , 4075, 3.965 , 4.599 , Cur , -0.63
22  , 4074, 3.965 , 4.604 , Cur , -0.64
23  , 4073, 3.967 , 4.601 , Cur , -0.63
24  , 4072, 3.963 , 4.6   , Cur , -0.64
25  , 4071, 3.965 , 4.601 , Cur , -0.64
26  , 4070, 3.964 , 4.597 , Cur , -0.63
27  , 4069, 3.967 , 4.598 , Cur , -0.63
28  , 4068, 3.964 , 4.595 , Cur , -0.63
29  , 4067, 3.964 , 4.598 , Cur , -0.63
30  , 4066, 3.966 , 4.596 , Cur , -0.63
31  , 4065, 3.966 , 4.597 , Cur , -0.63

Results For Tigerlake memset-avx512-erms
size, algn, Cur T , New T , Win , Dif 
1   , 0   , 4.071 , 4.065 , New , 0.01
2   , 0   , 4.014 , 4.064 , Cur , -0.05
4   , 0   , 3.974 , 4.063 , Cur , -0.09
8   , 0   , 4.01  , 4.059 , Cur , -0.05
16  , 0   , 4.596 , 4.061 , New , 0.54
32  , 0   , 3.964 , 3.987 , Cur , -0.02
1   , 1   , 3.989 , 3.981 , New , 0.01
2   , 2   , 3.967 , 3.981 , Cur , -0.01
3   , 3   , 3.961 , 3.984 , Cur , -0.02
3   , 0   , 3.958 , 3.993 , Cur , -0.03
4   , 4   , 3.96  , 3.98  , Cur , -0.02
5   , 5   , 3.958 , 3.981 , Cur , -0.02
5   , 0   , 3.959 , 3.993 , Cur , -0.03
6   , 6   , 3.956 , 3.979 , Cur , -0.02
6   , 0   , 3.956 , 3.985 , Cur , -0.03
7   , 7   , 3.958 , 3.979 , Cur , -0.02
7   , 0   , 3.957 , 3.984 , Cur , -0.03
8   , 8   , 3.963 , 3.984 , Cur , -0.02
9   , 9   , 3.959 , 3.985 , Cur , -0.03
9   , 0   , 3.958 , 3.989 , Cur , -0.03
10  , 10  , 3.958 , 3.982 , Cur , -0.02
10  , 0   , 3.956 , 3.984 , Cur , -0.03
11  , 11  , 3.954 , 3.981 , Cur , -0.03
11  , 0   , 3.954 , 3.985 , Cur , -0.03
12  , 12  , 3.956 , 3.979 , Cur , -0.02
12  , 0   , 3.956 , 3.984 , Cur , -0.03
13  , 13  , 3.958 , 3.979 , Cur , -0.02
13  , 0   , 3.958 , 3.986 , Cur , -0.03
14  , 14  , 3.957 , 3.98  , Cur , -0.02
14  , 0   , 3.959 , 3.993 , Cur , -0.03
15  , 15  , 3.958 , 3.981 , Cur , -0.02
15  , 0   , 3.957 , 3.98  , Cur , -0.02
16  , 16  , 4.581 , 3.978 , New , 0.6 
17  , 17  , 4.579 , 3.985 , New , 0.59
17  , 0   , 4.58  , 3.983 , New , 0.6 
18  , 18  , 4.581 , 3.981 , New , 0.6 
18  , 0   , 4.581 , 3.982 , New , 0.6 
19  , 19  , 4.582 , 3.98  , New , 0.6 
19  , 0   , 4.583 , 3.982 , New , 0.6 
20  , 20  , 4.581 , 3.979 , New , 0.6 
20  , 0   , 4.581 , 3.982 , New , 0.6 
21  , 21  , 4.58  , 3.979 , New , 0.6 
21  , 0   , 4.578 , 3.985 , New , 0.59
22  , 22  , 4.579 , 3.979 , New , 0.6 
22  , 0   , 4.581 , 3.981 , New , 0.6 
23  , 23  , 4.582 , 3.979 , New , 0.6 
23  , 0   , 4.582 , 3.976 , New , 0.61
24  , 24  , 4.579 , 3.98  , New , 0.6 
24  , 0   , 4.58  , 3.982 , New , 0.6 
25  , 25  , 4.58  , 3.978 , New , 0.6 
25  , 0   , 4.578 , 3.978 , New , 0.6 
26  , 26  , 4.576 , 3.978 , New , 0.6 
26  , 0   , 4.578 , 3.982 , New , 0.6 
27  , 27  , 4.581 , 3.978 , New , 0.6 
27  , 0   , 4.581 , 3.982 , New , 0.6 
28  , 28  , 4.58  , 3.977 , New , 0.6 
28  , 0   , 4.579 , 3.978 , New , 0.6 
29  , 29  , 4.58  , 3.979 , New , 0.6 
29  , 0   , 4.579 , 3.979 , New , 0.6 
30  , 30  , 4.579 , 3.976 , New , 0.6 
30  , 0   , 4.58  , 3.983 , New , 0.6 
31  , 31  , 4.579 , 3.976 , New , 0.6 
31  , 0   , 4.58  , 3.978 , New , 0.6 
32  , 32  , 3.957 , 3.977 , Cur , -0.02
33  , 33  , 3.957 , 3.978 , Cur , -0.02
33  , 0   , 3.954 , 3.979 , Cur , -0.02
34  , 34  , 3.955 , 3.979 , Cur , -0.02
34  , 0   , 3.956 , 3.978 , Cur , -0.02
35  , 35  , 3.955 , 3.977 , Cur , -0.02
35  , 0   , 3.956 , 3.981 , Cur , -0.02
36  , 36  , 3.954 , 3.977 , Cur , -0.02
36  , 0   , 3.955 , 3.979 , Cur , -0.02
37  , 37  , 3.955 , 3.982 , Cur , -0.03
37  , 0   , 3.954 , 3.978 , Cur , -0.02
38  , 38  , 3.952 , 3.976 , Cur , -0.02
38  , 0   , 3.956 , 3.979 , Cur , -0.02
39  , 39  , 3.952 , 3.977 , Cur , -0.02
39  , 0   , 3.954 , 3.978 , Cur , -0.02
40  , 40  , 3.953 , 3.977 , Cur , -0.02
40  , 0   , 3.956 , 3.978 , Cur , -0.02
41  , 41  , 3.955 , 3.979 , Cur , -0.02
41  , 0   , 3.956 , 3.979 , Cur , -0.02
42  , 42  , 3.955 , 3.978 , Cur , -0.02
42  , 0   , 3.957 , 3.978 , Cur , -0.02
43  , 43  , 3.956 , 3.978 , Cur , -0.02
43  , 0   , 3.952 , 3.977 , Cur , -0.02
44  , 44  , 3.954 , 3.976 , Cur , -0.02
44  , 0   , 3.952 , 3.979 , Cur , -0.03
45  , 45  , 3.952 , 3.979 , Cur , -0.03
45  , 0   , 3.953 , 3.979 , Cur , -0.03
46  , 46  , 3.954 , 3.979 , Cur , -0.02
46  , 0   , 3.95  , 3.98  , Cur , -0.03
47  , 47  , 3.952 , 3.979 , Cur , -0.03
47  , 0   , 3.951 , 3.979 , Cur , -0.03
48  , 48  , 3.951 , 3.977 , Cur , -0.03
48  , 0   , 3.952 , 3.979 , Cur , -0.03
49  , 49  , 3.954 , 3.979 , Cur , -0.02
49  , 0   , 3.951 , 3.976 , Cur , -0.02
50  , 50  , 3.95  , 3.977 , Cur , -0.03
50  , 0   , 3.952 , 3.977 , Cur , -0.02
51  , 51  , 3.954 , 3.979 , Cur , -0.02
51  , 0   , 3.953 , 3.977 , Cur , -0.02
52  , 52  , 3.953 , 3.978 , Cur , -0.03
52  , 0   , 3.953 , 3.977 , Cur , -0.02
53  , 53  , 3.952 , 3.976 , Cur , -0.02
53  , 0   , 3.953 , 3.979 , Cur , -0.03
54  , 54  , 3.954 , 3.976 , Cur , -0.02
54  , 0   , 3.953 , 3.977 , Cur , -0.02
55  , 55  , 3.952 , 3.978 , Cur , -0.03
55  , 0   , 3.954 , 3.975 , Cur , -0.02
56  , 56  , 3.955 , 3.978 , Cur , -0.02
56  , 0   , 3.951 , 3.979 , Cur , -0.03
57  , 57  , 3.953 , 3.978 , Cur , -0.03
57  , 0   , 3.951 , 3.976 , Cur , -0.02
58  , 58  , 3.953 , 3.978 , Cur , -0.03
58  , 0   , 3.954 , 3.98  , Cur , -0.03
59  , 59  , 3.952 , 3.976 , Cur , -0.02
59  , 0   , 3.952 , 3.977 , Cur , -0.02
60  , 60  , 3.955 , 3.979 , Cur , -0.02
60  , 0   , 3.953 , 3.977 , Cur , -0.02
61  , 61  , 3.952 , 3.979 , Cur , -0.03
61  , 0   , 3.951 , 3.978 , Cur , -0.03
62  , 62  , 3.949 , 3.98  , Cur , -0.03
62  , 0   , 3.949 , 3.979 , Cur , -0.03
63  , 63  , 3.951 , 3.977 , Cur , -0.03
63  , 0   , 3.949 , 3.977 , Cur , -0.03
14  , 1   , 3.968 , 3.979 , Cur , -0.01
25  , 2   , 4.592 , 3.978 , New , 0.61
1   , 4095, 10.829, 11.313, Cur , -0.48
2   , 4095, 32.223, 32.39 , Cur , -0.17
3   , 4095, 17.239, 17.256, Cur , -0.02
4   , 4095, 32.225, 32.244, Cur , -0.02
5   , 4095, 17.239, 17.272, Cur , -0.03
6   , 4095, 17.24 , 17.258, Cur , -0.02
7   , 4095, 17.242, 17.265, Cur , -0.02
8   , 4095, 32.227, 32.434, Cur , -0.21
9   , 4095, 17.239, 17.254, Cur , -0.02
10  , 4095, 17.238, 17.262, Cur , -0.02
11  , 4095, 17.238, 17.258, Cur , -0.02
12  , 4095, 17.241, 17.445, Cur , -0.2
13  , 4095, 17.238, 17.438, Cur , -0.2
14  , 4095, 17.238, 17.245, Cur , -0.01
15  , 4095, 17.24 , 17.25 , Cur , -0.01
16  , 4095, 32.22 , 32.599, Cur , -0.38
17  , 4095, 17.236, 17.251, Cur , -0.02
18  , 4095, 17.237, 17.258, Cur , -0.02
19  , 4095, 17.237, 17.255, Cur , -0.02
20  , 4095, 17.237, 17.248, Cur , -0.01
21  , 4095, 17.235, 17.256, Cur , -0.02
22  , 4095, 17.235, 17.245, Cur , -0.01
23  , 4095, 17.234, 17.257, Cur , -0.02
24  , 4095, 17.234, 17.251, Cur , -0.02
25  , 4095, 17.235, 17.252, Cur , -0.02
26  , 4095, 17.232, 17.244, Cur , -0.01
27  , 4095, 17.237, 17.244, Cur , -0.01
28  , 4095, 17.236, 17.25 , Cur , -0.01
29  , 4095, 17.236, 17.245, Cur , -0.01
30  , 4095, 17.235, 17.25 , Cur , -0.02
31  , 4095, 17.237, 17.237, Eq  , 0.0 
32  , 4095, 32.218, 32.226, Cur , -0.01
33  , 4095, 17.234, 17.239, Cur , -0.0
34  , 4095, 17.232, 17.248, Cur , -0.02
35  , 4095, 17.234, 17.249, Cur , -0.01
36  , 4095, 17.234, 17.241, Cur , -0.01
37  , 4095, 17.235, 17.246, Cur , -0.01
38  , 4095, 17.234, 17.247, Cur , -0.01
39  , 4095, 17.231, 17.242, Cur , -0.01
40  , 4095, 17.234, 17.245, Cur , -0.01
41  , 4095, 17.237, 17.25 , Cur , -0.01
42  , 4095, 17.233, 17.243, Cur , -0.01
43  , 4095, 17.234, 17.242, Cur , -0.01
44  , 4095, 17.234, 17.242, Cur , -0.01
45  , 4095, 17.234, 17.244, Cur , -0.01
46  , 4095, 17.231, 17.257, Cur , -0.03
47  , 4095, 17.236, 17.247, Cur , -0.01
48  , 4095, 17.234, 17.24 , Cur , -0.01
49  , 4095, 17.235, 17.244, Cur , -0.01
50  , 4095, 17.233, 17.245, Cur , -0.01
51  , 4095, 17.236, 17.25 , Cur , -0.01
52  , 4095, 17.232, 17.242, Cur , -0.01
53  , 4095, 17.233, 17.253, Cur , -0.02
54  , 4095, 17.233, 17.245, Cur , -0.01
55  , 4095, 17.234, 17.245, Cur , -0.01
56  , 4095, 17.235, 17.246, Cur , -0.01
57  , 4095, 17.234, 17.241, Cur , -0.01
58  , 4095, 17.234, 17.256, Cur , -0.02
59  , 4095, 17.235, 17.248, Cur , -0.01
60  , 4095, 17.235, 17.244, Cur , -0.01
61  , 4095, 17.23 , 17.24 , Cur , -0.01
62  , 4095, 17.232, 17.244, Cur , -0.01
63  , 4095, 17.231, 17.242, Cur , -0.01
2   , 4094, 10.475, 11.063, Cur , -0.59
3   , 4093, 10.867, 11.242, Cur , -0.38
4   , 4092, 10.614, 10.98 , Cur , -0.37
5   , 4091, 10.893, 11.457, Cur , -0.56
6   , 4090, 10.588, 11.216, Cur , -0.63
7   , 4089, 10.969, 11.24 , Cur , -0.27
8   , 4088, 10.359, 11.251, Cur , -0.89
9   , 4087, 10.532, 11.642, Cur , -1.11
10  , 4086, 10.402, 11.17 , Cur , -0.77
11  , 4085, 10.231, 11.153, Cur , -0.92
12  , 4084, 10.335, 10.755, Cur , -0.42
13  , 4083, 10.849, 11.2  , Cur , -0.35
14  , 4082, 10.575, 11.34 , Cur , -0.77
15  , 4081, 10.523, 10.993, Cur , -0.47
16  , 4080, 4.581 , 4.594 , Cur , -0.01
17  , 4079, 4.582 , 4.588 , Cur , -0.01
18  , 4078, 4.582 , 4.591 , Cur , -0.01
19  , 4077, 4.582 , 4.59  , Cur , -0.01
20  , 4076, 4.582 , 4.587 , Cur , -0.0
21  , 4075, 4.58  , 4.587 , Cur , -0.01
22  , 4074, 4.582 , 4.585 , Cur , -0.0
23  , 4073, 4.583 , 4.588 , Cur , -0.0
24  , 4072, 4.58  , 4.588 , Cur , -0.01
25  , 4071, 4.579 , 4.586 , Cur , -0.01
26  , 4070, 4.58  , 4.584 , Cur , -0.0
27  , 4069, 4.582 , 4.585 , Cur , -0.0
28  , 4068, 4.579 , 4.585 , Cur , -0.01
29  , 4067, 4.58  , 4.588 , Cur , -0.01
30  , 4066, 4.578 , 4.588 , Cur , -0.01
31  , 4065, 4.58  , 4.587 , Cur , -0.01
32  , 4064, 3.958 , 5.222 , Cur , -1.26
33  , 4063, 3.956 , 5.219 , Cur , -1.26
34  , 4062, 3.957 , 5.22  , Cur , -1.26
35  , 4061, 3.955 , 5.223 , Cur , -1.27
36  , 4060, 3.955 , 5.222 , Cur , -1.27
37  , 4059, 3.954 , 5.222 , Cur , -1.27
38  , 4058, 3.955 , 5.219 , Cur , -1.26
39  , 4057, 3.954 , 5.222 , Cur , -1.27
40  , 4056, 3.952 , 5.218 , Cur , -1.27
41  , 4055, 3.957 , 5.219 , Cur , -1.26
42  , 4054, 3.953 , 5.22  , Cur , -1.27
43  , 4053, 3.954 , 5.215 , Cur , -1.26
44  , 4052, 3.953 , 5.222 , Cur , -1.27
45  , 4051, 3.954 , 5.221 , Cur , -1.27
46  , 4050, 3.955 , 5.218 , Cur , -1.26
47  , 4049, 3.953 , 5.221 , Cur , -1.27
48  , 4048, 3.95  , 5.219 , Cur , -1.27
49  , 4047, 3.957 , 5.22  , Cur , -1.26
50  , 4046, 3.953 , 5.217 , Cur , -1.26
51  , 4045, 3.951 , 5.218 , Cur , -1.27
52  , 4044, 3.951 , 5.216 , Cur , -1.27
53  , 4043, 3.956 , 5.223 , Cur , -1.27
54  , 4042, 3.953 , 5.217 , Cur , -1.26
55  , 4041, 3.953 , 5.219 , Cur , -1.27
56  , 4040, 3.953 , 5.221 , Cur , -1.27
57  , 4039, 3.953 , 5.219 , Cur , -1.27
58  , 4038, 3.953 , 5.212 , Cur , -1.26
59  , 4037, 3.954 , 5.216 , Cur , -1.26
60  , 4036, 3.953 , 5.214 , Cur , -1.26
61  , 4035, 3.955 , 5.218 , Cur , -1.26
62  , 4034, 3.953 , 5.22  , Cur , -1.27
63  , 4033, 3.95  , 5.218 , Cur , -1.27

Results For Tigerlake memset-avx512
size, algn, Cur T , New T , Win , Dif 
1   , 0   , 4.043 , 4.066 , Cur , -0.02
2   , 0   , 4.038 , 4.06  , Cur , -0.02
4   , 0   , 4.006 , 4.059 , Cur , -0.05
8   , 0   , 4.006 , 4.06  , Cur , -0.05
16  , 0   , 4.637 , 4.062 , New , 0.57
32  , 0   , 3.956 , 3.992 , Cur , -0.04
1   , 1   , 3.974 , 3.989 , Cur , -0.01
2   , 2   , 3.965 , 3.991 , Cur , -0.03
3   , 3   , 3.965 , 3.99  , Cur , -0.03
3   , 0   , 3.962 , 3.985 , Cur , -0.02
4   , 4   , 3.96  , 3.982 , Cur , -0.02
5   , 5   , 3.957 , 3.986 , Cur , -0.03
5   , 0   , 3.957 , 3.982 , Cur , -0.03
6   , 6   , 3.959 , 3.98  , Cur , -0.02
6   , 0   , 3.956 , 3.982 , Cur , -0.03
7   , 7   , 3.958 , 3.981 , Cur , -0.02
7   , 0   , 3.958 , 3.985 , Cur , -0.03
8   , 8   , 3.961 , 3.981 , Cur , -0.02
9   , 9   , 3.957 , 3.983 , Cur , -0.03
9   , 0   , 3.956 , 3.981 , Cur , -0.02
10  , 10  , 3.956 , 3.985 , Cur , -0.03
10  , 0   , 3.956 , 3.984 , Cur , -0.03
11  , 11  , 3.957 , 3.983 , Cur , -0.03
11  , 0   , 3.954 , 3.986 , Cur , -0.03
12  , 12  , 3.956 , 3.989 , Cur , -0.03
12  , 0   , 3.955 , 3.98  , Cur , -0.02
13  , 13  , 3.96  , 3.994 , Cur , -0.03
13  , 0   , 3.956 , 3.987 , Cur , -0.03
14  , 14  , 3.956 , 3.981 , Cur , -0.02
14  , 0   , 3.96  , 3.996 , Cur , -0.04
15  , 15  , 3.956 , 3.986 , Cur , -0.03
15  , 0   , 3.955 , 3.983 , Cur , -0.03
16  , 16  , 4.582 , 3.98  , New , 0.6 
17  , 17  , 4.581 , 3.986 , New , 0.6 
17  , 0   , 4.583 , 3.982 , New , 0.6 
18  , 18  , 4.578 , 3.984 , New , 0.59
18  , 0   , 4.582 , 3.983 , New , 0.6 
19  , 19  , 4.58  , 3.979 , New , 0.6 
19  , 0   , 4.577 , 3.982 , New , 0.59
20  , 20  , 4.581 , 3.979 , New , 0.6 
20  , 0   , 4.58  , 3.981 , New , 0.6 
21  , 21  , 4.581 , 3.982 , New , 0.6 
21  , 0   , 4.581 , 3.98  , New , 0.6 
22  , 22  , 4.581 , 3.976 , New , 0.61
22  , 0   , 4.579 , 3.987 , New , 0.59
23  , 23  , 4.579 , 3.979 , New , 0.6 
23  , 0   , 4.577 , 3.981 , New , 0.6 
24  , 24  , 4.581 , 3.977 , New , 0.6 
24  , 0   , 4.579 , 3.981 , New , 0.6 
25  , 25  , 4.578 , 3.979 , New , 0.6 
25  , 0   , 4.581 , 3.979 , New , 0.6 
26  , 26  , 4.578 , 3.981 , New , 0.6 
26  , 0   , 4.58  , 3.978 , New , 0.6 
27  , 27  , 4.576 , 3.979 , New , 0.6 
27  , 0   , 4.577 , 3.979 , New , 0.6 
28  , 28  , 4.577 , 3.977 , New , 0.6 
28  , 0   , 4.577 , 3.978 , New , 0.6 
29  , 29  , 4.577 , 3.977 , New , 0.6 
29  , 0   , 4.579 , 3.979 , New , 0.6 
30  , 30  , 4.578 , 3.977 , New , 0.6 
30  , 0   , 4.581 , 3.988 , New , 0.59
31  , 31  , 4.58  , 3.979 , New , 0.6 
31  , 0   , 4.578 , 3.979 , New , 0.6 
32  , 32  , 3.955 , 3.978 , Cur , -0.02
33  , 33  , 3.956 , 3.979 , Cur , -0.02
33  , 0   , 3.956 , 3.978 , Cur , -0.02
34  , 34  , 3.953 , 3.982 , Cur , -0.03
34  , 0   , 3.956 , 3.978 , Cur , -0.02
35  , 35  , 3.954 , 3.979 , Cur , -0.02
35  , 0   , 3.952 , 3.977 , Cur , -0.02
36  , 36  , 3.957 , 3.978 , Cur , -0.02
36  , 0   , 3.955 , 3.979 , Cur , -0.02
37  , 37  , 3.951 , 3.979 , Cur , -0.03
37  , 0   , 3.955 , 3.977 , Cur , -0.02
38  , 38  , 3.956 , 3.979 , Cur , -0.02
38  , 0   , 3.953 , 3.98  , Cur , -0.03
39  , 39  , 3.957 , 3.979 , Cur , -0.02
39  , 0   , 3.957 , 3.985 , Cur , -0.03
40  , 40  , 3.953 , 3.981 , Cur , -0.03
40  , 0   , 3.954 , 3.978 , Cur , -0.02
41  , 41  , 3.952 , 3.978 , Cur , -0.03
41  , 0   , 3.952 , 3.98  , Cur , -0.03
42  , 42  , 3.951 , 3.982 , Cur , -0.03
42  , 0   , 3.952 , 3.978 , Cur , -0.03
43  , 43  , 3.95  , 3.986 , Cur , -0.04
43  , 0   , 3.952 , 3.979 , Cur , -0.03
44  , 44  , 3.952 , 3.979 , Cur , -0.03
44  , 0   , 3.952 , 3.98  , Cur , -0.03
45  , 45  , 3.951 , 3.979 , Cur , -0.03
45  , 0   , 3.954 , 3.978 , Cur , -0.02
46  , 46  , 3.951 , 3.982 , Cur , -0.03
46  , 0   , 3.953 , 3.978 , Cur , -0.03
47  , 47  , 3.953 , 3.978 , Cur , -0.03
47  , 0   , 3.951 , 3.986 , Cur , -0.04
48  , 48  , 3.953 , 3.979 , Cur , -0.03
48  , 0   , 3.953 , 3.977 , Cur , -0.02
49  , 49  , 3.952 , 3.976 , Cur , -0.02
49  , 0   , 3.953 , 3.979 , Cur , -0.03
50  , 50  , 3.95  , 3.98  , Cur , -0.03
50  , 0   , 3.955 , 3.978 , Cur , -0.02
51  , 51  , 3.951 , 3.978 , Cur , -0.03
51  , 0   , 3.953 , 3.979 , Cur , -0.03
52  , 52  , 3.952 , 3.977 , Cur , -0.02
52  , 0   , 3.953 , 3.98  , Cur , -0.03
53  , 53  , 3.955 , 3.977 , Cur , -0.02
53  , 0   , 3.951 , 3.979 , Cur , -0.03
54  , 54  , 3.951 , 3.978 , Cur , -0.03
54  , 0   , 3.952 , 3.978 , Cur , -0.03
55  , 55  , 3.954 , 3.98  , Cur , -0.03
55  , 0   , 3.952 , 3.98  , Cur , -0.03
56  , 56  , 3.951 , 3.976 , Cur , -0.02
56  , 0   , 3.954 , 3.982 , Cur , -0.03
57  , 57  , 3.952 , 3.979 , Cur , -0.03
57  , 0   , 3.953 , 3.98  , Cur , -0.03
58  , 58  , 3.952 , 3.979 , Cur , -0.03
58  , 0   , 3.955 , 3.98  , Cur , -0.02
59  , 59  , 3.951 , 3.976 , Cur , -0.02
59  , 0   , 3.953 , 3.979 , Cur , -0.03
60  , 60  , 3.953 , 3.976 , Cur , -0.02
60  , 0   , 3.952 , 3.979 , Cur , -0.03
61  , 61  , 3.951 , 3.98  , Cur , -0.03
61  , 0   , 3.953 , 3.977 , Cur , -0.02
62  , 62  , 3.951 , 3.978 , Cur , -0.03
62  , 0   , 3.95  , 3.979 , Cur , -0.03
63  , 63  , 3.952 , 3.977 , Cur , -0.02
63  , 0   , 3.948 , 3.977 , Cur , -0.03
14  , 1   , 3.959 , 3.98  , Cur , -0.02
25  , 2   , 4.577 , 3.979 , New , 0.6 
1   , 4095, 3.967 , 4.639 , Cur , -0.67
2   , 4095, 32.223, 32.253, Cur , -0.03
3   , 4095, 17.245, 17.277, Cur , -0.03
4   , 4095, 32.219, 32.387, Cur , -0.17
5   , 4095, 17.24 , 17.256, Cur , -0.02
6   , 4095, 17.239, 17.256, Cur , -0.02
7   , 4095, 17.241, 17.407, Cur , -0.17
8   , 4095, 32.222, 32.44 , Cur , -0.22
9   , 4095, 17.238, 17.246, Cur , -0.01
10  , 4095, 17.239, 17.257, Cur , -0.02
11  , 4095, 17.238, 17.261, Cur , -0.02
12  , 4095, 17.236, 17.441, Cur , -0.2
13  , 4095, 17.24 , 17.492, Cur , -0.25
14  , 4095, 17.238, 17.256, Cur , -0.02
15  , 4095, 17.236, 17.264, Cur , -0.03
16  , 4095, 32.22 , 32.24 , Cur , -0.02
17  , 4095, 17.235, 17.445, Cur , -0.21
18  , 4095, 17.236, 17.249, Cur , -0.01
19  , 4095, 17.236, 17.243, Cur , -0.01
20  , 4095, 17.238, 17.244, Cur , -0.01
21  , 4095, 17.237, 17.256, Cur , -0.02
22  , 4095, 17.239, 17.252, Cur , -0.01
23  , 4095, 17.238, 17.247, Cur , -0.01
24  , 4095, 17.236, 17.241, Cur , -0.0
25  , 4095, 17.236, 17.241, Cur , -0.0
26  , 4095, 17.236, 17.246, Cur , -0.01
27  , 4095, 17.238, 17.247, Cur , -0.01
28  , 4095, 17.232, 17.242, Cur , -0.01
29  , 4095, 17.233, 17.249, Cur , -0.02
30  , 4095, 17.236, 17.241, Cur , -0.0
31  , 4095, 17.237, 17.244, Cur , -0.01
32  , 4095, 32.219, 32.236, Cur , -0.02
33  , 4095, 17.236, 17.245, Cur , -0.01
34  , 4095, 17.234, 17.244, Cur , -0.01
35  , 4095, 17.236, 17.248, Cur , -0.01
36  , 4095, 17.233, 17.24 , Cur , -0.01
37  , 4095, 17.233, 17.241, Cur , -0.01
38  , 4095, 17.236, 17.24 , Cur , -0.0
39  , 4095, 17.236, 17.24 , Cur , -0.0
40  , 4095, 17.238, 17.244, Cur , -0.01
41  , 4095, 17.238, 17.244, Cur , -0.01
42  , 4095, 17.236, 17.243, Cur , -0.01
43  , 4095, 17.234, 17.241, Cur , -0.01
44  , 4095, 17.235, 17.241, Cur , -0.01
45  , 4095, 17.235, 17.246, Cur , -0.01
46  , 4095, 17.234, 17.253, Cur , -0.02
47  , 4095, 17.232, 17.239, Cur , -0.01
48  , 4095, 17.235, 17.253, Cur , -0.02
49  , 4095, 17.232, 17.242, Cur , -0.01
50  , 4095, 17.232, 17.248, Cur , -0.02
51  , 4095, 17.233, 17.249, Cur , -0.02
52  , 4095, 17.232, 17.244, Cur , -0.01
53  , 4095, 17.231, 17.242, Cur , -0.01
54  , 4095, 17.234, 17.239, Cur , -0.0
55  , 4095, 17.234, 17.245, Cur , -0.01
56  , 4095, 17.232, 17.241, Cur , -0.01
57  , 4095, 17.232, 17.243, Cur , -0.01
58  , 4095, 17.235, 17.242, Cur , -0.01
59  , 4095, 17.232, 17.246, Cur , -0.01
60  , 4095, 17.234, 17.24 , Cur , -0.01
61  , 4095, 17.232, 17.242, Cur , -0.01
62  , 4095, 17.233, 17.244, Cur , -0.01
63  , 4095, 17.235, 17.244, Cur , -0.01
2   , 4094, 3.965 , 4.609 , Cur , -0.64
3   , 4093, 3.966 , 4.598 , Cur , -0.63
4   , 4092, 3.959 , 4.604 , Cur , -0.65
5   , 4091, 3.971 , 4.623 , Cur , -0.65
6   , 4090, 3.956 , 4.591 , Cur , -0.64
7   , 4089, 3.958 , 4.594 , Cur , -0.64
8   , 4088, 3.959 , 4.607 , Cur , -0.65
9   , 4087, 3.959 , 4.605 , Cur , -0.65
10  , 4086, 3.957 , 4.6   , Cur , -0.64
11  , 4085, 3.958 , 4.599 , Cur , -0.64
12  , 4084, 3.955 , 4.597 , Cur , -0.64
13  , 4083, 3.957 , 4.602 , Cur , -0.65
14  , 4082, 3.957 , 4.599 , Cur , -0.64
15  , 4081, 3.956 , 4.601 , Cur , -0.65
16  , 4080, 4.581 , 4.596 , Cur , -0.01
17  , 4079, 4.581 , 4.588 , Cur , -0.01
18  , 4078, 4.582 , 4.589 , Cur , -0.01
19  , 4077, 4.581 , 4.587 , Cur , -0.01
20  , 4076, 4.582 , 4.589 , Cur , -0.01
21  , 4075, 4.582 , 4.589 , Cur , -0.01
22  , 4074, 4.582 , 4.583 , Cur , -0.0
23  , 4073, 4.586 , 4.599 , Cur , -0.01
24  , 4072, 4.58  , 4.586 , Cur , -0.01
25  , 4071, 4.58  , 4.585 , Cur , -0.0
26  , 4070, 4.581 , 4.587 , Cur , -0.01
27  , 4069, 4.576 , 4.586 , Cur , -0.01
28  , 4068, 4.578 , 4.585 , Cur , -0.01
29  , 4067, 4.578 , 4.585 , Cur , -0.01
30  , 4066, 4.578 , 4.587 , Cur , -0.01
31  , 4065, 4.578 , 4.583 , Cur , -0.0
32  , 4064, 3.955 , 5.225 , Cur , -1.27
33  , 4063, 3.952 , 5.221 , Cur , -1.27
34  , 4062, 3.952 , 5.218 , Cur , -1.27
35  , 4061, 3.955 , 5.22  , Cur , -1.26
36  , 4060, 3.955 , 5.221 , Cur , -1.27
37  , 4059, 3.956 , 5.219 , Cur , -1.26
38  , 4058, 3.955 , 5.219 , Cur , -1.26
39  , 4057, 3.956 , 5.224 , Cur , -1.27
40  , 4056, 3.956 , 5.221 , Cur , -1.27
41  , 4055, 3.952 , 5.22  , Cur , -1.27
42  , 4054, 3.956 , 5.217 , Cur , -1.26
43  , 4053, 3.953 , 5.22  , Cur , -1.27
44  , 4052, 3.951 , 5.222 , Cur , -1.27
45  , 4051, 3.949 , 5.22  , Cur , -1.27
46  , 4050, 3.952 , 5.222 , Cur , -1.27
47  , 4049, 3.953 , 5.222 , Cur , -1.27
48  , 4048, 3.953 , 5.218 , Cur , -1.27
49  , 4047, 3.956 , 5.226 , Cur , -1.27
50  , 4046, 3.953 , 5.221 , Cur , -1.27
51  , 4045, 3.953 , 5.218 , Cur , -1.27
52  , 4044, 3.954 , 5.221 , Cur , -1.27
53  , 4043, 3.948 , 5.219 , Cur , -1.27
54  , 4042, 3.951 , 5.22  , Cur , -1.27
55  , 4041, 3.951 , 5.218 , Cur , -1.27
56  , 4040, 3.957 , 5.224 , Cur , -1.27
57  , 4039, 3.953 , 5.221 , Cur , -1.27
58  , 4038, 3.95  , 5.222 , Cur , -1.27
59  , 4037, 3.953 , 5.22  , Cur , -1.27
60  , 4036, 3.953 , 5.221 , Cur , -1.27
61  , 4035, 3.951 , 5.22  , Cur , -1.27
62  , 4034, 3.951 , 5.22  , Cur , -1.27
63  , 4033, 3.949 , 5.22  , Cur , -1.27

Results For Icelake memset-evex-erms
size, algn, Cur T , New T , Win , Dif 
1   , 0   , 2.885 , 2.896 , Cur , -0.01
2   , 0   , 3.341 , 2.896 , New , 0.45
4   , 0   , 2.883 , 2.893 , Cur , -0.01
8   , 0   , 2.883 , 2.893 , Cur , -0.01
16  , 0   , 2.881 , 2.897 , Cur , -0.02
1   , 1   , 2.574 , 2.582 , Cur , -0.01
2   , 2   , 2.976 , 2.581 , New , 0.4 
3   , 3   , 3.054 , 2.651 , New , 0.4 
3   , 0   , 3.054 , 2.65  , New , 0.4 
4   , 4   , 2.64  , 2.649 , Cur , -0.01
5   , 5   , 2.639 , 2.653 , Cur , -0.01
5   , 0   , 2.64  , 2.651 , Cur , -0.01
6   , 6   , 2.639 , 2.652 , Cur , -0.01
6   , 0   , 2.638 , 2.651 , Cur , -0.01
7   , 7   , 2.64  , 2.652 , Cur , -0.01
7   , 0   , 2.64  , 2.651 , Cur , -0.01
8   , 8   , 2.64  , 2.652 , Cur , -0.01
9   , 9   , 2.64  , 2.653 , Cur , -0.01
9   , 0   , 2.639 , 2.651 , Cur , -0.01
10  , 10  , 2.639 , 2.652 , Cur , -0.01
10  , 0   , 2.638 , 2.651 , Cur , -0.01
11  , 11  , 2.639 , 2.653 , Cur , -0.01
11  , 0   , 2.639 , 2.653 , Cur , -0.01
12  , 12  , 2.639 , 2.652 , Cur , -0.01
12  , 0   , 2.638 , 2.653 , Cur , -0.02
13  , 13  , 2.64  , 2.654 , Cur , -0.01
13  , 0   , 2.638 , 2.653 , Cur , -0.02
14  , 14  , 2.64  , 2.652 , Cur , -0.01
14  , 0   , 2.639 , 2.652 , Cur , -0.01
15  , 15  , 2.639 , 2.651 , Cur , -0.01
15  , 0   , 2.639 , 2.652 , Cur , -0.01
16  , 16  , 2.645 , 2.656 , Cur , -0.01
17  , 17  , 2.639 , 2.652 , Cur , -0.01
17  , 0   , 2.639 , 2.652 , Cur , -0.01
18  , 18  , 2.64  , 2.652 , Cur , -0.01
18  , 0   , 2.64  , 2.652 , Cur , -0.01
19  , 19  , 2.64  , 2.651 , Cur , -0.01
19  , 0   , 2.639 , 2.652 , Cur , -0.01
20  , 20  , 2.639 , 2.652 , Cur , -0.01
20  , 0   , 2.64  , 2.652 , Cur , -0.01
21  , 21  , 2.64  , 2.651 , Cur , -0.01
21  , 0   , 2.64  , 2.652 , Cur , -0.01
22  , 22  , 2.64  , 2.652 , Cur , -0.01
22  , 0   , 2.64  , 2.652 , Cur , -0.01
23  , 23  , 2.64  , 2.652 , Cur , -0.01
23  , 0   , 2.643 , 2.655 , Cur , -0.01
24  , 24  , 2.639 , 2.653 , Cur , -0.01
24  , 0   , 2.64  , 2.652 , Cur , -0.01
25  , 25  , 2.639 , 2.653 , Cur , -0.01
25  , 0   , 2.639 , 2.653 , Cur , -0.01
26  , 26  , 2.639 , 2.653 , Cur , -0.01
26  , 0   , 2.639 , 2.652 , Cur , -0.01
27  , 27  , 2.639 , 2.652 , Cur , -0.01
27  , 0   , 2.639 , 2.651 , Cur , -0.01
28  , 28  , 2.64  , 2.651 , Cur , -0.01
28  , 0   , 2.64  , 2.652 , Cur , -0.01
29  , 29  , 2.639 , 2.652 , Cur , -0.01
29  , 0   , 2.639 , 2.653 , Cur , -0.01
30  , 30  , 2.638 , 2.653 , Cur , -0.02
30  , 0   , 2.639 , 2.652 , Cur , -0.01
31  , 31  , 2.64  , 2.651 , Cur , -0.01
31  , 0   , 2.641 , 2.653 , Cur , -0.01
14  , 1   , 2.882 , 2.893 , Cur , -0.01
25  , 2   , 2.882 , 2.894 , Cur , -0.01
1   , 4095, 2.571 , 3.383 , Cur , -0.81
2   , 4095, 21.509, 21.507, New , 0.0 
3   , 4095, 11.499, 11.21 , New , 0.29
4   , 4095, 21.512, 21.513, Cur , -0.0
5   , 4095, 11.501, 11.213, New , 0.29
6   , 4095, 11.505, 11.507, Cur , -0.0
7   , 4095, 11.504, 11.504, Eq  , 0.0 
8   , 4095, 21.512, 21.513, Cur , -0.0
9   , 4095, 11.5  , 11.505, Cur , -0.01
10  , 4095, 11.506, 11.508, Cur , -0.0
11  , 4095, 11.504, 11.506, Cur , -0.0
12  , 4095, 11.504, 11.505, Cur , -0.0
13  , 4095, 11.503, 11.507, Cur , -0.0
14  , 4095, 11.505, 11.506, Cur , -0.0
15  , 4095, 11.504, 11.508, Cur , -0.0
16  , 4095, 21.511, 21.516, Cur , -0.0
17  , 4095, 11.502, 11.504, Cur , -0.0
18  , 4095, 11.505, 11.507, Cur , -0.0
19  , 4095, 11.505, 11.506, Cur , -0.0
20  , 4095, 11.504, 11.507, Cur , -0.0
21  , 4095, 11.506, 11.506, Eq  , 0.0 
22  , 4095, 11.504, 11.507, Cur , -0.0
23  , 4095, 11.506, 11.507, Cur , -0.0
24  , 4095, 11.505, 11.509, Cur , -0.0
25  , 4095, 11.506, 11.508, Cur , -0.0
26  , 4095, 11.505, 11.506, Cur , -0.0
27  , 4095, 11.505, 11.507, Cur , -0.0
28  , 4095, 11.507, 11.506, New , 0.0 
29  , 4095, 11.505, 11.505, Eq  , 0.0 
30  , 4095, 11.506, 11.507, Cur , -0.0
31  , 4095, 11.504, 11.506, Cur , -0.0
2   , 4094, 2.983 , 3.788 , Cur , -0.8
3   , 4093, 2.978 , 3.787 , Cur , -0.81
4   , 4092, 2.635 , 3.805 , Cur , -1.17
5   , 4091, 2.571 , 3.789 , Cur , -1.22
6   , 4090, 2.632 , 3.888 , Cur , -1.26
7   , 4089, 2.638 , 3.888 , Cur , -1.25
8   , 4088, 2.64  , 3.474 , Cur , -0.83
9   , 4087, 2.571 , 3.427 , Cur , -0.86
10  , 4086, 2.637 , 3.471 , Cur , -0.83
11  , 4085, 2.636 , 3.473 , Cur , -0.84
12  , 4084, 2.638 , 3.473 , Cur , -0.83
13  , 4083, 2.637 , 3.474 , Cur , -0.84
14  , 4082, 2.639 , 3.474 , Cur , -0.84
15  , 4081, 2.638 , 3.474 , Cur , -0.84
16  , 4080, 2.639 , 3.474 , Cur , -0.84
17  , 4079, 2.637 , 3.468 , Cur , -0.83
18  , 4078, 2.638 , 3.47  , Cur , -0.83
19  , 4077, 2.637 , 3.472 , Cur , -0.83
20  , 4076, 2.638 , 3.471 , Cur , -0.83
21  , 4075, 2.638 , 3.473 , Cur , -0.83
22  , 4074, 2.638 , 3.473 , Cur , -0.83
23  , 4073, 2.639 , 3.472 , Cur , -0.83
24  , 4072, 2.637 , 3.472 , Cur , -0.83
25  , 4071, 2.638 , 3.472 , Cur , -0.83
26  , 4070, 2.637 , 3.473 , Cur , -0.84
27  , 4069, 2.639 , 3.472 , Cur , -0.83
28  , 4068, 2.638 , 3.472 , Cur , -0.83
29  , 4067, 2.638 , 3.473 , Cur , -0.83
30  , 4066, 2.637 , 3.472 , Cur , -0.83
31  , 4065, 2.639 , 3.472 , Cur , -0.83

Results For Icelake memset-evex
size, algn, Cur T , New T , Win , Dif 
1   , 0   , 3.351 , 2.442 , New , 0.91
2   , 0   , 3.798 , 2.44  , New , 1.36
4   , 0   , 3.338 , 2.441 , New , 0.9 
8   , 0   , 3.335 , 2.441 , New , 0.89
16  , 0   , 3.338 , 2.443 , New , 0.9 
1   , 1   , 2.983 , 2.178 , New , 0.81
2   , 2   , 3.383 , 2.178 , New , 1.21
3   , 3   , 3.472 , 2.235 , New , 1.24
3   , 0   , 3.47  , 2.236 , New , 1.23
4   , 4   , 3.057 , 2.235 , New , 0.82
5   , 5   , 3.055 , 2.236 , New , 0.82
5   , 0   , 3.056 , 2.236 , New , 0.82
6   , 6   , 3.055 , 2.236 , New , 0.82
6   , 0   , 3.056 , 2.235 , New , 0.82
7   , 7   , 3.057 , 2.238 , New , 0.82
7   , 0   , 3.057 , 2.237 , New , 0.82
8   , 8   , 3.059 , 2.236 , New , 0.82
9   , 9   , 3.056 , 2.236 , New , 0.82
9   , 0   , 3.055 , 2.236 , New , 0.82
10  , 10  , 3.057 , 2.237 , New , 0.82
10  , 0   , 3.055 , 2.237 , New , 0.82
11  , 11  , 3.056 , 2.236 , New , 0.82
11  , 0   , 3.056 , 2.236 , New , 0.82
12  , 12  , 3.056 , 2.237 , New , 0.82
12  , 0   , 3.056 , 2.235 , New , 0.82
13  , 13  , 3.055 , 2.236 , New , 0.82
13  , 0   , 3.056 , 2.237 , New , 0.82
14  , 14  , 3.056 , 2.237 , New , 0.82
14  , 0   , 3.057 , 2.237 , New , 0.82
15  , 15  , 3.055 , 2.236 , New , 0.82
15  , 0   , 3.055 , 2.237 , New , 0.82
16  , 16  , 3.057 , 2.237 , New , 0.82
17  , 17  , 3.055 , 2.237 , New , 0.82
17  , 0   , 3.056 , 2.237 , New , 0.82
18  , 18  , 3.055 , 2.237 , New , 0.82
18  , 0   , 3.055 , 2.236 , New , 0.82
19  , 19  , 3.056 , 2.237 , New , 0.82
19  , 0   , 3.055 , 2.238 , New , 0.82
20  , 20  , 3.056 , 2.236 , New , 0.82
20  , 0   , 3.056 , 2.238 , New , 0.82
21  , 21  , 3.056 , 2.238 , New , 0.82
21  , 0   , 3.055 , 2.236 , New , 0.82
22  , 22  , 3.055 , 2.237 , New , 0.82
22  , 0   , 3.056 , 2.238 , New , 0.82
23  , 23  , 3.055 , 2.237 , New , 0.82
23  , 0   , 3.055 , 2.237 , New , 0.82
24  , 24  , 3.056 , 2.237 , New , 0.82
24  , 0   , 3.055 , 2.237 , New , 0.82
25  , 25  , 3.056 , 2.237 , New , 0.82
25  , 0   , 3.056 , 2.235 , New , 0.82
26  , 26  , 3.056 , 2.237 , New , 0.82
26  , 0   , 3.056 , 2.237 , New , 0.82
27  , 27  , 3.056 , 2.236 , New , 0.82
27  , 0   , 3.056 , 2.236 , New , 0.82
28  , 28  , 3.056 , 2.236 , New , 0.82
28  , 0   , 3.055 , 2.237 , New , 0.82
29  , 29  , 3.056 , 2.236 , New , 0.82
29  , 0   , 3.056 , 2.236 , New , 0.82
30  , 30  , 3.056 , 2.237 , New , 0.82
30  , 0   , 3.056 , 2.237 , New , 0.82
31  , 31  , 3.057 , 2.237 , New , 0.82
31  , 0   , 3.057 , 2.237 , New , 0.82
14  , 1   , 3.338 , 2.441 , New , 0.9 
25  , 2   , 3.336 , 2.441 , New , 0.9 
1   , 4095, 2.977 , 2.981 , Cur , -0.0
2   , 4095, 22.514, 21.963, New , 0.55
3   , 4095, 11.212, 11.2  , New , 0.01
4   , 4095, 22.522, 22.538, Cur , -0.02
5   , 4095, 11.501, 11.199, New , 0.3 
6   , 4095, 11.504, 11.506, Cur , -0.0
7   , 4095, 11.503, 11.504, Cur , -0.0
8   , 4095, 21.968, 22.047, Cur , -0.08
9   , 4095, 11.363, 11.503, Cur , -0.14
10  , 4095, 11.505, 11.505, Eq  , 0.0 
11  , 4095, 11.504, 11.505, Cur , -0.0
12  , 4095, 11.504, 11.506, Cur , -0.0
13  , 4095, 11.504, 11.507, Cur , -0.0
14  , 4095, 11.503, 11.507, Cur , -0.0
15  , 4095, 11.504, 11.505, Cur , -0.0
16  , 4095, 21.511, 21.513, Cur , -0.0
17  , 4095, 11.503, 11.503, Eq  , 0.0 
18  , 4095, 11.504, 11.506, Cur , -0.0
19  , 4095, 11.504, 11.506, Cur , -0.0
20  , 4095, 11.504, 11.508, Cur , -0.0
21  , 4095, 11.505, 11.507, Cur , -0.0
22  , 4095, 11.503, 11.508, Cur , -0.0
23  , 4095, 11.505, 11.506, Cur , -0.0
24  , 4095, 11.504, 11.506, Cur , -0.0
25  , 4095, 11.504, 11.507, Cur , -0.0
26  , 4095, 11.504, 11.507, Cur , -0.0
27  , 4095, 11.504, 11.506, Cur , -0.0
28  , 4095, 11.505, 11.507, Cur , -0.0
29  , 4095, 11.504, 11.508, Cur , -0.0
30  , 4095, 11.506, 11.507, Cur , -0.0
31  , 4095, 11.504, 11.505, Cur , -0.0
2   , 4094, 3.384 , 3.387 , Cur , -0.0
3   , 4093, 3.38  , 3.383 , Cur , -0.0
4   , 4092, 3.052 , 3.399 , Cur , -0.35
5   , 4091, 2.978 , 3.384 , Cur , -0.41
6   , 4090, 3.05  , 3.473 , Cur , -0.42
7   , 4089, 3.055 , 3.474 , Cur , -0.42
8   , 4088, 3.056 , 3.061 , Cur , -0.0
9   , 4087, 2.984 , 3.052 , Cur , -0.07
10  , 4086, 3.055 , 3.055 , Eq  , 0.0 
11  , 4085, 3.054 , 3.057 , Cur , -0.0
12  , 4084, 3.055 , 3.057 , Cur , -0.0
13  , 4083, 3.056 , 3.058 , Cur , -0.0
14  , 4082, 3.056 , 3.06  , Cur , -0.0
15  , 4081, 3.054 , 3.058 , Cur , -0.0
16  , 4080, 3.057 , 3.061 , Cur , -0.0
17  , 4079, 3.051 , 3.056 , Cur , -0.0
18  , 4078, 3.055 , 3.056 , Cur , -0.0
19  , 4077, 3.056 , 3.057 , Cur , -0.0
20  , 4076, 3.055 , 3.057 , Cur , -0.0
21  , 4075, 3.055 , 3.058 , Cur , -0.0
22  , 4074, 3.056 , 3.059 , Cur , -0.0
23  , 4073, 3.056 , 3.058 , Cur , -0.0
24  , 4072, 3.055 , 3.058 , Cur , -0.0
25  , 4071, 3.056 , 3.058 , Cur , -0.0
26  , 4070, 3.055 , 3.057 , Cur , -0.0
27  , 4069, 3.055 , 3.058 , Cur , -0.0
28  , 4068, 3.056 , 3.056 , Eq  , 0.0 
29  , 4067, 3.056 , 3.057 , Cur , -0.0
30  , 4066, 3.055 , 3.057 , Cur , -0.0
31  , 4065, 3.056 , 3.059 , Cur , -0.0

Results For Icelake memset-avx512-erms
size, algn, Cur T , New T , Win , Dif 
1   , 0   , 2.982 , 2.9   , New , 0.08
2   , 0   , 3.344 , 2.895 , New , 0.45
4   , 0   , 3.794 , 2.896 , New , 0.9 
8   , 0   , 3.338 , 2.894 , New , 0.44
16  , 0   , 2.885 , 2.897 , Cur , -0.01
32  , 0   , 2.799 , 2.809 , Cur , -0.01
1   , 1   , 2.596 , 2.585 , New , 0.01
2   , 2   , 2.979 , 2.582 , New , 0.4 
3   , 3   , 3.053 , 2.65  , New , 0.4 
3   , 0   , 3.056 , 2.651 , New , 0.41
4   , 4   , 3.475 , 2.652 , New , 0.82
5   , 5   , 3.473 , 2.653 , New , 0.82
5   , 0   , 3.473 , 2.654 , New , 0.82
6   , 6   , 3.472 , 2.652 , New , 0.82
6   , 0   , 3.473 , 2.652 , New , 0.82
7   , 7   , 3.474 , 2.652 , New , 0.82
7   , 0   , 3.474 , 2.653 , New , 0.82
8   , 8   , 3.057 , 2.652 , New , 0.4 
9   , 9   , 3.056 , 2.653 , New , 0.4 
9   , 0   , 3.055 , 2.653 , New , 0.4 
10  , 10  , 3.056 , 2.653 , New , 0.4 
10  , 0   , 3.057 , 2.654 , New , 0.4 
11  , 11  , 3.056 , 2.653 , New , 0.4 
11  , 0   , 3.056 , 2.654 , New , 0.4 
12  , 12  , 3.056 , 2.653 , New , 0.4 
12  , 0   , 3.057 , 2.652 , New , 0.4 
13  , 13  , 3.057 , 2.654 , New , 0.4 
13  , 0   , 3.057 , 2.653 , New , 0.4 
14  , 14  , 3.056 , 2.652 , New , 0.4 
14  , 0   , 3.057 , 2.653 , New , 0.4 
15  , 15  , 3.056 , 2.653 , New , 0.4 
15  , 0   , 3.057 , 2.653 , New , 0.4 
16  , 16  , 2.644 , 2.652 , Cur , -0.01
17  , 17  , 2.64  , 2.653 , Cur , -0.01
17  , 0   , 2.64  , 2.652 , Cur , -0.01
18  , 18  , 2.639 , 2.652 , Cur , -0.01
18  , 0   , 2.639 , 2.652 , Cur , -0.01
19  , 19  , 2.64  , 2.653 , Cur , -0.01
19  , 0   , 2.639 , 2.652 , Cur , -0.01
20  , 20  , 2.639 , 2.652 , Cur , -0.01
20  , 0   , 2.639 , 2.651 , Cur , -0.01
21  , 21  , 2.639 , 2.652 , Cur , -0.01
21  , 0   , 2.639 , 2.652 , Cur , -0.01
22  , 22  , 2.639 , 2.652 , Cur , -0.01
22  , 0   , 2.64  , 2.654 , Cur , -0.01
23  , 23  , 2.64  , 2.654 , Cur , -0.01
23  , 0   , 2.641 , 2.652 , Cur , -0.01
24  , 24  , 2.639 , 2.653 , Cur , -0.01
24  , 0   , 2.64  , 2.653 , Cur , -0.01
25  , 25  , 2.64  , 2.653 , Cur , -0.01
25  , 0   , 2.639 , 2.652 , Cur , -0.01
26  , 26  , 2.64  , 2.654 , Cur , -0.01
26  , 0   , 2.64  , 2.653 , Cur , -0.01
27  , 27  , 2.64  , 2.653 , Cur , -0.01
27  , 0   , 2.639 , 2.652 , Cur , -0.01
28  , 28  , 2.639 , 2.652 , Cur , -0.01
28  , 0   , 2.64  , 2.653 , Cur , -0.01
29  , 29  , 2.64  , 2.653 , Cur , -0.01
29  , 0   , 2.64  , 2.653 , Cur , -0.01
30  , 30  , 2.64  , 2.653 , Cur , -0.01
30  , 0   , 2.641 , 2.654 , Cur , -0.01
31  , 31  , 2.64  , 2.653 , Cur , -0.01
31  , 0   , 2.64  , 2.653 , Cur , -0.01
32  , 32  , 2.643 , 2.652 , Cur , -0.01
33  , 33  , 2.641 , 2.652 , Cur , -0.01
33  , 0   , 2.64  , 2.653 , Cur , -0.01
34  , 34  , 2.641 , 2.653 , Cur , -0.01
34  , 0   , 2.64  , 2.651 , Cur , -0.01
35  , 35  , 2.64  , 2.652 , Cur , -0.01
35  , 0   , 2.64  , 2.651 , Cur , -0.01
36  , 36  , 2.642 , 2.653 , Cur , -0.01
36  , 0   , 2.639 , 2.652 , Cur , -0.01
37  , 37  , 2.641 , 2.652 , Cur , -0.01
37  , 0   , 2.641 , 2.652 , Cur , -0.01
38  , 38  , 2.64  , 2.652 , Cur , -0.01
38  , 0   , 2.64  , 2.652 , Cur , -0.01
39  , 39  , 2.641 , 2.652 , Cur , -0.01
39  , 0   , 2.64  , 2.653 , Cur , -0.01
40  , 40  , 2.64  , 2.652 , Cur , -0.01
40  , 0   , 2.641 , 2.652 , Cur , -0.01
41  , 41  , 2.64  , 2.654 , Cur , -0.01
41  , 0   , 2.64  , 2.653 , Cur , -0.01
42  , 42  , 2.641 , 2.653 , Cur , -0.01
42  , 0   , 2.641 , 2.652 , Cur , -0.01
43  , 43  , 2.641 , 2.653 , Cur , -0.01
43  , 0   , 2.641 , 2.653 , Cur , -0.01
44  , 44  , 2.64  , 2.653 , Cur , -0.01
44  , 0   , 2.642 , 2.653 , Cur , -0.01
45  , 45  , 2.641 , 2.654 , Cur , -0.01
45  , 0   , 2.64  , 2.652 , Cur , -0.01
46  , 46  , 2.642 , 2.653 , Cur , -0.01
46  , 0   , 2.642 , 2.651 , Cur , -0.01
47  , 47  , 2.64  , 2.652 , Cur , -0.01
47  , 0   , 2.64  , 2.653 , Cur , -0.01
48  , 48  , 2.64  , 2.651 , Cur , -0.01
48  , 0   , 2.639 , 2.651 , Cur , -0.01
49  , 49  , 2.642 , 2.653 , Cur , -0.01
49  , 0   , 2.641 , 2.651 , Cur , -0.01
50  , 50  , 2.64  , 2.651 , Cur , -0.01
50  , 0   , 2.641 , 2.652 , Cur , -0.01
51  , 51  , 2.641 , 2.652 , Cur , -0.01
51  , 0   , 2.641 , 2.652 , Cur , -0.01
52  , 52  , 2.641 , 2.653 , Cur , -0.01
52  , 0   , 2.642 , 2.652 , Cur , -0.01
53  , 53  , 2.64  , 2.652 , Cur , -0.01
53  , 0   , 2.64  , 2.651 , Cur , -0.01
54  , 54  , 2.641 , 2.651 , Cur , -0.01
54  , 0   , 2.64  , 2.653 , Cur , -0.01
55  , 55  , 2.642 , 2.652 , Cur , -0.01
55  , 0   , 2.64  , 2.652 , Cur , -0.01
56  , 56  , 2.64  , 2.653 , Cur , -0.01
56  , 0   , 2.641 , 2.653 , Cur , -0.01
57  , 57  , 2.642 , 2.652 , Cur , -0.01
57  , 0   , 2.64  , 2.652 , Cur , -0.01
58  , 58  , 2.641 , 2.651 , Cur , -0.01
58  , 0   , 2.64  , 2.652 , Cur , -0.01
59  , 59  , 2.64  , 2.653 , Cur , -0.01
59  , 0   , 2.64  , 2.652 , Cur , -0.01
60  , 60  , 2.64  , 2.652 , Cur , -0.01
60  , 0   , 2.64  , 2.652 , Cur , -0.01
61  , 61  , 2.64  , 2.652 , Cur , -0.01
61  , 0   , 2.64  , 2.653 , Cur , -0.01
62  , 62  , 2.641 , 2.653 , Cur , -0.01
62  , 0   , 2.639 , 2.652 , Cur , -0.01
63  , 63  , 2.641 , 2.653 , Cur , -0.01
63  , 0   , 2.64  , 2.653 , Cur , -0.01
14  , 1   , 3.34  , 2.895 , New , 0.44
25  , 2   , 2.885 , 2.896 , Cur , -0.01
1   , 4095, 5.438 , 6.523 , Cur , -1.08
2   , 4095, 22.522, 22.242, New , 0.28
3   , 4095, 11.502, 11.431, New , 0.07
4   , 4095, 22.539, 22.544, Cur , -0.0
5   , 4095, 11.503, 11.503, Eq  , 0.0 
6   , 4095, 11.504, 11.507, Cur , -0.0
7   , 4095, 11.505, 11.505, Eq  , 0.0 
8   , 4095, 22.539, 22.548, Cur , -0.01
9   , 4095, 11.502, 11.504, Cur , -0.0
10  , 4095, 11.504, 11.507, Cur , -0.0
11  , 4095, 11.504, 11.507, Cur , -0.0
12  , 4095, 11.504, 11.508, Cur , -0.0
13  , 4095, 11.503, 11.507, Cur , -0.0
14  , 4095, 11.504, 11.506, Cur , -0.0
15  , 4095, 11.504, 11.507, Cur , -0.0
16  , 4095, 21.511, 21.513, Cur , -0.0
17  , 4095, 11.503, 11.506, Cur , -0.0
18  , 4095, 11.506, 11.506, Eq  , 0.0 
19  , 4095, 11.504, 11.507, Cur , -0.0
20  , 4095, 11.504, 11.507, Cur , -0.0
21  , 4095, 11.505, 11.507, Cur , -0.0
22  , 4095, 11.504, 11.508, Cur , -0.0
23  , 4095, 11.504, 11.507, Cur , -0.0
24  , 4095, 11.507, 11.509, Cur , -0.0
25  , 4095, 11.505, 11.51 , Cur , -0.0
26  , 4095, 11.505, 11.508, Cur , -0.0
27  , 4095, 11.505, 11.508, Cur , -0.0
28  , 4095, 11.506, 11.508, Cur , -0.0
29  , 4095, 11.504, 11.506, Cur , -0.0
30  , 4095, 11.506, 11.507, Cur , -0.0
31  , 4095, 11.504, 11.509, Cur , -0.01
32  , 4095, 21.511, 21.517, Cur , -0.01
33  , 4095, 11.504, 11.507, Cur , -0.0
34  , 4095, 11.505, 11.507, Cur , -0.0
35  , 4095, 11.506, 11.505, New , 0.0 
36  , 4095, 11.506, 11.508, Cur , -0.0
37  , 4095, 11.505, 11.508, Cur , -0.0
38  , 4095, 11.506, 11.51 , Cur , -0.0
39  , 4095, 11.505, 11.507, Cur , -0.0
40  , 4095, 11.504, 11.511, Cur , -0.01
41  , 4095, 11.509, 11.51 , Cur , -0.0
42  , 4095, 11.506, 11.509, Cur , -0.0
43  , 4095, 11.506, 11.51 , Cur , -0.0
44  , 4095, 11.505, 11.507, Cur , -0.0
45  , 4095, 11.505, 11.508, Cur , -0.0
46  , 4095, 11.504, 11.507, Cur , -0.0
47  , 4095, 11.508, 11.507, New , 0.0 
48  , 4095, 11.505, 11.509, Cur , -0.0
49  , 4095, 11.506, 11.508, Cur , -0.0
50  , 4095, 11.505, 11.509, Cur , -0.0
51  , 4095, 11.507, 11.508, Cur , -0.0
52  , 4095, 11.504, 11.507, Cur , -0.0
53  , 4095, 11.505, 11.507, Cur , -0.0
54  , 4095, 11.504, 11.509, Cur , -0.01
55  , 4095, 11.505, 11.507, Cur , -0.0
56  , 4095, 11.505, 11.508, Cur , -0.0
57  , 4095, 11.505, 11.509, Cur , -0.0
58  , 4095, 11.505, 11.509, Cur , -0.0
59  , 4095, 11.504, 11.507, Cur , -0.0
60  , 4095, 11.505, 11.508, Cur , -0.0
61  , 4095, 11.505, 11.508, Cur , -0.0
62  , 4095, 11.508, 11.508, Eq  , 0.0 
63  , 4095, 11.505, 11.509, Cur , -0.0
2   , 4094, 5.959 , 6.508 , Cur , -0.55
3   , 4093, 6.442 , 6.849 , Cur , -0.41
4   , 4092, 6.032 , 6.786 , Cur , -0.75
5   , 4091, 6.791 , 7.028 , Cur , -0.24
6   , 4090, 6.553 , 6.298 , New , 0.25
7   , 4089, 6.85  , 6.866 , Cur , -0.02
8   , 4088, 6.502 , 6.414 , New , 0.09
9   , 4087, 6.421 , 6.912 , Cur , -0.49
10  , 4086, 6.214 , 7.047 , Cur , -0.83
11  , 4085, 5.943 , 7.059 , Cur , -1.12
12  , 4084, 5.986 , 6.743 , Cur , -0.76
13  , 4083, 6.468 , 6.412 , New , 0.06
14  , 4082, 6.46  , 6.851 , Cur , -0.39
15  , 4081, 6.466 , 7.184 , Cur , -0.72
16  , 4080, 2.64  , 3.892 , Cur , -1.25
17  , 4079, 2.637 , 3.886 , Cur , -1.25
18  , 4078, 2.64  , 3.889 , Cur , -1.25
19  , 4077, 2.639 , 3.891 , Cur , -1.25
20  , 4076, 2.638 , 3.891 , Cur , -1.25
21  , 4075, 2.638 , 3.89  , Cur , -1.25
22  , 4074, 2.639 , 3.891 , Cur , -1.25
23  , 4073, 2.639 , 3.891 , Cur , -1.25
24  , 4072, 2.64  , 3.889 , Cur , -1.25
25  , 4071, 2.639 , 3.891 , Cur , -1.25
26  , 4070, 2.639 , 3.89  , Cur , -1.25
27  , 4069, 2.64  , 3.89  , Cur , -1.25
28  , 4068, 2.64  , 3.89  , Cur , -1.25
29  , 4067, 2.641 , 3.891 , Cur , -1.25
30  , 4066, 2.639 , 3.891 , Cur , -1.25
31  , 4065, 2.641 , 3.89  , Cur , -1.25
32  , 4064, 2.644 , 3.478 , Cur , -0.83
33  , 4063, 2.639 , 3.473 , Cur , -0.83
34  , 4062, 2.638 , 3.473 , Cur , -0.83
35  , 4061, 2.639 , 3.474 , Cur , -0.84
36  , 4060, 2.641 , 3.474 , Cur , -0.83
37  , 4059, 2.64  , 3.474 , Cur , -0.83
38  , 4058, 2.64  , 3.474 , Cur , -0.83
39  , 4057, 2.641 , 3.473 , Cur , -0.83
40  , 4056, 2.641 , 3.474 , Cur , -0.83
41  , 4055, 2.641 , 3.475 , Cur , -0.83
42  , 4054, 2.64  , 3.474 , Cur , -0.83
43  , 4053, 2.641 , 3.474 , Cur , -0.83
44  , 4052, 2.64  , 3.474 , Cur , -0.83
45  , 4051, 2.641 , 3.475 , Cur , -0.83
46  , 4050, 2.641 , 3.474 , Cur , -0.83
47  , 4049, 2.64  , 3.474 , Cur , -0.83
48  , 4048, 2.642 , 3.475 , Cur , -0.83
49  , 4047, 2.641 , 3.474 , Cur , -0.83
50  , 4046, 2.64  , 3.475 , Cur , -0.83
51  , 4045, 2.64  , 3.474 , Cur , -0.83
52  , 4044, 2.641 , 3.473 , Cur , -0.83
53  , 4043, 2.64  , 3.474 , Cur , -0.83
54  , 4042, 2.641 , 3.474 , Cur , -0.83
55  , 4041, 2.641 , 3.475 , Cur , -0.83
56  , 4040, 2.641 , 3.474 , Cur , -0.83
57  , 4039, 2.642 , 3.474 , Cur , -0.83
58  , 4038, 2.641 , 3.475 , Cur , -0.83
59  , 4037, 2.641 , 3.475 , Cur , -0.83
60  , 4036, 2.641 , 3.474 , Cur , -0.83
61  , 4035, 2.64  , 3.475 , Cur , -0.83
62  , 4034, 2.64  , 3.474 , Cur , -0.83
63  , 4033, 2.641 , 3.474 , Cur , -0.83

Results For Icelake memset-avx512
size, algn, Cur T , New T , Win , Dif 
1   , 0   , 3.348 , 2.441 , New , 0.91
2   , 0   , 3.795 , 2.439 , New , 1.36
4   , 0   , 4.245 , 2.439 , New , 1.81
8   , 0   , 3.792 , 2.441 , New , 1.35
16  , 0   , 3.337 , 2.443 , New , 0.89
32  , 0   , 3.227 , 2.376 , New , 0.85
1   , 1   , 2.981 , 2.179 , New , 0.8 
2   , 2   , 3.385 , 2.178 , New , 1.21
3   , 3   , 3.472 , 2.235 , New , 1.24
3   , 0   , 3.472 , 2.236 , New , 1.24
4   , 4   , 3.891 , 2.236 , New , 1.65
5   , 5   , 3.889 , 2.237 , New , 1.65
5   , 0   , 3.889 , 2.235 , New , 1.65
6   , 6   , 3.89  , 2.237 , New , 1.65
6   , 0   , 3.89  , 2.235 , New , 1.66
7   , 7   , 3.89  , 2.237 , New , 1.65
7   , 0   , 3.89  , 2.237 , New , 1.65
8   , 8   , 3.474 , 2.238 , New , 1.24
9   , 9   , 3.472 , 2.237 , New , 1.23
9   , 0   , 3.472 , 2.235 , New , 1.24
10  , 10  , 3.472 , 2.238 , New , 1.23
10  , 0   , 3.472 , 2.236 , New , 1.24
11  , 11  , 3.474 , 2.238 , New , 1.24
11  , 0   , 3.471 , 2.235 , New , 1.24
12  , 12  , 3.472 , 2.24  , New , 1.23
12  , 0   , 3.473 , 2.236 , New , 1.24
13  , 13  , 3.473 , 2.237 , New , 1.24
13  , 0   , 3.471 , 2.236 , New , 1.23
14  , 14  , 3.474 , 2.237 , New , 1.24
14  , 0   , 3.476 , 2.238 , New , 1.24
15  , 15  , 3.473 , 2.238 , New , 1.23
15  , 0   , 3.473 , 2.236 , New , 1.24
16  , 16  , 3.057 , 2.238 , New , 0.82
17  , 17  , 3.057 , 2.241 , New , 0.82
17  , 0   , 3.056 , 2.235 , New , 0.82
18  , 18  , 3.055 , 2.239 , New , 0.82
18  , 0   , 3.056 , 2.236 , New , 0.82
19  , 19  , 3.057 , 2.237 , New , 0.82
19  , 0   , 3.055 , 2.236 , New , 0.82
20  , 20  , 3.056 , 2.239 , New , 0.82
20  , 0   , 3.057 , 2.236 , New , 0.82
21  , 21  , 3.056 , 2.239 , New , 0.82
21  , 0   , 3.055 , 2.237 , New , 0.82
22  , 22  , 3.057 , 2.239 , New , 0.82
22  , 0   , 3.056 , 2.235 , New , 0.82
23  , 23  , 3.056 , 2.239 , New , 0.82
23  , 0   , 3.057 , 2.236 , New , 0.82
24  , 24  , 3.055 , 2.239 , New , 0.82
24  , 0   , 3.056 , 2.236 , New , 0.82
25  , 25  , 3.057 , 2.238 , New , 0.82
25  , 0   , 3.056 , 2.237 , New , 0.82
26  , 26  , 3.057 , 2.239 , New , 0.82
26  , 0   , 3.056 , 2.237 , New , 0.82
27  , 27  , 3.055 , 2.238 , New , 0.82
27  , 0   , 3.056 , 2.236 , New , 0.82
28  , 28  , 3.058 , 2.239 , New , 0.82
28  , 0   , 3.055 , 2.236 , New , 0.82
29  , 29  , 3.056 , 2.237 , New , 0.82
29  , 0   , 3.057 , 2.237 , New , 0.82
30  , 30  , 3.056 , 2.239 , New , 0.82
30  , 0   , 3.058 , 2.238 , New , 0.82
31  , 31  , 3.058 , 2.238 , New , 0.82
31  , 0   , 3.056 , 2.237 , New , 0.82
32  , 32  , 3.057 , 2.236 , New , 0.82
33  , 33  , 3.057 , 2.237 , New , 0.82
33  , 0   , 3.055 , 2.236 , New , 0.82
34  , 34  , 3.055 , 2.236 , New , 0.82
34  , 0   , 3.056 , 2.235 , New , 0.82
35  , 35  , 3.056 , 2.235 , New , 0.82
35  , 0   , 3.056 , 2.236 , New , 0.82
36  , 36  , 3.056 , 2.237 , New , 0.82
36  , 0   , 3.056 , 2.236 , New , 0.82
37  , 37  , 3.056 , 2.237 , New , 0.82
37  , 0   , 3.056 , 2.237 , New , 0.82
38  , 38  , 3.056 , 2.237 , New , 0.82
38  , 0   , 3.056 , 2.236 , New , 0.82
39  , 39  , 3.057 , 2.235 , New , 0.82
39  , 0   , 3.056 , 2.236 , New , 0.82
40  , 40  , 3.057 , 2.237 , New , 0.82
40  , 0   , 3.057 , 2.235 , New , 0.82
41  , 41  , 3.055 , 2.236 , New , 0.82
41  , 0   , 3.056 , 2.236 , New , 0.82
42  , 42  , 3.057 , 2.236 , New , 0.82
42  , 0   , 3.055 , 2.236 , New , 0.82
43  , 43  , 3.056 , 2.236 , New , 0.82
43  , 0   , 3.056 , 2.236 , New , 0.82
44  , 44  , 3.055 , 2.237 , New , 0.82
44  , 0   , 3.056 , 2.237 , New , 0.82
45  , 45  , 3.059 , 2.237 , New , 0.82
45  , 0   , 3.056 , 2.236 , New , 0.82
46  , 46  , 3.057 , 2.236 , New , 0.82
46  , 0   , 3.056 , 2.237 , New , 0.82
47  , 47  , 3.056 , 2.236 , New , 0.82
47  , 0   , 3.055 , 2.235 , New , 0.82
48  , 48  , 3.057 , 2.238 , New , 0.82
48  , 0   , 3.056 , 2.236 , New , 0.82
49  , 49  , 3.056 , 2.236 , New , 0.82
49  , 0   , 3.057 , 2.236 , New , 0.82
50  , 50  , 3.056 , 2.237 , New , 0.82
50  , 0   , 3.055 , 2.237 , New , 0.82
51  , 51  , 3.058 , 2.236 , New , 0.82
51  , 0   , 3.055 , 2.235 , New , 0.82
52  , 52  , 3.056 , 2.236 , New , 0.82
52  , 0   , 3.057 , 2.236 , New , 0.82
53  , 53  , 3.056 , 2.236 , New , 0.82
53  , 0   , 3.056 , 2.236 , New , 0.82
54  , 54  , 3.058 , 2.236 , New , 0.82
54  , 0   , 3.056 , 2.236 , New , 0.82
55  , 55  , 3.057 , 2.235 , New , 0.82
55  , 0   , 3.057 , 2.236 , New , 0.82
56  , 56  , 3.056 , 2.237 , New , 0.82
56  , 0   , 3.055 , 2.237 , New , 0.82
57  , 57  , 3.057 , 2.235 , New , 0.82
57  , 0   , 3.056 , 2.236 , New , 0.82
58  , 58  , 3.055 , 2.236 , New , 0.82
58  , 0   , 3.056 , 2.236 , New , 0.82
59  , 59  , 3.056 , 2.236 , New , 0.82
59  , 0   , 3.055 , 2.237 , New , 0.82
60  , 60  , 3.058 , 2.236 , New , 0.82
60  , 0   , 3.056 , 2.237 , New , 0.82
61  , 61  , 3.056 , 2.237 , New , 0.82
61  , 0   , 3.058 , 2.236 , New , 0.82
62  , 62  , 3.057 , 2.236 , New , 0.82
62  , 0   , 3.056 , 2.236 , New , 0.82
63  , 63  , 3.058 , 2.235 , New , 0.82
63  , 0   , 3.057 , 2.238 , New , 0.82
14  , 1   , 3.791 , 2.443 , New , 1.35
25  , 2   , 3.336 , 2.441 , New , 0.9 
1   , 4095, 2.974 , 2.976 , Cur , -0.0
2   , 4095, 21.512, 21.507, New , 0.0 
3   , 4095, 11.505, 11.502, New , 0.0 
4   , 4095, 21.508, 21.511, Cur , -0.0
5   , 4095, 11.503, 11.505, Cur , -0.0
6   , 4095, 11.503, 11.505, Cur , -0.0
7   , 4095, 11.505, 11.504, New , 0.0 
8   , 4095, 21.51 , 21.511, Cur , -0.0
9   , 4095, 11.502, 11.504, Cur , -0.0
10  , 4095, 11.503, 11.507, Cur , -0.0
11  , 4095, 11.504, 11.506, Cur , -0.0
12  , 4095, 11.504, 11.505, Cur , -0.0
13  , 4095, 11.503, 11.507, Cur , -0.0
14  , 4095, 11.504, 11.506, Cur , -0.0
15  , 4095, 11.503, 11.505, Cur , -0.0
16  , 4095, 21.51 , 21.511, Cur , -0.0
17  , 4095, 11.503, 11.505, Cur , -0.0
18  , 4095, 11.504, 11.504, Eq  , 0.0 
19  , 4095, 11.505, 11.506, Cur , -0.0
20  , 4095, 11.505, 11.507, Cur , -0.0
21  , 4095, 11.505, 11.507, Cur , -0.0
22  , 4095, 11.504, 11.506, Cur , -0.0
23  , 4095, 11.505, 11.505, Eq  , 0.0 
24  , 4095, 11.505, 11.506, Cur , -0.0
25  , 4095, 11.504, 11.506, Cur , -0.0
26  , 4095, 11.506, 11.507, Cur , -0.0
27  , 4095, 11.505, 11.506, Cur , -0.0
28  , 4095, 11.507, 11.506, New , 0.0 
29  , 4095, 11.505, 11.506, Cur , -0.0
30  , 4095, 11.506, 11.507, Cur , -0.0
31  , 4095, 11.505, 11.507, Cur , -0.0
32  , 4095, 21.517, 21.517, Eq  , 0.0 
33  , 4095, 11.505, 11.505, Eq  , 0.0 
34  , 4095, 11.507, 11.507, Eq  , 0.0 
35  , 4095, 11.505, 11.507, Cur , -0.0
36  , 4095, 11.507, 11.509, Cur , -0.0
37  , 4095, 11.505, 11.506, Cur , -0.0
38  , 4095, 11.505, 11.509, Cur , -0.0
39  , 4095, 11.507, 11.506, New , 0.0 
40  , 4095, 11.506, 11.506, Eq  , 0.0 
41  , 4095, 11.507, 11.507, Eq  , 0.0 
42  , 4095, 11.507, 11.507, Eq  , 0.0 
43  , 4095, 11.507, 11.508, Cur , -0.0
44  , 4095, 11.506, 11.509, Cur , -0.0
45  , 4095, 11.506, 11.509, Cur , -0.0
46  , 4095, 11.507, 11.508, Cur , -0.0
47  , 4095, 11.507, 11.507, Eq  , 0.0 
48  , 4095, 11.509, 11.512, Cur , -0.0
49  , 4095, 11.507, 11.508, Cur , -0.0
50  , 4095, 11.506, 11.508, Cur , -0.0
51  , 4095, 11.506, 11.506, Eq  , 0.0 
52  , 4095, 11.507, 11.507, Eq  , 0.0 
53  , 4095, 11.506, 11.509, Cur , -0.0
54  , 4095, 11.506, 11.506, Eq  , 0.0 
55  , 4095, 11.507, 11.507, Eq  , 0.0 
56  , 4095, 11.507, 11.507, Eq  , 0.0 
57  , 4095, 11.506, 11.506, Eq  , 0.0 
58  , 4095, 11.507, 11.507, Eq  , 0.0 
59  , 4095, 11.504, 11.507, Cur , -0.0
60  , 4095, 11.507, 11.507, Eq  , 0.0 
61  , 4095, 11.506, 11.508, Cur , -0.0
62  , 4095, 11.507, 11.507, Eq  , 0.0 
63  , 4095, 11.507, 11.507, Eq  , 0.0 
2   , 4094, 3.383 , 2.978 , New , 0.4 
3   , 4093, 3.381 , 2.977 , New , 0.4 
4   , 4092, 3.795 , 2.984 , New , 0.81
5   , 4091, 3.793 , 2.985 , New , 0.81
6   , 4090, 3.883 , 3.053 , New , 0.83
7   , 4089, 3.888 , 3.053 , New , 0.83
8   , 4088, 3.47  , 3.057 , New , 0.41
9   , 4087, 3.385 , 2.985 , New , 0.4 
10  , 4086, 3.47  , 3.053 , New , 0.42
11  , 4085, 3.469 , 3.056 , New , 0.41
12  , 4084, 3.471 , 3.055 , New , 0.42
13  , 4083, 3.47  , 3.056 , New , 0.41
14  , 4082, 3.47  , 3.055 , New , 0.42
15  , 4081, 3.471 , 3.056 , New , 0.42
16  , 4080, 3.055 , 3.473 , Cur , -0.42
17  , 4079, 3.053 , 3.47  , Cur , -0.42
18  , 4078, 3.052 , 3.47  , Cur , -0.42
19  , 4077, 3.054 , 3.472 , Cur , -0.42
20  , 4076, 3.056 , 3.473 , Cur , -0.42
21  , 4075, 3.054 , 3.473 , Cur , -0.42
22  , 4074, 3.054 , 3.472 , Cur , -0.42
23  , 4073, 3.056 , 3.474 , Cur , -0.42
24  , 4072, 3.054 , 3.472 , Cur , -0.42
25  , 4071, 3.053 , 3.472 , Cur , -0.42
26  , 4070, 3.054 , 3.473 , Cur , -0.42
27  , 4069, 3.054 , 3.473 , Cur , -0.42
28  , 4068, 3.056 , 3.471 , Cur , -0.42
29  , 4067, 3.055 , 3.473 , Cur , -0.42
30  , 4066, 3.053 , 3.473 , Cur , -0.42
31  , 4065, 3.054 , 3.473 , Cur , -0.42
32  , 4064, 3.056 , 3.058 , Cur , -0.0
33  , 4063, 3.053 , 3.055 , Cur , -0.0
34  , 4062, 3.054 , 3.055 , Cur , -0.0
35  , 4061, 3.053 , 3.057 , Cur , -0.0
36  , 4060, 3.051 , 3.057 , Cur , -0.01
37  , 4059, 3.056 , 3.057 , Cur , -0.0
38  , 4058, 3.054 , 3.056 , Cur , -0.0
39  , 4057, 3.059 , 3.06  , Cur , -0.0
40  , 4056, 3.055 , 3.057 , Cur , -0.0
41  , 4055, 3.054 , 3.056 , Cur , -0.0
42  , 4054, 3.055 , 3.057 , Cur , -0.0
43  , 4053, 3.055 , 3.055 , Eq  , 0.0 
44  , 4052, 3.054 , 3.056 , Cur , -0.0
45  , 4051, 3.054 , 3.057 , Cur , -0.0
46  , 4050, 3.055 , 3.057 , Cur , -0.0
47  , 4049, 3.054 , 3.057 , Cur , -0.0
48  , 4048, 3.054 , 3.056 , Cur , -0.0
49  , 4047, 3.056 , 3.057 , Cur , -0.0
50  , 4046, 3.054 , 3.056 , Cur , -0.0
51  , 4045, 3.054 , 3.056 , Cur , -0.0
52  , 4044, 3.057 , 3.057 , Eq  , 0.0 
53  , 4043, 3.056 , 3.057 , Cur , -0.0
54  , 4042, 3.055 , 3.057 , Cur , -0.0
55  , 4041, 3.056 , 3.057 , Cur , -0.0
56  , 4040, 3.057 , 3.058 , Cur , -0.0
57  , 4039, 3.056 , 3.057 , Cur , -0.0
58  , 4038, 3.056 , 3.057 , Cur , -0.0
59  , 4037, 3.055 , 3.056 , Cur , -0.0
60  , 4036, 3.054 , 3.056 , Cur , -0.0
61  , 4035, 3.056 , 3.057 , Cur , -0.0
62  , 4034, 3.054 , 3.057 , Cur , -0.0
63  , 4033, 3.055 , 3.056 , Cur , -0.0        

 sysdeps/x86_64/multiarch/ifunc-memset.h       |  6 ++-
 .../multiarch/memset-avx512-unaligned-erms.S  |  2 +-
 .../multiarch/memset-evex-unaligned-erms.S    |  2 +-
 .../multiarch/memset-vec-unaligned-erms.S     | 52 +++++++++++++++----
 4 files changed, 47 insertions(+), 15 deletions(-)

Comments

H.J. Lu April 19, 2021, 2:50 p.m. UTC | #1
On Sun, Apr 18, 2021 at 06:09:21PM -0400, Noah Goldstein wrote:
> No bug. This commit adds optimized cased for less_vec memset case that
> uses the avx512vl/avx512bw mask store avoiding the excessive
> branches. test-memset and test-wmemset are passing.
> 
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> Tests where run on the following CPUs:
> 
> Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html
> 
> Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
> 
> 
> All times are the geometric mean of N=20. The unit of time is
> seconds.
> 
> "Cur" refers to the current implementation
> "New" refers to this patches implementation
> 
> There are 3 cases that matter for performance.
> 
> 1) ptr is not within VEC_SIZE of a page
> 2) ptr is within VEC_SIZE of a page but length is small enough so that
>    there is not page cross
> 3) page cross.
> 
> Case 1 (which should be the most common) the new implementation has a
> near universal improvement. The only exception is the avx512 case for
> size = [0, 15] where I believe the downclocking from avx512 is causing
> slowdown. Its worth noting that because bench-memset.c repeats the
> same size the branch heavy case should be favored as the branches will
> all be predicted correctly. In a setting with unpredictable length
> this version should perform significant better. For example I
> implemented something similiar to this change for memmove/memcpy and
> saw ~40% speedup in bench-memcpy-random (but for other reasons this
> change isnt good there).
> 
> Cases 2 has a slowdown with this patch (roughly equivilent to the
> performance improvement for case 1). Though I think this is probably
> less important than the improvements for case 1 as page cross are
> probably rarer than non-page cross.
> 
> Case 3 has a very slight slowdown with this patch. But for the same
> reason as above I think this patch is still an improvement.
> 
> Its worth noting that the page cross check could be removed and
> the mask store implementation would still be correct, but I'm finding
> the the fault suppression is incredibly expensive from a performance
> perspective and without the branch I see a 2 orders of magnitude
> performance regression on the Case 2 benchmarks.
> 
> 
...
> 
>  sysdeps/x86_64/multiarch/ifunc-memset.h       |  6 ++-
>  .../multiarch/memset-avx512-unaligned-erms.S  |  2 +-
>  .../multiarch/memset-evex-unaligned-erms.S    |  2 +-
>  .../multiarch/memset-vec-unaligned-erms.S     | 52 +++++++++++++++----
>  4 files changed, 47 insertions(+), 15 deletions(-)
> 
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
> index 502f946a84..eda5640541 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
> @@ -54,7 +54,8 @@ IFUNC_SELECTOR (void)
>        && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
>      {
>        if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> -	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>  	{
>  	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
>  	    return OPTIMIZE (avx512_unaligned_erms);
> @@ -68,7 +69,8 @@ IFUNC_SELECTOR (void)
>    if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
>      {
>        if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> -	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
>  	{
>  	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
>  	    return OPTIMIZE (evex_unaligned_erms);

Please also update ifunc-impl-list.c.

> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> index 22e7b187c8..d03460be93 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -19,6 +19,6 @@
>  # define SECTION(p)		p##.evex512
>  # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
>  # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
> -
> +# define USE_LESS_VEC_MASKMOV	1
>  # include "memset-vec-unaligned-erms.S"
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> index ae0a4d6e46..eb3541ef60 100644
> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> @@ -19,6 +19,6 @@
>  # define SECTION(p)		p##.evex
>  # define MEMSET_SYMBOL(p,s)	p##_evex_##s
>  # define WMEMSET_SYMBOL(p,s)	p##_evex_##s
> -
> +# define USE_LESS_VEC_MASKMOV	1
>  # include "memset-vec-unaligned-erms.S"
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 584747f1a1..6b02e87f48 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -63,6 +63,9 @@
>  # endif
>  #endif
>  
> +#define PAGE_SIZE 4096
> +#define LOG_PAGE_SIZE 12
> +
>  #ifndef SECTION
>  # error SECTION is not defined!
>  #endif
> @@ -213,11 +216,38 @@ L(loop):
>  	cmpq	%rcx, %rdx
>  	jne	L(loop)
>  	VZEROUPPER_SHORT_RETURN
> +
> +	.p2align 4
>  L(less_vec):
>  	/* Less than 1 VEC.  */
>  # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
>  #  error Unsupported VEC_SIZE!
>  # endif
> +# ifdef USE_LESS_VEC_MASKMOV
> +	/* Clear high bits from edi. Only keeping bits relevant to page
> +	   cross check. Using sall instead of andl saves 3 bytes. Note
> +	   that we are using rax which is set in
> +	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
> +	sall	$(32 - LOG_PAGE_SIZE), %edi

Why is SAL needed here?

> +	/* Check if VEC_SIZE load cross page. Mask loads suffer serious
> +	   performance degradation when it has to fault supress.  */
> +	cmpl	$((PAGE_SIZE - VEC_SIZE) << (32 - LOG_PAGE_SIZE)), %edi

Can you use only CMP, not SAL, for masked store?

> +	ja	L(cross_page)
> +# if VEC_SIZE > 32
> +	movq	$-1, %rcx
> +	bzhiq	%rdx, %rcx, %rcx
> +	kmovq	%rcx, %k1
> +# else
> +	movl	$-1, %ecx
> +	bzhil	%edx, %ecx, %ecx
> +	kmovd	%ecx, %k1
> +# endif
> +	vmovdqu8	%VEC(0), (%rax) {%k1}
> +	VZEROUPPER_RETURN
> +
> +	.p2align 4
> +L(cross_page):
> +# endif
>  # if VEC_SIZE > 32
>  	cmpb	$32, %dl
>  	jae	L(between_32_63)
> @@ -234,36 +264,36 @@ L(less_vec):
>  	cmpb	$1, %dl
>  	ja	L(between_2_3)
>  	jb	1f
> -	movb	%cl, (%rdi)
> +	movb	%cl, (%rax)
>  1:
>  	VZEROUPPER_RETURN
>  # if VEC_SIZE > 32
>  	/* From 32 to 63.  No branch when size == 32.  */
>  L(between_32_63):
> -	VMOVU	%YMM0, -32(%rdi,%rdx)
> -	VMOVU	%YMM0, (%rdi)
> +	VMOVU	%YMM0, -32(%rax,%rdx)
> +	VMOVU	%YMM0, (%rax)
>  	VZEROUPPER_RETURN
>  # endif
>  # if VEC_SIZE > 16
>  	/* From 16 to 31.  No branch when size == 16.  */
>  L(between_16_31):
> -	VMOVU	%XMM0, -16(%rdi,%rdx)
> -	VMOVU	%XMM0, (%rdi)
> +	VMOVU	%XMM0, -16(%rax,%rdx)
> +	VMOVU	%XMM0, (%rax)
>  	VZEROUPPER_RETURN
>  # endif
>  	/* From 8 to 15.  No branch when size == 8.  */
>  L(between_8_15):
> -	movq	%rcx, -8(%rdi,%rdx)
> -	movq	%rcx, (%rdi)
> +	movq	%rcx, -8(%rax,%rdx)
> +	movq	%rcx, (%rax)
>  	VZEROUPPER_RETURN
>  L(between_4_7):
>  	/* From 4 to 7.  No branch when size == 4.  */
> -	movl	%ecx, -4(%rdi,%rdx)
> -	movl	%ecx, (%rdi)
> +	movl	%ecx, -4(%rax,%rdx)
> +	movl	%ecx, (%rax)
>  	VZEROUPPER_RETURN
>  L(between_2_3):
>  	/* From 2 to 3.  No branch when size == 2.  */
> -	movw	%cx, -2(%rdi,%rdx)
> -	movw	%cx, (%rdi)
> +	movw	%cx, -2(%rax,%rdx)
> +	movw	%cx, (%rax)
>  	VZEROUPPER_RETURN
>  END (MEMSET_SYMBOL (__memset, unaligned_erms))
> -- 
> 2.29.2
> 

H.J.
Noah Goldstein April 19, 2021, 4:27 p.m. UTC | #2
On Mon, Apr 19, 2021 at 10:50 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sun, Apr 18, 2021 at 06:09:21PM -0400, Noah Goldstein wrote:
> > No bug. This commit adds optimized cased for less_vec memset case that
> > uses the avx512vl/avx512bw mask store avoiding the excessive
> > branches. test-memset and test-wmemset are passing.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> > Tests where run on the following CPUs:
> >
> > Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html
> >
> > Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
> >
> >
> > All times are the geometric mean of N=20. The unit of time is
> > seconds.
> >
> > "Cur" refers to the current implementation
> > "New" refers to this patches implementation
> >
> > There are 3 cases that matter for performance.
> >
> > 1) ptr is not within VEC_SIZE of a page
> > 2) ptr is within VEC_SIZE of a page but length is small enough so that
> >    there is not page cross
> > 3) page cross.
> >
> > Case 1 (which should be the most common) the new implementation has a
> > near universal improvement. The only exception is the avx512 case for
> > size = [0, 15] where I believe the downclocking from avx512 is causing
> > slowdown. Its worth noting that because bench-memset.c repeats the
> > same size the branch heavy case should be favored as the branches will
> > all be predicted correctly. In a setting with unpredictable length
> > this version should perform significant better. For example I
> > implemented something similiar to this change for memmove/memcpy and
> > saw ~40% speedup in bench-memcpy-random (but for other reasons this
> > change isnt good there).
> >
> > Cases 2 has a slowdown with this patch (roughly equivilent to the
> > performance improvement for case 1). Though I think this is probably
> > less important than the improvements for case 1 as page cross are
> > probably rarer than non-page cross.
> >
> > Case 3 has a very slight slowdown with this patch. But for the same
> > reason as above I think this patch is still an improvement.
> >
> > Its worth noting that the page cross check could be removed and
> > the mask store implementation would still be correct, but I'm finding
> > the the fault suppression is incredibly expensive from a performance
> > perspective and without the branch I see a 2 orders of magnitude
> > performance regression on the Case 2 benchmarks.
> >
> >
> ...
> >
> >  sysdeps/x86_64/multiarch/ifunc-memset.h       |  6 ++-
> >  .../multiarch/memset-avx512-unaligned-erms.S  |  2 +-
> >  .../multiarch/memset-evex-unaligned-erms.S    |  2 +-
> >  .../multiarch/memset-vec-unaligned-erms.S     | 52 +++++++++++++++----
> >  4 files changed, 47 insertions(+), 15 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
> > index 502f946a84..eda5640541 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
> > @@ -54,7 +54,8 @@ IFUNC_SELECTOR (void)
> >        && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
> >      {
> >        if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > -       && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> >       {
> >         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> >           return OPTIMIZE (avx512_unaligned_erms);
> > @@ -68,7 +69,8 @@ IFUNC_SELECTOR (void)
> >    if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> >      {
> >        if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > -       && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
> > +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> >       {
> >         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> >           return OPTIMIZE (evex_unaligned_erms);
>
> Please also update ifunc-impl-list.c.
>
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > index 22e7b187c8..d03460be93 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > @@ -19,6 +19,6 @@
> >  # define SECTION(p)          p##.evex512
> >  # define MEMSET_SYMBOL(p,s)  p##_avx512_##s
> >  # define WMEMSET_SYMBOL(p,s) p##_avx512_##s
> > -
> > +# define USE_LESS_VEC_MASKMOV        1
> >  # include "memset-vec-unaligned-erms.S"
> >  #endif
> > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > index ae0a4d6e46..eb3541ef60 100644
> > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > @@ -19,6 +19,6 @@
> >  # define SECTION(p)          p##.evex
> >  # define MEMSET_SYMBOL(p,s)  p##_evex_##s
> >  # define WMEMSET_SYMBOL(p,s) p##_evex_##s
> > -
> > +# define USE_LESS_VEC_MASKMOV        1
> >  # include "memset-vec-unaligned-erms.S"
> >  #endif
> > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > index 584747f1a1..6b02e87f48 100644
> > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > @@ -63,6 +63,9 @@
> >  # endif
> >  #endif
> >
> > +#define PAGE_SIZE 4096
> > +#define LOG_PAGE_SIZE 12
> > +
> >  #ifndef SECTION
> >  # error SECTION is not defined!
> >  #endif
> > @@ -213,11 +216,38 @@ L(loop):
> >       cmpq    %rcx, %rdx
> >       jne     L(loop)
> >       VZEROUPPER_SHORT_RETURN
> > +
> > +     .p2align 4
> >  L(less_vec):
> >       /* Less than 1 VEC.  */
> >  # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
> >  #  error Unsupported VEC_SIZE!
> >  # endif
> > +# ifdef USE_LESS_VEC_MASKMOV
> > +     /* Clear high bits from edi. Only keeping bits relevant to page
> > +        cross check. Using sall instead of andl saves 3 bytes. Note
> > +        that we are using rax which is set in
> > +        MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
> > +     sall    $(32 - LOG_PAGE_SIZE), %edi
>
> Why is SAL needed here?

To clear the upper bits from edi before checking for a page cross.
We only want to check edi[11:0] to see if one vec load will cross a page.
AFAIK this is the most efficient way to do that.

>
> > +     /* Check if VEC_SIZE load cross page. Mask loads suffer serious
> > +        performance degradation when it has to fault supress.  */
> > +     cmpl    $((PAGE_SIZE - VEC_SIZE) << (32 - LOG_PAGE_SIZE)), %edi
>
> Can you use only CMP, not SAL, for masked store?

What do you mean? This comparison is to check if there will be a page cross
on the store. Its not necessary for correctness but is for performance as
fault suppression appears to be really expensive. The code would still be
correct with no SAL and no CMP.

>
> > +     ja      L(cross_page)
> > +# if VEC_SIZE > 32
> > +     movq    $-1, %rcx
> > +     bzhiq   %rdx, %rcx, %rcx
> > +     kmovq   %rcx, %k1
> > +# else
> > +     movl    $-1, %ecx
> > +     bzhil   %edx, %ecx, %ecx
> > +     kmovd   %ecx, %k1
> > +# endif
> > +     vmovdqu8        %VEC(0), (%rax) {%k1}
> > +     VZEROUPPER_RETURN
> > +
> > +     .p2align 4
> > +L(cross_page):
> > +# endif
> >  # if VEC_SIZE > 32
> >       cmpb    $32, %dl
> >       jae     L(between_32_63)
> > @@ -234,36 +264,36 @@ L(less_vec):
> >       cmpb    $1, %dl
> >       ja      L(between_2_3)
> >       jb      1f
> > -     movb    %cl, (%rdi)
> > +     movb    %cl, (%rax)
> >  1:
> >       VZEROUPPER_RETURN
> >  # if VEC_SIZE > 32
> >       /* From 32 to 63.  No branch when size == 32.  */
> >  L(between_32_63):
> > -     VMOVU   %YMM0, -32(%rdi,%rdx)
> > -     VMOVU   %YMM0, (%rdi)
> > +     VMOVU   %YMM0, -32(%rax,%rdx)
> > +     VMOVU   %YMM0, (%rax)
> >       VZEROUPPER_RETURN
> >  # endif
> >  # if VEC_SIZE > 16
> >       /* From 16 to 31.  No branch when size == 16.  */
> >  L(between_16_31):
> > -     VMOVU   %XMM0, -16(%rdi,%rdx)
> > -     VMOVU   %XMM0, (%rdi)
> > +     VMOVU   %XMM0, -16(%rax,%rdx)
> > +     VMOVU   %XMM0, (%rax)
> >       VZEROUPPER_RETURN
> >  # endif
> >       /* From 8 to 15.  No branch when size == 8.  */
> >  L(between_8_15):
> > -     movq    %rcx, -8(%rdi,%rdx)
> > -     movq    %rcx, (%rdi)
> > +     movq    %rcx, -8(%rax,%rdx)
> > +     movq    %rcx, (%rax)
> >       VZEROUPPER_RETURN
> >  L(between_4_7):
> >       /* From 4 to 7.  No branch when size == 4.  */
> > -     movl    %ecx, -4(%rdi,%rdx)
> > -     movl    %ecx, (%rdi)
> > +     movl    %ecx, -4(%rax,%rdx)
> > +     movl    %ecx, (%rax)
> >       VZEROUPPER_RETURN
> >  L(between_2_3):
> >       /* From 2 to 3.  No branch when size == 2.  */
> > -     movw    %cx, -2(%rdi,%rdx)
> > -     movw    %cx, (%rdi)
> > +     movw    %cx, -2(%rax,%rdx)
> > +     movw    %cx, (%rax)
> >       VZEROUPPER_RETURN
> >  END (MEMSET_SYMBOL (__memset, unaligned_erms))
> > --
> > 2.29.2
> >
>
> H.J.
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index 502f946a84..eda5640541 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -54,7 +54,8 @@  IFUNC_SELECTOR (void)
       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
     {
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 	{
 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 	    return OPTIMIZE (avx512_unaligned_erms);
@@ -68,7 +69,8 @@  IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
     {
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 	{
 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 	    return OPTIMIZE (evex_unaligned_erms);
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 22e7b187c8..d03460be93 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -19,6 +19,6 @@ 
 # define SECTION(p)		p##.evex512
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
 # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
-
+# define USE_LESS_VEC_MASKMOV	1
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index ae0a4d6e46..eb3541ef60 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -19,6 +19,6 @@ 
 # define SECTION(p)		p##.evex
 # define MEMSET_SYMBOL(p,s)	p##_evex_##s
 # define WMEMSET_SYMBOL(p,s)	p##_evex_##s
-
+# define USE_LESS_VEC_MASKMOV	1
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 584747f1a1..6b02e87f48 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -63,6 +63,9 @@ 
 # endif
 #endif
 
+#define PAGE_SIZE 4096
+#define LOG_PAGE_SIZE 12
+
 #ifndef SECTION
 # error SECTION is not defined!
 #endif
@@ -213,11 +216,38 @@  L(loop):
 	cmpq	%rcx, %rdx
 	jne	L(loop)
 	VZEROUPPER_SHORT_RETURN
+
+	.p2align 4
 L(less_vec):
 	/* Less than 1 VEC.  */
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 #  error Unsupported VEC_SIZE!
 # endif
+# ifdef USE_LESS_VEC_MASKMOV
+	/* Clear high bits from edi. Only keeping bits relevant to page
+	   cross check. Using sall instead of andl saves 3 bytes. Note
+	   that we are using rax which is set in
+	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.  */
+	sall	$(32 - LOG_PAGE_SIZE), %edi
+	/* Check if VEC_SIZE load cross page. Mask loads suffer serious
+	   performance degradation when it has to fault supress.  */
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << (32 - LOG_PAGE_SIZE)), %edi
+	ja	L(cross_page)
+# if VEC_SIZE > 32
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+	kmovq	%rcx, %k1
+# else
+	movl	$-1, %ecx
+	bzhil	%edx, %ecx, %ecx
+	kmovd	%ecx, %k1
+# endif
+	vmovdqu8	%VEC(0), (%rax) {%k1}
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(cross_page):
+# endif
 # if VEC_SIZE > 32
 	cmpb	$32, %dl
 	jae	L(between_32_63)
@@ -234,36 +264,36 @@  L(less_vec):
 	cmpb	$1, %dl
 	ja	L(between_2_3)
 	jb	1f
-	movb	%cl, (%rdi)
+	movb	%cl, (%rax)
 1:
 	VZEROUPPER_RETURN
 # if VEC_SIZE > 32
 	/* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
-	VMOVU	%YMM0, -32(%rdi,%rdx)
-	VMOVU	%YMM0, (%rdi)
+	VMOVU	%YMM0, -32(%rax,%rdx)
+	VMOVU	%YMM0, (%rax)
 	VZEROUPPER_RETURN
 # endif
 # if VEC_SIZE > 16
 	/* From 16 to 31.  No branch when size == 16.  */
 L(between_16_31):
-	VMOVU	%XMM0, -16(%rdi,%rdx)
-	VMOVU	%XMM0, (%rdi)
+	VMOVU	%XMM0, -16(%rax,%rdx)
+	VMOVU	%XMM0, (%rax)
 	VZEROUPPER_RETURN
 # endif
 	/* From 8 to 15.  No branch when size == 8.  */
 L(between_8_15):
-	movq	%rcx, -8(%rdi,%rdx)
-	movq	%rcx, (%rdi)
+	movq	%rcx, -8(%rax,%rdx)
+	movq	%rcx, (%rax)
 	VZEROUPPER_RETURN
 L(between_4_7):
 	/* From 4 to 7.  No branch when size == 4.  */
-	movl	%ecx, -4(%rdi,%rdx)
-	movl	%ecx, (%rdi)
+	movl	%ecx, -4(%rax,%rdx)
+	movl	%ecx, (%rax)
 	VZEROUPPER_RETURN
 L(between_2_3):
 	/* From 2 to 3.  No branch when size == 2.  */
-	movw	%cx, -2(%rdi,%rdx)
-	movw	%cx, (%rdi)
+	movw	%cx, -2(%rax,%rdx)
+	movw	%cx, (%rax)
 	VZEROUPPER_RETURN
 END (MEMSET_SYMBOL (__memset, unaligned_erms))