The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.755
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
---
Results For: strrchr
Geometric Mean of N=30 runs.
Geometric Mean of all benchmarks New / Old: 0.755
Benchmarks performance on Tigerlake:
https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
len, align, pos, seek, max_char, freq, New Time / Old Time
2048, 0, 32, 0, 127, 1, 0.669
2048, 1, 32, 0, 127, 1, 0.672
2048, 0, 64, 0, 127, 1, 0.579
2048, 2, 64, 0, 127, 1, 0.579
2048, 0, 128, 0, 127, 1, 0.828
2048, 3, 128, 0, 127, 1, 0.827
2048, 0, 256, 0, 127, 1, 0.693
2048, 4, 256, 0, 127, 1, 0.692
2048, 0, 512, 0, 127, 1, 0.619
2048, 5, 512, 0, 127, 1, 0.622
2048, 0, 1024, 0, 127, 1, 0.626
2048, 6, 1024, 0, 127, 1, 0.627
2048, 0, 2048, 0, 127, 1, 0.85
2048, 7, 2048, 0, 127, 1, 0.855
2048, 0, 4096, 0, 127, 1, 0.849
2048, 8, 4096, 0, 127, 1, 0.848
256, 1, 64, 0, 127, 1, 0.579
256, 15, 64, 0, 127, 1, 0.579
256, 2, 64, 0, 127, 1, 0.579
256, 30, 64, 0, 127, 1, 0.579
256, 3, 64, 0, 127, 1, 0.579
256, 45, 64, 0, 127, 1, 0.551
256, 4, 64, 0, 127, 1, 0.579
256, 60, 64, 0, 127, 1, 0.553
256, 5, 64, 0, 127, 1, 0.579
256, 75, 64, 0, 127, 1, 0.578
256, 6, 64, 0, 127, 1, 0.578
256, 90, 64, 0, 127, 1, 0.579
256, 7, 64, 0, 127, 1, 0.579
256, 105, 64, 0, 127, 1, 0.55
1, 0, 0, 0, 127, 1, 0.795
2, 0, 1, 0, 127, 1, 0.797
3, 0, 2, 0, 127, 1, 0.796
4, 0, 3, 0, 127, 1, 0.792
5, 0, 4, 0, 127, 1, 0.789
6, 0, 5, 0, 127, 1, 0.791
7, 0, 6, 0, 127, 1, 0.793
8, 0, 7, 0, 127, 1, 0.789
9, 0, 8, 0, 127, 1, 0.797
10, 0, 9, 0, 127, 1, 0.788
11, 0, 10, 0, 127, 1, 0.796
12, 0, 11, 0, 127, 1, 0.793
13, 0, 12, 0, 127, 1, 0.797
14, 0, 13, 0, 127, 1, 0.795
15, 0, 14, 0, 127, 1, 0.795
16, 0, 15, 0, 127, 1, 0.791
17, 0, 16, 0, 127, 1, 0.798
18, 0, 17, 0, 127, 1, 0.8
19, 0, 18, 0, 127, 1, 0.797
20, 0, 19, 0, 127, 1, 0.798
21, 0, 20, 0, 127, 1, 0.797
22, 0, 21, 0, 127, 1, 0.796
23, 0, 22, 0, 127, 1, 0.792
24, 0, 23, 0, 127, 1, 0.791
25, 0, 24, 0, 127, 1, 0.794
26, 0, 25, 0, 127, 1, 0.797
27, 0, 26, 0, 127, 1, 0.793
28, 0, 27, 0, 127, 1, 0.79
29, 0, 28, 0, 127, 1, 0.79
30, 0, 29, 0, 127, 1, 0.791
31, 0, 30, 0, 127, 1, 0.791
32, 0, 31, 0, 127, 1, 0.79
2048, 0, 32, 23, 127, 1, 0.734
2048, 1, 32, 23, 127, 1, 0.748
2048, 0, 64, 23, 127, 1, 0.759
2048, 2, 64, 23, 127, 1, 0.753
2048, 0, 128, 23, 127, 1, 0.834
2048, 3, 128, 23, 127, 1, 0.835
2048, 0, 256, 23, 127, 1, 0.789
2048, 4, 256, 23, 127, 1, 0.791
2048, 0, 512, 23, 127, 1, 0.882
2048, 5, 512, 23, 127, 1, 0.861
2048, 0, 1024, 23, 127, 1, 0.643
2048, 6, 1024, 23, 127, 1, 0.643
2048, 0, 2048, 23, 127, 1, 0.931
2048, 7, 2048, 23, 127, 1, 0.929
2048, 0, 4096, 23, 127, 1, 0.922
2048, 8, 4096, 23, 127, 1, 0.934
256, 1, 64, 23, 127, 1, 0.73
256, 15, 64, 23, 127, 1, 0.729
256, 2, 64, 23, 127, 1, 0.725
256, 30, 64, 23, 127, 1, 0.728
256, 3, 64, 23, 127, 1, 0.727
256, 45, 64, 23, 127, 1, 0.749
256, 4, 64, 23, 127, 1, 0.73
256, 60, 64, 23, 127, 1, 0.752
256, 5, 64, 23, 127, 1, 0.729
256, 75, 64, 23, 127, 1, 0.727
256, 6, 64, 23, 127, 1, 0.693
256, 90, 64, 23, 127, 1, 0.73
256, 7, 64, 23, 127, 1, 0.73
256, 105, 64, 23, 127, 1, 0.751
1, 0, 0, 23, 127, 1, 0.797
2, 0, 1, 23, 127, 1, 0.794
3, 0, 2, 23, 127, 1, 0.797
4, 0, 3, 23, 127, 1, 0.792
5, 0, 4, 23, 127, 1, 0.781
6, 0, 5, 23, 127, 1, 0.783
7, 0, 6, 23, 127, 1, 0.79
8, 0, 7, 23, 127, 1, 0.791
9, 0, 8, 23, 127, 1, 0.794
10, 0, 9, 23, 127, 1, 0.795
11, 0, 10, 23, 127, 1, 0.795
12, 0, 11, 23, 127, 1, 0.795
13, 0, 12, 23, 127, 1, 0.794
14, 0, 13, 23, 127, 1, 0.792
15, 0, 14, 23, 127, 1, 0.79
16, 0, 15, 23, 127, 1, 0.793
17, 0, 16, 23, 127, 1, 0.795
18, 0, 17, 23, 127, 1, 0.797
19, 0, 18, 23, 127, 1, 0.796
20, 0, 19, 23, 127, 1, 0.796
21, 0, 20, 23, 127, 1, 0.794
22, 0, 21, 23, 127, 1, 0.794
23, 0, 22, 23, 127, 1, 0.793
24, 0, 23, 23, 127, 1, 0.792
25, 0, 24, 23, 127, 1, 0.795
26, 0, 25, 23, 127, 1, 0.792
27, 0, 26, 23, 127, 1, 0.789
28, 0, 27, 23, 127, 1, 0.794
29, 0, 28, 23, 127, 1, 0.793
30, 0, 29, 23, 127, 1, 0.795
31, 0, 30, 23, 127, 1, 0.797
32, 0, 31, 23, 127, 1, 0.775
2048, 0, 32, 23, 127, 2, 0.736
2048, 1, 32, 23, 127, 2, 0.738
2048, 0, 64, 23, 127, 2, 0.895
2048, 2, 64, 23, 127, 2, 0.897
2048, 0, 128, 23, 127, 2, 0.852
2048, 3, 128, 23, 127, 2, 0.845
2048, 0, 256, 23, 127, 2, 0.755
2048, 4, 256, 23, 127, 2, 0.712
2048, 0, 512, 23, 127, 2, 0.857
2048, 5, 512, 23, 127, 2, 0.849
2048, 0, 1024, 23, 127, 2, 0.626
2048, 6, 1024, 23, 127, 2, 0.661
2048, 0, 2048, 23, 127, 2, 0.67
2048, 7, 2048, 23, 127, 2, 0.67
2048, 0, 4096, 23, 127, 2, 0.928
2048, 8, 4096, 23, 127, 2, 0.935
256, 1, 64, 23, 127, 2, 0.693
256, 15, 64, 23, 127, 2, 0.692
256, 2, 64, 23, 127, 2, 0.693
256, 30, 64, 23, 127, 2, 0.692
256, 3, 64, 23, 127, 2, 0.692
256, 45, 64, 23, 127, 2, 0.701
256, 4, 64, 23, 127, 2, 0.692
256, 60, 64, 23, 127, 2, 0.701
256, 5, 64, 23, 127, 2, 0.69
256, 75, 64, 23, 127, 2, 0.693
256, 6, 64, 23, 127, 2, 0.691
256, 90, 64, 23, 127, 2, 0.692
256, 7, 64, 23, 127, 2, 0.693
256, 105, 64, 23, 127, 2, 0.701
1, 0, 0, 23, 127, 2, 0.797
2, 0, 1, 23, 127, 2, 0.787
3, 0, 2, 23, 127, 2, 0.797
4, 0, 3, 23, 127, 2, 0.793
5, 0, 4, 23, 127, 2, 0.792
6, 0, 5, 23, 127, 2, 0.795
7, 0, 6, 23, 127, 2, 0.791
8, 0, 7, 23, 127, 2, 0.792
9, 0, 8, 23, 127, 2, 0.796
10, 0, 9, 23, 127, 2, 0.797
11, 0, 10, 23, 127, 2, 0.797
12, 0, 11, 23, 127, 2, 0.798
13, 0, 12, 23, 127, 2, 0.799
14, 0, 13, 23, 127, 2, 0.796
15, 0, 14, 23, 127, 2, 0.796
16, 0, 15, 23, 127, 2, 0.794
17, 0, 16, 23, 127, 2, 0.795
18, 0, 17, 23, 127, 2, 0.797
19, 0, 18, 23, 127, 2, 0.793
20, 0, 19, 23, 127, 2, 0.795
21, 0, 20, 23, 127, 2, 0.794
22, 0, 21, 23, 127, 2, 0.794
23, 0, 22, 23, 127, 2, 0.796
24, 0, 23, 23, 127, 2, 0.794
25, 0, 24, 23, 127, 2, 0.794
26, 0, 25, 23, 127, 2, 0.794
27, 0, 26, 23, 127, 2, 0.788
28, 0, 27, 23, 127, 2, 0.791
29, 0, 28, 23, 127, 2, 0.791
30, 0, 29, 23, 127, 2, 0.793
31, 0, 30, 23, 127, 2, 0.796
32, 0, 31, 23, 127, 2, 0.628
2048, 0, 32, 23, 127, 4, 0.742
2048, 1, 32, 23, 127, 4, 0.742
2048, 0, 64, 23, 127, 4, 0.899
2048, 2, 64, 23, 127, 4, 0.912
2048, 0, 128, 23, 127, 4, 0.783
2048, 3, 128, 23, 127, 4, 0.815
2048, 0, 256, 23, 127, 4, 0.854
2048, 4, 256, 23, 127, 4, 0.858
2048, 0, 512, 23, 127, 4, 0.907
2048, 5, 512, 23, 127, 4, 0.873
2048, 0, 1024, 23, 127, 4, 0.657
2048, 6, 1024, 23, 127, 4, 0.653
2048, 0, 2048, 23, 127, 4, 0.666
2048, 7, 2048, 23, 127, 4, 0.667
2048, 0, 4096, 23, 127, 4, 0.67
2048, 8, 4096, 23, 127, 4, 0.67
256, 1, 64, 23, 127, 4, 0.686
256, 15, 64, 23, 127, 4, 0.687
256, 2, 64, 23, 127, 4, 0.687
256, 30, 64, 23, 127, 4, 0.687
256, 3, 64, 23, 127, 4, 0.687
256, 45, 64, 23, 127, 4, 0.672
256, 4, 64, 23, 127, 4, 0.687
256, 60, 64, 23, 127, 4, 0.701
256, 5, 64, 23, 127, 4, 0.687
256, 75, 64, 23, 127, 4, 0.686
256, 6, 64, 23, 127, 4, 0.687
256, 90, 64, 23, 127, 4, 0.686
256, 7, 64, 23, 127, 4, 0.69
256, 105, 64, 23, 127, 4, 0.672
1, 0, 0, 23, 127, 4, 0.798
2, 0, 1, 23, 127, 4, 0.791
3, 0, 2, 23, 127, 4, 0.792
4, 0, 3, 23, 127, 4, 0.795
5, 0, 4, 23, 127, 4, 0.791
6, 0, 5, 23, 127, 4, 0.793
7, 0, 6, 23, 127, 4, 0.78
8, 0, 7, 23, 127, 4, 0.791
9, 0, 8, 23, 127, 4, 0.788
10, 0, 9, 23, 127, 4, 0.798
11, 0, 10, 23, 127, 4, 0.796
12, 0, 11, 23, 127, 4, 0.794
13, 0, 12, 23, 127, 4, 0.795
14, 0, 13, 23, 127, 4, 0.793
15, 0, 14, 23, 127, 4, 0.8
16, 0, 15, 23, 127, 4, 0.796
17, 0, 16, 23, 127, 4, 0.796
18, 0, 17, 23, 127, 4, 0.796
19, 0, 18, 23, 127, 4, 0.798
20, 0, 19, 23, 127, 4, 0.796
21, 0, 20, 23, 127, 4, 0.796
22, 0, 21, 23, 127, 4, 0.796
23, 0, 22, 23, 127, 4, 0.801
24, 0, 23, 23, 127, 4, 0.799
25, 0, 24, 23, 127, 4, 0.795
26, 0, 25, 23, 127, 4, 0.793
27, 0, 26, 23, 127, 4, 0.796
28, 0, 27, 23, 127, 4, 0.794
29, 0, 28, 23, 127, 4, 0.798
30, 0, 29, 23, 127, 4, 0.795
31, 0, 30, 23, 127, 4, 0.797
32, 0, 31, 23, 127, 4, 0.628
2048, 0, 32, 23, 127, 8, 0.738
2048, 1, 32, 23, 127, 8, 0.747
2048, 0, 64, 23, 127, 8, 0.905
2048, 2, 64, 23, 127, 8, 0.906
2048, 0, 128, 23, 127, 8, 0.822
2048, 3, 128, 23, 127, 8, 0.827
2048, 0, 256, 23, 127, 8, 0.825
2048, 4, 256, 23, 127, 8, 0.825
2048, 0, 512, 23, 127, 8, 0.851
2048, 5, 512, 23, 127, 8, 0.855
2048, 0, 1024, 23, 127, 8, 0.653
2048, 6, 1024, 23, 127, 8, 0.651
2048, 0, 2048, 23, 127, 8, 0.644
2048, 7, 2048, 23, 127, 8, 0.643
2048, 0, 4096, 23, 127, 8, 0.67
2048, 8, 4096, 23, 127, 8, 0.67
256, 1, 64, 23, 127, 8, 0.686
256, 15, 64, 23, 127, 8, 0.686
256, 2, 64, 23, 127, 8, 0.686
256, 30, 64, 23, 127, 8, 0.687
256, 3, 64, 23, 127, 8, 0.686
256, 45, 64, 23, 127, 8, 0.671
256, 4, 64, 23, 127, 8, 0.69
256, 60, 64, 23, 127, 8, 0.705
256, 5, 64, 23, 127, 8, 0.688
256, 75, 64, 23, 127, 8, 0.687
256, 6, 64, 23, 127, 8, 0.692
256, 90, 64, 23, 127, 8, 0.689
256, 7, 64, 23, 127, 8, 0.69
256, 105, 64, 23, 127, 8, 0.674
1, 0, 0, 23, 127, 8, 0.798
2, 0, 1, 23, 127, 8, 0.798
3, 0, 2, 23, 127, 8, 0.797
4, 0, 3, 23, 127, 8, 0.792
5, 0, 4, 23, 127, 8, 0.795
6, 0, 5, 23, 127, 8, 0.792
7, 0, 6, 23, 127, 8, 0.792
8, 0, 7, 23, 127, 8, 0.795
9, 0, 8, 23, 127, 8, 0.799
10, 0, 9, 23, 127, 8, 0.798
11, 0, 10, 23, 127, 8, 0.795
12, 0, 11, 23, 127, 8, 0.795
13, 0, 12, 23, 127, 8, 0.797
14, 0, 13, 23, 127, 8, 0.796
15, 0, 14, 23, 127, 8, 0.795
16, 0, 15, 23, 127, 8, 0.796
17, 0, 16, 23, 127, 8, 0.798
18, 0, 17, 23, 127, 8, 0.798
19, 0, 18, 23, 127, 8, 0.795
20, 0, 19, 23, 127, 8, 0.797
21, 0, 20, 23, 127, 8, 0.797
22, 0, 21, 23, 127, 8, 0.793
23, 0, 22, 23, 127, 8, 0.797
24, 0, 23, 23, 127, 8, 0.8
25, 0, 24, 23, 127, 8, 0.796
26, 0, 25, 23, 127, 8, 0.796
27, 0, 26, 23, 127, 8, 0.791
28, 0, 27, 23, 127, 8, 0.795
29, 0, 28, 23, 127, 8, 0.786
30, 0, 29, 23, 127, 8, 0.797
31, 0, 30, 23, 127, 8, 0.791
32, 0, 31, 23, 127, 8, 0.628
2048, 0, 32, 23, 127, 16, 0.736
2048, 1, 32, 23, 127, 16, 0.737
2048, 0, 64, 23, 127, 16, 0.905
2048, 2, 64, 23, 127, 16, 0.908
2048, 0, 128, 23, 127, 16, 0.829
2048, 3, 128, 23, 127, 16, 0.824
2048, 0, 256, 23, 127, 16, 0.827
2048, 4, 256, 23, 127, 16, 0.825
2048, 0, 512, 23, 127, 16, 0.694
2048, 5, 512, 23, 127, 16, 0.687
2048, 0, 1024, 23, 127, 16, 0.568
2048, 6, 1024, 23, 127, 16, 0.667
2048, 0, 2048, 23, 127, 16, 0.766
2048, 7, 2048, 23, 127, 16, 0.781
2048, 0, 4096, 23, 127, 16, 0.646
2048, 8, 4096, 23, 127, 16, 0.646
256, 1, 64, 23, 127, 16, 0.697
256, 15, 64, 23, 127, 16, 0.686
256, 2, 64, 23, 127, 16, 0.687
256, 30, 64, 23, 127, 16, 0.687
256, 3, 64, 23, 127, 16, 0.686
256, 45, 64, 23, 127, 16, 0.672
256, 4, 64, 23, 127, 16, 0.686
256, 60, 64, 23, 127, 16, 0.701
256, 5, 64, 23, 127, 16, 0.686
256, 75, 64, 23, 127, 16, 0.686
256, 6, 64, 23, 127, 16, 0.691
256, 90, 64, 23, 127, 16, 0.687
256, 7, 64, 23, 127, 16, 0.688
256, 105, 64, 23, 127, 16, 0.674
1, 0, 0, 23, 127, 16, 0.797
2, 0, 1, 23, 127, 16, 0.798
3, 0, 2, 23, 127, 16, 0.786
4, 0, 3, 23, 127, 16, 0.792
5, 0, 4, 23, 127, 16, 0.792
6, 0, 5, 23, 127, 16, 0.795
7, 0, 6, 23, 127, 16, 0.796
8, 0, 7, 23, 127, 16, 0.798
9, 0, 8, 23, 127, 16, 0.795
10, 0, 9, 23, 127, 16, 0.797
11, 0, 10, 23, 127, 16, 0.797
12, 0, 11, 23, 127, 16, 0.797
13, 0, 12, 23, 127, 16, 0.799
14, 0, 13, 23, 127, 16, 0.798
15, 0, 14, 23, 127, 16, 0.798
16, 0, 15, 23, 127, 16, 0.796
17, 0, 16, 23, 127, 16, 0.798
18, 0, 17, 23, 127, 16, 0.796
19, 0, 18, 23, 127, 16, 0.797
20, 0, 19, 23, 127, 16, 0.797
21, 0, 20, 23, 127, 16, 0.798
22, 0, 21, 23, 127, 16, 0.797
23, 0, 22, 23, 127, 16, 0.797
24, 0, 23, 23, 127, 16, 0.797
25, 0, 24, 23, 127, 16, 0.798
26, 0, 25, 23, 127, 16, 0.794
27, 0, 26, 23, 127, 16, 0.796
28, 0, 27, 23, 127, 16, 0.796
29, 0, 28, 23, 127, 16, 0.792
30, 0, 29, 23, 127, 16, 0.788
31, 0, 30, 23, 127, 16, 0.79
32, 0, 31, 23, 127, 16, 0.628
sysdeps/x86_64/multiarch/strrchr-evex.S | 441 ++++++++++++++----------
1 file changed, 259 insertions(+), 182 deletions(-)