The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.832
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
---
Results For: strrchr
Geometric Mean of N=30 runs.
Geometric Mean of all benchmarks New / Old: 0.832
Benchmarks performance on Tigerlake:
https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
len, align, pos, seek, max_char, freq, New Time / Old Time
2048, 0, 32, 0, 127, 1, 0.673
2048, 1, 32, 0, 127, 1, 0.68
2048, 0, 64, 0, 127, 1, 0.566
2048, 2, 64, 0, 127, 1, 0.574
2048, 0, 128, 0, 127, 1, 0.976
2048, 3, 128, 0, 127, 1, 0.967
2048, 0, 256, 0, 127, 1, 0.931
2048, 4, 256, 0, 127, 1, 0.921
2048, 0, 512, 0, 127, 1, 0.792
2048, 5, 512, 0, 127, 1, 0.78
2048, 0, 1024, 0, 127, 1, 0.733
2048, 6, 1024, 0, 127, 1, 0.729
2048, 0, 2048, 0, 127, 1, 0.795
2048, 7, 2048, 0, 127, 1, 0.805
2048, 0, 4096, 0, 127, 1, 0.803
2048, 8, 4096, 0, 127, 1, 0.794
256, 1, 64, 0, 127, 1, 0.584
256, 15, 64, 0, 127, 1, 0.587
256, 2, 64, 0, 127, 1, 0.586
256, 30, 64, 0, 127, 1, 0.592
256, 3, 64, 0, 127, 1, 0.586
256, 45, 64, 0, 127, 1, 0.505
256, 4, 64, 0, 127, 1, 0.59
256, 60, 64, 0, 127, 1, 0.501
256, 5, 64, 0, 127, 1, 0.595
256, 75, 64, 0, 127, 1, 0.588
256, 6, 64, 0, 127, 1, 0.593
256, 90, 64, 0, 127, 1, 0.594
256, 7, 64, 0, 127, 1, 0.596
256, 105, 64, 0, 127, 1, 0.506
1, 0, 0, 0, 127, 1, 0.872
2, 0, 1, 0, 127, 1, 0.861
3, 0, 2, 0, 127, 1, 0.862
4, 0, 3, 0, 127, 1, 0.884
5, 0, 4, 0, 127, 1, 0.869
6, 0, 5, 0, 127, 1, 0.861
7, 0, 6, 0, 127, 1, 0.865
8, 0, 7, 0, 127, 1, 0.884
9, 0, 8, 0, 127, 1, 0.862
10, 0, 9, 0, 127, 1, 0.889
11, 0, 10, 0, 127, 1, 0.9
12, 0, 11, 0, 127, 1, 0.897
13, 0, 12, 0, 127, 1, 0.909
14, 0, 13, 0, 127, 1, 0.885
15, 0, 14, 0, 127, 1, 0.929
16, 0, 15, 0, 127, 1, 0.871
17, 0, 16, 0, 127, 1, 0.875
18, 0, 17, 0, 127, 1, 0.878
19, 0, 18, 0, 127, 1, 0.889
20, 0, 19, 0, 127, 1, 0.89
21, 0, 20, 0, 127, 1, 0.901
22, 0, 21, 0, 127, 1, 0.91
23, 0, 22, 0, 127, 1, 0.912
24, 0, 23, 0, 127, 1, 0.907
25, 0, 24, 0, 127, 1, 0.947
26, 0, 25, 0, 127, 1, 0.904
27, 0, 26, 0, 127, 1, 0.921
28, 0, 27, 0, 127, 1, 0.899
29, 0, 28, 0, 127, 1, 0.923
30, 0, 29, 0, 127, 1, 0.918
31, 0, 30, 0, 127, 1, 0.943
32, 0, 31, 0, 127, 1, 0.914
2048, 0, 32, 23, 127, 1, 0.815
2048, 1, 32, 23, 127, 1, 0.829
2048, 0, 64, 23, 127, 1, 0.884
2048, 2, 64, 23, 127, 1, 0.882
2048, 0, 128, 23, 127, 1, 0.884
2048, 3, 128, 23, 127, 1, 0.851
2048, 0, 256, 23, 127, 1, 0.843
2048, 4, 256, 23, 127, 1, 0.867
2048, 0, 512, 23, 127, 1, 0.746
2048, 5, 512, 23, 127, 1, 0.863
2048, 0, 1024, 23, 127, 1, 0.662
2048, 6, 1024, 23, 127, 1, 0.683
2048, 0, 2048, 23, 127, 1, 0.852
2048, 7, 2048, 23, 127, 1, 0.837
2048, 0, 4096, 23, 127, 1, 0.837
2048, 8, 4096, 23, 127, 1, 0.829
256, 1, 64, 23, 127, 1, 0.934
256, 15, 64, 23, 127, 1, 0.936
256, 2, 64, 23, 127, 1, 0.931
256, 30, 64, 23, 127, 1, 0.938
256, 3, 64, 23, 127, 1, 0.927
256, 45, 64, 23, 127, 1, 0.863
256, 4, 64, 23, 127, 1, 0.939
256, 60, 64, 23, 127, 1, 0.871
256, 5, 64, 23, 127, 1, 0.94
256, 75, 64, 23, 127, 1, 0.933
256, 6, 64, 23, 127, 1, 0.915
256, 90, 64, 23, 127, 1, 0.934
256, 7, 64, 23, 127, 1, 0.938
256, 105, 64, 23, 127, 1, 0.871
1, 0, 0, 23, 127, 1, 0.865
2, 0, 1, 23, 127, 1, 0.87
3, 0, 2, 23, 127, 1, 0.882
4, 0, 3, 23, 127, 1, 0.901
5, 0, 4, 23, 127, 1, 0.879
6, 0, 5, 23, 127, 1, 0.934
7, 0, 6, 23, 127, 1, 0.874
8, 0, 7, 23, 127, 1, 0.895
9, 0, 8, 23, 127, 1, 0.873
10, 0, 9, 23, 127, 1, 0.861
11, 0, 10, 23, 127, 1, 0.865
12, 0, 11, 23, 127, 1, 0.875
13, 0, 12, 23, 127, 1, 0.878
14, 0, 13, 23, 127, 1, 0.86
15, 0, 14, 23, 127, 1, 0.889
16, 0, 15, 23, 127, 1, 0.875
17, 0, 16, 23, 127, 1, 0.911
18, 0, 17, 23, 127, 1, 0.891
19, 0, 18, 23, 127, 1, 0.921
20, 0, 19, 23, 127, 1, 0.898
21, 0, 20, 23, 127, 1, 0.895
22, 0, 21, 23, 127, 1, 0.906
23, 0, 22, 23, 127, 1, 0.911
24, 0, 23, 23, 127, 1, 0.877
25, 0, 24, 23, 127, 1, 0.9
26, 0, 25, 23, 127, 1, 0.911
27, 0, 26, 23, 127, 1, 0.926
28, 0, 27, 23, 127, 1, 0.918
29, 0, 28, 23, 127, 1, 0.952
30, 0, 29, 23, 127, 1, 0.943
31, 0, 30, 23, 127, 1, 0.934
32, 0, 31, 23, 127, 1, 0.8
2048, 0, 32, 23, 127, 2, 0.872
2048, 1, 32, 23, 127, 2, 0.819
2048, 0, 64, 23, 127, 2, 0.815
2048, 2, 64, 23, 127, 2, 0.805
2048, 0, 128, 23, 127, 2, 0.884
2048, 3, 128, 23, 127, 2, 0.852
2048, 0, 256, 23, 127, 2, 0.873
2048, 4, 256, 23, 127, 2, 0.871
2048, 0, 512, 23, 127, 2, 0.654
2048, 5, 512, 23, 127, 2, 0.762
2048, 0, 1024, 23, 127, 2, 0.646
2048, 6, 1024, 23, 127, 2, 0.665
2048, 0, 2048, 23, 127, 2, 0.678
2048, 7, 2048, 23, 127, 2, 0.675
2048, 0, 4096, 23, 127, 2, 0.849
2048, 8, 4096, 23, 127, 2, 0.835
256, 1, 64, 23, 127, 2, 0.917
256, 15, 64, 23, 127, 2, 0.915
256, 2, 64, 23, 127, 2, 0.911
256, 30, 64, 23, 127, 2, 0.907
256, 3, 64, 23, 127, 2, 0.9
256, 45, 64, 23, 127, 2, 0.816
256, 4, 64, 23, 127, 2, 0.912
256, 60, 64, 23, 127, 2, 0.81
256, 5, 64, 23, 127, 2, 0.904
256, 75, 64, 23, 127, 2, 0.911
256, 6, 64, 23, 127, 2, 0.898
256, 90, 64, 23, 127, 2, 0.912
256, 7, 64, 23, 127, 2, 0.909
256, 105, 64, 23, 127, 2, 0.81
1, 0, 0, 23, 127, 2, 0.858
2, 0, 1, 23, 127, 2, 0.89
3, 0, 2, 23, 127, 2, 0.877
4, 0, 3, 23, 127, 2, 0.863
5, 0, 4, 23, 127, 2, 0.863
6, 0, 5, 23, 127, 2, 0.889
7, 0, 6, 23, 127, 2, 0.898
8, 0, 7, 23, 127, 2, 0.885
9, 0, 8, 23, 127, 2, 0.863
10, 0, 9, 23, 127, 2, 0.902
11, 0, 10, 23, 127, 2, 0.865
12, 0, 11, 23, 127, 2, 0.864
13, 0, 12, 23, 127, 2, 0.87
14, 0, 13, 23, 127, 2, 0.862
15, 0, 14, 23, 127, 2, 0.861
16, 0, 15, 23, 127, 2, 0.859
17, 0, 16, 23, 127, 2, 0.87
18, 0, 17, 23, 127, 2, 0.892
19, 0, 18, 23, 127, 2, 0.874
20, 0, 19, 23, 127, 2, 0.866
21, 0, 20, 23, 127, 2, 0.877
22, 0, 21, 23, 127, 2, 0.868
23, 0, 22, 23, 127, 2, 0.884
24, 0, 23, 23, 127, 2, 0.881
25, 0, 24, 23, 127, 2, 0.872
26, 0, 25, 23, 127, 2, 0.866
27, 0, 26, 23, 127, 2, 0.881
28, 0, 27, 23, 127, 2, 0.93
29, 0, 28, 23, 127, 2, 0.886
30, 0, 29, 23, 127, 2, 0.869
31, 0, 30, 23, 127, 2, 0.869
32, 0, 31, 23, 127, 2, 0.667
2048, 0, 32, 23, 127, 4, 0.858
2048, 1, 32, 23, 127, 4, 0.858
2048, 0, 64, 23, 127, 4, 0.838
2048, 2, 64, 23, 127, 4, 0.834
2048, 0, 128, 23, 127, 4, 0.85
2048, 3, 128, 23, 127, 4, 0.762
2048, 0, 256, 23, 127, 4, 0.874
2048, 4, 256, 23, 127, 4, 0.796
2048, 0, 512, 23, 127, 4, 0.691
2048, 5, 512, 23, 127, 4, 0.755
2048, 0, 1024, 23, 127, 4, 0.676
2048, 6, 1024, 23, 127, 4, 0.661
2048, 0, 2048, 23, 127, 4, 0.678
2048, 7, 2048, 23, 127, 4, 0.678
2048, 0, 4096, 23, 127, 4, 0.676
2048, 8, 4096, 23, 127, 4, 0.677
256, 1, 64, 23, 127, 4, 0.875
256, 15, 64, 23, 127, 4, 0.877
256, 2, 64, 23, 127, 4, 0.875
256, 30, 64, 23, 127, 4, 0.875
256, 3, 64, 23, 127, 4, 0.878
256, 45, 64, 23, 127, 4, 0.829
256, 4, 64, 23, 127, 4, 0.876
256, 60, 64, 23, 127, 4, 0.807
256, 5, 64, 23, 127, 4, 0.874
256, 75, 64, 23, 127, 4, 0.872
256, 6, 64, 23, 127, 4, 0.874
256, 90, 64, 23, 127, 4, 0.874
256, 7, 64, 23, 127, 4, 0.873
256, 105, 64, 23, 127, 4, 0.826
1, 0, 0, 23, 127, 4, 0.863
2, 0, 1, 23, 127, 4, 0.861
3, 0, 2, 23, 127, 4, 0.863
4, 0, 3, 23, 127, 4, 0.867
5, 0, 4, 23, 127, 4, 0.866
6, 0, 5, 23, 127, 4, 0.873
7, 0, 6, 23, 127, 4, 0.873
8, 0, 7, 23, 127, 4, 0.866
9, 0, 8, 23, 127, 4, 0.861
10, 0, 9, 23, 127, 4, 0.861
11, 0, 10, 23, 127, 4, 0.857
12, 0, 11, 23, 127, 4, 0.864
13, 0, 12, 23, 127, 4, 0.86
14, 0, 13, 23, 127, 4, 0.859
15, 0, 14, 23, 127, 4, 0.854
16, 0, 15, 23, 127, 4, 0.857
17, 0, 16, 23, 127, 4, 0.881
18, 0, 17, 23, 127, 4, 0.863
19, 0, 18, 23, 127, 4, 0.86
20, 0, 19, 23, 127, 4, 0.906
21, 0, 20, 23, 127, 4, 0.924
22, 0, 21, 23, 127, 4, 0.885
23, 0, 22, 23, 127, 4, 0.861
24, 0, 23, 23, 127, 4, 0.907
25, 0, 24, 23, 127, 4, 0.909
26, 0, 25, 23, 127, 4, 0.863
27, 0, 26, 23, 127, 4, 0.862
28, 0, 27, 23, 127, 4, 0.887
29, 0, 28, 23, 127, 4, 0.879
30, 0, 29, 23, 127, 4, 0.932
31, 0, 30, 23, 127, 4, 0.895
32, 0, 31, 23, 127, 4, 0.666
2048, 0, 32, 23, 127, 8, 0.865
2048, 1, 32, 23, 127, 8, 0.892
2048, 0, 64, 23, 127, 8, 0.85
2048, 2, 64, 23, 127, 8, 0.834
2048, 0, 128, 23, 127, 8, 0.823
2048, 3, 128, 23, 127, 8, 0.809
2048, 0, 256, 23, 127, 8, 0.84
2048, 4, 256, 23, 127, 8, 0.738
2048, 0, 512, 23, 127, 8, 0.656
2048, 5, 512, 23, 127, 8, 0.644
2048, 0, 1024, 23, 127, 8, 0.705
2048, 6, 1024, 23, 127, 8, 0.708
2048, 0, 2048, 23, 127, 8, 0.701
2048, 7, 2048, 23, 127, 8, 0.7
2048, 0, 4096, 23, 127, 8, 0.68
2048, 8, 4096, 23, 127, 8, 0.678
256, 1, 64, 23, 127, 8, 0.881
256, 15, 64, 23, 127, 8, 0.879
256, 2, 64, 23, 127, 8, 0.878
256, 30, 64, 23, 127, 8, 0.877
256, 3, 64, 23, 127, 8, 0.88
256, 45, 64, 23, 127, 8, 0.829
256, 4, 64, 23, 127, 8, 0.883
256, 60, 64, 23, 127, 8, 0.808
256, 5, 64, 23, 127, 8, 0.875
256, 75, 64, 23, 127, 8, 0.877
256, 6, 64, 23, 127, 8, 0.874
256, 90, 64, 23, 127, 8, 0.874
256, 7, 64, 23, 127, 8, 0.874
256, 105, 64, 23, 127, 8, 0.83
1, 0, 0, 23, 127, 8, 0.862
2, 0, 1, 23, 127, 8, 0.865
3, 0, 2, 23, 127, 8, 0.866
4, 0, 3, 23, 127, 8, 0.863
5, 0, 4, 23, 127, 8, 0.874
6, 0, 5, 23, 127, 8, 0.87
7, 0, 6, 23, 127, 8, 0.87
8, 0, 7, 23, 127, 8, 0.864
9, 0, 8, 23, 127, 8, 0.87
10, 0, 9, 23, 127, 8, 0.861
11, 0, 10, 23, 127, 8, 0.862
12, 0, 11, 23, 127, 8, 0.87
13, 0, 12, 23, 127, 8, 0.858
14, 0, 13, 23, 127, 8, 0.86
15, 0, 14, 23, 127, 8, 0.863
16, 0, 15, 23, 127, 8, 0.866
17, 0, 16, 23, 127, 8, 0.86
18, 0, 17, 23, 127, 8, 0.887
19, 0, 18, 23, 127, 8, 0.858
20, 0, 19, 23, 127, 8, 0.891
21, 0, 20, 23, 127, 8, 0.874
22, 0, 21, 23, 127, 8, 0.891
23, 0, 22, 23, 127, 8, 0.873
24, 0, 23, 23, 127, 8, 0.895
25, 0, 24, 23, 127, 8, 0.884
26, 0, 25, 23, 127, 8, 0.878
27, 0, 26, 23, 127, 8, 0.878
28, 0, 27, 23, 127, 8, 0.891
29, 0, 28, 23, 127, 8, 0.91
30, 0, 29, 23, 127, 8, 0.881
31, 0, 30, 23, 127, 8, 0.917
32, 0, 31, 23, 127, 8, 0.667
2048, 0, 32, 23, 127, 16, 0.86
2048, 1, 32, 23, 127, 16, 0.847
2048, 0, 64, 23, 127, 16, 0.846
2048, 2, 64, 23, 127, 16, 0.852
2048, 0, 128, 23, 127, 16, 0.82
2048, 3, 128, 23, 127, 16, 0.751
2048, 0, 256, 23, 127, 16, 0.788
2048, 4, 256, 23, 127, 16, 0.712
2048, 0, 512, 23, 127, 16, 0.524
2048, 5, 512, 23, 127, 16, 0.517
2048, 0, 1024, 23, 127, 16, 0.583
2048, 6, 1024, 23, 127, 16, 0.682
2048, 0, 2048, 23, 127, 16, 0.77
2048, 7, 2048, 23, 127, 16, 0.659
2048, 0, 4096, 23, 127, 16, 0.7
2048, 8, 4096, 23, 127, 16, 0.7
256, 1, 64, 23, 127, 16, 0.798
256, 15, 64, 23, 127, 16, 0.873
256, 2, 64, 23, 127, 16, 0.875
256, 30, 64, 23, 127, 16, 0.877
256, 3, 64, 23, 127, 16, 0.875
256, 45, 64, 23, 127, 16, 0.834
256, 4, 64, 23, 127, 16, 0.873
256, 60, 64, 23, 127, 16, 0.809
256, 5, 64, 23, 127, 16, 0.879
256, 75, 64, 23, 127, 16, 0.884
256, 6, 64, 23, 127, 16, 0.874
256, 90, 64, 23, 127, 16, 0.876
256, 7, 64, 23, 127, 16, 0.876
256, 105, 64, 23, 127, 16, 0.827
1, 0, 0, 23, 127, 16, 0.859
2, 0, 1, 23, 127, 16, 0.864
3, 0, 2, 23, 127, 16, 0.871
4, 0, 3, 23, 127, 16, 0.869
5, 0, 4, 23, 127, 16, 0.881
6, 0, 5, 23, 127, 16, 0.869
7, 0, 6, 23, 127, 16, 0.867
8, 0, 7, 23, 127, 16, 0.877
9, 0, 8, 23, 127, 16, 0.862
10, 0, 9, 23, 127, 16, 0.861
11, 0, 10, 23, 127, 16, 0.859
12, 0, 11, 23, 127, 16, 0.858
13, 0, 12, 23, 127, 16, 0.867
14, 0, 13, 23, 127, 16, 0.857
15, 0, 14, 23, 127, 16, 0.858
16, 0, 15, 23, 127, 16, 0.857
17, 0, 16, 23, 127, 16, 0.858
18, 0, 17, 23, 127, 16, 0.867
19, 0, 18, 23, 127, 16, 0.875
20, 0, 19, 23, 127, 16, 0.868
21, 0, 20, 23, 127, 16, 0.861
22, 0, 21, 23, 127, 16, 0.868
23, 0, 22, 23, 127, 16, 0.866
24, 0, 23, 23, 127, 16, 0.858
25, 0, 24, 23, 127, 16, 0.859
26, 0, 25, 23, 127, 16, 0.857
27, 0, 26, 23, 127, 16, 0.866
28, 0, 27, 23, 127, 16, 0.875
29, 0, 28, 23, 127, 16, 0.896
30, 0, 29, 23, 127, 16, 0.889
31, 0, 30, 23, 127, 16, 0.903
32, 0, 31, 23, 127, 16, 0.667
sysdeps/x86_64/multiarch/strrchr-avx2.S | 415 +++++++++++++++---------
1 file changed, 258 insertions(+), 157 deletions(-)