[v1,2/5] x86: Optimize {str|wcs}rchr-sse2
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.741
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
---
Results For: strrchr
Geometric Mean of N=30 runs.
Geometric Mean of all benchmarks New / Old: 0.741
Benchmarks performance on Tigerlake:
https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
len, align, pos, seek, max_char, freq, New Time / Old Time
2048, 0, 32, 0, 127, 1, 0.647
2048, 1, 32, 0, 127, 1, 0.621
2048, 0, 64, 0, 127, 1, 0.661
2048, 2, 64, 0, 127, 1, 0.655
2048, 0, 128, 0, 127, 1, 0.69
2048, 3, 128, 0, 127, 1, 0.689
2048, 0, 256, 0, 127, 1, 0.718
2048, 4, 256, 0, 127, 1, 0.718
2048, 0, 512, 0, 127, 1, 0.758
2048, 5, 512, 0, 127, 1, 0.754
2048, 0, 1024, 0, 127, 1, 1.029
2048, 6, 1024, 0, 127, 1, 1.032
2048, 0, 2048, 0, 127, 1, 0.826
2048, 7, 2048, 0, 127, 1, 0.834
2048, 0, 4096, 0, 127, 1, 0.825
2048, 8, 4096, 0, 127, 1, 0.83
256, 1, 64, 0, 127, 1, 0.657
256, 15, 64, 0, 127, 1, 0.657
256, 2, 64, 0, 127, 1, 0.657
256, 30, 64, 0, 127, 1, 0.523
256, 3, 64, 0, 127, 1, 0.657
256, 45, 64, 0, 127, 1, 0.654
256, 4, 64, 0, 127, 1, 0.657
256, 60, 64, 0, 127, 1, 0.526
256, 5, 64, 0, 127, 1, 0.658
256, 75, 64, 0, 127, 1, 0.658
256, 6, 64, 0, 127, 1, 0.655
256, 90, 64, 0, 127, 1, 0.523
256, 7, 64, 0, 127, 1, 0.655
256, 105, 64, 0, 127, 1, 0.654
1, 0, 0, 0, 127, 1, 0.98
2, 0, 1, 0, 127, 1, 0.978
3, 0, 2, 0, 127, 1, 0.975
4, 0, 3, 0, 127, 1, 0.976
5, 0, 4, 0, 127, 1, 0.977
6, 0, 5, 0, 127, 1, 0.981
7, 0, 6, 0, 127, 1, 0.982
8, 0, 7, 0, 127, 1, 0.98
9, 0, 8, 0, 127, 1, 0.978
10, 0, 9, 0, 127, 1, 0.981
11, 0, 10, 0, 127, 1, 0.984
12, 0, 11, 0, 127, 1, 0.982
13, 0, 12, 0, 127, 1, 0.98
14, 0, 13, 0, 127, 1, 0.978
15, 0, 14, 0, 127, 1, 0.979
16, 0, 15, 0, 127, 1, 0.986
17, 0, 16, 0, 127, 1, 0.529
18, 0, 17, 0, 127, 1, 0.566
19, 0, 18, 0, 127, 1, 0.575
20, 0, 19, 0, 127, 1, 0.573
21, 0, 20, 0, 127, 1, 0.579
22, 0, 21, 0, 127, 1, 0.595
23, 0, 22, 0, 127, 1, 0.585
24, 0, 23, 0, 127, 1, 0.586
25, 0, 24, 0, 127, 1, 0.587
26, 0, 25, 0, 127, 1, 0.592
27, 0, 26, 0, 127, 1, 0.595
28, 0, 27, 0, 127, 1, 0.592
29, 0, 28, 0, 127, 1, 0.6
30, 0, 29, 0, 127, 1, 0.598
31, 0, 30, 0, 127, 1, 0.595
32, 0, 31, 0, 127, 1, 0.592
2048, 0, 32, 23, 127, 1, 0.827
2048, 1, 32, 23, 127, 1, 0.826
2048, 0, 64, 23, 127, 1, 0.824
2048, 2, 64, 23, 127, 1, 0.825
2048, 0, 128, 23, 127, 1, 0.829
2048, 3, 128, 23, 127, 1, 0.824
2048, 0, 256, 23, 127, 1, 0.832
2048, 4, 256, 23, 127, 1, 0.825
2048, 0, 512, 23, 127, 1, 0.831
2048, 5, 512, 23, 127, 1, 0.837
2048, 0, 1024, 23, 127, 1, 0.721
2048, 6, 1024, 23, 127, 1, 0.757
2048, 0, 2048, 23, 127, 1, 0.825
2048, 7, 2048, 23, 127, 1, 0.824
2048, 0, 4096, 23, 127, 1, 0.828
2048, 8, 4096, 23, 127, 1, 0.823
256, 1, 64, 23, 127, 1, 0.665
256, 15, 64, 23, 127, 1, 0.661
256, 2, 64, 23, 127, 1, 0.674
256, 30, 64, 23, 127, 1, 0.605
256, 3, 64, 23, 127, 1, 0.668
256, 45, 64, 23, 127, 1, 0.661
256, 4, 64, 23, 127, 1, 0.657
256, 60, 64, 23, 127, 1, 0.594
256, 5, 64, 23, 127, 1, 0.654
256, 75, 64, 23, 127, 1, 0.673
256, 6, 64, 23, 127, 1, 0.688
256, 90, 64, 23, 127, 1, 0.6
256, 7, 64, 23, 127, 1, 0.66
256, 105, 64, 23, 127, 1, 0.654
1, 0, 0, 23, 127, 1, 0.981
2, 0, 1, 23, 127, 1, 0.976
3, 0, 2, 23, 127, 1, 0.983
4, 0, 3, 23, 127, 1, 0.984
5, 0, 4, 23, 127, 1, 0.973
6, 0, 5, 23, 127, 1, 0.987
7, 0, 6, 23, 127, 1, 0.977
8, 0, 7, 23, 127, 1, 0.979
9, 0, 8, 23, 127, 1, 0.981
10, 0, 9, 23, 127, 1, 0.98
11, 0, 10, 23, 127, 1, 0.983
12, 0, 11, 23, 127, 1, 0.98
13, 0, 12, 23, 127, 1, 0.98
14, 0, 13, 23, 127, 1, 0.977
15, 0, 14, 23, 127, 1, 0.982
16, 0, 15, 23, 127, 1, 0.581
17, 0, 16, 23, 127, 1, 0.551
18, 0, 17, 23, 127, 1, 0.555
19, 0, 18, 23, 127, 1, 0.586
20, 0, 19, 23, 127, 1, 0.585
21, 0, 20, 23, 127, 1, 0.582
22, 0, 21, 23, 127, 1, 0.571
23, 0, 22, 23, 127, 1, 0.576
24, 0, 23, 23, 127, 1, 0.581
25, 0, 24, 23, 127, 1, 0.589
26, 0, 25, 23, 127, 1, 0.593
27, 0, 26, 23, 127, 1, 0.595
28, 0, 27, 23, 127, 1, 0.583
29, 0, 28, 23, 127, 1, 0.595
30, 0, 29, 23, 127, 1, 0.58
31, 0, 30, 23, 127, 1, 0.594
32, 0, 31, 23, 127, 1, 0.665
2048, 0, 32, 23, 127, 2, 0.825
2048, 1, 32, 23, 127, 2, 0.818
2048, 0, 64, 23, 127, 2, 0.829
2048, 2, 64, 23, 127, 2, 0.828
2048, 0, 128, 23, 127, 2, 0.823
2048, 3, 128, 23, 127, 2, 0.825
2048, 0, 256, 23, 127, 2, 0.819
2048, 4, 256, 23, 127, 2, 0.828
2048, 0, 512, 23, 127, 2, 0.824
2048, 5, 512, 23, 127, 2, 0.827
2048, 0, 1024, 23, 127, 2, 0.813
2048, 6, 1024, 23, 127, 2, 0.834
2048, 0, 2048, 23, 127, 2, 0.927
2048, 7, 2048, 23, 127, 2, 0.923
2048, 0, 4096, 23, 127, 2, 0.818
2048, 8, 4096, 23, 127, 2, 0.82
256, 1, 64, 23, 127, 2, 0.693
256, 15, 64, 23, 127, 2, 0.686
256, 2, 64, 23, 127, 2, 0.69
256, 30, 64, 23, 127, 2, 0.611
256, 3, 64, 23, 127, 2, 0.692
256, 45, 64, 23, 127, 2, 0.685
256, 4, 64, 23, 127, 2, 0.688
256, 60, 64, 23, 127, 2, 0.6
256, 5, 64, 23, 127, 2, 0.69
256, 75, 64, 23, 127, 2, 0.689
256, 6, 64, 23, 127, 2, 0.688
256, 90, 64, 23, 127, 2, 0.611
256, 7, 64, 23, 127, 2, 0.69
256, 105, 64, 23, 127, 2, 0.686
1, 0, 0, 23, 127, 2, 0.982
2, 0, 1, 23, 127, 2, 0.987
3, 0, 2, 23, 127, 2, 0.978
4, 0, 3, 23, 127, 2, 0.977
5, 0, 4, 23, 127, 2, 0.979
6, 0, 5, 23, 127, 2, 0.985
7, 0, 6, 23, 127, 2, 0.975
8, 0, 7, 23, 127, 2, 0.981
9, 0, 8, 23, 127, 2, 0.984
10, 0, 9, 23, 127, 2, 0.983
11, 0, 10, 23, 127, 2, 0.982
12, 0, 11, 23, 127, 2, 0.976
13, 0, 12, 23, 127, 2, 0.985
14, 0, 13, 23, 127, 2, 0.984
15, 0, 14, 23, 127, 2, 0.98
16, 0, 15, 23, 127, 2, 0.583
17, 0, 16, 23, 127, 2, 0.552
18, 0, 17, 23, 127, 2, 0.564
19, 0, 18, 23, 127, 2, 0.585
20, 0, 19, 23, 127, 2, 0.578
21, 0, 20, 23, 127, 2, 0.578
22, 0, 21, 23, 127, 2, 0.571
23, 0, 22, 23, 127, 2, 0.587
24, 0, 23, 23, 127, 2, 0.589
25, 0, 24, 23, 127, 2, 0.593
26, 0, 25, 23, 127, 2, 0.589
27, 0, 26, 23, 127, 2, 0.588
28, 0, 27, 23, 127, 2, 0.593
29, 0, 28, 23, 127, 2, 0.579
30, 0, 29, 23, 127, 2, 0.572
31, 0, 30, 23, 127, 2, 0.582
32, 0, 31, 23, 127, 2, 0.659
2048, 0, 32, 23, 127, 4, 0.822
2048, 1, 32, 23, 127, 4, 0.818
2048, 0, 64, 23, 127, 4, 0.826
2048, 2, 64, 23, 127, 4, 0.824
2048, 0, 128, 23, 127, 4, 0.833
2048, 3, 128, 23, 127, 4, 0.831
2048, 0, 256, 23, 127, 4, 0.826
2048, 4, 256, 23, 127, 4, 0.831
2048, 0, 512, 23, 127, 4, 0.834
2048, 5, 512, 23, 127, 4, 0.83
2048, 0, 1024, 23, 127, 4, 0.836
2048, 6, 1024, 23, 127, 4, 0.844
2048, 0, 2048, 23, 127, 4, 0.696
2048, 7, 2048, 23, 127, 4, 0.704
2048, 0, 4096, 23, 127, 4, 0.936
2048, 8, 4096, 23, 127, 4, 0.925
256, 1, 64, 23, 127, 4, 0.694
256, 15, 64, 23, 127, 4, 0.69
256, 2, 64, 23, 127, 4, 0.687
256, 30, 64, 23, 127, 4, 0.612
256, 3, 64, 23, 127, 4, 0.685
256, 45, 64, 23, 127, 4, 0.685
256, 4, 64, 23, 127, 4, 0.684
256, 60, 64, 23, 127, 4, 0.606
256, 5, 64, 23, 127, 4, 0.69
256, 75, 64, 23, 127, 4, 0.688
256, 6, 64, 23, 127, 4, 0.69
256, 90, 64, 23, 127, 4, 0.615
256, 7, 64, 23, 127, 4, 0.691
256, 105, 64, 23, 127, 4, 0.688
1, 0, 0, 23, 127, 4, 0.982
2, 0, 1, 23, 127, 4, 0.983
3, 0, 2, 23, 127, 4, 0.981
4, 0, 3, 23, 127, 4, 0.984
5, 0, 4, 23, 127, 4, 0.963
6, 0, 5, 23, 127, 4, 0.978
7, 0, 6, 23, 127, 4, 0.985
8, 0, 7, 23, 127, 4, 0.986
9, 0, 8, 23, 127, 4, 0.978
10, 0, 9, 23, 127, 4, 0.985
11, 0, 10, 23, 127, 4, 0.986
12, 0, 11, 23, 127, 4, 0.983
13, 0, 12, 23, 127, 4, 0.986
14, 0, 13, 23, 127, 4, 0.98
15, 0, 14, 23, 127, 4, 0.979
16, 0, 15, 23, 127, 4, 0.582
17, 0, 16, 23, 127, 4, 0.542
18, 0, 17, 23, 127, 4, 0.564
19, 0, 18, 23, 127, 4, 0.571
20, 0, 19, 23, 127, 4, 0.582
21, 0, 20, 23, 127, 4, 0.573
22, 0, 21, 23, 127, 4, 0.575
23, 0, 22, 23, 127, 4, 0.578
24, 0, 23, 23, 127, 4, 0.58
25, 0, 24, 23, 127, 4, 0.592
26, 0, 25, 23, 127, 4, 0.588
27, 0, 26, 23, 127, 4, 0.574
28, 0, 27, 23, 127, 4, 0.589
29, 0, 28, 23, 127, 4, 0.56
30, 0, 29, 23, 127, 4, 0.587
31, 0, 30, 23, 127, 4, 0.584
32, 0, 31, 23, 127, 4, 0.664
2048, 0, 32, 23, 127, 8, 0.826
2048, 1, 32, 23, 127, 8, 0.821
2048, 0, 64, 23, 127, 8, 0.828
2048, 2, 64, 23, 127, 8, 0.827
2048, 0, 128, 23, 127, 8, 0.833
2048, 3, 128, 23, 127, 8, 0.83
2048, 0, 256, 23, 127, 8, 0.855
2048, 4, 256, 23, 127, 8, 0.849
2048, 0, 512, 23, 127, 8, 0.849
2048, 5, 512, 23, 127, 8, 0.851
2048, 0, 1024, 23, 127, 8, 0.856
2048, 6, 1024, 23, 127, 8, 0.862
2048, 0, 2048, 23, 127, 8, 0.709
2048, 7, 2048, 23, 127, 8, 0.712
2048, 0, 4096, 23, 127, 8, 0.702
2048, 8, 4096, 23, 127, 8, 0.701
256, 1, 64, 23, 127, 8, 0.689
256, 15, 64, 23, 127, 8, 0.688
256, 2, 64, 23, 127, 8, 0.691
256, 30, 64, 23, 127, 8, 0.612
256, 3, 64, 23, 127, 8, 0.688
256, 45, 64, 23, 127, 8, 0.686
256, 4, 64, 23, 127, 8, 0.694
256, 60, 64, 23, 127, 8, 0.609
256, 5, 64, 23, 127, 8, 0.69
256, 75, 64, 23, 127, 8, 0.69
256, 6, 64, 23, 127, 8, 0.691
256, 90, 64, 23, 127, 8, 0.612
256, 7, 64, 23, 127, 8, 0.689
256, 105, 64, 23, 127, 8, 0.688
1, 0, 0, 23, 127, 8, 0.98
2, 0, 1, 23, 127, 8, 0.978
3, 0, 2, 23, 127, 8, 0.98
4, 0, 3, 23, 127, 8, 0.978
5, 0, 4, 23, 127, 8, 0.977
6, 0, 5, 23, 127, 8, 0.984
7, 0, 6, 23, 127, 8, 0.982
8, 0, 7, 23, 127, 8, 0.983
9, 0, 8, 23, 127, 8, 0.987
10, 0, 9, 23, 127, 8, 0.979
11, 0, 10, 23, 127, 8, 0.985
12, 0, 11, 23, 127, 8, 0.981
13, 0, 12, 23, 127, 8, 0.98
14, 0, 13, 23, 127, 8, 0.982
15, 0, 14, 23, 127, 8, 0.981
16, 0, 15, 23, 127, 8, 0.579
17, 0, 16, 23, 127, 8, 0.531
18, 0, 17, 23, 127, 8, 0.577
19, 0, 18, 23, 127, 8, 0.588
20, 0, 19, 23, 127, 8, 0.571
21, 0, 20, 23, 127, 8, 0.576
22, 0, 21, 23, 127, 8, 0.59
23, 0, 22, 23, 127, 8, 0.574
24, 0, 23, 23, 127, 8, 0.583
25, 0, 24, 23, 127, 8, 0.581
26, 0, 25, 23, 127, 8, 0.592
27, 0, 26, 23, 127, 8, 0.586
28, 0, 27, 23, 127, 8, 0.588
29, 0, 28, 23, 127, 8, 0.578
30, 0, 29, 23, 127, 8, 0.573
31, 0, 30, 23, 127, 8, 0.588
32, 0, 31, 23, 127, 8, 0.664
2048, 0, 32, 23, 127, 16, 0.825
2048, 1, 32, 23, 127, 16, 0.823
2048, 0, 64, 23, 127, 16, 0.831
2048, 2, 64, 23, 127, 16, 0.822
2048, 0, 128, 23, 127, 16, 0.831
2048, 3, 128, 23, 127, 16, 0.831
2048, 0, 256, 23, 127, 16, 0.849
2048, 4, 256, 23, 127, 16, 0.85
2048, 0, 512, 23, 127, 16, 0.751
2048, 5, 512, 23, 127, 16, 0.75
2048, 0, 1024, 23, 127, 16, 0.913
2048, 6, 1024, 23, 127, 16, 0.895
2048, 0, 2048, 23, 127, 16, 0.736
2048, 7, 2048, 23, 127, 16, 0.741
2048, 0, 4096, 23, 127, 16, 0.712
2048, 8, 4096, 23, 127, 16, 0.711
256, 1, 64, 23, 127, 16, 0.758
256, 15, 64, 23, 127, 16, 0.692
256, 2, 64, 23, 127, 16, 0.692
256, 30, 64, 23, 127, 16, 0.613
256, 3, 64, 23, 127, 16, 0.69
256, 45, 64, 23, 127, 16, 0.687
256, 4, 64, 23, 127, 16, 0.69
256, 60, 64, 23, 127, 16, 0.604
256, 5, 64, 23, 127, 16, 0.687
256, 75, 64, 23, 127, 16, 0.687
256, 6, 64, 23, 127, 16, 0.69
256, 90, 64, 23, 127, 16, 0.61
256, 7, 64, 23, 127, 16, 0.69
256, 105, 64, 23, 127, 16, 0.685
1, 0, 0, 23, 127, 16, 0.981
2, 0, 1, 23, 127, 16, 0.985
3, 0, 2, 23, 127, 16, 0.985
4, 0, 3, 23, 127, 16, 0.981
5, 0, 4, 23, 127, 16, 0.979
6, 0, 5, 23, 127, 16, 0.986
7, 0, 6, 23, 127, 16, 0.986
8, 0, 7, 23, 127, 16, 0.982
9, 0, 8, 23, 127, 16, 0.982
10, 0, 9, 23, 127, 16, 0.98
11, 0, 10, 23, 127, 16, 0.983
12, 0, 11, 23, 127, 16, 0.982
13, 0, 12, 23, 127, 16, 0.982
14, 0, 13, 23, 127, 16, 0.982
15, 0, 14, 23, 127, 16, 0.982
16, 0, 15, 23, 127, 16, 0.582
17, 0, 16, 23, 127, 16, 0.542
18, 0, 17, 23, 127, 16, 0.554
19, 0, 18, 23, 127, 16, 0.562
20, 0, 19, 23, 127, 16, 0.587
21, 0, 20, 23, 127, 16, 0.584
22, 0, 21, 23, 127, 16, 0.587
23, 0, 22, 23, 127, 16, 0.594
24, 0, 23, 23, 127, 16, 0.581
25, 0, 24, 23, 127, 16, 0.577
26, 0, 25, 23, 127, 16, 0.588
27, 0, 26, 23, 127, 16, 0.589
28, 0, 27, 23, 127, 16, 0.596
29, 0, 28, 23, 127, 16, 0.591
30, 0, 29, 23, 127, 16, 0.585
31, 0, 30, 23, 127, 16, 0.59
32, 0, 31, 23, 127, 16, 0.669
sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
sysdeps/x86_64/strrchr.S | 505 +++++++++++++++---------
sysdeps/x86_64/wcsrchr.S | 268 +------------
4 files changed, 334 insertions(+), 444 deletions(-)
Comments
On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code unrolls the main loop slightly without adding too much
> overhead and minimizes the comparisons for the search CHAR.
>
> Geometric Mean of all benchmarks New / Old: 0.741
> See email for all results.
>
> Full xcheck passes on x86_64 with and without multiarch enabled.
> ---
> Results For: strrchr
>
> Geometric Mean of N=30 runs.
>
> Geometric Mean of all benchmarks New / Old: 0.741
> Benchmarks performance on Tigerlake:
> https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
>
> len, align, pos, seek, max_char, freq, New Time / Old Time
> 2048, 0, 32, 0, 127, 1, 0.647
> 2048, 1, 32, 0, 127, 1, 0.621
> 2048, 0, 64, 0, 127, 1, 0.661
> 2048, 2, 64, 0, 127, 1, 0.655
> 2048, 0, 128, 0, 127, 1, 0.69
> 2048, 3, 128, 0, 127, 1, 0.689
> 2048, 0, 256, 0, 127, 1, 0.718
> 2048, 4, 256, 0, 127, 1, 0.718
> 2048, 0, 512, 0, 127, 1, 0.758
> 2048, 5, 512, 0, 127, 1, 0.754
> 2048, 0, 1024, 0, 127, 1, 1.029
> 2048, 6, 1024, 0, 127, 1, 1.032
> 2048, 0, 2048, 0, 127, 1, 0.826
> 2048, 7, 2048, 0, 127, 1, 0.834
> 2048, 0, 4096, 0, 127, 1, 0.825
> 2048, 8, 4096, 0, 127, 1, 0.83
> 256, 1, 64, 0, 127, 1, 0.657
> 256, 15, 64, 0, 127, 1, 0.657
> 256, 2, 64, 0, 127, 1, 0.657
> 256, 30, 64, 0, 127, 1, 0.523
> 256, 3, 64, 0, 127, 1, 0.657
> 256, 45, 64, 0, 127, 1, 0.654
> 256, 4, 64, 0, 127, 1, 0.657
> 256, 60, 64, 0, 127, 1, 0.526
> 256, 5, 64, 0, 127, 1, 0.658
> 256, 75, 64, 0, 127, 1, 0.658
> 256, 6, 64, 0, 127, 1, 0.655
> 256, 90, 64, 0, 127, 1, 0.523
> 256, 7, 64, 0, 127, 1, 0.655
> 256, 105, 64, 0, 127, 1, 0.654
> 1, 0, 0, 0, 127, 1, 0.98
> 2, 0, 1, 0, 127, 1, 0.978
> 3, 0, 2, 0, 127, 1, 0.975
> 4, 0, 3, 0, 127, 1, 0.976
> 5, 0, 4, 0, 127, 1, 0.977
> 6, 0, 5, 0, 127, 1, 0.981
> 7, 0, 6, 0, 127, 1, 0.982
> 8, 0, 7, 0, 127, 1, 0.98
> 9, 0, 8, 0, 127, 1, 0.978
> 10, 0, 9, 0, 127, 1, 0.981
> 11, 0, 10, 0, 127, 1, 0.984
> 12, 0, 11, 0, 127, 1, 0.982
> 13, 0, 12, 0, 127, 1, 0.98
> 14, 0, 13, 0, 127, 1, 0.978
> 15, 0, 14, 0, 127, 1, 0.979
> 16, 0, 15, 0, 127, 1, 0.986
> 17, 0, 16, 0, 127, 1, 0.529
> 18, 0, 17, 0, 127, 1, 0.566
> 19, 0, 18, 0, 127, 1, 0.575
> 20, 0, 19, 0, 127, 1, 0.573
> 21, 0, 20, 0, 127, 1, 0.579
> 22, 0, 21, 0, 127, 1, 0.595
> 23, 0, 22, 0, 127, 1, 0.585
> 24, 0, 23, 0, 127, 1, 0.586
> 25, 0, 24, 0, 127, 1, 0.587
> 26, 0, 25, 0, 127, 1, 0.592
> 27, 0, 26, 0, 127, 1, 0.595
> 28, 0, 27, 0, 127, 1, 0.592
> 29, 0, 28, 0, 127, 1, 0.6
> 30, 0, 29, 0, 127, 1, 0.598
> 31, 0, 30, 0, 127, 1, 0.595
> 32, 0, 31, 0, 127, 1, 0.592
> 2048, 0, 32, 23, 127, 1, 0.827
> 2048, 1, 32, 23, 127, 1, 0.826
> 2048, 0, 64, 23, 127, 1, 0.824
> 2048, 2, 64, 23, 127, 1, 0.825
> 2048, 0, 128, 23, 127, 1, 0.829
> 2048, 3, 128, 23, 127, 1, 0.824
> 2048, 0, 256, 23, 127, 1, 0.832
> 2048, 4, 256, 23, 127, 1, 0.825
> 2048, 0, 512, 23, 127, 1, 0.831
> 2048, 5, 512, 23, 127, 1, 0.837
> 2048, 0, 1024, 23, 127, 1, 0.721
> 2048, 6, 1024, 23, 127, 1, 0.757
> 2048, 0, 2048, 23, 127, 1, 0.825
> 2048, 7, 2048, 23, 127, 1, 0.824
> 2048, 0, 4096, 23, 127, 1, 0.828
> 2048, 8, 4096, 23, 127, 1, 0.823
> 256, 1, 64, 23, 127, 1, 0.665
> 256, 15, 64, 23, 127, 1, 0.661
> 256, 2, 64, 23, 127, 1, 0.674
> 256, 30, 64, 23, 127, 1, 0.605
> 256, 3, 64, 23, 127, 1, 0.668
> 256, 45, 64, 23, 127, 1, 0.661
> 256, 4, 64, 23, 127, 1, 0.657
> 256, 60, 64, 23, 127, 1, 0.594
> 256, 5, 64, 23, 127, 1, 0.654
> 256, 75, 64, 23, 127, 1, 0.673
> 256, 6, 64, 23, 127, 1, 0.688
> 256, 90, 64, 23, 127, 1, 0.6
> 256, 7, 64, 23, 127, 1, 0.66
> 256, 105, 64, 23, 127, 1, 0.654
> 1, 0, 0, 23, 127, 1, 0.981
> 2, 0, 1, 23, 127, 1, 0.976
> 3, 0, 2, 23, 127, 1, 0.983
> 4, 0, 3, 23, 127, 1, 0.984
> 5, 0, 4, 23, 127, 1, 0.973
> 6, 0, 5, 23, 127, 1, 0.987
> 7, 0, 6, 23, 127, 1, 0.977
> 8, 0, 7, 23, 127, 1, 0.979
> 9, 0, 8, 23, 127, 1, 0.981
> 10, 0, 9, 23, 127, 1, 0.98
> 11, 0, 10, 23, 127, 1, 0.983
> 12, 0, 11, 23, 127, 1, 0.98
> 13, 0, 12, 23, 127, 1, 0.98
> 14, 0, 13, 23, 127, 1, 0.977
> 15, 0, 14, 23, 127, 1, 0.982
> 16, 0, 15, 23, 127, 1, 0.581
> 17, 0, 16, 23, 127, 1, 0.551
> 18, 0, 17, 23, 127, 1, 0.555
> 19, 0, 18, 23, 127, 1, 0.586
> 20, 0, 19, 23, 127, 1, 0.585
> 21, 0, 20, 23, 127, 1, 0.582
> 22, 0, 21, 23, 127, 1, 0.571
> 23, 0, 22, 23, 127, 1, 0.576
> 24, 0, 23, 23, 127, 1, 0.581
> 25, 0, 24, 23, 127, 1, 0.589
> 26, 0, 25, 23, 127, 1, 0.593
> 27, 0, 26, 23, 127, 1, 0.595
> 28, 0, 27, 23, 127, 1, 0.583
> 29, 0, 28, 23, 127, 1, 0.595
> 30, 0, 29, 23, 127, 1, 0.58
> 31, 0, 30, 23, 127, 1, 0.594
> 32, 0, 31, 23, 127, 1, 0.665
> 2048, 0, 32, 23, 127, 2, 0.825
> 2048, 1, 32, 23, 127, 2, 0.818
> 2048, 0, 64, 23, 127, 2, 0.829
> 2048, 2, 64, 23, 127, 2, 0.828
> 2048, 0, 128, 23, 127, 2, 0.823
> 2048, 3, 128, 23, 127, 2, 0.825
> 2048, 0, 256, 23, 127, 2, 0.819
> 2048, 4, 256, 23, 127, 2, 0.828
> 2048, 0, 512, 23, 127, 2, 0.824
> 2048, 5, 512, 23, 127, 2, 0.827
> 2048, 0, 1024, 23, 127, 2, 0.813
> 2048, 6, 1024, 23, 127, 2, 0.834
> 2048, 0, 2048, 23, 127, 2, 0.927
> 2048, 7, 2048, 23, 127, 2, 0.923
> 2048, 0, 4096, 23, 127, 2, 0.818
> 2048, 8, 4096, 23, 127, 2, 0.82
> 256, 1, 64, 23, 127, 2, 0.693
> 256, 15, 64, 23, 127, 2, 0.686
> 256, 2, 64, 23, 127, 2, 0.69
> 256, 30, 64, 23, 127, 2, 0.611
> 256, 3, 64, 23, 127, 2, 0.692
> 256, 45, 64, 23, 127, 2, 0.685
> 256, 4, 64, 23, 127, 2, 0.688
> 256, 60, 64, 23, 127, 2, 0.6
> 256, 5, 64, 23, 127, 2, 0.69
> 256, 75, 64, 23, 127, 2, 0.689
> 256, 6, 64, 23, 127, 2, 0.688
> 256, 90, 64, 23, 127, 2, 0.611
> 256, 7, 64, 23, 127, 2, 0.69
> 256, 105, 64, 23, 127, 2, 0.686
> 1, 0, 0, 23, 127, 2, 0.982
> 2, 0, 1, 23, 127, 2, 0.987
> 3, 0, 2, 23, 127, 2, 0.978
> 4, 0, 3, 23, 127, 2, 0.977
> 5, 0, 4, 23, 127, 2, 0.979
> 6, 0, 5, 23, 127, 2, 0.985
> 7, 0, 6, 23, 127, 2, 0.975
> 8, 0, 7, 23, 127, 2, 0.981
> 9, 0, 8, 23, 127, 2, 0.984
> 10, 0, 9, 23, 127, 2, 0.983
> 11, 0, 10, 23, 127, 2, 0.982
> 12, 0, 11, 23, 127, 2, 0.976
> 13, 0, 12, 23, 127, 2, 0.985
> 14, 0, 13, 23, 127, 2, 0.984
> 15, 0, 14, 23, 127, 2, 0.98
> 16, 0, 15, 23, 127, 2, 0.583
> 17, 0, 16, 23, 127, 2, 0.552
> 18, 0, 17, 23, 127, 2, 0.564
> 19, 0, 18, 23, 127, 2, 0.585
> 20, 0, 19, 23, 127, 2, 0.578
> 21, 0, 20, 23, 127, 2, 0.578
> 22, 0, 21, 23, 127, 2, 0.571
> 23, 0, 22, 23, 127, 2, 0.587
> 24, 0, 23, 23, 127, 2, 0.589
> 25, 0, 24, 23, 127, 2, 0.593
> 26, 0, 25, 23, 127, 2, 0.589
> 27, 0, 26, 23, 127, 2, 0.588
> 28, 0, 27, 23, 127, 2, 0.593
> 29, 0, 28, 23, 127, 2, 0.579
> 30, 0, 29, 23, 127, 2, 0.572
> 31, 0, 30, 23, 127, 2, 0.582
> 32, 0, 31, 23, 127, 2, 0.659
> 2048, 0, 32, 23, 127, 4, 0.822
> 2048, 1, 32, 23, 127, 4, 0.818
> 2048, 0, 64, 23, 127, 4, 0.826
> 2048, 2, 64, 23, 127, 4, 0.824
> 2048, 0, 128, 23, 127, 4, 0.833
> 2048, 3, 128, 23, 127, 4, 0.831
> 2048, 0, 256, 23, 127, 4, 0.826
> 2048, 4, 256, 23, 127, 4, 0.831
> 2048, 0, 512, 23, 127, 4, 0.834
> 2048, 5, 512, 23, 127, 4, 0.83
> 2048, 0, 1024, 23, 127, 4, 0.836
> 2048, 6, 1024, 23, 127, 4, 0.844
> 2048, 0, 2048, 23, 127, 4, 0.696
> 2048, 7, 2048, 23, 127, 4, 0.704
> 2048, 0, 4096, 23, 127, 4, 0.936
> 2048, 8, 4096, 23, 127, 4, 0.925
> 256, 1, 64, 23, 127, 4, 0.694
> 256, 15, 64, 23, 127, 4, 0.69
> 256, 2, 64, 23, 127, 4, 0.687
> 256, 30, 64, 23, 127, 4, 0.612
> 256, 3, 64, 23, 127, 4, 0.685
> 256, 45, 64, 23, 127, 4, 0.685
> 256, 4, 64, 23, 127, 4, 0.684
> 256, 60, 64, 23, 127, 4, 0.606
> 256, 5, 64, 23, 127, 4, 0.69
> 256, 75, 64, 23, 127, 4, 0.688
> 256, 6, 64, 23, 127, 4, 0.69
> 256, 90, 64, 23, 127, 4, 0.615
> 256, 7, 64, 23, 127, 4, 0.691
> 256, 105, 64, 23, 127, 4, 0.688
> 1, 0, 0, 23, 127, 4, 0.982
> 2, 0, 1, 23, 127, 4, 0.983
> 3, 0, 2, 23, 127, 4, 0.981
> 4, 0, 3, 23, 127, 4, 0.984
> 5, 0, 4, 23, 127, 4, 0.963
> 6, 0, 5, 23, 127, 4, 0.978
> 7, 0, 6, 23, 127, 4, 0.985
> 8, 0, 7, 23, 127, 4, 0.986
> 9, 0, 8, 23, 127, 4, 0.978
> 10, 0, 9, 23, 127, 4, 0.985
> 11, 0, 10, 23, 127, 4, 0.986
> 12, 0, 11, 23, 127, 4, 0.983
> 13, 0, 12, 23, 127, 4, 0.986
> 14, 0, 13, 23, 127, 4, 0.98
> 15, 0, 14, 23, 127, 4, 0.979
> 16, 0, 15, 23, 127, 4, 0.582
> 17, 0, 16, 23, 127, 4, 0.542
> 18, 0, 17, 23, 127, 4, 0.564
> 19, 0, 18, 23, 127, 4, 0.571
> 20, 0, 19, 23, 127, 4, 0.582
> 21, 0, 20, 23, 127, 4, 0.573
> 22, 0, 21, 23, 127, 4, 0.575
> 23, 0, 22, 23, 127, 4, 0.578
> 24, 0, 23, 23, 127, 4, 0.58
> 25, 0, 24, 23, 127, 4, 0.592
> 26, 0, 25, 23, 127, 4, 0.588
> 27, 0, 26, 23, 127, 4, 0.574
> 28, 0, 27, 23, 127, 4, 0.589
> 29, 0, 28, 23, 127, 4, 0.56
> 30, 0, 29, 23, 127, 4, 0.587
> 31, 0, 30, 23, 127, 4, 0.584
> 32, 0, 31, 23, 127, 4, 0.664
> 2048, 0, 32, 23, 127, 8, 0.826
> 2048, 1, 32, 23, 127, 8, 0.821
> 2048, 0, 64, 23, 127, 8, 0.828
> 2048, 2, 64, 23, 127, 8, 0.827
> 2048, 0, 128, 23, 127, 8, 0.833
> 2048, 3, 128, 23, 127, 8, 0.83
> 2048, 0, 256, 23, 127, 8, 0.855
> 2048, 4, 256, 23, 127, 8, 0.849
> 2048, 0, 512, 23, 127, 8, 0.849
> 2048, 5, 512, 23, 127, 8, 0.851
> 2048, 0, 1024, 23, 127, 8, 0.856
> 2048, 6, 1024, 23, 127, 8, 0.862
> 2048, 0, 2048, 23, 127, 8, 0.709
> 2048, 7, 2048, 23, 127, 8, 0.712
> 2048, 0, 4096, 23, 127, 8, 0.702
> 2048, 8, 4096, 23, 127, 8, 0.701
> 256, 1, 64, 23, 127, 8, 0.689
> 256, 15, 64, 23, 127, 8, 0.688
> 256, 2, 64, 23, 127, 8, 0.691
> 256, 30, 64, 23, 127, 8, 0.612
> 256, 3, 64, 23, 127, 8, 0.688
> 256, 45, 64, 23, 127, 8, 0.686
> 256, 4, 64, 23, 127, 8, 0.694
> 256, 60, 64, 23, 127, 8, 0.609
> 256, 5, 64, 23, 127, 8, 0.69
> 256, 75, 64, 23, 127, 8, 0.69
> 256, 6, 64, 23, 127, 8, 0.691
> 256, 90, 64, 23, 127, 8, 0.612
> 256, 7, 64, 23, 127, 8, 0.689
> 256, 105, 64, 23, 127, 8, 0.688
> 1, 0, 0, 23, 127, 8, 0.98
> 2, 0, 1, 23, 127, 8, 0.978
> 3, 0, 2, 23, 127, 8, 0.98
> 4, 0, 3, 23, 127, 8, 0.978
> 5, 0, 4, 23, 127, 8, 0.977
> 6, 0, 5, 23, 127, 8, 0.984
> 7, 0, 6, 23, 127, 8, 0.982
> 8, 0, 7, 23, 127, 8, 0.983
> 9, 0, 8, 23, 127, 8, 0.987
> 10, 0, 9, 23, 127, 8, 0.979
> 11, 0, 10, 23, 127, 8, 0.985
> 12, 0, 11, 23, 127, 8, 0.981
> 13, 0, 12, 23, 127, 8, 0.98
> 14, 0, 13, 23, 127, 8, 0.982
> 15, 0, 14, 23, 127, 8, 0.981
> 16, 0, 15, 23, 127, 8, 0.579
> 17, 0, 16, 23, 127, 8, 0.531
> 18, 0, 17, 23, 127, 8, 0.577
> 19, 0, 18, 23, 127, 8, 0.588
> 20, 0, 19, 23, 127, 8, 0.571
> 21, 0, 20, 23, 127, 8, 0.576
> 22, 0, 21, 23, 127, 8, 0.59
> 23, 0, 22, 23, 127, 8, 0.574
> 24, 0, 23, 23, 127, 8, 0.583
> 25, 0, 24, 23, 127, 8, 0.581
> 26, 0, 25, 23, 127, 8, 0.592
> 27, 0, 26, 23, 127, 8, 0.586
> 28, 0, 27, 23, 127, 8, 0.588
> 29, 0, 28, 23, 127, 8, 0.578
> 30, 0, 29, 23, 127, 8, 0.573
> 31, 0, 30, 23, 127, 8, 0.588
> 32, 0, 31, 23, 127, 8, 0.664
> 2048, 0, 32, 23, 127, 16, 0.825
> 2048, 1, 32, 23, 127, 16, 0.823
> 2048, 0, 64, 23, 127, 16, 0.831
> 2048, 2, 64, 23, 127, 16, 0.822
> 2048, 0, 128, 23, 127, 16, 0.831
> 2048, 3, 128, 23, 127, 16, 0.831
> 2048, 0, 256, 23, 127, 16, 0.849
> 2048, 4, 256, 23, 127, 16, 0.85
> 2048, 0, 512, 23, 127, 16, 0.751
> 2048, 5, 512, 23, 127, 16, 0.75
> 2048, 0, 1024, 23, 127, 16, 0.913
> 2048, 6, 1024, 23, 127, 16, 0.895
> 2048, 0, 2048, 23, 127, 16, 0.736
> 2048, 7, 2048, 23, 127, 16, 0.741
> 2048, 0, 4096, 23, 127, 16, 0.712
> 2048, 8, 4096, 23, 127, 16, 0.711
> 256, 1, 64, 23, 127, 16, 0.758
> 256, 15, 64, 23, 127, 16, 0.692
> 256, 2, 64, 23, 127, 16, 0.692
> 256, 30, 64, 23, 127, 16, 0.613
> 256, 3, 64, 23, 127, 16, 0.69
> 256, 45, 64, 23, 127, 16, 0.687
> 256, 4, 64, 23, 127, 16, 0.69
> 256, 60, 64, 23, 127, 16, 0.604
> 256, 5, 64, 23, 127, 16, 0.687
> 256, 75, 64, 23, 127, 16, 0.687
> 256, 6, 64, 23, 127, 16, 0.69
> 256, 90, 64, 23, 127, 16, 0.61
> 256, 7, 64, 23, 127, 16, 0.69
> 256, 105, 64, 23, 127, 16, 0.685
> 1, 0, 0, 23, 127, 16, 0.981
> 2, 0, 1, 23, 127, 16, 0.985
> 3, 0, 2, 23, 127, 16, 0.985
> 4, 0, 3, 23, 127, 16, 0.981
> 5, 0, 4, 23, 127, 16, 0.979
> 6, 0, 5, 23, 127, 16, 0.986
> 7, 0, 6, 23, 127, 16, 0.986
> 8, 0, 7, 23, 127, 16, 0.982
> 9, 0, 8, 23, 127, 16, 0.982
> 10, 0, 9, 23, 127, 16, 0.98
> 11, 0, 10, 23, 127, 16, 0.983
> 12, 0, 11, 23, 127, 16, 0.982
> 13, 0, 12, 23, 127, 16, 0.982
> 14, 0, 13, 23, 127, 16, 0.982
> 15, 0, 14, 23, 127, 16, 0.982
> 16, 0, 15, 23, 127, 16, 0.582
> 17, 0, 16, 23, 127, 16, 0.542
> 18, 0, 17, 23, 127, 16, 0.554
> 19, 0, 18, 23, 127, 16, 0.562
> 20, 0, 19, 23, 127, 16, 0.587
> 21, 0, 20, 23, 127, 16, 0.584
> 22, 0, 21, 23, 127, 16, 0.587
> 23, 0, 22, 23, 127, 16, 0.594
> 24, 0, 23, 23, 127, 16, 0.581
> 25, 0, 24, 23, 127, 16, 0.577
> 26, 0, 25, 23, 127, 16, 0.588
> 27, 0, 26, 23, 127, 16, 0.589
> 28, 0, 27, 23, 127, 16, 0.596
> 29, 0, 28, 23, 127, 16, 0.591
> 30, 0, 29, 23, 127, 16, 0.585
> 31, 0, 30, 23, 127, 16, 0.59
> 32, 0, 31, 23, 127, 16, 0.669
>
> sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
> sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
> sysdeps/x86_64/strrchr.S | 505 +++++++++++++++---------
> sysdeps/x86_64/wcsrchr.S | 268 +------------
> 4 files changed, 334 insertions(+), 444 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> index db1b44c23c..866396e947 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> @@ -17,7 +17,7 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# define strrchr __strrchr_sse2
> +# define STRRCHR __strrchr_sse2
>
> # undef weak_alias
> # define weak_alias(strrchr, rindex)
> diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> index 78d1ca6553..69d2f3cdb1 100644
> --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> @@ -17,7 +17,6 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# define wcsrchr __wcsrchr_sse2
> +# define STRRCHR __wcsrchr_sse2
> #endif
> -
> #include "../wcsrchr.S"
> diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> index 50d886713e..94449ad806 100644
> --- a/sysdeps/x86_64/strrchr.S
> +++ b/sysdeps/x86_64/strrchr.S
> @@ -19,210 +19,355 @@
>
> #include <sysdep.h>
>
> +#ifndef STRRCHR
> +# define STRRCHR strrchr
> +#endif
> +
> +#ifdef USE_AS_WCSRCHR
> +# define PCMPEQ pcmpeqd
> +# define CHAR_SIZE 4
> +# define PMINU pminud
> +#else
> +# define PCMPEQ pcmpeqb
> +# define CHAR_SIZE 1
> +# define PMINU pminub
> +#endif
> +
> +#define PAGE_SIZE 4096
> +#define VEC_SIZE 16
> +
> .text
> -ENTRY (strrchr)
> - movd %esi, %xmm1
> +ENTRY(STRRCHR)
> + movd %esi, %xmm0
> movq %rdi, %rax
> - andl $4095, %eax
> - punpcklbw %xmm1, %xmm1
> - cmpq $4032, %rax
> - punpcklwd %xmm1, %xmm1
> - pshufd $0, %xmm1, %xmm1
> + andl $(PAGE_SIZE - 1), %eax
> +#ifndef USE_AS_WCSRCHR
> + punpcklbw %xmm0, %xmm0
> + punpcklwd %xmm0, %xmm0
> +#endif
> + pshufd $0, %xmm0, %xmm0
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> ja L(cross_page)
> - movdqu (%rdi), %xmm0
> +
> +L(cross_page_continue):
> + movups (%rdi), %xmm1
> pxor %xmm2, %xmm2
> - movdqa %xmm0, %xmm3
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm3
> - pmovmskb %xmm0, %ecx
> - pmovmskb %xmm3, %edx
> - testq %rdx, %rdx
> - je L(next_48_bytes)
> - leaq -1(%rdx), %rax
> - xorq %rdx, %rax
> - andq %rcx, %rax
> - je L(exit)
> - bsrq %rax, %rax
> + PCMPEQ %xmm1, %xmm2
> + pmovmskb %xmm2, %ecx
> + testl %ecx, %ecx
> + jz L(aligned_more)
> +
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + leal -1(%rcx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(ret0)
> + bsrl %eax, %eax
> addq %rdi, %rax
> + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> + search CHAR is zero we are correct. Either way `andq
> + -CHAR_SIZE, %rax` gets the correct result. */
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> +L(ret0):
> ret
>
> + /* Returns for first vec x1/x2 have hard coded backward search
> + path for earlier matches. */
> .p2align 4
> -L(next_48_bytes):
> - movdqu 16(%rdi), %xmm4
> - movdqa %xmm4, %xmm5
> - movdqu 32(%rdi), %xmm3
> - pcmpeqb %xmm1, %xmm4
> - pcmpeqb %xmm2, %xmm5
> - movdqu 48(%rdi), %xmm0
> - pmovmskb %xmm5, %edx
> - movdqa %xmm3, %xmm5
> - pcmpeqb %xmm1, %xmm3
> - pcmpeqb %xmm2, %xmm5
> - pcmpeqb %xmm0, %xmm2
> - salq $16, %rdx
> - pmovmskb %xmm3, %r8d
> - pmovmskb %xmm5, %eax
> - pmovmskb %xmm2, %esi
> - salq $32, %r8
> - salq $32, %rax
> - pcmpeqb %xmm1, %xmm0
> - orq %rdx, %rax
> - movq %rsi, %rdx
> - pmovmskb %xmm4, %esi
> - salq $48, %rdx
> - salq $16, %rsi
> - orq %r8, %rsi
> - orq %rcx, %rsi
> - pmovmskb %xmm0, %ecx
> - salq $48, %rcx
> - orq %rcx, %rsi
> - orq %rdx, %rax
> - je L(loop_header2)
> - leaq -1(%rax), %rcx
> - xorq %rax, %rcx
> - andq %rcx, %rsi
> - je L(exit)
> - bsrq %rsi, %rsi
> - leaq (%rdi,%rsi), %rax
> +L(first_vec_x0_test):
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + testl %eax, %eax
> + jz L(ret0)
> + bsrl %eax, %eax
> + addq %r8, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> ret
>
> .p2align 4
> -L(loop_header2):
> - testq %rsi, %rsi
> - movq %rdi, %rcx
> - je L(no_c_found)
> -L(loop_header):
> - addq $64, %rdi
> - pxor %xmm7, %xmm7
> - andq $-64, %rdi
> - jmp L(loop_entry)
> +L(first_vec_x1):
> + PCMPEQ %xmm0, %xmm2
> + pmovmskb %xmm2, %eax
> + leal -1(%rcx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(first_vec_x0_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
>
> .p2align 4
> -L(loop64):
> - testq %rdx, %rdx
> - cmovne %rdx, %rsi
> - cmovne %rdi, %rcx
> - addq $64, %rdi
> -L(loop_entry):
> - movdqa 32(%rdi), %xmm3
> - pxor %xmm6, %xmm6
> - movdqa 48(%rdi), %xmm2
> - movdqa %xmm3, %xmm0
> - movdqa 16(%rdi), %xmm4
> - pminub %xmm2, %xmm0
> - movdqa (%rdi), %xmm5
> - pminub %xmm4, %xmm0
> - pminub %xmm5, %xmm0
> - pcmpeqb %xmm7, %xmm0
> - pmovmskb %xmm0, %eax
> - movdqa %xmm5, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %r9d
> - movdqa %xmm4, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %edx
> - movdqa %xmm3, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - salq $16, %rdx
> - pmovmskb %xmm0, %r10d
> - movdqa %xmm2, %xmm0
> - pcmpeqb %xmm1, %xmm0
> - salq $32, %r10
> - orq %r10, %rdx
> - pmovmskb %xmm0, %r8d
> - orq %r9, %rdx
> - salq $48, %r8
> - orq %r8, %rdx
> +L(first_vec_x1_test):
> + PCMPEQ %xmm0, %xmm2
> + pmovmskb %xmm2, %eax
> testl %eax, %eax
> - je L(loop64)
> - pcmpeqb %xmm6, %xmm4
> - pcmpeqb %xmm6, %xmm3
> - pcmpeqb %xmm6, %xmm5
> - pmovmskb %xmm4, %eax
> - pmovmskb %xmm3, %r10d
> - pcmpeqb %xmm6, %xmm2
> - pmovmskb %xmm5, %r9d
> - salq $32, %r10
> - salq $16, %rax
> - pmovmskb %xmm2, %r8d
> - orq %r10, %rax
> - orq %r9, %rax
> - salq $48, %r8
> - orq %r8, %rax
> - leaq -1(%rax), %r8
> - xorq %rax, %r8
> - andq %r8, %rdx
> - cmovne %rdi, %rcx
> - cmovne %rdx, %rsi
> - bsrq %rsi, %rsi
> - leaq (%rcx,%rsi), %rax
> + jz L(first_vec_x0_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(first_vec_x2):
> + PCMPEQ %xmm0, %xmm3
> + pmovmskb %xmm3, %eax
> + leal -1(%rcx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(first_vec_x1_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(aligned_more):
> + /* Save original pointer if match was in VEC 0. */
> + movq %rdi, %r8
> + andq $-VEC_SIZE, %rdi
> +
> + movaps VEC_SIZE(%rdi), %xmm2
> + pxor %xmm3, %xmm3
> + PCMPEQ %xmm2, %xmm3
> + pmovmskb %xmm3, %ecx
> + testl %ecx, %ecx
> + jnz L(first_vec_x1)
> +
> + movaps (VEC_SIZE * 2)(%rdi), %xmm3
> + pxor %xmm4, %xmm4
> + PCMPEQ %xmm3, %xmm4
> + pmovmskb %xmm4, %ecx
> + testl %ecx, %ecx
> + jnz L(first_vec_x2)
> +
> + addq $VEC_SIZE, %rdi
> + /* Save pointer again before realigning. */
> + movq %rdi, %rsi
> + andq $-(VEC_SIZE * 2), %rdi
> + .p2align 4
> +L(first_loop):
> + /* Do 2x VEC at a time. */
> + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> + /* If SSE2 no pminud. */
> +#ifdef NO_PMINU
Do we really need SSE4.1 wcsrchr? I think we should focus on AVX2 and
above.
> + movaps %xmm5, %xmm6
> + pxor %xmm8, %xmm8
> +
> + PCMPEQ %xmm8, %xmm5
> + PCMPEQ %xmm4, %xmm8
> + por %xmm5, %xmm8
> +#else
> + movaps %xmm5, %xmm6
> + PMINU %xmm4, %xmm5
> +#endif
> +
> + movaps %xmm4, %xmm9
> + PCMPEQ %xmm0, %xmm4
> + PCMPEQ %xmm0, %xmm6
> + movaps %xmm6, %xmm7
> + por %xmm4, %xmm6
> +#ifndef NO_PMINU
> + pxor %xmm8, %xmm8
> + PCMPEQ %xmm5, %xmm8
> +#endif
> + pmovmskb %xmm8, %ecx
> + pmovmskb %xmm6, %eax
> +
> + addq $(VEC_SIZE * 2), %rdi
> + /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> + macro-fuse with `jz`. */
> + addl %ecx, %eax
> + jz L(first_loop)
> +
> + /* Check if there is zero match. */
> + testl %ecx, %ecx
> + jz L(second_loop_match)
> +
> + /* Check if there was a match in last iteration. */
> + subl %ecx, %eax
> + jnz L(new_match)
> +
> +L(first_loop_old_match):
> + PCMPEQ %xmm0, %xmm2
> + PCMPEQ %xmm0, %xmm3
> + pmovmskb %xmm2, %ecx
> + pmovmskb %xmm3, %eax
> + addl %eax, %ecx
> + jz L(first_vec_x0_test)
> + /* NB: We could move this shift to before the branch and save a
> + bit of code size / performance on the fall through. The
> + branch leads to the null case which generally seems hotter
> + than char in first 3x VEC. */
> + sall $16, %eax
> + orl %ecx, %eax
> +
> + bsrl %eax, %eax
> + addq %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(new_match):
> + pxor %xmm6, %xmm6
> + PCMPEQ %xmm9, %xmm6
> + pmovmskb %xmm6, %eax
> + sall $16, %ecx
> + orl %eax, %ecx
> +
> + /* We can't reuse either of the old comparisons as since we mask
> + of zeros after first zero (instead of using the full
> + comparison) we can't gurantee no interference between match
> + after end of string and valid match. */
> + pmovmskb %xmm4, %eax
> + pmovmskb %xmm7, %edx
> + sall $16, %edx
> + orl %edx, %eax
> +
> + leal -1(%ecx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(first_loop_old_match)
> + bsrl %eax, %eax
> + addq %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> ret
>
> + /* Save minimum state for getting most recent match. We can
> + throw out all previous work. */
> .p2align 4
> -L(no_c_found):
> - movl $1, %esi
> - xorl %ecx, %ecx
> - jmp L(loop_header)
> +L(second_loop_match):
> + movq %rdi, %rsi
> + movaps %xmm4, %xmm2
> + movaps %xmm7, %xmm3
>
> .p2align 4
> -L(exit):
> - xorl %eax, %eax
> +L(second_loop):
> + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> +#ifdef NO_PMINU
> + movaps %xmm5, %xmm6
> + pxor %xmm8, %xmm8
> +
> + PCMPEQ %xmm8, %xmm5
> + PCMPEQ %xmm4, %xmm8
> + por %xmm5, %xmm8
> +#else
> + movaps %xmm5, %xmm6
> + PMINU %xmm4, %xmm5
> +#endif
> +
> + movaps %xmm4, %xmm9
> + PCMPEQ %xmm0, %xmm4
> + PCMPEQ %xmm0, %xmm6
> + movaps %xmm6, %xmm7
> + por %xmm4, %xmm6
> +#ifndef NO_PMINU
> + pxor %xmm8, %xmm8
> + PCMPEQ %xmm5, %xmm8
> +#endif
> +
> + pmovmskb %xmm8, %ecx
> + pmovmskb %xmm6, %eax
> +
> + addq $(VEC_SIZE * 2), %rdi
> + /* Either null term or new occurence of CHAR. */
> + addl %ecx, %eax
> + jz L(second_loop)
> +
> + /* No null term so much be new occurence of CHAR. */
> + testl %ecx, %ecx
> + jz L(second_loop_match)
> +
> +
> + subl %ecx, %eax
> + jnz L(second_loop_new_match)
> +
> +L(second_loop_old_match):
> + pmovmskb %xmm2, %ecx
> + pmovmskb %xmm3, %eax
> + sall $16, %eax
> + orl %ecx, %eax
> + bsrl %eax, %eax
> + addq %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> ret
>
> .p2align 4
> +L(second_loop_new_match):
> + pxor %xmm6, %xmm6
> + PCMPEQ %xmm9, %xmm6
> + pmovmskb %xmm6, %eax
> + sall $16, %ecx
> + orl %eax, %ecx
> +
> + /* We can't reuse either of the old comparisons as since we mask
> + of zeros after first zero (instead of using the full
> + comparison) we can't gurantee no interference between match
> + after end of string and valid match. */
> + pmovmskb %xmm4, %eax
> + pmovmskb %xmm7, %edx
> + sall $16, %edx
> + orl %edx, %eax
> +
> + leal -1(%ecx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(second_loop_old_match)
> + bsrl %eax, %eax
> + addq %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4,, 4
> L(cross_page):
> - movq %rdi, %rax
> - pxor %xmm0, %xmm0
> - andq $-64, %rax
> - movdqu (%rax), %xmm5
> - movdqa %xmm5, %xmm6
> - movdqu 16(%rax), %xmm4
> - pcmpeqb %xmm1, %xmm5
> - pcmpeqb %xmm0, %xmm6
> - movdqu 32(%rax), %xmm3
> - pmovmskb %xmm6, %esi
> - movdqa %xmm4, %xmm6
> - movdqu 48(%rax), %xmm2
> - pcmpeqb %xmm1, %xmm4
> - pcmpeqb %xmm0, %xmm6
> - pmovmskb %xmm6, %edx
> - movdqa %xmm3, %xmm6
> - pcmpeqb %xmm1, %xmm3
> - pcmpeqb %xmm0, %xmm6
> - pcmpeqb %xmm2, %xmm0
> - salq $16, %rdx
> - pmovmskb %xmm3, %r9d
> - pmovmskb %xmm6, %r8d
> - pmovmskb %xmm0, %ecx
> - salq $32, %r9
> - salq $32, %r8
> - pcmpeqb %xmm1, %xmm2
> - orq %r8, %rdx
> - salq $48, %rcx
> - pmovmskb %xmm5, %r8d
> - orq %rsi, %rdx
> - pmovmskb %xmm4, %esi
> - orq %rcx, %rdx
> - pmovmskb %xmm2, %ecx
> - salq $16, %rsi
> - salq $48, %rcx
> - orq %r9, %rsi
> - orq %r8, %rsi
> - orq %rcx, %rsi
> + movq %rdi, %rsi
> + andq $-VEC_SIZE, %rsi
> + movaps (%rsi), %xmm1
> + pxor %xmm2, %xmm2
> + PCMPEQ %xmm1, %xmm2
> + pmovmskb %xmm2, %edx
> movl %edi, %ecx
> - subl %eax, %ecx
> - shrq %cl, %rdx
> - shrq %cl, %rsi
> - testq %rdx, %rdx
> - je L(loop_header2)
> - leaq -1(%rdx), %rax
> - xorq %rdx, %rax
> - andq %rax, %rsi
> - je L(exit)
> - bsrq %rsi, %rax
> + andl $(VEC_SIZE - 1), %ecx
> + sarl %cl, %edx
> + jz L(cross_page_continue)
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + sarl %cl, %eax
> + leal -1(%rdx), %ecx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(ret1)
> + bsrl %eax, %eax
> addq %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> +L(ret1):
> ret
> -END (strrchr)
> +END(STRRCHR)
>
> -weak_alias (strrchr, rindex)
> -libc_hidden_builtin_def (strrchr)
> +#ifndef USE_AS_WCSRCHR
> + weak_alias (STRRCHR, rindex)
> + libc_hidden_builtin_def (STRRCHR)
> +#endif
> diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> index 61552954de..2b80efc5ef 100644
> --- a/sysdeps/x86_64/wcsrchr.S
> +++ b/sysdeps/x86_64/wcsrchr.S
> @@ -1,4 +1,4 @@
> -/* wcsrchr with SSSE3
> +/* wcsrchr optimized with SSE2.
> Copyright (C) 2011-2022 Free Software Foundation, Inc.
> This file is part of the GNU C Library.
>
> @@ -16,266 +16,12 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include <sysdep.h>
>
> - .text
> -ENTRY (wcsrchr)
> +#define USE_AS_WCSRCHR 1
> +#define NO_PMINU 1
>
> - movd %rsi, %xmm1
> - mov %rdi, %rcx
> - punpckldq %xmm1, %xmm1
> - pxor %xmm2, %xmm2
> - punpckldq %xmm1, %xmm1
> - and $63, %rcx
> - cmp $48, %rcx
> - ja L(crosscache)
> +#ifndef STRRCHR
> +# define STRRCHR wcsrchr
> +#endif
>
> - movdqu (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm2
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm0, %rax
> - add $16, %rdi
> -
> - test %rax, %rax
> - jnz L(unaligned_match1)
> -
> - test %rcx, %rcx
> - jnz L(return_null)
> -
> - and $-16, %rdi
> - xor %r8, %r8
> - jmp L(loop)
> -
> - .p2align 4
> -L(unaligned_match1):
> - test %rcx, %rcx
> - jnz L(prolog_find_zero_1)
> -
> - mov %rax, %r8
> - mov %rdi, %rsi
> - and $-16, %rdi
> - jmp L(loop)
> -
> - .p2align 4
> -L(crosscache):
> - and $15, %rcx
> - and $-16, %rdi
> - pxor %xmm3, %xmm3
> - movdqa (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm3
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm3, %rdx
> - pmovmskb %xmm0, %rax
> - shr %cl, %rdx
> - shr %cl, %rax
> - add $16, %rdi
> -
> - test %rax, %rax
> - jnz L(unaligned_match)
> -
> - test %rdx, %rdx
> - jnz L(return_null)
> -
> - xor %r8, %r8
> - jmp L(loop)
> -
> - .p2align 4
> -L(unaligned_match):
> - test %rdx, %rdx
> - jnz L(prolog_find_zero)
> -
> - mov %rax, %r8
> - lea (%rdi, %rcx), %rsi
> -
> -/* Loop start on aligned string. */
> - .p2align 4
> -L(loop):
> - movdqa (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm0, %rax
> - or %rax, %rcx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm3
> - pcmpeqd %xmm3, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm3
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm3, %rax
> - or %rax, %rcx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm4
> - pcmpeqd %xmm4, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm4
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm4, %rax
> - or %rax, %rcx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm5
> - pcmpeqd %xmm5, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm5
> - pmovmskb %xmm2, %rcx
> - pmovmskb %xmm5, %rax
> - or %rax, %rcx
> - jz L(loop)
> -
> - .p2align 4
> -L(matches):
> - test %rax, %rax
> - jnz L(match)
> -L(return_value):
> - test %r8, %r8
> - jz L(return_null)
> - mov %r8, %rax
> - mov %rsi, %rdi
> -
> - test $15 << 4, %ah
> - jnz L(match_fourth_wchar)
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match):
> - pmovmskb %xmm2, %rcx
> - test %rcx, %rcx
> - jnz L(find_zero)
> - mov %rax, %r8
> - mov %rdi, %rsi
> - jmp L(loop)
> -
> - .p2align 4
> -L(find_zero):
> - test $15, %cl
> - jnz L(find_zero_in_first_wchar)
> - test %cl, %cl
> - jnz L(find_zero_in_second_wchar)
> - test $15, %ch
> - jnz L(find_zero_in_third_wchar)
> -
> - and $1 << 13 - 1, %rax
> - jz L(return_value)
> -
> - test $15 << 4, %ah
> - jnz L(match_fourth_wchar)
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(find_zero_in_first_wchar):
> - test $1, %rax
> - jz L(return_value)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(find_zero_in_second_wchar):
> - and $1 << 5 - 1, %rax
> - jz L(return_value)
> -
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(find_zero_in_third_wchar):
> - and $1 << 9 - 1, %rax
> - jz L(return_value)
> -
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero):
> - add %rcx, %rdi
> - mov %rdx, %rcx
> -L(prolog_find_zero_1):
> - test $15, %cl
> - jnz L(prolog_find_zero_in_first_wchar)
> - test %cl, %cl
> - jnz L(prolog_find_zero_in_second_wchar)
> - test $15, %ch
> - jnz L(prolog_find_zero_in_third_wchar)
> -
> - and $1 << 13 - 1, %rax
> - jz L(return_null)
> -
> - test $15 << 4, %ah
> - jnz L(match_fourth_wchar)
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero_in_first_wchar):
> - test $1, %rax
> - jz L(return_null)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero_in_second_wchar):
> - and $1 << 5 - 1, %rax
> - jz L(return_null)
> -
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(prolog_find_zero_in_third_wchar):
> - and $1 << 9 - 1, %rax
> - jz L(return_null)
> -
> - test %ah, %ah
> - jnz L(match_third_wchar)
> - test $15 << 4, %al
> - jnz L(match_second_wchar)
> - lea -16(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match_second_wchar):
> - lea -12(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match_third_wchar):
> - lea -8(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(match_fourth_wchar):
> - lea -4(%rdi), %rax
> - ret
> -
> - .p2align 4
> -L(return_null):
> - xor %rax, %rax
> - ret
> -
> -END (wcsrchr)
> +#include "../strrchr.S"
> --
> 2.25.1
>
--
H.J.
On Thu, Apr 21, 2022 at 3:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > The new code unrolls the main loop slightly without adding too much
> > overhead and minimizes the comparisons for the search CHAR.
> >
> > Geometric Mean of all benchmarks New / Old: 0.741
> > See email for all results.
> >
> > Full xcheck passes on x86_64 with and without multiarch enabled.
> > ---
> > Results For: strrchr
> >
> > Geometric Mean of N=30 runs.
> >
> > Geometric Mean of all benchmarks New / Old: 0.741
> > Benchmarks performance on Tigerlake:
> > https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
> >
> > len, align, pos, seek, max_char, freq, New Time / Old Time
> > 2048, 0, 32, 0, 127, 1, 0.647
> > 2048, 1, 32, 0, 127, 1, 0.621
> > 2048, 0, 64, 0, 127, 1, 0.661
> > 2048, 2, 64, 0, 127, 1, 0.655
> > 2048, 0, 128, 0, 127, 1, 0.69
> > 2048, 3, 128, 0, 127, 1, 0.689
> > 2048, 0, 256, 0, 127, 1, 0.718
> > 2048, 4, 256, 0, 127, 1, 0.718
> > 2048, 0, 512, 0, 127, 1, 0.758
> > 2048, 5, 512, 0, 127, 1, 0.754
> > 2048, 0, 1024, 0, 127, 1, 1.029
> > 2048, 6, 1024, 0, 127, 1, 1.032
> > 2048, 0, 2048, 0, 127, 1, 0.826
> > 2048, 7, 2048, 0, 127, 1, 0.834
> > 2048, 0, 4096, 0, 127, 1, 0.825
> > 2048, 8, 4096, 0, 127, 1, 0.83
> > 256, 1, 64, 0, 127, 1, 0.657
> > 256, 15, 64, 0, 127, 1, 0.657
> > 256, 2, 64, 0, 127, 1, 0.657
> > 256, 30, 64, 0, 127, 1, 0.523
> > 256, 3, 64, 0, 127, 1, 0.657
> > 256, 45, 64, 0, 127, 1, 0.654
> > 256, 4, 64, 0, 127, 1, 0.657
> > 256, 60, 64, 0, 127, 1, 0.526
> > 256, 5, 64, 0, 127, 1, 0.658
> > 256, 75, 64, 0, 127, 1, 0.658
> > 256, 6, 64, 0, 127, 1, 0.655
> > 256, 90, 64, 0, 127, 1, 0.523
> > 256, 7, 64, 0, 127, 1, 0.655
> > 256, 105, 64, 0, 127, 1, 0.654
> > 1, 0, 0, 0, 127, 1, 0.98
> > 2, 0, 1, 0, 127, 1, 0.978
> > 3, 0, 2, 0, 127, 1, 0.975
> > 4, 0, 3, 0, 127, 1, 0.976
> > 5, 0, 4, 0, 127, 1, 0.977
> > 6, 0, 5, 0, 127, 1, 0.981
> > 7, 0, 6, 0, 127, 1, 0.982
> > 8, 0, 7, 0, 127, 1, 0.98
> > 9, 0, 8, 0, 127, 1, 0.978
> > 10, 0, 9, 0, 127, 1, 0.981
> > 11, 0, 10, 0, 127, 1, 0.984
> > 12, 0, 11, 0, 127, 1, 0.982
> > 13, 0, 12, 0, 127, 1, 0.98
> > 14, 0, 13, 0, 127, 1, 0.978
> > 15, 0, 14, 0, 127, 1, 0.979
> > 16, 0, 15, 0, 127, 1, 0.986
> > 17, 0, 16, 0, 127, 1, 0.529
> > 18, 0, 17, 0, 127, 1, 0.566
> > 19, 0, 18, 0, 127, 1, 0.575
> > 20, 0, 19, 0, 127, 1, 0.573
> > 21, 0, 20, 0, 127, 1, 0.579
> > 22, 0, 21, 0, 127, 1, 0.595
> > 23, 0, 22, 0, 127, 1, 0.585
> > 24, 0, 23, 0, 127, 1, 0.586
> > 25, 0, 24, 0, 127, 1, 0.587
> > 26, 0, 25, 0, 127, 1, 0.592
> > 27, 0, 26, 0, 127, 1, 0.595
> > 28, 0, 27, 0, 127, 1, 0.592
> > 29, 0, 28, 0, 127, 1, 0.6
> > 30, 0, 29, 0, 127, 1, 0.598
> > 31, 0, 30, 0, 127, 1, 0.595
> > 32, 0, 31, 0, 127, 1, 0.592
> > 2048, 0, 32, 23, 127, 1, 0.827
> > 2048, 1, 32, 23, 127, 1, 0.826
> > 2048, 0, 64, 23, 127, 1, 0.824
> > 2048, 2, 64, 23, 127, 1, 0.825
> > 2048, 0, 128, 23, 127, 1, 0.829
> > 2048, 3, 128, 23, 127, 1, 0.824
> > 2048, 0, 256, 23, 127, 1, 0.832
> > 2048, 4, 256, 23, 127, 1, 0.825
> > 2048, 0, 512, 23, 127, 1, 0.831
> > 2048, 5, 512, 23, 127, 1, 0.837
> > 2048, 0, 1024, 23, 127, 1, 0.721
> > 2048, 6, 1024, 23, 127, 1, 0.757
> > 2048, 0, 2048, 23, 127, 1, 0.825
> > 2048, 7, 2048, 23, 127, 1, 0.824
> > 2048, 0, 4096, 23, 127, 1, 0.828
> > 2048, 8, 4096, 23, 127, 1, 0.823
> > 256, 1, 64, 23, 127, 1, 0.665
> > 256, 15, 64, 23, 127, 1, 0.661
> > 256, 2, 64, 23, 127, 1, 0.674
> > 256, 30, 64, 23, 127, 1, 0.605
> > 256, 3, 64, 23, 127, 1, 0.668
> > 256, 45, 64, 23, 127, 1, 0.661
> > 256, 4, 64, 23, 127, 1, 0.657
> > 256, 60, 64, 23, 127, 1, 0.594
> > 256, 5, 64, 23, 127, 1, 0.654
> > 256, 75, 64, 23, 127, 1, 0.673
> > 256, 6, 64, 23, 127, 1, 0.688
> > 256, 90, 64, 23, 127, 1, 0.6
> > 256, 7, 64, 23, 127, 1, 0.66
> > 256, 105, 64, 23, 127, 1, 0.654
> > 1, 0, 0, 23, 127, 1, 0.981
> > 2, 0, 1, 23, 127, 1, 0.976
> > 3, 0, 2, 23, 127, 1, 0.983
> > 4, 0, 3, 23, 127, 1, 0.984
> > 5, 0, 4, 23, 127, 1, 0.973
> > 6, 0, 5, 23, 127, 1, 0.987
> > 7, 0, 6, 23, 127, 1, 0.977
> > 8, 0, 7, 23, 127, 1, 0.979
> > 9, 0, 8, 23, 127, 1, 0.981
> > 10, 0, 9, 23, 127, 1, 0.98
> > 11, 0, 10, 23, 127, 1, 0.983
> > 12, 0, 11, 23, 127, 1, 0.98
> > 13, 0, 12, 23, 127, 1, 0.98
> > 14, 0, 13, 23, 127, 1, 0.977
> > 15, 0, 14, 23, 127, 1, 0.982
> > 16, 0, 15, 23, 127, 1, 0.581
> > 17, 0, 16, 23, 127, 1, 0.551
> > 18, 0, 17, 23, 127, 1, 0.555
> > 19, 0, 18, 23, 127, 1, 0.586
> > 20, 0, 19, 23, 127, 1, 0.585
> > 21, 0, 20, 23, 127, 1, 0.582
> > 22, 0, 21, 23, 127, 1, 0.571
> > 23, 0, 22, 23, 127, 1, 0.576
> > 24, 0, 23, 23, 127, 1, 0.581
> > 25, 0, 24, 23, 127, 1, 0.589
> > 26, 0, 25, 23, 127, 1, 0.593
> > 27, 0, 26, 23, 127, 1, 0.595
> > 28, 0, 27, 23, 127, 1, 0.583
> > 29, 0, 28, 23, 127, 1, 0.595
> > 30, 0, 29, 23, 127, 1, 0.58
> > 31, 0, 30, 23, 127, 1, 0.594
> > 32, 0, 31, 23, 127, 1, 0.665
> > 2048, 0, 32, 23, 127, 2, 0.825
> > 2048, 1, 32, 23, 127, 2, 0.818
> > 2048, 0, 64, 23, 127, 2, 0.829
> > 2048, 2, 64, 23, 127, 2, 0.828
> > 2048, 0, 128, 23, 127, 2, 0.823
> > 2048, 3, 128, 23, 127, 2, 0.825
> > 2048, 0, 256, 23, 127, 2, 0.819
> > 2048, 4, 256, 23, 127, 2, 0.828
> > 2048, 0, 512, 23, 127, 2, 0.824
> > 2048, 5, 512, 23, 127, 2, 0.827
> > 2048, 0, 1024, 23, 127, 2, 0.813
> > 2048, 6, 1024, 23, 127, 2, 0.834
> > 2048, 0, 2048, 23, 127, 2, 0.927
> > 2048, 7, 2048, 23, 127, 2, 0.923
> > 2048, 0, 4096, 23, 127, 2, 0.818
> > 2048, 8, 4096, 23, 127, 2, 0.82
> > 256, 1, 64, 23, 127, 2, 0.693
> > 256, 15, 64, 23, 127, 2, 0.686
> > 256, 2, 64, 23, 127, 2, 0.69
> > 256, 30, 64, 23, 127, 2, 0.611
> > 256, 3, 64, 23, 127, 2, 0.692
> > 256, 45, 64, 23, 127, 2, 0.685
> > 256, 4, 64, 23, 127, 2, 0.688
> > 256, 60, 64, 23, 127, 2, 0.6
> > 256, 5, 64, 23, 127, 2, 0.69
> > 256, 75, 64, 23, 127, 2, 0.689
> > 256, 6, 64, 23, 127, 2, 0.688
> > 256, 90, 64, 23, 127, 2, 0.611
> > 256, 7, 64, 23, 127, 2, 0.69
> > 256, 105, 64, 23, 127, 2, 0.686
> > 1, 0, 0, 23, 127, 2, 0.982
> > 2, 0, 1, 23, 127, 2, 0.987
> > 3, 0, 2, 23, 127, 2, 0.978
> > 4, 0, 3, 23, 127, 2, 0.977
> > 5, 0, 4, 23, 127, 2, 0.979
> > 6, 0, 5, 23, 127, 2, 0.985
> > 7, 0, 6, 23, 127, 2, 0.975
> > 8, 0, 7, 23, 127, 2, 0.981
> > 9, 0, 8, 23, 127, 2, 0.984
> > 10, 0, 9, 23, 127, 2, 0.983
> > 11, 0, 10, 23, 127, 2, 0.982
> > 12, 0, 11, 23, 127, 2, 0.976
> > 13, 0, 12, 23, 127, 2, 0.985
> > 14, 0, 13, 23, 127, 2, 0.984
> > 15, 0, 14, 23, 127, 2, 0.98
> > 16, 0, 15, 23, 127, 2, 0.583
> > 17, 0, 16, 23, 127, 2, 0.552
> > 18, 0, 17, 23, 127, 2, 0.564
> > 19, 0, 18, 23, 127, 2, 0.585
> > 20, 0, 19, 23, 127, 2, 0.578
> > 21, 0, 20, 23, 127, 2, 0.578
> > 22, 0, 21, 23, 127, 2, 0.571
> > 23, 0, 22, 23, 127, 2, 0.587
> > 24, 0, 23, 23, 127, 2, 0.589
> > 25, 0, 24, 23, 127, 2, 0.593
> > 26, 0, 25, 23, 127, 2, 0.589
> > 27, 0, 26, 23, 127, 2, 0.588
> > 28, 0, 27, 23, 127, 2, 0.593
> > 29, 0, 28, 23, 127, 2, 0.579
> > 30, 0, 29, 23, 127, 2, 0.572
> > 31, 0, 30, 23, 127, 2, 0.582
> > 32, 0, 31, 23, 127, 2, 0.659
> > 2048, 0, 32, 23, 127, 4, 0.822
> > 2048, 1, 32, 23, 127, 4, 0.818
> > 2048, 0, 64, 23, 127, 4, 0.826
> > 2048, 2, 64, 23, 127, 4, 0.824
> > 2048, 0, 128, 23, 127, 4, 0.833
> > 2048, 3, 128, 23, 127, 4, 0.831
> > 2048, 0, 256, 23, 127, 4, 0.826
> > 2048, 4, 256, 23, 127, 4, 0.831
> > 2048, 0, 512, 23, 127, 4, 0.834
> > 2048, 5, 512, 23, 127, 4, 0.83
> > 2048, 0, 1024, 23, 127, 4, 0.836
> > 2048, 6, 1024, 23, 127, 4, 0.844
> > 2048, 0, 2048, 23, 127, 4, 0.696
> > 2048, 7, 2048, 23, 127, 4, 0.704
> > 2048, 0, 4096, 23, 127, 4, 0.936
> > 2048, 8, 4096, 23, 127, 4, 0.925
> > 256, 1, 64, 23, 127, 4, 0.694
> > 256, 15, 64, 23, 127, 4, 0.69
> > 256, 2, 64, 23, 127, 4, 0.687
> > 256, 30, 64, 23, 127, 4, 0.612
> > 256, 3, 64, 23, 127, 4, 0.685
> > 256, 45, 64, 23, 127, 4, 0.685
> > 256, 4, 64, 23, 127, 4, 0.684
> > 256, 60, 64, 23, 127, 4, 0.606
> > 256, 5, 64, 23, 127, 4, 0.69
> > 256, 75, 64, 23, 127, 4, 0.688
> > 256, 6, 64, 23, 127, 4, 0.69
> > 256, 90, 64, 23, 127, 4, 0.615
> > 256, 7, 64, 23, 127, 4, 0.691
> > 256, 105, 64, 23, 127, 4, 0.688
> > 1, 0, 0, 23, 127, 4, 0.982
> > 2, 0, 1, 23, 127, 4, 0.983
> > 3, 0, 2, 23, 127, 4, 0.981
> > 4, 0, 3, 23, 127, 4, 0.984
> > 5, 0, 4, 23, 127, 4, 0.963
> > 6, 0, 5, 23, 127, 4, 0.978
> > 7, 0, 6, 23, 127, 4, 0.985
> > 8, 0, 7, 23, 127, 4, 0.986
> > 9, 0, 8, 23, 127, 4, 0.978
> > 10, 0, 9, 23, 127, 4, 0.985
> > 11, 0, 10, 23, 127, 4, 0.986
> > 12, 0, 11, 23, 127, 4, 0.983
> > 13, 0, 12, 23, 127, 4, 0.986
> > 14, 0, 13, 23, 127, 4, 0.98
> > 15, 0, 14, 23, 127, 4, 0.979
> > 16, 0, 15, 23, 127, 4, 0.582
> > 17, 0, 16, 23, 127, 4, 0.542
> > 18, 0, 17, 23, 127, 4, 0.564
> > 19, 0, 18, 23, 127, 4, 0.571
> > 20, 0, 19, 23, 127, 4, 0.582
> > 21, 0, 20, 23, 127, 4, 0.573
> > 22, 0, 21, 23, 127, 4, 0.575
> > 23, 0, 22, 23, 127, 4, 0.578
> > 24, 0, 23, 23, 127, 4, 0.58
> > 25, 0, 24, 23, 127, 4, 0.592
> > 26, 0, 25, 23, 127, 4, 0.588
> > 27, 0, 26, 23, 127, 4, 0.574
> > 28, 0, 27, 23, 127, 4, 0.589
> > 29, 0, 28, 23, 127, 4, 0.56
> > 30, 0, 29, 23, 127, 4, 0.587
> > 31, 0, 30, 23, 127, 4, 0.584
> > 32, 0, 31, 23, 127, 4, 0.664
> > 2048, 0, 32, 23, 127, 8, 0.826
> > 2048, 1, 32, 23, 127, 8, 0.821
> > 2048, 0, 64, 23, 127, 8, 0.828
> > 2048, 2, 64, 23, 127, 8, 0.827
> > 2048, 0, 128, 23, 127, 8, 0.833
> > 2048, 3, 128, 23, 127, 8, 0.83
> > 2048, 0, 256, 23, 127, 8, 0.855
> > 2048, 4, 256, 23, 127, 8, 0.849
> > 2048, 0, 512, 23, 127, 8, 0.849
> > 2048, 5, 512, 23, 127, 8, 0.851
> > 2048, 0, 1024, 23, 127, 8, 0.856
> > 2048, 6, 1024, 23, 127, 8, 0.862
> > 2048, 0, 2048, 23, 127, 8, 0.709
> > 2048, 7, 2048, 23, 127, 8, 0.712
> > 2048, 0, 4096, 23, 127, 8, 0.702
> > 2048, 8, 4096, 23, 127, 8, 0.701
> > 256, 1, 64, 23, 127, 8, 0.689
> > 256, 15, 64, 23, 127, 8, 0.688
> > 256, 2, 64, 23, 127, 8, 0.691
> > 256, 30, 64, 23, 127, 8, 0.612
> > 256, 3, 64, 23, 127, 8, 0.688
> > 256, 45, 64, 23, 127, 8, 0.686
> > 256, 4, 64, 23, 127, 8, 0.694
> > 256, 60, 64, 23, 127, 8, 0.609
> > 256, 5, 64, 23, 127, 8, 0.69
> > 256, 75, 64, 23, 127, 8, 0.69
> > 256, 6, 64, 23, 127, 8, 0.691
> > 256, 90, 64, 23, 127, 8, 0.612
> > 256, 7, 64, 23, 127, 8, 0.689
> > 256, 105, 64, 23, 127, 8, 0.688
> > 1, 0, 0, 23, 127, 8, 0.98
> > 2, 0, 1, 23, 127, 8, 0.978
> > 3, 0, 2, 23, 127, 8, 0.98
> > 4, 0, 3, 23, 127, 8, 0.978
> > 5, 0, 4, 23, 127, 8, 0.977
> > 6, 0, 5, 23, 127, 8, 0.984
> > 7, 0, 6, 23, 127, 8, 0.982
> > 8, 0, 7, 23, 127, 8, 0.983
> > 9, 0, 8, 23, 127, 8, 0.987
> > 10, 0, 9, 23, 127, 8, 0.979
> > 11, 0, 10, 23, 127, 8, 0.985
> > 12, 0, 11, 23, 127, 8, 0.981
> > 13, 0, 12, 23, 127, 8, 0.98
> > 14, 0, 13, 23, 127, 8, 0.982
> > 15, 0, 14, 23, 127, 8, 0.981
> > 16, 0, 15, 23, 127, 8, 0.579
> > 17, 0, 16, 23, 127, 8, 0.531
> > 18, 0, 17, 23, 127, 8, 0.577
> > 19, 0, 18, 23, 127, 8, 0.588
> > 20, 0, 19, 23, 127, 8, 0.571
> > 21, 0, 20, 23, 127, 8, 0.576
> > 22, 0, 21, 23, 127, 8, 0.59
> > 23, 0, 22, 23, 127, 8, 0.574
> > 24, 0, 23, 23, 127, 8, 0.583
> > 25, 0, 24, 23, 127, 8, 0.581
> > 26, 0, 25, 23, 127, 8, 0.592
> > 27, 0, 26, 23, 127, 8, 0.586
> > 28, 0, 27, 23, 127, 8, 0.588
> > 29, 0, 28, 23, 127, 8, 0.578
> > 30, 0, 29, 23, 127, 8, 0.573
> > 31, 0, 30, 23, 127, 8, 0.588
> > 32, 0, 31, 23, 127, 8, 0.664
> > 2048, 0, 32, 23, 127, 16, 0.825
> > 2048, 1, 32, 23, 127, 16, 0.823
> > 2048, 0, 64, 23, 127, 16, 0.831
> > 2048, 2, 64, 23, 127, 16, 0.822
> > 2048, 0, 128, 23, 127, 16, 0.831
> > 2048, 3, 128, 23, 127, 16, 0.831
> > 2048, 0, 256, 23, 127, 16, 0.849
> > 2048, 4, 256, 23, 127, 16, 0.85
> > 2048, 0, 512, 23, 127, 16, 0.751
> > 2048, 5, 512, 23, 127, 16, 0.75
> > 2048, 0, 1024, 23, 127, 16, 0.913
> > 2048, 6, 1024, 23, 127, 16, 0.895
> > 2048, 0, 2048, 23, 127, 16, 0.736
> > 2048, 7, 2048, 23, 127, 16, 0.741
> > 2048, 0, 4096, 23, 127, 16, 0.712
> > 2048, 8, 4096, 23, 127, 16, 0.711
> > 256, 1, 64, 23, 127, 16, 0.758
> > 256, 15, 64, 23, 127, 16, 0.692
> > 256, 2, 64, 23, 127, 16, 0.692
> > 256, 30, 64, 23, 127, 16, 0.613
> > 256, 3, 64, 23, 127, 16, 0.69
> > 256, 45, 64, 23, 127, 16, 0.687
> > 256, 4, 64, 23, 127, 16, 0.69
> > 256, 60, 64, 23, 127, 16, 0.604
> > 256, 5, 64, 23, 127, 16, 0.687
> > 256, 75, 64, 23, 127, 16, 0.687
> > 256, 6, 64, 23, 127, 16, 0.69
> > 256, 90, 64, 23, 127, 16, 0.61
> > 256, 7, 64, 23, 127, 16, 0.69
> > 256, 105, 64, 23, 127, 16, 0.685
> > 1, 0, 0, 23, 127, 16, 0.981
> > 2, 0, 1, 23, 127, 16, 0.985
> > 3, 0, 2, 23, 127, 16, 0.985
> > 4, 0, 3, 23, 127, 16, 0.981
> > 5, 0, 4, 23, 127, 16, 0.979
> > 6, 0, 5, 23, 127, 16, 0.986
> > 7, 0, 6, 23, 127, 16, 0.986
> > 8, 0, 7, 23, 127, 16, 0.982
> > 9, 0, 8, 23, 127, 16, 0.982
> > 10, 0, 9, 23, 127, 16, 0.98
> > 11, 0, 10, 23, 127, 16, 0.983
> > 12, 0, 11, 23, 127, 16, 0.982
> > 13, 0, 12, 23, 127, 16, 0.982
> > 14, 0, 13, 23, 127, 16, 0.982
> > 15, 0, 14, 23, 127, 16, 0.982
> > 16, 0, 15, 23, 127, 16, 0.582
> > 17, 0, 16, 23, 127, 16, 0.542
> > 18, 0, 17, 23, 127, 16, 0.554
> > 19, 0, 18, 23, 127, 16, 0.562
> > 20, 0, 19, 23, 127, 16, 0.587
> > 21, 0, 20, 23, 127, 16, 0.584
> > 22, 0, 21, 23, 127, 16, 0.587
> > 23, 0, 22, 23, 127, 16, 0.594
> > 24, 0, 23, 23, 127, 16, 0.581
> > 25, 0, 24, 23, 127, 16, 0.577
> > 26, 0, 25, 23, 127, 16, 0.588
> > 27, 0, 26, 23, 127, 16, 0.589
> > 28, 0, 27, 23, 127, 16, 0.596
> > 29, 0, 28, 23, 127, 16, 0.591
> > 30, 0, 29, 23, 127, 16, 0.585
> > 31, 0, 30, 23, 127, 16, 0.59
> > 32, 0, 31, 23, 127, 16, 0.669
> >
> > sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
> > sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
> > sysdeps/x86_64/strrchr.S | 505 +++++++++++++++---------
> > sysdeps/x86_64/wcsrchr.S | 268 +------------
> > 4 files changed, 334 insertions(+), 444 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > index db1b44c23c..866396e947 100644
> > --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > @@ -17,7 +17,7 @@
> > <https://www.gnu.org/licenses/>. */
> >
> > #if IS_IN (libc)
> > -# define strrchr __strrchr_sse2
> > +# define STRRCHR __strrchr_sse2
> >
> > # undef weak_alias
> > # define weak_alias(strrchr, rindex)
> > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > index 78d1ca6553..69d2f3cdb1 100644
> > --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > @@ -17,7 +17,6 @@
> > <https://www.gnu.org/licenses/>. */
> >
> > #if IS_IN (libc)
> > -# define wcsrchr __wcsrchr_sse2
> > +# define STRRCHR __wcsrchr_sse2
> > #endif
> > -
> > #include "../wcsrchr.S"
> > diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> > index 50d886713e..94449ad806 100644
> > --- a/sysdeps/x86_64/strrchr.S
> > +++ b/sysdeps/x86_64/strrchr.S
> > @@ -19,210 +19,355 @@
> >
> > #include <sysdep.h>
> >
> > +#ifndef STRRCHR
> > +# define STRRCHR strrchr
> > +#endif
> > +
> > +#ifdef USE_AS_WCSRCHR
> > +# define PCMPEQ pcmpeqd
> > +# define CHAR_SIZE 4
> > +# define PMINU pminud
> > +#else
> > +# define PCMPEQ pcmpeqb
> > +# define CHAR_SIZE 1
> > +# define PMINU pminub
> > +#endif
> > +
> > +#define PAGE_SIZE 4096
> > +#define VEC_SIZE 16
> > +
> > .text
> > -ENTRY (strrchr)
> > - movd %esi, %xmm1
> > +ENTRY(STRRCHR)
> > + movd %esi, %xmm0
> > movq %rdi, %rax
> > - andl $4095, %eax
> > - punpcklbw %xmm1, %xmm1
> > - cmpq $4032, %rax
> > - punpcklwd %xmm1, %xmm1
> > - pshufd $0, %xmm1, %xmm1
> > + andl $(PAGE_SIZE - 1), %eax
> > +#ifndef USE_AS_WCSRCHR
> > + punpcklbw %xmm0, %xmm0
> > + punpcklwd %xmm0, %xmm0
> > +#endif
> > + pshufd $0, %xmm0, %xmm0
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > ja L(cross_page)
> > - movdqu (%rdi), %xmm0
> > +
> > +L(cross_page_continue):
> > + movups (%rdi), %xmm1
> > pxor %xmm2, %xmm2
> > - movdqa %xmm0, %xmm3
> > - pcmpeqb %xmm1, %xmm0
> > - pcmpeqb %xmm2, %xmm3
> > - pmovmskb %xmm0, %ecx
> > - pmovmskb %xmm3, %edx
> > - testq %rdx, %rdx
> > - je L(next_48_bytes)
> > - leaq -1(%rdx), %rax
> > - xorq %rdx, %rax
> > - andq %rcx, %rax
> > - je L(exit)
> > - bsrq %rax, %rax
> > + PCMPEQ %xmm1, %xmm2
> > + pmovmskb %xmm2, %ecx
> > + testl %ecx, %ecx
> > + jz L(aligned_more)
> > +
> > + PCMPEQ %xmm0, %xmm1
> > + pmovmskb %xmm1, %eax
> > + leal -1(%rcx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(ret0)
> > + bsrl %eax, %eax
> > addq %rdi, %rax
> > + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> > + search CHAR is zero we are correct. Either way `andq
> > + -CHAR_SIZE, %rax` gets the correct result. */
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > +L(ret0):
> > ret
> >
> > + /* Returns for first vec x1/x2 have hard coded backward search
> > + path for earlier matches. */
> > .p2align 4
> > -L(next_48_bytes):
> > - movdqu 16(%rdi), %xmm4
> > - movdqa %xmm4, %xmm5
> > - movdqu 32(%rdi), %xmm3
> > - pcmpeqb %xmm1, %xmm4
> > - pcmpeqb %xmm2, %xmm5
> > - movdqu 48(%rdi), %xmm0
> > - pmovmskb %xmm5, %edx
> > - movdqa %xmm3, %xmm5
> > - pcmpeqb %xmm1, %xmm3
> > - pcmpeqb %xmm2, %xmm5
> > - pcmpeqb %xmm0, %xmm2
> > - salq $16, %rdx
> > - pmovmskb %xmm3, %r8d
> > - pmovmskb %xmm5, %eax
> > - pmovmskb %xmm2, %esi
> > - salq $32, %r8
> > - salq $32, %rax
> > - pcmpeqb %xmm1, %xmm0
> > - orq %rdx, %rax
> > - movq %rsi, %rdx
> > - pmovmskb %xmm4, %esi
> > - salq $48, %rdx
> > - salq $16, %rsi
> > - orq %r8, %rsi
> > - orq %rcx, %rsi
> > - pmovmskb %xmm0, %ecx
> > - salq $48, %rcx
> > - orq %rcx, %rsi
> > - orq %rdx, %rax
> > - je L(loop_header2)
> > - leaq -1(%rax), %rcx
> > - xorq %rax, %rcx
> > - andq %rcx, %rsi
> > - je L(exit)
> > - bsrq %rsi, %rsi
> > - leaq (%rdi,%rsi), %rax
> > +L(first_vec_x0_test):
> > + PCMPEQ %xmm0, %xmm1
> > + pmovmskb %xmm1, %eax
> > + testl %eax, %eax
> > + jz L(ret0)
> > + bsrl %eax, %eax
> > + addq %r8, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > ret
> >
> > .p2align 4
> > -L(loop_header2):
> > - testq %rsi, %rsi
> > - movq %rdi, %rcx
> > - je L(no_c_found)
> > -L(loop_header):
> > - addq $64, %rdi
> > - pxor %xmm7, %xmm7
> > - andq $-64, %rdi
> > - jmp L(loop_entry)
> > +L(first_vec_x1):
> > + PCMPEQ %xmm0, %xmm2
> > + pmovmskb %xmm2, %eax
> > + leal -1(%rcx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(first_vec_x0_test)
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> >
> > .p2align 4
> > -L(loop64):
> > - testq %rdx, %rdx
> > - cmovne %rdx, %rsi
> > - cmovne %rdi, %rcx
> > - addq $64, %rdi
> > -L(loop_entry):
> > - movdqa 32(%rdi), %xmm3
> > - pxor %xmm6, %xmm6
> > - movdqa 48(%rdi), %xmm2
> > - movdqa %xmm3, %xmm0
> > - movdqa 16(%rdi), %xmm4
> > - pminub %xmm2, %xmm0
> > - movdqa (%rdi), %xmm5
> > - pminub %xmm4, %xmm0
> > - pminub %xmm5, %xmm0
> > - pcmpeqb %xmm7, %xmm0
> > - pmovmskb %xmm0, %eax
> > - movdqa %xmm5, %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - pmovmskb %xmm0, %r9d
> > - movdqa %xmm4, %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - pmovmskb %xmm0, %edx
> > - movdqa %xmm3, %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - salq $16, %rdx
> > - pmovmskb %xmm0, %r10d
> > - movdqa %xmm2, %xmm0
> > - pcmpeqb %xmm1, %xmm0
> > - salq $32, %r10
> > - orq %r10, %rdx
> > - pmovmskb %xmm0, %r8d
> > - orq %r9, %rdx
> > - salq $48, %r8
> > - orq %r8, %rdx
> > +L(first_vec_x1_test):
> > + PCMPEQ %xmm0, %xmm2
> > + pmovmskb %xmm2, %eax
> > testl %eax, %eax
> > - je L(loop64)
> > - pcmpeqb %xmm6, %xmm4
> > - pcmpeqb %xmm6, %xmm3
> > - pcmpeqb %xmm6, %xmm5
> > - pmovmskb %xmm4, %eax
> > - pmovmskb %xmm3, %r10d
> > - pcmpeqb %xmm6, %xmm2
> > - pmovmskb %xmm5, %r9d
> > - salq $32, %r10
> > - salq $16, %rax
> > - pmovmskb %xmm2, %r8d
> > - orq %r10, %rax
> > - orq %r9, %rax
> > - salq $48, %r8
> > - orq %r8, %rax
> > - leaq -1(%rax), %r8
> > - xorq %rax, %r8
> > - andq %r8, %rdx
> > - cmovne %rdi, %rcx
> > - cmovne %rdx, %rsi
> > - bsrq %rsi, %rsi
> > - leaq (%rcx,%rsi), %rax
> > + jz L(first_vec_x0_test)
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> > +
> > + .p2align 4
> > +L(first_vec_x2):
> > + PCMPEQ %xmm0, %xmm3
> > + pmovmskb %xmm3, %eax
> > + leal -1(%rcx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(first_vec_x1_test)
> > + bsrl %eax, %eax
> > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> > +
> > + .p2align 4
> > +L(aligned_more):
> > + /* Save original pointer if match was in VEC 0. */
> > + movq %rdi, %r8
> > + andq $-VEC_SIZE, %rdi
> > +
> > + movaps VEC_SIZE(%rdi), %xmm2
> > + pxor %xmm3, %xmm3
> > + PCMPEQ %xmm2, %xmm3
> > + pmovmskb %xmm3, %ecx
> > + testl %ecx, %ecx
> > + jnz L(first_vec_x1)
> > +
> > + movaps (VEC_SIZE * 2)(%rdi), %xmm3
> > + pxor %xmm4, %xmm4
> > + PCMPEQ %xmm3, %xmm4
> > + pmovmskb %xmm4, %ecx
> > + testl %ecx, %ecx
> > + jnz L(first_vec_x2)
> > +
> > + addq $VEC_SIZE, %rdi
> > + /* Save pointer again before realigning. */
> > + movq %rdi, %rsi
> > + andq $-(VEC_SIZE * 2), %rdi
> > + .p2align 4
> > +L(first_loop):
> > + /* Do 2x VEC at a time. */
> > + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> > + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> > + /* If SSE2 no pminud. */
> > +#ifdef NO_PMINU
>
> Do we really need SSE4.1 wcsrchr? I think we should focus on AVX2 and
> above.
It seems like freebie performance that can make a difference in the loop
cases. (see the SSE4.1 commit for numbers).
Imo there is little harm but if you feel strongly I'll drop. (In V2 will
change the .text section for SSE4_1).
What do you think?
>
> > + movaps %xmm5, %xmm6
> > + pxor %xmm8, %xmm8
> > +
> > + PCMPEQ %xmm8, %xmm5
> > + PCMPEQ %xmm4, %xmm8
> > + por %xmm5, %xmm8
> > +#else
> > + movaps %xmm5, %xmm6
> > + PMINU %xmm4, %xmm5
> > +#endif
> > +
> > + movaps %xmm4, %xmm9
> > + PCMPEQ %xmm0, %xmm4
> > + PCMPEQ %xmm0, %xmm6
> > + movaps %xmm6, %xmm7
> > + por %xmm4, %xmm6
> > +#ifndef NO_PMINU
> > + pxor %xmm8, %xmm8
> > + PCMPEQ %xmm5, %xmm8
> > +#endif
> > + pmovmskb %xmm8, %ecx
> > + pmovmskb %xmm6, %eax
> > +
> > + addq $(VEC_SIZE * 2), %rdi
> > + /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> > + macro-fuse with `jz`. */
> > + addl %ecx, %eax
> > + jz L(first_loop)
> > +
> > + /* Check if there is zero match. */
> > + testl %ecx, %ecx
> > + jz L(second_loop_match)
> > +
> > + /* Check if there was a match in last iteration. */
> > + subl %ecx, %eax
> > + jnz L(new_match)
> > +
> > +L(first_loop_old_match):
> > + PCMPEQ %xmm0, %xmm2
> > + PCMPEQ %xmm0, %xmm3
> > + pmovmskb %xmm2, %ecx
> > + pmovmskb %xmm3, %eax
> > + addl %eax, %ecx
> > + jz L(first_vec_x0_test)
> > + /* NB: We could move this shift to before the branch and save a
> > + bit of code size / performance on the fall through. The
> > + branch leads to the null case which generally seems hotter
> > + than char in first 3x VEC. */
> > + sall $16, %eax
> > + orl %ecx, %eax
> > +
> > + bsrl %eax, %eax
> > + addq %rsi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> > +
> > + .p2align 4
> > +L(new_match):
> > + pxor %xmm6, %xmm6
> > + PCMPEQ %xmm9, %xmm6
> > + pmovmskb %xmm6, %eax
> > + sall $16, %ecx
> > + orl %eax, %ecx
> > +
> > + /* We can't reuse either of the old comparisons as since we mask
> > + of zeros after first zero (instead of using the full
> > + comparison) we can't gurantee no interference between match
> > + after end of string and valid match. */
> > + pmovmskb %xmm4, %eax
> > + pmovmskb %xmm7, %edx
> > + sall $16, %edx
> > + orl %edx, %eax
> > +
> > + leal -1(%ecx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(first_loop_old_match)
> > + bsrl %eax, %eax
> > + addq %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > ret
> >
> > + /* Save minimum state for getting most recent match. We can
> > + throw out all previous work. */
> > .p2align 4
> > -L(no_c_found):
> > - movl $1, %esi
> > - xorl %ecx, %ecx
> > - jmp L(loop_header)
> > +L(second_loop_match):
> > + movq %rdi, %rsi
> > + movaps %xmm4, %xmm2
> > + movaps %xmm7, %xmm3
> >
> > .p2align 4
> > -L(exit):
> > - xorl %eax, %eax
> > +L(second_loop):
> > + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> > + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> > +#ifdef NO_PMINU
> > + movaps %xmm5, %xmm6
> > + pxor %xmm8, %xmm8
> > +
> > + PCMPEQ %xmm8, %xmm5
> > + PCMPEQ %xmm4, %xmm8
> > + por %xmm5, %xmm8
> > +#else
> > + movaps %xmm5, %xmm6
> > + PMINU %xmm4, %xmm5
> > +#endif
> > +
> > + movaps %xmm4, %xmm9
> > + PCMPEQ %xmm0, %xmm4
> > + PCMPEQ %xmm0, %xmm6
> > + movaps %xmm6, %xmm7
> > + por %xmm4, %xmm6
> > +#ifndef NO_PMINU
> > + pxor %xmm8, %xmm8
> > + PCMPEQ %xmm5, %xmm8
> > +#endif
> > +
> > + pmovmskb %xmm8, %ecx
> > + pmovmskb %xmm6, %eax
> > +
> > + addq $(VEC_SIZE * 2), %rdi
> > + /* Either null term or new occurence of CHAR. */
> > + addl %ecx, %eax
> > + jz L(second_loop)
> > +
> > + /* No null term so much be new occurence of CHAR. */
> > + testl %ecx, %ecx
> > + jz L(second_loop_match)
> > +
> > +
> > + subl %ecx, %eax
> > + jnz L(second_loop_new_match)
> > +
> > +L(second_loop_old_match):
> > + pmovmskb %xmm2, %ecx
> > + pmovmskb %xmm3, %eax
> > + sall $16, %eax
> > + orl %ecx, %eax
> > + bsrl %eax, %eax
> > + addq %rsi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > ret
> >
> > .p2align 4
> > +L(second_loop_new_match):
> > + pxor %xmm6, %xmm6
> > + PCMPEQ %xmm9, %xmm6
> > + pmovmskb %xmm6, %eax
> > + sall $16, %ecx
> > + orl %eax, %ecx
> > +
> > + /* We can't reuse either of the old comparisons as since we mask
> > + of zeros after first zero (instead of using the full
> > + comparison) we can't gurantee no interference between match
> > + after end of string and valid match. */
> > + pmovmskb %xmm4, %eax
> > + pmovmskb %xmm7, %edx
> > + sall $16, %edx
> > + orl %edx, %eax
> > +
> > + leal -1(%ecx), %edx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(second_loop_old_match)
> > + bsrl %eax, %eax
> > + addq %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > + ret
> > +
> > + .p2align 4,, 4
> > L(cross_page):
> > - movq %rdi, %rax
> > - pxor %xmm0, %xmm0
> > - andq $-64, %rax
> > - movdqu (%rax), %xmm5
> > - movdqa %xmm5, %xmm6
> > - movdqu 16(%rax), %xmm4
> > - pcmpeqb %xmm1, %xmm5
> > - pcmpeqb %xmm0, %xmm6
> > - movdqu 32(%rax), %xmm3
> > - pmovmskb %xmm6, %esi
> > - movdqa %xmm4, %xmm6
> > - movdqu 48(%rax), %xmm2
> > - pcmpeqb %xmm1, %xmm4
> > - pcmpeqb %xmm0, %xmm6
> > - pmovmskb %xmm6, %edx
> > - movdqa %xmm3, %xmm6
> > - pcmpeqb %xmm1, %xmm3
> > - pcmpeqb %xmm0, %xmm6
> > - pcmpeqb %xmm2, %xmm0
> > - salq $16, %rdx
> > - pmovmskb %xmm3, %r9d
> > - pmovmskb %xmm6, %r8d
> > - pmovmskb %xmm0, %ecx
> > - salq $32, %r9
> > - salq $32, %r8
> > - pcmpeqb %xmm1, %xmm2
> > - orq %r8, %rdx
> > - salq $48, %rcx
> > - pmovmskb %xmm5, %r8d
> > - orq %rsi, %rdx
> > - pmovmskb %xmm4, %esi
> > - orq %rcx, %rdx
> > - pmovmskb %xmm2, %ecx
> > - salq $16, %rsi
> > - salq $48, %rcx
> > - orq %r9, %rsi
> > - orq %r8, %rsi
> > - orq %rcx, %rsi
> > + movq %rdi, %rsi
> > + andq $-VEC_SIZE, %rsi
> > + movaps (%rsi), %xmm1
> > + pxor %xmm2, %xmm2
> > + PCMPEQ %xmm1, %xmm2
> > + pmovmskb %xmm2, %edx
> > movl %edi, %ecx
> > - subl %eax, %ecx
> > - shrq %cl, %rdx
> > - shrq %cl, %rsi
> > - testq %rdx, %rdx
> > - je L(loop_header2)
> > - leaq -1(%rdx), %rax
> > - xorq %rdx, %rax
> > - andq %rax, %rsi
> > - je L(exit)
> > - bsrq %rsi, %rax
> > + andl $(VEC_SIZE - 1), %ecx
> > + sarl %cl, %edx
> > + jz L(cross_page_continue)
> > + PCMPEQ %xmm0, %xmm1
> > + pmovmskb %xmm1, %eax
> > + sarl %cl, %eax
> > + leal -1(%rdx), %ecx
> > + xorl %edx, %ecx
> > + andl %ecx, %eax
> > + jz L(ret1)
> > + bsrl %eax, %eax
> > addq %rdi, %rax
> > +#ifdef USE_AS_WCSRCHR
> > + andq $-CHAR_SIZE, %rax
> > +#endif
> > +L(ret1):
> > ret
> > -END (strrchr)
> > +END(STRRCHR)
> >
> > -weak_alias (strrchr, rindex)
> > -libc_hidden_builtin_def (strrchr)
> > +#ifndef USE_AS_WCSRCHR
> > + weak_alias (STRRCHR, rindex)
> > + libc_hidden_builtin_def (STRRCHR)
> > +#endif
> > diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> > index 61552954de..2b80efc5ef 100644
> > --- a/sysdeps/x86_64/wcsrchr.S
> > +++ b/sysdeps/x86_64/wcsrchr.S
> > @@ -1,4 +1,4 @@
> > -/* wcsrchr with SSSE3
> > +/* wcsrchr optimized with SSE2.
> > Copyright (C) 2011-2022 Free Software Foundation, Inc.
> > This file is part of the GNU C Library.
> >
> > @@ -16,266 +16,12 @@
> > License along with the GNU C Library; if not, see
> > <https://www.gnu.org/licenses/>. */
> >
> > -#include <sysdep.h>
> >
> > - .text
> > -ENTRY (wcsrchr)
> > +#define USE_AS_WCSRCHR 1
> > +#define NO_PMINU 1
> >
> > - movd %rsi, %xmm1
> > - mov %rdi, %rcx
> > - punpckldq %xmm1, %xmm1
> > - pxor %xmm2, %xmm2
> > - punpckldq %xmm1, %xmm1
> > - and $63, %rcx
> > - cmp $48, %rcx
> > - ja L(crosscache)
> > +#ifndef STRRCHR
> > +# define STRRCHR wcsrchr
> > +#endif
> >
> > - movdqu (%rdi), %xmm0
> > - pcmpeqd %xmm0, %xmm2
> > - pcmpeqd %xmm1, %xmm0
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm0, %rax
> > - add $16, %rdi
> > -
> > - test %rax, %rax
> > - jnz L(unaligned_match1)
> > -
> > - test %rcx, %rcx
> > - jnz L(return_null)
> > -
> > - and $-16, %rdi
> > - xor %r8, %r8
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(unaligned_match1):
> > - test %rcx, %rcx
> > - jnz L(prolog_find_zero_1)
> > -
> > - mov %rax, %r8
> > - mov %rdi, %rsi
> > - and $-16, %rdi
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(crosscache):
> > - and $15, %rcx
> > - and $-16, %rdi
> > - pxor %xmm3, %xmm3
> > - movdqa (%rdi), %xmm0
> > - pcmpeqd %xmm0, %xmm3
> > - pcmpeqd %xmm1, %xmm0
> > - pmovmskb %xmm3, %rdx
> > - pmovmskb %xmm0, %rax
> > - shr %cl, %rdx
> > - shr %cl, %rax
> > - add $16, %rdi
> > -
> > - test %rax, %rax
> > - jnz L(unaligned_match)
> > -
> > - test %rdx, %rdx
> > - jnz L(return_null)
> > -
> > - xor %r8, %r8
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(unaligned_match):
> > - test %rdx, %rdx
> > - jnz L(prolog_find_zero)
> > -
> > - mov %rax, %r8
> > - lea (%rdi, %rcx), %rsi
> > -
> > -/* Loop start on aligned string. */
> > - .p2align 4
> > -L(loop):
> > - movdqa (%rdi), %xmm0
> > - pcmpeqd %xmm0, %xmm2
> > - add $16, %rdi
> > - pcmpeqd %xmm1, %xmm0
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm0, %rax
> > - or %rax, %rcx
> > - jnz L(matches)
> > -
> > - movdqa (%rdi), %xmm3
> > - pcmpeqd %xmm3, %xmm2
> > - add $16, %rdi
> > - pcmpeqd %xmm1, %xmm3
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm3, %rax
> > - or %rax, %rcx
> > - jnz L(matches)
> > -
> > - movdqa (%rdi), %xmm4
> > - pcmpeqd %xmm4, %xmm2
> > - add $16, %rdi
> > - pcmpeqd %xmm1, %xmm4
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm4, %rax
> > - or %rax, %rcx
> > - jnz L(matches)
> > -
> > - movdqa (%rdi), %xmm5
> > - pcmpeqd %xmm5, %xmm2
> > - add $16, %rdi
> > - pcmpeqd %xmm1, %xmm5
> > - pmovmskb %xmm2, %rcx
> > - pmovmskb %xmm5, %rax
> > - or %rax, %rcx
> > - jz L(loop)
> > -
> > - .p2align 4
> > -L(matches):
> > - test %rax, %rax
> > - jnz L(match)
> > -L(return_value):
> > - test %r8, %r8
> > - jz L(return_null)
> > - mov %r8, %rax
> > - mov %rsi, %rdi
> > -
> > - test $15 << 4, %ah
> > - jnz L(match_fourth_wchar)
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(match):
> > - pmovmskb %xmm2, %rcx
> > - test %rcx, %rcx
> > - jnz L(find_zero)
> > - mov %rax, %r8
> > - mov %rdi, %rsi
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(find_zero):
> > - test $15, %cl
> > - jnz L(find_zero_in_first_wchar)
> > - test %cl, %cl
> > - jnz L(find_zero_in_second_wchar)
> > - test $15, %ch
> > - jnz L(find_zero_in_third_wchar)
> > -
> > - and $1 << 13 - 1, %rax
> > - jz L(return_value)
> > -
> > - test $15 << 4, %ah
> > - jnz L(match_fourth_wchar)
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(find_zero_in_first_wchar):
> > - test $1, %rax
> > - jz L(return_value)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(find_zero_in_second_wchar):
> > - and $1 << 5 - 1, %rax
> > - jz L(return_value)
> > -
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(find_zero_in_third_wchar):
> > - and $1 << 9 - 1, %rax
> > - jz L(return_value)
> > -
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(prolog_find_zero):
> > - add %rcx, %rdi
> > - mov %rdx, %rcx
> > -L(prolog_find_zero_1):
> > - test $15, %cl
> > - jnz L(prolog_find_zero_in_first_wchar)
> > - test %cl, %cl
> > - jnz L(prolog_find_zero_in_second_wchar)
> > - test $15, %ch
> > - jnz L(prolog_find_zero_in_third_wchar)
> > -
> > - and $1 << 13 - 1, %rax
> > - jz L(return_null)
> > -
> > - test $15 << 4, %ah
> > - jnz L(match_fourth_wchar)
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(prolog_find_zero_in_first_wchar):
> > - test $1, %rax
> > - jz L(return_null)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(prolog_find_zero_in_second_wchar):
> > - and $1 << 5 - 1, %rax
> > - jz L(return_null)
> > -
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(prolog_find_zero_in_third_wchar):
> > - and $1 << 9 - 1, %rax
> > - jz L(return_null)
> > -
> > - test %ah, %ah
> > - jnz L(match_third_wchar)
> > - test $15 << 4, %al
> > - jnz L(match_second_wchar)
> > - lea -16(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(match_second_wchar):
> > - lea -12(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(match_third_wchar):
> > - lea -8(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(match_fourth_wchar):
> > - lea -4(%rdi), %rax
> > - ret
> > -
> > - .p2align 4
> > -L(return_null):
> > - xor %rax, %rax
> > - ret
> > -
> > -END (wcsrchr)
> > +#include "../strrchr.S"
> > --
> > 2.25.1
> >
>
>
> --
> H.J.
On Thu, Apr 21, 2022 at 1:57 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Apr 21, 2022 at 3:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > The new code unrolls the main loop slightly without adding too much
> > > overhead and minimizes the comparisons for the search CHAR.
> > >
> > > Geometric Mean of all benchmarks New / Old: 0.741
> > > See email for all results.
> > >
> > > Full xcheck passes on x86_64 with and without multiarch enabled.
> > > ---
> > > Results For: strrchr
> > >
> > > Geometric Mean of N=30 runs.
> > >
> > > Geometric Mean of all benchmarks New / Old: 0.741
> > > Benchmarks performance on Tigerlake:
> > > https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
> > >
> > > len, align, pos, seek, max_char, freq, New Time / Old Time
> > > 2048, 0, 32, 0, 127, 1, 0.647
> > > 2048, 1, 32, 0, 127, 1, 0.621
> > > 2048, 0, 64, 0, 127, 1, 0.661
> > > 2048, 2, 64, 0, 127, 1, 0.655
> > > 2048, 0, 128, 0, 127, 1, 0.69
> > > 2048, 3, 128, 0, 127, 1, 0.689
> > > 2048, 0, 256, 0, 127, 1, 0.718
> > > 2048, 4, 256, 0, 127, 1, 0.718
> > > 2048, 0, 512, 0, 127, 1, 0.758
> > > 2048, 5, 512, 0, 127, 1, 0.754
> > > 2048, 0, 1024, 0, 127, 1, 1.029
> > > 2048, 6, 1024, 0, 127, 1, 1.032
> > > 2048, 0, 2048, 0, 127, 1, 0.826
> > > 2048, 7, 2048, 0, 127, 1, 0.834
> > > 2048, 0, 4096, 0, 127, 1, 0.825
> > > 2048, 8, 4096, 0, 127, 1, 0.83
> > > 256, 1, 64, 0, 127, 1, 0.657
> > > 256, 15, 64, 0, 127, 1, 0.657
> > > 256, 2, 64, 0, 127, 1, 0.657
> > > 256, 30, 64, 0, 127, 1, 0.523
> > > 256, 3, 64, 0, 127, 1, 0.657
> > > 256, 45, 64, 0, 127, 1, 0.654
> > > 256, 4, 64, 0, 127, 1, 0.657
> > > 256, 60, 64, 0, 127, 1, 0.526
> > > 256, 5, 64, 0, 127, 1, 0.658
> > > 256, 75, 64, 0, 127, 1, 0.658
> > > 256, 6, 64, 0, 127, 1, 0.655
> > > 256, 90, 64, 0, 127, 1, 0.523
> > > 256, 7, 64, 0, 127, 1, 0.655
> > > 256, 105, 64, 0, 127, 1, 0.654
> > > 1, 0, 0, 0, 127, 1, 0.98
> > > 2, 0, 1, 0, 127, 1, 0.978
> > > 3, 0, 2, 0, 127, 1, 0.975
> > > 4, 0, 3, 0, 127, 1, 0.976
> > > 5, 0, 4, 0, 127, 1, 0.977
> > > 6, 0, 5, 0, 127, 1, 0.981
> > > 7, 0, 6, 0, 127, 1, 0.982
> > > 8, 0, 7, 0, 127, 1, 0.98
> > > 9, 0, 8, 0, 127, 1, 0.978
> > > 10, 0, 9, 0, 127, 1, 0.981
> > > 11, 0, 10, 0, 127, 1, 0.984
> > > 12, 0, 11, 0, 127, 1, 0.982
> > > 13, 0, 12, 0, 127, 1, 0.98
> > > 14, 0, 13, 0, 127, 1, 0.978
> > > 15, 0, 14, 0, 127, 1, 0.979
> > > 16, 0, 15, 0, 127, 1, 0.986
> > > 17, 0, 16, 0, 127, 1, 0.529
> > > 18, 0, 17, 0, 127, 1, 0.566
> > > 19, 0, 18, 0, 127, 1, 0.575
> > > 20, 0, 19, 0, 127, 1, 0.573
> > > 21, 0, 20, 0, 127, 1, 0.579
> > > 22, 0, 21, 0, 127, 1, 0.595
> > > 23, 0, 22, 0, 127, 1, 0.585
> > > 24, 0, 23, 0, 127, 1, 0.586
> > > 25, 0, 24, 0, 127, 1, 0.587
> > > 26, 0, 25, 0, 127, 1, 0.592
> > > 27, 0, 26, 0, 127, 1, 0.595
> > > 28, 0, 27, 0, 127, 1, 0.592
> > > 29, 0, 28, 0, 127, 1, 0.6
> > > 30, 0, 29, 0, 127, 1, 0.598
> > > 31, 0, 30, 0, 127, 1, 0.595
> > > 32, 0, 31, 0, 127, 1, 0.592
> > > 2048, 0, 32, 23, 127, 1, 0.827
> > > 2048, 1, 32, 23, 127, 1, 0.826
> > > 2048, 0, 64, 23, 127, 1, 0.824
> > > 2048, 2, 64, 23, 127, 1, 0.825
> > > 2048, 0, 128, 23, 127, 1, 0.829
> > > 2048, 3, 128, 23, 127, 1, 0.824
> > > 2048, 0, 256, 23, 127, 1, 0.832
> > > 2048, 4, 256, 23, 127, 1, 0.825
> > > 2048, 0, 512, 23, 127, 1, 0.831
> > > 2048, 5, 512, 23, 127, 1, 0.837
> > > 2048, 0, 1024, 23, 127, 1, 0.721
> > > 2048, 6, 1024, 23, 127, 1, 0.757
> > > 2048, 0, 2048, 23, 127, 1, 0.825
> > > 2048, 7, 2048, 23, 127, 1, 0.824
> > > 2048, 0, 4096, 23, 127, 1, 0.828
> > > 2048, 8, 4096, 23, 127, 1, 0.823
> > > 256, 1, 64, 23, 127, 1, 0.665
> > > 256, 15, 64, 23, 127, 1, 0.661
> > > 256, 2, 64, 23, 127, 1, 0.674
> > > 256, 30, 64, 23, 127, 1, 0.605
> > > 256, 3, 64, 23, 127, 1, 0.668
> > > 256, 45, 64, 23, 127, 1, 0.661
> > > 256, 4, 64, 23, 127, 1, 0.657
> > > 256, 60, 64, 23, 127, 1, 0.594
> > > 256, 5, 64, 23, 127, 1, 0.654
> > > 256, 75, 64, 23, 127, 1, 0.673
> > > 256, 6, 64, 23, 127, 1, 0.688
> > > 256, 90, 64, 23, 127, 1, 0.6
> > > 256, 7, 64, 23, 127, 1, 0.66
> > > 256, 105, 64, 23, 127, 1, 0.654
> > > 1, 0, 0, 23, 127, 1, 0.981
> > > 2, 0, 1, 23, 127, 1, 0.976
> > > 3, 0, 2, 23, 127, 1, 0.983
> > > 4, 0, 3, 23, 127, 1, 0.984
> > > 5, 0, 4, 23, 127, 1, 0.973
> > > 6, 0, 5, 23, 127, 1, 0.987
> > > 7, 0, 6, 23, 127, 1, 0.977
> > > 8, 0, 7, 23, 127, 1, 0.979
> > > 9, 0, 8, 23, 127, 1, 0.981
> > > 10, 0, 9, 23, 127, 1, 0.98
> > > 11, 0, 10, 23, 127, 1, 0.983
> > > 12, 0, 11, 23, 127, 1, 0.98
> > > 13, 0, 12, 23, 127, 1, 0.98
> > > 14, 0, 13, 23, 127, 1, 0.977
> > > 15, 0, 14, 23, 127, 1, 0.982
> > > 16, 0, 15, 23, 127, 1, 0.581
> > > 17, 0, 16, 23, 127, 1, 0.551
> > > 18, 0, 17, 23, 127, 1, 0.555
> > > 19, 0, 18, 23, 127, 1, 0.586
> > > 20, 0, 19, 23, 127, 1, 0.585
> > > 21, 0, 20, 23, 127, 1, 0.582
> > > 22, 0, 21, 23, 127, 1, 0.571
> > > 23, 0, 22, 23, 127, 1, 0.576
> > > 24, 0, 23, 23, 127, 1, 0.581
> > > 25, 0, 24, 23, 127, 1, 0.589
> > > 26, 0, 25, 23, 127, 1, 0.593
> > > 27, 0, 26, 23, 127, 1, 0.595
> > > 28, 0, 27, 23, 127, 1, 0.583
> > > 29, 0, 28, 23, 127, 1, 0.595
> > > 30, 0, 29, 23, 127, 1, 0.58
> > > 31, 0, 30, 23, 127, 1, 0.594
> > > 32, 0, 31, 23, 127, 1, 0.665
> > > 2048, 0, 32, 23, 127, 2, 0.825
> > > 2048, 1, 32, 23, 127, 2, 0.818
> > > 2048, 0, 64, 23, 127, 2, 0.829
> > > 2048, 2, 64, 23, 127, 2, 0.828
> > > 2048, 0, 128, 23, 127, 2, 0.823
> > > 2048, 3, 128, 23, 127, 2, 0.825
> > > 2048, 0, 256, 23, 127, 2, 0.819
> > > 2048, 4, 256, 23, 127, 2, 0.828
> > > 2048, 0, 512, 23, 127, 2, 0.824
> > > 2048, 5, 512, 23, 127, 2, 0.827
> > > 2048, 0, 1024, 23, 127, 2, 0.813
> > > 2048, 6, 1024, 23, 127, 2, 0.834
> > > 2048, 0, 2048, 23, 127, 2, 0.927
> > > 2048, 7, 2048, 23, 127, 2, 0.923
> > > 2048, 0, 4096, 23, 127, 2, 0.818
> > > 2048, 8, 4096, 23, 127, 2, 0.82
> > > 256, 1, 64, 23, 127, 2, 0.693
> > > 256, 15, 64, 23, 127, 2, 0.686
> > > 256, 2, 64, 23, 127, 2, 0.69
> > > 256, 30, 64, 23, 127, 2, 0.611
> > > 256, 3, 64, 23, 127, 2, 0.692
> > > 256, 45, 64, 23, 127, 2, 0.685
> > > 256, 4, 64, 23, 127, 2, 0.688
> > > 256, 60, 64, 23, 127, 2, 0.6
> > > 256, 5, 64, 23, 127, 2, 0.69
> > > 256, 75, 64, 23, 127, 2, 0.689
> > > 256, 6, 64, 23, 127, 2, 0.688
> > > 256, 90, 64, 23, 127, 2, 0.611
> > > 256, 7, 64, 23, 127, 2, 0.69
> > > 256, 105, 64, 23, 127, 2, 0.686
> > > 1, 0, 0, 23, 127, 2, 0.982
> > > 2, 0, 1, 23, 127, 2, 0.987
> > > 3, 0, 2, 23, 127, 2, 0.978
> > > 4, 0, 3, 23, 127, 2, 0.977
> > > 5, 0, 4, 23, 127, 2, 0.979
> > > 6, 0, 5, 23, 127, 2, 0.985
> > > 7, 0, 6, 23, 127, 2, 0.975
> > > 8, 0, 7, 23, 127, 2, 0.981
> > > 9, 0, 8, 23, 127, 2, 0.984
> > > 10, 0, 9, 23, 127, 2, 0.983
> > > 11, 0, 10, 23, 127, 2, 0.982
> > > 12, 0, 11, 23, 127, 2, 0.976
> > > 13, 0, 12, 23, 127, 2, 0.985
> > > 14, 0, 13, 23, 127, 2, 0.984
> > > 15, 0, 14, 23, 127, 2, 0.98
> > > 16, 0, 15, 23, 127, 2, 0.583
> > > 17, 0, 16, 23, 127, 2, 0.552
> > > 18, 0, 17, 23, 127, 2, 0.564
> > > 19, 0, 18, 23, 127, 2, 0.585
> > > 20, 0, 19, 23, 127, 2, 0.578
> > > 21, 0, 20, 23, 127, 2, 0.578
> > > 22, 0, 21, 23, 127, 2, 0.571
> > > 23, 0, 22, 23, 127, 2, 0.587
> > > 24, 0, 23, 23, 127, 2, 0.589
> > > 25, 0, 24, 23, 127, 2, 0.593
> > > 26, 0, 25, 23, 127, 2, 0.589
> > > 27, 0, 26, 23, 127, 2, 0.588
> > > 28, 0, 27, 23, 127, 2, 0.593
> > > 29, 0, 28, 23, 127, 2, 0.579
> > > 30, 0, 29, 23, 127, 2, 0.572
> > > 31, 0, 30, 23, 127, 2, 0.582
> > > 32, 0, 31, 23, 127, 2, 0.659
> > > 2048, 0, 32, 23, 127, 4, 0.822
> > > 2048, 1, 32, 23, 127, 4, 0.818
> > > 2048, 0, 64, 23, 127, 4, 0.826
> > > 2048, 2, 64, 23, 127, 4, 0.824
> > > 2048, 0, 128, 23, 127, 4, 0.833
> > > 2048, 3, 128, 23, 127, 4, 0.831
> > > 2048, 0, 256, 23, 127, 4, 0.826
> > > 2048, 4, 256, 23, 127, 4, 0.831
> > > 2048, 0, 512, 23, 127, 4, 0.834
> > > 2048, 5, 512, 23, 127, 4, 0.83
> > > 2048, 0, 1024, 23, 127, 4, 0.836
> > > 2048, 6, 1024, 23, 127, 4, 0.844
> > > 2048, 0, 2048, 23, 127, 4, 0.696
> > > 2048, 7, 2048, 23, 127, 4, 0.704
> > > 2048, 0, 4096, 23, 127, 4, 0.936
> > > 2048, 8, 4096, 23, 127, 4, 0.925
> > > 256, 1, 64, 23, 127, 4, 0.694
> > > 256, 15, 64, 23, 127, 4, 0.69
> > > 256, 2, 64, 23, 127, 4, 0.687
> > > 256, 30, 64, 23, 127, 4, 0.612
> > > 256, 3, 64, 23, 127, 4, 0.685
> > > 256, 45, 64, 23, 127, 4, 0.685
> > > 256, 4, 64, 23, 127, 4, 0.684
> > > 256, 60, 64, 23, 127, 4, 0.606
> > > 256, 5, 64, 23, 127, 4, 0.69
> > > 256, 75, 64, 23, 127, 4, 0.688
> > > 256, 6, 64, 23, 127, 4, 0.69
> > > 256, 90, 64, 23, 127, 4, 0.615
> > > 256, 7, 64, 23, 127, 4, 0.691
> > > 256, 105, 64, 23, 127, 4, 0.688
> > > 1, 0, 0, 23, 127, 4, 0.982
> > > 2, 0, 1, 23, 127, 4, 0.983
> > > 3, 0, 2, 23, 127, 4, 0.981
> > > 4, 0, 3, 23, 127, 4, 0.984
> > > 5, 0, 4, 23, 127, 4, 0.963
> > > 6, 0, 5, 23, 127, 4, 0.978
> > > 7, 0, 6, 23, 127, 4, 0.985
> > > 8, 0, 7, 23, 127, 4, 0.986
> > > 9, 0, 8, 23, 127, 4, 0.978
> > > 10, 0, 9, 23, 127, 4, 0.985
> > > 11, 0, 10, 23, 127, 4, 0.986
> > > 12, 0, 11, 23, 127, 4, 0.983
> > > 13, 0, 12, 23, 127, 4, 0.986
> > > 14, 0, 13, 23, 127, 4, 0.98
> > > 15, 0, 14, 23, 127, 4, 0.979
> > > 16, 0, 15, 23, 127, 4, 0.582
> > > 17, 0, 16, 23, 127, 4, 0.542
> > > 18, 0, 17, 23, 127, 4, 0.564
> > > 19, 0, 18, 23, 127, 4, 0.571
> > > 20, 0, 19, 23, 127, 4, 0.582
> > > 21, 0, 20, 23, 127, 4, 0.573
> > > 22, 0, 21, 23, 127, 4, 0.575
> > > 23, 0, 22, 23, 127, 4, 0.578
> > > 24, 0, 23, 23, 127, 4, 0.58
> > > 25, 0, 24, 23, 127, 4, 0.592
> > > 26, 0, 25, 23, 127, 4, 0.588
> > > 27, 0, 26, 23, 127, 4, 0.574
> > > 28, 0, 27, 23, 127, 4, 0.589
> > > 29, 0, 28, 23, 127, 4, 0.56
> > > 30, 0, 29, 23, 127, 4, 0.587
> > > 31, 0, 30, 23, 127, 4, 0.584
> > > 32, 0, 31, 23, 127, 4, 0.664
> > > 2048, 0, 32, 23, 127, 8, 0.826
> > > 2048, 1, 32, 23, 127, 8, 0.821
> > > 2048, 0, 64, 23, 127, 8, 0.828
> > > 2048, 2, 64, 23, 127, 8, 0.827
> > > 2048, 0, 128, 23, 127, 8, 0.833
> > > 2048, 3, 128, 23, 127, 8, 0.83
> > > 2048, 0, 256, 23, 127, 8, 0.855
> > > 2048, 4, 256, 23, 127, 8, 0.849
> > > 2048, 0, 512, 23, 127, 8, 0.849
> > > 2048, 5, 512, 23, 127, 8, 0.851
> > > 2048, 0, 1024, 23, 127, 8, 0.856
> > > 2048, 6, 1024, 23, 127, 8, 0.862
> > > 2048, 0, 2048, 23, 127, 8, 0.709
> > > 2048, 7, 2048, 23, 127, 8, 0.712
> > > 2048, 0, 4096, 23, 127, 8, 0.702
> > > 2048, 8, 4096, 23, 127, 8, 0.701
> > > 256, 1, 64, 23, 127, 8, 0.689
> > > 256, 15, 64, 23, 127, 8, 0.688
> > > 256, 2, 64, 23, 127, 8, 0.691
> > > 256, 30, 64, 23, 127, 8, 0.612
> > > 256, 3, 64, 23, 127, 8, 0.688
> > > 256, 45, 64, 23, 127, 8, 0.686
> > > 256, 4, 64, 23, 127, 8, 0.694
> > > 256, 60, 64, 23, 127, 8, 0.609
> > > 256, 5, 64, 23, 127, 8, 0.69
> > > 256, 75, 64, 23, 127, 8, 0.69
> > > 256, 6, 64, 23, 127, 8, 0.691
> > > 256, 90, 64, 23, 127, 8, 0.612
> > > 256, 7, 64, 23, 127, 8, 0.689
> > > 256, 105, 64, 23, 127, 8, 0.688
> > > 1, 0, 0, 23, 127, 8, 0.98
> > > 2, 0, 1, 23, 127, 8, 0.978
> > > 3, 0, 2, 23, 127, 8, 0.98
> > > 4, 0, 3, 23, 127, 8, 0.978
> > > 5, 0, 4, 23, 127, 8, 0.977
> > > 6, 0, 5, 23, 127, 8, 0.984
> > > 7, 0, 6, 23, 127, 8, 0.982
> > > 8, 0, 7, 23, 127, 8, 0.983
> > > 9, 0, 8, 23, 127, 8, 0.987
> > > 10, 0, 9, 23, 127, 8, 0.979
> > > 11, 0, 10, 23, 127, 8, 0.985
> > > 12, 0, 11, 23, 127, 8, 0.981
> > > 13, 0, 12, 23, 127, 8, 0.98
> > > 14, 0, 13, 23, 127, 8, 0.982
> > > 15, 0, 14, 23, 127, 8, 0.981
> > > 16, 0, 15, 23, 127, 8, 0.579
> > > 17, 0, 16, 23, 127, 8, 0.531
> > > 18, 0, 17, 23, 127, 8, 0.577
> > > 19, 0, 18, 23, 127, 8, 0.588
> > > 20, 0, 19, 23, 127, 8, 0.571
> > > 21, 0, 20, 23, 127, 8, 0.576
> > > 22, 0, 21, 23, 127, 8, 0.59
> > > 23, 0, 22, 23, 127, 8, 0.574
> > > 24, 0, 23, 23, 127, 8, 0.583
> > > 25, 0, 24, 23, 127, 8, 0.581
> > > 26, 0, 25, 23, 127, 8, 0.592
> > > 27, 0, 26, 23, 127, 8, 0.586
> > > 28, 0, 27, 23, 127, 8, 0.588
> > > 29, 0, 28, 23, 127, 8, 0.578
> > > 30, 0, 29, 23, 127, 8, 0.573
> > > 31, 0, 30, 23, 127, 8, 0.588
> > > 32, 0, 31, 23, 127, 8, 0.664
> > > 2048, 0, 32, 23, 127, 16, 0.825
> > > 2048, 1, 32, 23, 127, 16, 0.823
> > > 2048, 0, 64, 23, 127, 16, 0.831
> > > 2048, 2, 64, 23, 127, 16, 0.822
> > > 2048, 0, 128, 23, 127, 16, 0.831
> > > 2048, 3, 128, 23, 127, 16, 0.831
> > > 2048, 0, 256, 23, 127, 16, 0.849
> > > 2048, 4, 256, 23, 127, 16, 0.85
> > > 2048, 0, 512, 23, 127, 16, 0.751
> > > 2048, 5, 512, 23, 127, 16, 0.75
> > > 2048, 0, 1024, 23, 127, 16, 0.913
> > > 2048, 6, 1024, 23, 127, 16, 0.895
> > > 2048, 0, 2048, 23, 127, 16, 0.736
> > > 2048, 7, 2048, 23, 127, 16, 0.741
> > > 2048, 0, 4096, 23, 127, 16, 0.712
> > > 2048, 8, 4096, 23, 127, 16, 0.711
> > > 256, 1, 64, 23, 127, 16, 0.758
> > > 256, 15, 64, 23, 127, 16, 0.692
> > > 256, 2, 64, 23, 127, 16, 0.692
> > > 256, 30, 64, 23, 127, 16, 0.613
> > > 256, 3, 64, 23, 127, 16, 0.69
> > > 256, 45, 64, 23, 127, 16, 0.687
> > > 256, 4, 64, 23, 127, 16, 0.69
> > > 256, 60, 64, 23, 127, 16, 0.604
> > > 256, 5, 64, 23, 127, 16, 0.687
> > > 256, 75, 64, 23, 127, 16, 0.687
> > > 256, 6, 64, 23, 127, 16, 0.69
> > > 256, 90, 64, 23, 127, 16, 0.61
> > > 256, 7, 64, 23, 127, 16, 0.69
> > > 256, 105, 64, 23, 127, 16, 0.685
> > > 1, 0, 0, 23, 127, 16, 0.981
> > > 2, 0, 1, 23, 127, 16, 0.985
> > > 3, 0, 2, 23, 127, 16, 0.985
> > > 4, 0, 3, 23, 127, 16, 0.981
> > > 5, 0, 4, 23, 127, 16, 0.979
> > > 6, 0, 5, 23, 127, 16, 0.986
> > > 7, 0, 6, 23, 127, 16, 0.986
> > > 8, 0, 7, 23, 127, 16, 0.982
> > > 9, 0, 8, 23, 127, 16, 0.982
> > > 10, 0, 9, 23, 127, 16, 0.98
> > > 11, 0, 10, 23, 127, 16, 0.983
> > > 12, 0, 11, 23, 127, 16, 0.982
> > > 13, 0, 12, 23, 127, 16, 0.982
> > > 14, 0, 13, 23, 127, 16, 0.982
> > > 15, 0, 14, 23, 127, 16, 0.982
> > > 16, 0, 15, 23, 127, 16, 0.582
> > > 17, 0, 16, 23, 127, 16, 0.542
> > > 18, 0, 17, 23, 127, 16, 0.554
> > > 19, 0, 18, 23, 127, 16, 0.562
> > > 20, 0, 19, 23, 127, 16, 0.587
> > > 21, 0, 20, 23, 127, 16, 0.584
> > > 22, 0, 21, 23, 127, 16, 0.587
> > > 23, 0, 22, 23, 127, 16, 0.594
> > > 24, 0, 23, 23, 127, 16, 0.581
> > > 25, 0, 24, 23, 127, 16, 0.577
> > > 26, 0, 25, 23, 127, 16, 0.588
> > > 27, 0, 26, 23, 127, 16, 0.589
> > > 28, 0, 27, 23, 127, 16, 0.596
> > > 29, 0, 28, 23, 127, 16, 0.591
> > > 30, 0, 29, 23, 127, 16, 0.585
> > > 31, 0, 30, 23, 127, 16, 0.59
> > > 32, 0, 31, 23, 127, 16, 0.669
> > >
> > > sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
> > > sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
> > > sysdeps/x86_64/strrchr.S | 505 +++++++++++++++---------
> > > sysdeps/x86_64/wcsrchr.S | 268 +------------
> > > 4 files changed, 334 insertions(+), 444 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > index db1b44c23c..866396e947 100644
> > > --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > @@ -17,7 +17,7 @@
> > > <https://www.gnu.org/licenses/>. */
> > >
> > > #if IS_IN (libc)
> > > -# define strrchr __strrchr_sse2
> > > +# define STRRCHR __strrchr_sse2
> > >
> > > # undef weak_alias
> > > # define weak_alias(strrchr, rindex)
> > > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > index 78d1ca6553..69d2f3cdb1 100644
> > > --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > @@ -17,7 +17,6 @@
> > > <https://www.gnu.org/licenses/>. */
> > >
> > > #if IS_IN (libc)
> > > -# define wcsrchr __wcsrchr_sse2
> > > +# define STRRCHR __wcsrchr_sse2
> > > #endif
> > > -
> > > #include "../wcsrchr.S"
> > > diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> > > index 50d886713e..94449ad806 100644
> > > --- a/sysdeps/x86_64/strrchr.S
> > > +++ b/sysdeps/x86_64/strrchr.S
> > > @@ -19,210 +19,355 @@
> > >
> > > #include <sysdep.h>
> > >
> > > +#ifndef STRRCHR
> > > +# define STRRCHR strrchr
> > > +#endif
> > > +
> > > +#ifdef USE_AS_WCSRCHR
> > > +# define PCMPEQ pcmpeqd
> > > +# define CHAR_SIZE 4
> > > +# define PMINU pminud
> > > +#else
> > > +# define PCMPEQ pcmpeqb
> > > +# define CHAR_SIZE 1
> > > +# define PMINU pminub
> > > +#endif
> > > +
> > > +#define PAGE_SIZE 4096
> > > +#define VEC_SIZE 16
> > > +
> > > .text
> > > -ENTRY (strrchr)
> > > - movd %esi, %xmm1
> > > +ENTRY(STRRCHR)
> > > + movd %esi, %xmm0
> > > movq %rdi, %rax
> > > - andl $4095, %eax
> > > - punpcklbw %xmm1, %xmm1
> > > - cmpq $4032, %rax
> > > - punpcklwd %xmm1, %xmm1
> > > - pshufd $0, %xmm1, %xmm1
> > > + andl $(PAGE_SIZE - 1), %eax
> > > +#ifndef USE_AS_WCSRCHR
> > > + punpcklbw %xmm0, %xmm0
> > > + punpcklwd %xmm0, %xmm0
> > > +#endif
> > > + pshufd $0, %xmm0, %xmm0
> > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > > ja L(cross_page)
> > > - movdqu (%rdi), %xmm0
> > > +
> > > +L(cross_page_continue):
> > > + movups (%rdi), %xmm1
> > > pxor %xmm2, %xmm2
> > > - movdqa %xmm0, %xmm3
> > > - pcmpeqb %xmm1, %xmm0
> > > - pcmpeqb %xmm2, %xmm3
> > > - pmovmskb %xmm0, %ecx
> > > - pmovmskb %xmm3, %edx
> > > - testq %rdx, %rdx
> > > - je L(next_48_bytes)
> > > - leaq -1(%rdx), %rax
> > > - xorq %rdx, %rax
> > > - andq %rcx, %rax
> > > - je L(exit)
> > > - bsrq %rax, %rax
> > > + PCMPEQ %xmm1, %xmm2
> > > + pmovmskb %xmm2, %ecx
> > > + testl %ecx, %ecx
> > > + jz L(aligned_more)
> > > +
> > > + PCMPEQ %xmm0, %xmm1
> > > + pmovmskb %xmm1, %eax
> > > + leal -1(%rcx), %edx
> > > + xorl %edx, %ecx
> > > + andl %ecx, %eax
> > > + jz L(ret0)
> > > + bsrl %eax, %eax
> > > addq %rdi, %rax
> > > + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> > > + search CHAR is zero we are correct. Either way `andq
> > > + -CHAR_SIZE, %rax` gets the correct result. */
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > +L(ret0):
> > > ret
> > >
> > > + /* Returns for first vec x1/x2 have hard coded backward search
> > > + path for earlier matches. */
> > > .p2align 4
> > > -L(next_48_bytes):
> > > - movdqu 16(%rdi), %xmm4
> > > - movdqa %xmm4, %xmm5
> > > - movdqu 32(%rdi), %xmm3
> > > - pcmpeqb %xmm1, %xmm4
> > > - pcmpeqb %xmm2, %xmm5
> > > - movdqu 48(%rdi), %xmm0
> > > - pmovmskb %xmm5, %edx
> > > - movdqa %xmm3, %xmm5
> > > - pcmpeqb %xmm1, %xmm3
> > > - pcmpeqb %xmm2, %xmm5
> > > - pcmpeqb %xmm0, %xmm2
> > > - salq $16, %rdx
> > > - pmovmskb %xmm3, %r8d
> > > - pmovmskb %xmm5, %eax
> > > - pmovmskb %xmm2, %esi
> > > - salq $32, %r8
> > > - salq $32, %rax
> > > - pcmpeqb %xmm1, %xmm0
> > > - orq %rdx, %rax
> > > - movq %rsi, %rdx
> > > - pmovmskb %xmm4, %esi
> > > - salq $48, %rdx
> > > - salq $16, %rsi
> > > - orq %r8, %rsi
> > > - orq %rcx, %rsi
> > > - pmovmskb %xmm0, %ecx
> > > - salq $48, %rcx
> > > - orq %rcx, %rsi
> > > - orq %rdx, %rax
> > > - je L(loop_header2)
> > > - leaq -1(%rax), %rcx
> > > - xorq %rax, %rcx
> > > - andq %rcx, %rsi
> > > - je L(exit)
> > > - bsrq %rsi, %rsi
> > > - leaq (%rdi,%rsi), %rax
> > > +L(first_vec_x0_test):
> > > + PCMPEQ %xmm0, %xmm1
> > > + pmovmskb %xmm1, %eax
> > > + testl %eax, %eax
> > > + jz L(ret0)
> > > + bsrl %eax, %eax
> > > + addq %r8, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > ret
> > >
> > > .p2align 4
> > > -L(loop_header2):
> > > - testq %rsi, %rsi
> > > - movq %rdi, %rcx
> > > - je L(no_c_found)
> > > -L(loop_header):
> > > - addq $64, %rdi
> > > - pxor %xmm7, %xmm7
> > > - andq $-64, %rdi
> > > - jmp L(loop_entry)
> > > +L(first_vec_x1):
> > > + PCMPEQ %xmm0, %xmm2
> > > + pmovmskb %xmm2, %eax
> > > + leal -1(%rcx), %edx
> > > + xorl %edx, %ecx
> > > + andl %ecx, %eax
> > > + jz L(first_vec_x0_test)
> > > + bsrl %eax, %eax
> > > + leaq (VEC_SIZE)(%rdi, %rax), %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > + ret
> > >
> > > .p2align 4
> > > -L(loop64):
> > > - testq %rdx, %rdx
> > > - cmovne %rdx, %rsi
> > > - cmovne %rdi, %rcx
> > > - addq $64, %rdi
> > > -L(loop_entry):
> > > - movdqa 32(%rdi), %xmm3
> > > - pxor %xmm6, %xmm6
> > > - movdqa 48(%rdi), %xmm2
> > > - movdqa %xmm3, %xmm0
> > > - movdqa 16(%rdi), %xmm4
> > > - pminub %xmm2, %xmm0
> > > - movdqa (%rdi), %xmm5
> > > - pminub %xmm4, %xmm0
> > > - pminub %xmm5, %xmm0
> > > - pcmpeqb %xmm7, %xmm0
> > > - pmovmskb %xmm0, %eax
> > > - movdqa %xmm5, %xmm0
> > > - pcmpeqb %xmm1, %xmm0
> > > - pmovmskb %xmm0, %r9d
> > > - movdqa %xmm4, %xmm0
> > > - pcmpeqb %xmm1, %xmm0
> > > - pmovmskb %xmm0, %edx
> > > - movdqa %xmm3, %xmm0
> > > - pcmpeqb %xmm1, %xmm0
> > > - salq $16, %rdx
> > > - pmovmskb %xmm0, %r10d
> > > - movdqa %xmm2, %xmm0
> > > - pcmpeqb %xmm1, %xmm0
> > > - salq $32, %r10
> > > - orq %r10, %rdx
> > > - pmovmskb %xmm0, %r8d
> > > - orq %r9, %rdx
> > > - salq $48, %r8
> > > - orq %r8, %rdx
> > > +L(first_vec_x1_test):
> > > + PCMPEQ %xmm0, %xmm2
> > > + pmovmskb %xmm2, %eax
> > > testl %eax, %eax
> > > - je L(loop64)
> > > - pcmpeqb %xmm6, %xmm4
> > > - pcmpeqb %xmm6, %xmm3
> > > - pcmpeqb %xmm6, %xmm5
> > > - pmovmskb %xmm4, %eax
> > > - pmovmskb %xmm3, %r10d
> > > - pcmpeqb %xmm6, %xmm2
> > > - pmovmskb %xmm5, %r9d
> > > - salq $32, %r10
> > > - salq $16, %rax
> > > - pmovmskb %xmm2, %r8d
> > > - orq %r10, %rax
> > > - orq %r9, %rax
> > > - salq $48, %r8
> > > - orq %r8, %rax
> > > - leaq -1(%rax), %r8
> > > - xorq %rax, %r8
> > > - andq %r8, %rdx
> > > - cmovne %rdi, %rcx
> > > - cmovne %rdx, %rsi
> > > - bsrq %rsi, %rsi
> > > - leaq (%rcx,%rsi), %rax
> > > + jz L(first_vec_x0_test)
> > > + bsrl %eax, %eax
> > > + leaq (VEC_SIZE)(%rdi, %rax), %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > + ret
> > > +
> > > + .p2align 4
> > > +L(first_vec_x2):
> > > + PCMPEQ %xmm0, %xmm3
> > > + pmovmskb %xmm3, %eax
> > > + leal -1(%rcx), %edx
> > > + xorl %edx, %ecx
> > > + andl %ecx, %eax
> > > + jz L(first_vec_x1_test)
> > > + bsrl %eax, %eax
> > > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > + ret
> > > +
> > > + .p2align 4
> > > +L(aligned_more):
> > > + /* Save original pointer if match was in VEC 0. */
> > > + movq %rdi, %r8
> > > + andq $-VEC_SIZE, %rdi
> > > +
> > > + movaps VEC_SIZE(%rdi), %xmm2
> > > + pxor %xmm3, %xmm3
> > > + PCMPEQ %xmm2, %xmm3
> > > + pmovmskb %xmm3, %ecx
> > > + testl %ecx, %ecx
> > > + jnz L(first_vec_x1)
> > > +
> > > + movaps (VEC_SIZE * 2)(%rdi), %xmm3
> > > + pxor %xmm4, %xmm4
> > > + PCMPEQ %xmm3, %xmm4
> > > + pmovmskb %xmm4, %ecx
> > > + testl %ecx, %ecx
> > > + jnz L(first_vec_x2)
> > > +
> > > + addq $VEC_SIZE, %rdi
> > > + /* Save pointer again before realigning. */
> > > + movq %rdi, %rsi
> > > + andq $-(VEC_SIZE * 2), %rdi
> > > + .p2align 4
> > > +L(first_loop):
> > > + /* Do 2x VEC at a time. */
> > > + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> > > + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> > > + /* If SSE2 no pminud. */
> > > +#ifdef NO_PMINU
> >
> > Do we really need SSE4.1 wcsrchr? I think we should focus on AVX2 and
> > above.
>
> It seems like freebie performance that can make a difference in the loop
> cases. (see the SSE4.1 commit for numbers).
But these numbers are on Tiger Lake. I think we should continue to
improve SSE2
version and optimize AVX2/AVX512. I don't think we should increase code sizes
for SSE4.
> Imo there is little harm but if you feel strongly I'll drop. (In V2 will
> change the .text section for SSE4_1).
>
> What do you think?
> >
> > > + movaps %xmm5, %xmm6
> > > + pxor %xmm8, %xmm8
> > > +
> > > + PCMPEQ %xmm8, %xmm5
> > > + PCMPEQ %xmm4, %xmm8
> > > + por %xmm5, %xmm8
> > > +#else
> > > + movaps %xmm5, %xmm6
> > > + PMINU %xmm4, %xmm5
> > > +#endif
> > > +
> > > + movaps %xmm4, %xmm9
> > > + PCMPEQ %xmm0, %xmm4
> > > + PCMPEQ %xmm0, %xmm6
> > > + movaps %xmm6, %xmm7
> > > + por %xmm4, %xmm6
> > > +#ifndef NO_PMINU
> > > + pxor %xmm8, %xmm8
> > > + PCMPEQ %xmm5, %xmm8
> > > +#endif
> > > + pmovmskb %xmm8, %ecx
> > > + pmovmskb %xmm6, %eax
> > > +
> > > + addq $(VEC_SIZE * 2), %rdi
> > > + /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> > > + macro-fuse with `jz`. */
> > > + addl %ecx, %eax
> > > + jz L(first_loop)
> > > +
> > > + /* Check if there is zero match. */
> > > + testl %ecx, %ecx
> > > + jz L(second_loop_match)
> > > +
> > > + /* Check if there was a match in last iteration. */
> > > + subl %ecx, %eax
> > > + jnz L(new_match)
> > > +
> > > +L(first_loop_old_match):
> > > + PCMPEQ %xmm0, %xmm2
> > > + PCMPEQ %xmm0, %xmm3
> > > + pmovmskb %xmm2, %ecx
> > > + pmovmskb %xmm3, %eax
> > > + addl %eax, %ecx
> > > + jz L(first_vec_x0_test)
> > > + /* NB: We could move this shift to before the branch and save a
> > > + bit of code size / performance on the fall through. The
> > > + branch leads to the null case which generally seems hotter
> > > + than char in first 3x VEC. */
> > > + sall $16, %eax
> > > + orl %ecx, %eax
> > > +
> > > + bsrl %eax, %eax
> > > + addq %rsi, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > + ret
> > > +
> > > + .p2align 4
> > > +L(new_match):
> > > + pxor %xmm6, %xmm6
> > > + PCMPEQ %xmm9, %xmm6
> > > + pmovmskb %xmm6, %eax
> > > + sall $16, %ecx
> > > + orl %eax, %ecx
> > > +
> > > + /* We can't reuse either of the old comparisons as since we mask
> > > + of zeros after first zero (instead of using the full
> > > + comparison) we can't gurantee no interference between match
> > > + after end of string and valid match. */
> > > + pmovmskb %xmm4, %eax
> > > + pmovmskb %xmm7, %edx
> > > + sall $16, %edx
> > > + orl %edx, %eax
> > > +
> > > + leal -1(%ecx), %edx
> > > + xorl %edx, %ecx
> > > + andl %ecx, %eax
> > > + jz L(first_loop_old_match)
> > > + bsrl %eax, %eax
> > > + addq %rdi, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > ret
> > >
> > > + /* Save minimum state for getting most recent match. We can
> > > + throw out all previous work. */
> > > .p2align 4
> > > -L(no_c_found):
> > > - movl $1, %esi
> > > - xorl %ecx, %ecx
> > > - jmp L(loop_header)
> > > +L(second_loop_match):
> > > + movq %rdi, %rsi
> > > + movaps %xmm4, %xmm2
> > > + movaps %xmm7, %xmm3
> > >
> > > .p2align 4
> > > -L(exit):
> > > - xorl %eax, %eax
> > > +L(second_loop):
> > > + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> > > + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> > > +#ifdef NO_PMINU
> > > + movaps %xmm5, %xmm6
> > > + pxor %xmm8, %xmm8
> > > +
> > > + PCMPEQ %xmm8, %xmm5
> > > + PCMPEQ %xmm4, %xmm8
> > > + por %xmm5, %xmm8
> > > +#else
> > > + movaps %xmm5, %xmm6
> > > + PMINU %xmm4, %xmm5
> > > +#endif
> > > +
> > > + movaps %xmm4, %xmm9
> > > + PCMPEQ %xmm0, %xmm4
> > > + PCMPEQ %xmm0, %xmm6
> > > + movaps %xmm6, %xmm7
> > > + por %xmm4, %xmm6
> > > +#ifndef NO_PMINU
> > > + pxor %xmm8, %xmm8
> > > + PCMPEQ %xmm5, %xmm8
> > > +#endif
> > > +
> > > + pmovmskb %xmm8, %ecx
> > > + pmovmskb %xmm6, %eax
> > > +
> > > + addq $(VEC_SIZE * 2), %rdi
> > > + /* Either null term or new occurence of CHAR. */
> > > + addl %ecx, %eax
> > > + jz L(second_loop)
> > > +
> > > + /* No null term so much be new occurence of CHAR. */
> > > + testl %ecx, %ecx
> > > + jz L(second_loop_match)
> > > +
> > > +
> > > + subl %ecx, %eax
> > > + jnz L(second_loop_new_match)
> > > +
> > > +L(second_loop_old_match):
> > > + pmovmskb %xmm2, %ecx
> > > + pmovmskb %xmm3, %eax
> > > + sall $16, %eax
> > > + orl %ecx, %eax
> > > + bsrl %eax, %eax
> > > + addq %rsi, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > ret
> > >
> > > .p2align 4
> > > +L(second_loop_new_match):
> > > + pxor %xmm6, %xmm6
> > > + PCMPEQ %xmm9, %xmm6
> > > + pmovmskb %xmm6, %eax
> > > + sall $16, %ecx
> > > + orl %eax, %ecx
> > > +
> > > + /* We can't reuse either of the old comparisons as since we mask
> > > + of zeros after first zero (instead of using the full
> > > + comparison) we can't gurantee no interference between match
> > > + after end of string and valid match. */
> > > + pmovmskb %xmm4, %eax
> > > + pmovmskb %xmm7, %edx
> > > + sall $16, %edx
> > > + orl %edx, %eax
> > > +
> > > + leal -1(%ecx), %edx
> > > + xorl %edx, %ecx
> > > + andl %ecx, %eax
> > > + jz L(second_loop_old_match)
> > > + bsrl %eax, %eax
> > > + addq %rdi, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > + ret
> > > +
> > > + .p2align 4,, 4
> > > L(cross_page):
> > > - movq %rdi, %rax
> > > - pxor %xmm0, %xmm0
> > > - andq $-64, %rax
> > > - movdqu (%rax), %xmm5
> > > - movdqa %xmm5, %xmm6
> > > - movdqu 16(%rax), %xmm4
> > > - pcmpeqb %xmm1, %xmm5
> > > - pcmpeqb %xmm0, %xmm6
> > > - movdqu 32(%rax), %xmm3
> > > - pmovmskb %xmm6, %esi
> > > - movdqa %xmm4, %xmm6
> > > - movdqu 48(%rax), %xmm2
> > > - pcmpeqb %xmm1, %xmm4
> > > - pcmpeqb %xmm0, %xmm6
> > > - pmovmskb %xmm6, %edx
> > > - movdqa %xmm3, %xmm6
> > > - pcmpeqb %xmm1, %xmm3
> > > - pcmpeqb %xmm0, %xmm6
> > > - pcmpeqb %xmm2, %xmm0
> > > - salq $16, %rdx
> > > - pmovmskb %xmm3, %r9d
> > > - pmovmskb %xmm6, %r8d
> > > - pmovmskb %xmm0, %ecx
> > > - salq $32, %r9
> > > - salq $32, %r8
> > > - pcmpeqb %xmm1, %xmm2
> > > - orq %r8, %rdx
> > > - salq $48, %rcx
> > > - pmovmskb %xmm5, %r8d
> > > - orq %rsi, %rdx
> > > - pmovmskb %xmm4, %esi
> > > - orq %rcx, %rdx
> > > - pmovmskb %xmm2, %ecx
> > > - salq $16, %rsi
> > > - salq $48, %rcx
> > > - orq %r9, %rsi
> > > - orq %r8, %rsi
> > > - orq %rcx, %rsi
> > > + movq %rdi, %rsi
> > > + andq $-VEC_SIZE, %rsi
> > > + movaps (%rsi), %xmm1
> > > + pxor %xmm2, %xmm2
> > > + PCMPEQ %xmm1, %xmm2
> > > + pmovmskb %xmm2, %edx
> > > movl %edi, %ecx
> > > - subl %eax, %ecx
> > > - shrq %cl, %rdx
> > > - shrq %cl, %rsi
> > > - testq %rdx, %rdx
> > > - je L(loop_header2)
> > > - leaq -1(%rdx), %rax
> > > - xorq %rdx, %rax
> > > - andq %rax, %rsi
> > > - je L(exit)
> > > - bsrq %rsi, %rax
> > > + andl $(VEC_SIZE - 1), %ecx
> > > + sarl %cl, %edx
> > > + jz L(cross_page_continue)
> > > + PCMPEQ %xmm0, %xmm1
> > > + pmovmskb %xmm1, %eax
> > > + sarl %cl, %eax
> > > + leal -1(%rdx), %ecx
> > > + xorl %edx, %ecx
> > > + andl %ecx, %eax
> > > + jz L(ret1)
> > > + bsrl %eax, %eax
> > > addq %rdi, %rax
> > > +#ifdef USE_AS_WCSRCHR
> > > + andq $-CHAR_SIZE, %rax
> > > +#endif
> > > +L(ret1):
> > > ret
> > > -END (strrchr)
> > > +END(STRRCHR)
> > >
> > > -weak_alias (strrchr, rindex)
> > > -libc_hidden_builtin_def (strrchr)
> > > +#ifndef USE_AS_WCSRCHR
> > > + weak_alias (STRRCHR, rindex)
> > > + libc_hidden_builtin_def (STRRCHR)
> > > +#endif
> > > diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> > > index 61552954de..2b80efc5ef 100644
> > > --- a/sysdeps/x86_64/wcsrchr.S
> > > +++ b/sysdeps/x86_64/wcsrchr.S
> > > @@ -1,4 +1,4 @@
> > > -/* wcsrchr with SSSE3
> > > +/* wcsrchr optimized with SSE2.
> > > Copyright (C) 2011-2022 Free Software Foundation, Inc.
> > > This file is part of the GNU C Library.
> > >
> > > @@ -16,266 +16,12 @@
> > > License along with the GNU C Library; if not, see
> > > <https://www.gnu.org/licenses/>. */
> > >
> > > -#include <sysdep.h>
> > >
> > > - .text
> > > -ENTRY (wcsrchr)
> > > +#define USE_AS_WCSRCHR 1
> > > +#define NO_PMINU 1
> > >
> > > - movd %rsi, %xmm1
> > > - mov %rdi, %rcx
> > > - punpckldq %xmm1, %xmm1
> > > - pxor %xmm2, %xmm2
> > > - punpckldq %xmm1, %xmm1
> > > - and $63, %rcx
> > > - cmp $48, %rcx
> > > - ja L(crosscache)
> > > +#ifndef STRRCHR
> > > +# define STRRCHR wcsrchr
> > > +#endif
> > >
> > > - movdqu (%rdi), %xmm0
> > > - pcmpeqd %xmm0, %xmm2
> > > - pcmpeqd %xmm1, %xmm0
> > > - pmovmskb %xmm2, %rcx
> > > - pmovmskb %xmm0, %rax
> > > - add $16, %rdi
> > > -
> > > - test %rax, %rax
> > > - jnz L(unaligned_match1)
> > > -
> > > - test %rcx, %rcx
> > > - jnz L(return_null)
> > > -
> > > - and $-16, %rdi
> > > - xor %r8, %r8
> > > - jmp L(loop)
> > > -
> > > - .p2align 4
> > > -L(unaligned_match1):
> > > - test %rcx, %rcx
> > > - jnz L(prolog_find_zero_1)
> > > -
> > > - mov %rax, %r8
> > > - mov %rdi, %rsi
> > > - and $-16, %rdi
> > > - jmp L(loop)
> > > -
> > > - .p2align 4
> > > -L(crosscache):
> > > - and $15, %rcx
> > > - and $-16, %rdi
> > > - pxor %xmm3, %xmm3
> > > - movdqa (%rdi), %xmm0
> > > - pcmpeqd %xmm0, %xmm3
> > > - pcmpeqd %xmm1, %xmm0
> > > - pmovmskb %xmm3, %rdx
> > > - pmovmskb %xmm0, %rax
> > > - shr %cl, %rdx
> > > - shr %cl, %rax
> > > - add $16, %rdi
> > > -
> > > - test %rax, %rax
> > > - jnz L(unaligned_match)
> > > -
> > > - test %rdx, %rdx
> > > - jnz L(return_null)
> > > -
> > > - xor %r8, %r8
> > > - jmp L(loop)
> > > -
> > > - .p2align 4
> > > -L(unaligned_match):
> > > - test %rdx, %rdx
> > > - jnz L(prolog_find_zero)
> > > -
> > > - mov %rax, %r8
> > > - lea (%rdi, %rcx), %rsi
> > > -
> > > -/* Loop start on aligned string. */
> > > - .p2align 4
> > > -L(loop):
> > > - movdqa (%rdi), %xmm0
> > > - pcmpeqd %xmm0, %xmm2
> > > - add $16, %rdi
> > > - pcmpeqd %xmm1, %xmm0
> > > - pmovmskb %xmm2, %rcx
> > > - pmovmskb %xmm0, %rax
> > > - or %rax, %rcx
> > > - jnz L(matches)
> > > -
> > > - movdqa (%rdi), %xmm3
> > > - pcmpeqd %xmm3, %xmm2
> > > - add $16, %rdi
> > > - pcmpeqd %xmm1, %xmm3
> > > - pmovmskb %xmm2, %rcx
> > > - pmovmskb %xmm3, %rax
> > > - or %rax, %rcx
> > > - jnz L(matches)
> > > -
> > > - movdqa (%rdi), %xmm4
> > > - pcmpeqd %xmm4, %xmm2
> > > - add $16, %rdi
> > > - pcmpeqd %xmm1, %xmm4
> > > - pmovmskb %xmm2, %rcx
> > > - pmovmskb %xmm4, %rax
> > > - or %rax, %rcx
> > > - jnz L(matches)
> > > -
> > > - movdqa (%rdi), %xmm5
> > > - pcmpeqd %xmm5, %xmm2
> > > - add $16, %rdi
> > > - pcmpeqd %xmm1, %xmm5
> > > - pmovmskb %xmm2, %rcx
> > > - pmovmskb %xmm5, %rax
> > > - or %rax, %rcx
> > > - jz L(loop)
> > > -
> > > - .p2align 4
> > > -L(matches):
> > > - test %rax, %rax
> > > - jnz L(match)
> > > -L(return_value):
> > > - test %r8, %r8
> > > - jz L(return_null)
> > > - mov %r8, %rax
> > > - mov %rsi, %rdi
> > > -
> > > - test $15 << 4, %ah
> > > - jnz L(match_fourth_wchar)
> > > - test %ah, %ah
> > > - jnz L(match_third_wchar)
> > > - test $15 << 4, %al
> > > - jnz L(match_second_wchar)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(match):
> > > - pmovmskb %xmm2, %rcx
> > > - test %rcx, %rcx
> > > - jnz L(find_zero)
> > > - mov %rax, %r8
> > > - mov %rdi, %rsi
> > > - jmp L(loop)
> > > -
> > > - .p2align 4
> > > -L(find_zero):
> > > - test $15, %cl
> > > - jnz L(find_zero_in_first_wchar)
> > > - test %cl, %cl
> > > - jnz L(find_zero_in_second_wchar)
> > > - test $15, %ch
> > > - jnz L(find_zero_in_third_wchar)
> > > -
> > > - and $1 << 13 - 1, %rax
> > > - jz L(return_value)
> > > -
> > > - test $15 << 4, %ah
> > > - jnz L(match_fourth_wchar)
> > > - test %ah, %ah
> > > - jnz L(match_third_wchar)
> > > - test $15 << 4, %al
> > > - jnz L(match_second_wchar)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(find_zero_in_first_wchar):
> > > - test $1, %rax
> > > - jz L(return_value)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(find_zero_in_second_wchar):
> > > - and $1 << 5 - 1, %rax
> > > - jz L(return_value)
> > > -
> > > - test $15 << 4, %al
> > > - jnz L(match_second_wchar)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(find_zero_in_third_wchar):
> > > - and $1 << 9 - 1, %rax
> > > - jz L(return_value)
> > > -
> > > - test %ah, %ah
> > > - jnz L(match_third_wchar)
> > > - test $15 << 4, %al
> > > - jnz L(match_second_wchar)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(prolog_find_zero):
> > > - add %rcx, %rdi
> > > - mov %rdx, %rcx
> > > -L(prolog_find_zero_1):
> > > - test $15, %cl
> > > - jnz L(prolog_find_zero_in_first_wchar)
> > > - test %cl, %cl
> > > - jnz L(prolog_find_zero_in_second_wchar)
> > > - test $15, %ch
> > > - jnz L(prolog_find_zero_in_third_wchar)
> > > -
> > > - and $1 << 13 - 1, %rax
> > > - jz L(return_null)
> > > -
> > > - test $15 << 4, %ah
> > > - jnz L(match_fourth_wchar)
> > > - test %ah, %ah
> > > - jnz L(match_third_wchar)
> > > - test $15 << 4, %al
> > > - jnz L(match_second_wchar)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(prolog_find_zero_in_first_wchar):
> > > - test $1, %rax
> > > - jz L(return_null)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(prolog_find_zero_in_second_wchar):
> > > - and $1 << 5 - 1, %rax
> > > - jz L(return_null)
> > > -
> > > - test $15 << 4, %al
> > > - jnz L(match_second_wchar)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(prolog_find_zero_in_third_wchar):
> > > - and $1 << 9 - 1, %rax
> > > - jz L(return_null)
> > > -
> > > - test %ah, %ah
> > > - jnz L(match_third_wchar)
> > > - test $15 << 4, %al
> > > - jnz L(match_second_wchar)
> > > - lea -16(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(match_second_wchar):
> > > - lea -12(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(match_third_wchar):
> > > - lea -8(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(match_fourth_wchar):
> > > - lea -4(%rdi), %rax
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(return_null):
> > > - xor %rax, %rax
> > > - ret
> > > -
> > > -END (wcsrchr)
> > > +#include "../strrchr.S"
> > > --
> > > 2.25.1
> > >
> >
> >
> > --
> > H.J.
On Thu, Apr 21, 2022 at 4:49 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Thu, Apr 21, 2022 at 1:57 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Thu, Apr 21, 2022 at 3:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Wed, Apr 20, 2022 at 8:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > The new code unrolls the main loop slightly without adding too much
> > > > overhead and minimizes the comparisons for the search CHAR.
> > > >
> > > > Geometric Mean of all benchmarks New / Old: 0.741
> > > > See email for all results.
> > > >
> > > > Full xcheck passes on x86_64 with and without multiarch enabled.
> > > > ---
> > > > Results For: strrchr
> > > >
> > > > Geometric Mean of N=30 runs.
> > > >
> > > > Geometric Mean of all benchmarks New / Old: 0.741
> > > > Benchmarks performance on Tigerlake:
> > > > https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
> > > >
> > > > len, align, pos, seek, max_char, freq, New Time / Old Time
> > > > 2048, 0, 32, 0, 127, 1, 0.647
> > > > 2048, 1, 32, 0, 127, 1, 0.621
> > > > 2048, 0, 64, 0, 127, 1, 0.661
> > > > 2048, 2, 64, 0, 127, 1, 0.655
> > > > 2048, 0, 128, 0, 127, 1, 0.69
> > > > 2048, 3, 128, 0, 127, 1, 0.689
> > > > 2048, 0, 256, 0, 127, 1, 0.718
> > > > 2048, 4, 256, 0, 127, 1, 0.718
> > > > 2048, 0, 512, 0, 127, 1, 0.758
> > > > 2048, 5, 512, 0, 127, 1, 0.754
> > > > 2048, 0, 1024, 0, 127, 1, 1.029
> > > > 2048, 6, 1024, 0, 127, 1, 1.032
> > > > 2048, 0, 2048, 0, 127, 1, 0.826
> > > > 2048, 7, 2048, 0, 127, 1, 0.834
> > > > 2048, 0, 4096, 0, 127, 1, 0.825
> > > > 2048, 8, 4096, 0, 127, 1, 0.83
> > > > 256, 1, 64, 0, 127, 1, 0.657
> > > > 256, 15, 64, 0, 127, 1, 0.657
> > > > 256, 2, 64, 0, 127, 1, 0.657
> > > > 256, 30, 64, 0, 127, 1, 0.523
> > > > 256, 3, 64, 0, 127, 1, 0.657
> > > > 256, 45, 64, 0, 127, 1, 0.654
> > > > 256, 4, 64, 0, 127, 1, 0.657
> > > > 256, 60, 64, 0, 127, 1, 0.526
> > > > 256, 5, 64, 0, 127, 1, 0.658
> > > > 256, 75, 64, 0, 127, 1, 0.658
> > > > 256, 6, 64, 0, 127, 1, 0.655
> > > > 256, 90, 64, 0, 127, 1, 0.523
> > > > 256, 7, 64, 0, 127, 1, 0.655
> > > > 256, 105, 64, 0, 127, 1, 0.654
> > > > 1, 0, 0, 0, 127, 1, 0.98
> > > > 2, 0, 1, 0, 127, 1, 0.978
> > > > 3, 0, 2, 0, 127, 1, 0.975
> > > > 4, 0, 3, 0, 127, 1, 0.976
> > > > 5, 0, 4, 0, 127, 1, 0.977
> > > > 6, 0, 5, 0, 127, 1, 0.981
> > > > 7, 0, 6, 0, 127, 1, 0.982
> > > > 8, 0, 7, 0, 127, 1, 0.98
> > > > 9, 0, 8, 0, 127, 1, 0.978
> > > > 10, 0, 9, 0, 127, 1, 0.981
> > > > 11, 0, 10, 0, 127, 1, 0.984
> > > > 12, 0, 11, 0, 127, 1, 0.982
> > > > 13, 0, 12, 0, 127, 1, 0.98
> > > > 14, 0, 13, 0, 127, 1, 0.978
> > > > 15, 0, 14, 0, 127, 1, 0.979
> > > > 16, 0, 15, 0, 127, 1, 0.986
> > > > 17, 0, 16, 0, 127, 1, 0.529
> > > > 18, 0, 17, 0, 127, 1, 0.566
> > > > 19, 0, 18, 0, 127, 1, 0.575
> > > > 20, 0, 19, 0, 127, 1, 0.573
> > > > 21, 0, 20, 0, 127, 1, 0.579
> > > > 22, 0, 21, 0, 127, 1, 0.595
> > > > 23, 0, 22, 0, 127, 1, 0.585
> > > > 24, 0, 23, 0, 127, 1, 0.586
> > > > 25, 0, 24, 0, 127, 1, 0.587
> > > > 26, 0, 25, 0, 127, 1, 0.592
> > > > 27, 0, 26, 0, 127, 1, 0.595
> > > > 28, 0, 27, 0, 127, 1, 0.592
> > > > 29, 0, 28, 0, 127, 1, 0.6
> > > > 30, 0, 29, 0, 127, 1, 0.598
> > > > 31, 0, 30, 0, 127, 1, 0.595
> > > > 32, 0, 31, 0, 127, 1, 0.592
> > > > 2048, 0, 32, 23, 127, 1, 0.827
> > > > 2048, 1, 32, 23, 127, 1, 0.826
> > > > 2048, 0, 64, 23, 127, 1, 0.824
> > > > 2048, 2, 64, 23, 127, 1, 0.825
> > > > 2048, 0, 128, 23, 127, 1, 0.829
> > > > 2048, 3, 128, 23, 127, 1, 0.824
> > > > 2048, 0, 256, 23, 127, 1, 0.832
> > > > 2048, 4, 256, 23, 127, 1, 0.825
> > > > 2048, 0, 512, 23, 127, 1, 0.831
> > > > 2048, 5, 512, 23, 127, 1, 0.837
> > > > 2048, 0, 1024, 23, 127, 1, 0.721
> > > > 2048, 6, 1024, 23, 127, 1, 0.757
> > > > 2048, 0, 2048, 23, 127, 1, 0.825
> > > > 2048, 7, 2048, 23, 127, 1, 0.824
> > > > 2048, 0, 4096, 23, 127, 1, 0.828
> > > > 2048, 8, 4096, 23, 127, 1, 0.823
> > > > 256, 1, 64, 23, 127, 1, 0.665
> > > > 256, 15, 64, 23, 127, 1, 0.661
> > > > 256, 2, 64, 23, 127, 1, 0.674
> > > > 256, 30, 64, 23, 127, 1, 0.605
> > > > 256, 3, 64, 23, 127, 1, 0.668
> > > > 256, 45, 64, 23, 127, 1, 0.661
> > > > 256, 4, 64, 23, 127, 1, 0.657
> > > > 256, 60, 64, 23, 127, 1, 0.594
> > > > 256, 5, 64, 23, 127, 1, 0.654
> > > > 256, 75, 64, 23, 127, 1, 0.673
> > > > 256, 6, 64, 23, 127, 1, 0.688
> > > > 256, 90, 64, 23, 127, 1, 0.6
> > > > 256, 7, 64, 23, 127, 1, 0.66
> > > > 256, 105, 64, 23, 127, 1, 0.654
> > > > 1, 0, 0, 23, 127, 1, 0.981
> > > > 2, 0, 1, 23, 127, 1, 0.976
> > > > 3, 0, 2, 23, 127, 1, 0.983
> > > > 4, 0, 3, 23, 127, 1, 0.984
> > > > 5, 0, 4, 23, 127, 1, 0.973
> > > > 6, 0, 5, 23, 127, 1, 0.987
> > > > 7, 0, 6, 23, 127, 1, 0.977
> > > > 8, 0, 7, 23, 127, 1, 0.979
> > > > 9, 0, 8, 23, 127, 1, 0.981
> > > > 10, 0, 9, 23, 127, 1, 0.98
> > > > 11, 0, 10, 23, 127, 1, 0.983
> > > > 12, 0, 11, 23, 127, 1, 0.98
> > > > 13, 0, 12, 23, 127, 1, 0.98
> > > > 14, 0, 13, 23, 127, 1, 0.977
> > > > 15, 0, 14, 23, 127, 1, 0.982
> > > > 16, 0, 15, 23, 127, 1, 0.581
> > > > 17, 0, 16, 23, 127, 1, 0.551
> > > > 18, 0, 17, 23, 127, 1, 0.555
> > > > 19, 0, 18, 23, 127, 1, 0.586
> > > > 20, 0, 19, 23, 127, 1, 0.585
> > > > 21, 0, 20, 23, 127, 1, 0.582
> > > > 22, 0, 21, 23, 127, 1, 0.571
> > > > 23, 0, 22, 23, 127, 1, 0.576
> > > > 24, 0, 23, 23, 127, 1, 0.581
> > > > 25, 0, 24, 23, 127, 1, 0.589
> > > > 26, 0, 25, 23, 127, 1, 0.593
> > > > 27, 0, 26, 23, 127, 1, 0.595
> > > > 28, 0, 27, 23, 127, 1, 0.583
> > > > 29, 0, 28, 23, 127, 1, 0.595
> > > > 30, 0, 29, 23, 127, 1, 0.58
> > > > 31, 0, 30, 23, 127, 1, 0.594
> > > > 32, 0, 31, 23, 127, 1, 0.665
> > > > 2048, 0, 32, 23, 127, 2, 0.825
> > > > 2048, 1, 32, 23, 127, 2, 0.818
> > > > 2048, 0, 64, 23, 127, 2, 0.829
> > > > 2048, 2, 64, 23, 127, 2, 0.828
> > > > 2048, 0, 128, 23, 127, 2, 0.823
> > > > 2048, 3, 128, 23, 127, 2, 0.825
> > > > 2048, 0, 256, 23, 127, 2, 0.819
> > > > 2048, 4, 256, 23, 127, 2, 0.828
> > > > 2048, 0, 512, 23, 127, 2, 0.824
> > > > 2048, 5, 512, 23, 127, 2, 0.827
> > > > 2048, 0, 1024, 23, 127, 2, 0.813
> > > > 2048, 6, 1024, 23, 127, 2, 0.834
> > > > 2048, 0, 2048, 23, 127, 2, 0.927
> > > > 2048, 7, 2048, 23, 127, 2, 0.923
> > > > 2048, 0, 4096, 23, 127, 2, 0.818
> > > > 2048, 8, 4096, 23, 127, 2, 0.82
> > > > 256, 1, 64, 23, 127, 2, 0.693
> > > > 256, 15, 64, 23, 127, 2, 0.686
> > > > 256, 2, 64, 23, 127, 2, 0.69
> > > > 256, 30, 64, 23, 127, 2, 0.611
> > > > 256, 3, 64, 23, 127, 2, 0.692
> > > > 256, 45, 64, 23, 127, 2, 0.685
> > > > 256, 4, 64, 23, 127, 2, 0.688
> > > > 256, 60, 64, 23, 127, 2, 0.6
> > > > 256, 5, 64, 23, 127, 2, 0.69
> > > > 256, 75, 64, 23, 127, 2, 0.689
> > > > 256, 6, 64, 23, 127, 2, 0.688
> > > > 256, 90, 64, 23, 127, 2, 0.611
> > > > 256, 7, 64, 23, 127, 2, 0.69
> > > > 256, 105, 64, 23, 127, 2, 0.686
> > > > 1, 0, 0, 23, 127, 2, 0.982
> > > > 2, 0, 1, 23, 127, 2, 0.987
> > > > 3, 0, 2, 23, 127, 2, 0.978
> > > > 4, 0, 3, 23, 127, 2, 0.977
> > > > 5, 0, 4, 23, 127, 2, 0.979
> > > > 6, 0, 5, 23, 127, 2, 0.985
> > > > 7, 0, 6, 23, 127, 2, 0.975
> > > > 8, 0, 7, 23, 127, 2, 0.981
> > > > 9, 0, 8, 23, 127, 2, 0.984
> > > > 10, 0, 9, 23, 127, 2, 0.983
> > > > 11, 0, 10, 23, 127, 2, 0.982
> > > > 12, 0, 11, 23, 127, 2, 0.976
> > > > 13, 0, 12, 23, 127, 2, 0.985
> > > > 14, 0, 13, 23, 127, 2, 0.984
> > > > 15, 0, 14, 23, 127, 2, 0.98
> > > > 16, 0, 15, 23, 127, 2, 0.583
> > > > 17, 0, 16, 23, 127, 2, 0.552
> > > > 18, 0, 17, 23, 127, 2, 0.564
> > > > 19, 0, 18, 23, 127, 2, 0.585
> > > > 20, 0, 19, 23, 127, 2, 0.578
> > > > 21, 0, 20, 23, 127, 2, 0.578
> > > > 22, 0, 21, 23, 127, 2, 0.571
> > > > 23, 0, 22, 23, 127, 2, 0.587
> > > > 24, 0, 23, 23, 127, 2, 0.589
> > > > 25, 0, 24, 23, 127, 2, 0.593
> > > > 26, 0, 25, 23, 127, 2, 0.589
> > > > 27, 0, 26, 23, 127, 2, 0.588
> > > > 28, 0, 27, 23, 127, 2, 0.593
> > > > 29, 0, 28, 23, 127, 2, 0.579
> > > > 30, 0, 29, 23, 127, 2, 0.572
> > > > 31, 0, 30, 23, 127, 2, 0.582
> > > > 32, 0, 31, 23, 127, 2, 0.659
> > > > 2048, 0, 32, 23, 127, 4, 0.822
> > > > 2048, 1, 32, 23, 127, 4, 0.818
> > > > 2048, 0, 64, 23, 127, 4, 0.826
> > > > 2048, 2, 64, 23, 127, 4, 0.824
> > > > 2048, 0, 128, 23, 127, 4, 0.833
> > > > 2048, 3, 128, 23, 127, 4, 0.831
> > > > 2048, 0, 256, 23, 127, 4, 0.826
> > > > 2048, 4, 256, 23, 127, 4, 0.831
> > > > 2048, 0, 512, 23, 127, 4, 0.834
> > > > 2048, 5, 512, 23, 127, 4, 0.83
> > > > 2048, 0, 1024, 23, 127, 4, 0.836
> > > > 2048, 6, 1024, 23, 127, 4, 0.844
> > > > 2048, 0, 2048, 23, 127, 4, 0.696
> > > > 2048, 7, 2048, 23, 127, 4, 0.704
> > > > 2048, 0, 4096, 23, 127, 4, 0.936
> > > > 2048, 8, 4096, 23, 127, 4, 0.925
> > > > 256, 1, 64, 23, 127, 4, 0.694
> > > > 256, 15, 64, 23, 127, 4, 0.69
> > > > 256, 2, 64, 23, 127, 4, 0.687
> > > > 256, 30, 64, 23, 127, 4, 0.612
> > > > 256, 3, 64, 23, 127, 4, 0.685
> > > > 256, 45, 64, 23, 127, 4, 0.685
> > > > 256, 4, 64, 23, 127, 4, 0.684
> > > > 256, 60, 64, 23, 127, 4, 0.606
> > > > 256, 5, 64, 23, 127, 4, 0.69
> > > > 256, 75, 64, 23, 127, 4, 0.688
> > > > 256, 6, 64, 23, 127, 4, 0.69
> > > > 256, 90, 64, 23, 127, 4, 0.615
> > > > 256, 7, 64, 23, 127, 4, 0.691
> > > > 256, 105, 64, 23, 127, 4, 0.688
> > > > 1, 0, 0, 23, 127, 4, 0.982
> > > > 2, 0, 1, 23, 127, 4, 0.983
> > > > 3, 0, 2, 23, 127, 4, 0.981
> > > > 4, 0, 3, 23, 127, 4, 0.984
> > > > 5, 0, 4, 23, 127, 4, 0.963
> > > > 6, 0, 5, 23, 127, 4, 0.978
> > > > 7, 0, 6, 23, 127, 4, 0.985
> > > > 8, 0, 7, 23, 127, 4, 0.986
> > > > 9, 0, 8, 23, 127, 4, 0.978
> > > > 10, 0, 9, 23, 127, 4, 0.985
> > > > 11, 0, 10, 23, 127, 4, 0.986
> > > > 12, 0, 11, 23, 127, 4, 0.983
> > > > 13, 0, 12, 23, 127, 4, 0.986
> > > > 14, 0, 13, 23, 127, 4, 0.98
> > > > 15, 0, 14, 23, 127, 4, 0.979
> > > > 16, 0, 15, 23, 127, 4, 0.582
> > > > 17, 0, 16, 23, 127, 4, 0.542
> > > > 18, 0, 17, 23, 127, 4, 0.564
> > > > 19, 0, 18, 23, 127, 4, 0.571
> > > > 20, 0, 19, 23, 127, 4, 0.582
> > > > 21, 0, 20, 23, 127, 4, 0.573
> > > > 22, 0, 21, 23, 127, 4, 0.575
> > > > 23, 0, 22, 23, 127, 4, 0.578
> > > > 24, 0, 23, 23, 127, 4, 0.58
> > > > 25, 0, 24, 23, 127, 4, 0.592
> > > > 26, 0, 25, 23, 127, 4, 0.588
> > > > 27, 0, 26, 23, 127, 4, 0.574
> > > > 28, 0, 27, 23, 127, 4, 0.589
> > > > 29, 0, 28, 23, 127, 4, 0.56
> > > > 30, 0, 29, 23, 127, 4, 0.587
> > > > 31, 0, 30, 23, 127, 4, 0.584
> > > > 32, 0, 31, 23, 127, 4, 0.664
> > > > 2048, 0, 32, 23, 127, 8, 0.826
> > > > 2048, 1, 32, 23, 127, 8, 0.821
> > > > 2048, 0, 64, 23, 127, 8, 0.828
> > > > 2048, 2, 64, 23, 127, 8, 0.827
> > > > 2048, 0, 128, 23, 127, 8, 0.833
> > > > 2048, 3, 128, 23, 127, 8, 0.83
> > > > 2048, 0, 256, 23, 127, 8, 0.855
> > > > 2048, 4, 256, 23, 127, 8, 0.849
> > > > 2048, 0, 512, 23, 127, 8, 0.849
> > > > 2048, 5, 512, 23, 127, 8, 0.851
> > > > 2048, 0, 1024, 23, 127, 8, 0.856
> > > > 2048, 6, 1024, 23, 127, 8, 0.862
> > > > 2048, 0, 2048, 23, 127, 8, 0.709
> > > > 2048, 7, 2048, 23, 127, 8, 0.712
> > > > 2048, 0, 4096, 23, 127, 8, 0.702
> > > > 2048, 8, 4096, 23, 127, 8, 0.701
> > > > 256, 1, 64, 23, 127, 8, 0.689
> > > > 256, 15, 64, 23, 127, 8, 0.688
> > > > 256, 2, 64, 23, 127, 8, 0.691
> > > > 256, 30, 64, 23, 127, 8, 0.612
> > > > 256, 3, 64, 23, 127, 8, 0.688
> > > > 256, 45, 64, 23, 127, 8, 0.686
> > > > 256, 4, 64, 23, 127, 8, 0.694
> > > > 256, 60, 64, 23, 127, 8, 0.609
> > > > 256, 5, 64, 23, 127, 8, 0.69
> > > > 256, 75, 64, 23, 127, 8, 0.69
> > > > 256, 6, 64, 23, 127, 8, 0.691
> > > > 256, 90, 64, 23, 127, 8, 0.612
> > > > 256, 7, 64, 23, 127, 8, 0.689
> > > > 256, 105, 64, 23, 127, 8, 0.688
> > > > 1, 0, 0, 23, 127, 8, 0.98
> > > > 2, 0, 1, 23, 127, 8, 0.978
> > > > 3, 0, 2, 23, 127, 8, 0.98
> > > > 4, 0, 3, 23, 127, 8, 0.978
> > > > 5, 0, 4, 23, 127, 8, 0.977
> > > > 6, 0, 5, 23, 127, 8, 0.984
> > > > 7, 0, 6, 23, 127, 8, 0.982
> > > > 8, 0, 7, 23, 127, 8, 0.983
> > > > 9, 0, 8, 23, 127, 8, 0.987
> > > > 10, 0, 9, 23, 127, 8, 0.979
> > > > 11, 0, 10, 23, 127, 8, 0.985
> > > > 12, 0, 11, 23, 127, 8, 0.981
> > > > 13, 0, 12, 23, 127, 8, 0.98
> > > > 14, 0, 13, 23, 127, 8, 0.982
> > > > 15, 0, 14, 23, 127, 8, 0.981
> > > > 16, 0, 15, 23, 127, 8, 0.579
> > > > 17, 0, 16, 23, 127, 8, 0.531
> > > > 18, 0, 17, 23, 127, 8, 0.577
> > > > 19, 0, 18, 23, 127, 8, 0.588
> > > > 20, 0, 19, 23, 127, 8, 0.571
> > > > 21, 0, 20, 23, 127, 8, 0.576
> > > > 22, 0, 21, 23, 127, 8, 0.59
> > > > 23, 0, 22, 23, 127, 8, 0.574
> > > > 24, 0, 23, 23, 127, 8, 0.583
> > > > 25, 0, 24, 23, 127, 8, 0.581
> > > > 26, 0, 25, 23, 127, 8, 0.592
> > > > 27, 0, 26, 23, 127, 8, 0.586
> > > > 28, 0, 27, 23, 127, 8, 0.588
> > > > 29, 0, 28, 23, 127, 8, 0.578
> > > > 30, 0, 29, 23, 127, 8, 0.573
> > > > 31, 0, 30, 23, 127, 8, 0.588
> > > > 32, 0, 31, 23, 127, 8, 0.664
> > > > 2048, 0, 32, 23, 127, 16, 0.825
> > > > 2048, 1, 32, 23, 127, 16, 0.823
> > > > 2048, 0, 64, 23, 127, 16, 0.831
> > > > 2048, 2, 64, 23, 127, 16, 0.822
> > > > 2048, 0, 128, 23, 127, 16, 0.831
> > > > 2048, 3, 128, 23, 127, 16, 0.831
> > > > 2048, 0, 256, 23, 127, 16, 0.849
> > > > 2048, 4, 256, 23, 127, 16, 0.85
> > > > 2048, 0, 512, 23, 127, 16, 0.751
> > > > 2048, 5, 512, 23, 127, 16, 0.75
> > > > 2048, 0, 1024, 23, 127, 16, 0.913
> > > > 2048, 6, 1024, 23, 127, 16, 0.895
> > > > 2048, 0, 2048, 23, 127, 16, 0.736
> > > > 2048, 7, 2048, 23, 127, 16, 0.741
> > > > 2048, 0, 4096, 23, 127, 16, 0.712
> > > > 2048, 8, 4096, 23, 127, 16, 0.711
> > > > 256, 1, 64, 23, 127, 16, 0.758
> > > > 256, 15, 64, 23, 127, 16, 0.692
> > > > 256, 2, 64, 23, 127, 16, 0.692
> > > > 256, 30, 64, 23, 127, 16, 0.613
> > > > 256, 3, 64, 23, 127, 16, 0.69
> > > > 256, 45, 64, 23, 127, 16, 0.687
> > > > 256, 4, 64, 23, 127, 16, 0.69
> > > > 256, 60, 64, 23, 127, 16, 0.604
> > > > 256, 5, 64, 23, 127, 16, 0.687
> > > > 256, 75, 64, 23, 127, 16, 0.687
> > > > 256, 6, 64, 23, 127, 16, 0.69
> > > > 256, 90, 64, 23, 127, 16, 0.61
> > > > 256, 7, 64, 23, 127, 16, 0.69
> > > > 256, 105, 64, 23, 127, 16, 0.685
> > > > 1, 0, 0, 23, 127, 16, 0.981
> > > > 2, 0, 1, 23, 127, 16, 0.985
> > > > 3, 0, 2, 23, 127, 16, 0.985
> > > > 4, 0, 3, 23, 127, 16, 0.981
> > > > 5, 0, 4, 23, 127, 16, 0.979
> > > > 6, 0, 5, 23, 127, 16, 0.986
> > > > 7, 0, 6, 23, 127, 16, 0.986
> > > > 8, 0, 7, 23, 127, 16, 0.982
> > > > 9, 0, 8, 23, 127, 16, 0.982
> > > > 10, 0, 9, 23, 127, 16, 0.98
> > > > 11, 0, 10, 23, 127, 16, 0.983
> > > > 12, 0, 11, 23, 127, 16, 0.982
> > > > 13, 0, 12, 23, 127, 16, 0.982
> > > > 14, 0, 13, 23, 127, 16, 0.982
> > > > 15, 0, 14, 23, 127, 16, 0.982
> > > > 16, 0, 15, 23, 127, 16, 0.582
> > > > 17, 0, 16, 23, 127, 16, 0.542
> > > > 18, 0, 17, 23, 127, 16, 0.554
> > > > 19, 0, 18, 23, 127, 16, 0.562
> > > > 20, 0, 19, 23, 127, 16, 0.587
> > > > 21, 0, 20, 23, 127, 16, 0.584
> > > > 22, 0, 21, 23, 127, 16, 0.587
> > > > 23, 0, 22, 23, 127, 16, 0.594
> > > > 24, 0, 23, 23, 127, 16, 0.581
> > > > 25, 0, 24, 23, 127, 16, 0.577
> > > > 26, 0, 25, 23, 127, 16, 0.588
> > > > 27, 0, 26, 23, 127, 16, 0.589
> > > > 28, 0, 27, 23, 127, 16, 0.596
> > > > 29, 0, 28, 23, 127, 16, 0.591
> > > > 30, 0, 29, 23, 127, 16, 0.585
> > > > 31, 0, 30, 23, 127, 16, 0.59
> > > > 32, 0, 31, 23, 127, 16, 0.669
> > > >
> > > > sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
> > > > sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
> > > > sysdeps/x86_64/strrchr.S | 505 +++++++++++++++---------
> > > > sysdeps/x86_64/wcsrchr.S | 268 +------------
> > > > 4 files changed, 334 insertions(+), 444 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > > index db1b44c23c..866396e947 100644
> > > > --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > > +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> > > > @@ -17,7 +17,7 @@
> > > > <https://www.gnu.org/licenses/>. */
> > > >
> > > > #if IS_IN (libc)
> > > > -# define strrchr __strrchr_sse2
> > > > +# define STRRCHR __strrchr_sse2
> > > >
> > > > # undef weak_alias
> > > > # define weak_alias(strrchr, rindex)
> > > > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > > index 78d1ca6553..69d2f3cdb1 100644
> > > > --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > > +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> > > > @@ -17,7 +17,6 @@
> > > > <https://www.gnu.org/licenses/>. */
> > > >
> > > > #if IS_IN (libc)
> > > > -# define wcsrchr __wcsrchr_sse2
> > > > +# define STRRCHR __wcsrchr_sse2
> > > > #endif
> > > > -
> > > > #include "../wcsrchr.S"
> > > > diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> > > > index 50d886713e..94449ad806 100644
> > > > --- a/sysdeps/x86_64/strrchr.S
> > > > +++ b/sysdeps/x86_64/strrchr.S
> > > > @@ -19,210 +19,355 @@
> > > >
> > > > #include <sysdep.h>
> > > >
> > > > +#ifndef STRRCHR
> > > > +# define STRRCHR strrchr
> > > > +#endif
> > > > +
> > > > +#ifdef USE_AS_WCSRCHR
> > > > +# define PCMPEQ pcmpeqd
> > > > +# define CHAR_SIZE 4
> > > > +# define PMINU pminud
> > > > +#else
> > > > +# define PCMPEQ pcmpeqb
> > > > +# define CHAR_SIZE 1
> > > > +# define PMINU pminub
> > > > +#endif
> > > > +
> > > > +#define PAGE_SIZE 4096
> > > > +#define VEC_SIZE 16
> > > > +
> > > > .text
> > > > -ENTRY (strrchr)
> > > > - movd %esi, %xmm1
> > > > +ENTRY(STRRCHR)
> > > > + movd %esi, %xmm0
> > > > movq %rdi, %rax
> > > > - andl $4095, %eax
> > > > - punpcklbw %xmm1, %xmm1
> > > > - cmpq $4032, %rax
> > > > - punpcklwd %xmm1, %xmm1
> > > > - pshufd $0, %xmm1, %xmm1
> > > > + andl $(PAGE_SIZE - 1), %eax
> > > > +#ifndef USE_AS_WCSRCHR
> > > > + punpcklbw %xmm0, %xmm0
> > > > + punpcklwd %xmm0, %xmm0
> > > > +#endif
> > > > + pshufd $0, %xmm0, %xmm0
> > > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > > > ja L(cross_page)
> > > > - movdqu (%rdi), %xmm0
> > > > +
> > > > +L(cross_page_continue):
> > > > + movups (%rdi), %xmm1
> > > > pxor %xmm2, %xmm2
> > > > - movdqa %xmm0, %xmm3
> > > > - pcmpeqb %xmm1, %xmm0
> > > > - pcmpeqb %xmm2, %xmm3
> > > > - pmovmskb %xmm0, %ecx
> > > > - pmovmskb %xmm3, %edx
> > > > - testq %rdx, %rdx
> > > > - je L(next_48_bytes)
> > > > - leaq -1(%rdx), %rax
> > > > - xorq %rdx, %rax
> > > > - andq %rcx, %rax
> > > > - je L(exit)
> > > > - bsrq %rax, %rax
> > > > + PCMPEQ %xmm1, %xmm2
> > > > + pmovmskb %xmm2, %ecx
> > > > + testl %ecx, %ecx
> > > > + jz L(aligned_more)
> > > > +
> > > > + PCMPEQ %xmm0, %xmm1
> > > > + pmovmskb %xmm1, %eax
> > > > + leal -1(%rcx), %edx
> > > > + xorl %edx, %ecx
> > > > + andl %ecx, %eax
> > > > + jz L(ret0)
> > > > + bsrl %eax, %eax
> > > > addq %rdi, %rax
> > > > + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> > > > + search CHAR is zero we are correct. Either way `andq
> > > > + -CHAR_SIZE, %rax` gets the correct result. */
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > +L(ret0):
> > > > ret
> > > >
> > > > + /* Returns for first vec x1/x2 have hard coded backward search
> > > > + path for earlier matches. */
> > > > .p2align 4
> > > > -L(next_48_bytes):
> > > > - movdqu 16(%rdi), %xmm4
> > > > - movdqa %xmm4, %xmm5
> > > > - movdqu 32(%rdi), %xmm3
> > > > - pcmpeqb %xmm1, %xmm4
> > > > - pcmpeqb %xmm2, %xmm5
> > > > - movdqu 48(%rdi), %xmm0
> > > > - pmovmskb %xmm5, %edx
> > > > - movdqa %xmm3, %xmm5
> > > > - pcmpeqb %xmm1, %xmm3
> > > > - pcmpeqb %xmm2, %xmm5
> > > > - pcmpeqb %xmm0, %xmm2
> > > > - salq $16, %rdx
> > > > - pmovmskb %xmm3, %r8d
> > > > - pmovmskb %xmm5, %eax
> > > > - pmovmskb %xmm2, %esi
> > > > - salq $32, %r8
> > > > - salq $32, %rax
> > > > - pcmpeqb %xmm1, %xmm0
> > > > - orq %rdx, %rax
> > > > - movq %rsi, %rdx
> > > > - pmovmskb %xmm4, %esi
> > > > - salq $48, %rdx
> > > > - salq $16, %rsi
> > > > - orq %r8, %rsi
> > > > - orq %rcx, %rsi
> > > > - pmovmskb %xmm0, %ecx
> > > > - salq $48, %rcx
> > > > - orq %rcx, %rsi
> > > > - orq %rdx, %rax
> > > > - je L(loop_header2)
> > > > - leaq -1(%rax), %rcx
> > > > - xorq %rax, %rcx
> > > > - andq %rcx, %rsi
> > > > - je L(exit)
> > > > - bsrq %rsi, %rsi
> > > > - leaq (%rdi,%rsi), %rax
> > > > +L(first_vec_x0_test):
> > > > + PCMPEQ %xmm0, %xmm1
> > > > + pmovmskb %xmm1, %eax
> > > > + testl %eax, %eax
> > > > + jz L(ret0)
> > > > + bsrl %eax, %eax
> > > > + addq %r8, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > ret
> > > >
> > > > .p2align 4
> > > > -L(loop_header2):
> > > > - testq %rsi, %rsi
> > > > - movq %rdi, %rcx
> > > > - je L(no_c_found)
> > > > -L(loop_header):
> > > > - addq $64, %rdi
> > > > - pxor %xmm7, %xmm7
> > > > - andq $-64, %rdi
> > > > - jmp L(loop_entry)
> > > > +L(first_vec_x1):
> > > > + PCMPEQ %xmm0, %xmm2
> > > > + pmovmskb %xmm2, %eax
> > > > + leal -1(%rcx), %edx
> > > > + xorl %edx, %ecx
> > > > + andl %ecx, %eax
> > > > + jz L(first_vec_x0_test)
> > > > + bsrl %eax, %eax
> > > > + leaq (VEC_SIZE)(%rdi, %rax), %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > + ret
> > > >
> > > > .p2align 4
> > > > -L(loop64):
> > > > - testq %rdx, %rdx
> > > > - cmovne %rdx, %rsi
> > > > - cmovne %rdi, %rcx
> > > > - addq $64, %rdi
> > > > -L(loop_entry):
> > > > - movdqa 32(%rdi), %xmm3
> > > > - pxor %xmm6, %xmm6
> > > > - movdqa 48(%rdi), %xmm2
> > > > - movdqa %xmm3, %xmm0
> > > > - movdqa 16(%rdi), %xmm4
> > > > - pminub %xmm2, %xmm0
> > > > - movdqa (%rdi), %xmm5
> > > > - pminub %xmm4, %xmm0
> > > > - pminub %xmm5, %xmm0
> > > > - pcmpeqb %xmm7, %xmm0
> > > > - pmovmskb %xmm0, %eax
> > > > - movdqa %xmm5, %xmm0
> > > > - pcmpeqb %xmm1, %xmm0
> > > > - pmovmskb %xmm0, %r9d
> > > > - movdqa %xmm4, %xmm0
> > > > - pcmpeqb %xmm1, %xmm0
> > > > - pmovmskb %xmm0, %edx
> > > > - movdqa %xmm3, %xmm0
> > > > - pcmpeqb %xmm1, %xmm0
> > > > - salq $16, %rdx
> > > > - pmovmskb %xmm0, %r10d
> > > > - movdqa %xmm2, %xmm0
> > > > - pcmpeqb %xmm1, %xmm0
> > > > - salq $32, %r10
> > > > - orq %r10, %rdx
> > > > - pmovmskb %xmm0, %r8d
> > > > - orq %r9, %rdx
> > > > - salq $48, %r8
> > > > - orq %r8, %rdx
> > > > +L(first_vec_x1_test):
> > > > + PCMPEQ %xmm0, %xmm2
> > > > + pmovmskb %xmm2, %eax
> > > > testl %eax, %eax
> > > > - je L(loop64)
> > > > - pcmpeqb %xmm6, %xmm4
> > > > - pcmpeqb %xmm6, %xmm3
> > > > - pcmpeqb %xmm6, %xmm5
> > > > - pmovmskb %xmm4, %eax
> > > > - pmovmskb %xmm3, %r10d
> > > > - pcmpeqb %xmm6, %xmm2
> > > > - pmovmskb %xmm5, %r9d
> > > > - salq $32, %r10
> > > > - salq $16, %rax
> > > > - pmovmskb %xmm2, %r8d
> > > > - orq %r10, %rax
> > > > - orq %r9, %rax
> > > > - salq $48, %r8
> > > > - orq %r8, %rax
> > > > - leaq -1(%rax), %r8
> > > > - xorq %rax, %r8
> > > > - andq %r8, %rdx
> > > > - cmovne %rdi, %rcx
> > > > - cmovne %rdx, %rsi
> > > > - bsrq %rsi, %rsi
> > > > - leaq (%rcx,%rsi), %rax
> > > > + jz L(first_vec_x0_test)
> > > > + bsrl %eax, %eax
> > > > + leaq (VEC_SIZE)(%rdi, %rax), %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > + ret
> > > > +
> > > > + .p2align 4
> > > > +L(first_vec_x2):
> > > > + PCMPEQ %xmm0, %xmm3
> > > > + pmovmskb %xmm3, %eax
> > > > + leal -1(%rcx), %edx
> > > > + xorl %edx, %ecx
> > > > + andl %ecx, %eax
> > > > + jz L(first_vec_x1_test)
> > > > + bsrl %eax, %eax
> > > > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > + ret
> > > > +
> > > > + .p2align 4
> > > > +L(aligned_more):
> > > > + /* Save original pointer if match was in VEC 0. */
> > > > + movq %rdi, %r8
> > > > + andq $-VEC_SIZE, %rdi
> > > > +
> > > > + movaps VEC_SIZE(%rdi), %xmm2
> > > > + pxor %xmm3, %xmm3
> > > > + PCMPEQ %xmm2, %xmm3
> > > > + pmovmskb %xmm3, %ecx
> > > > + testl %ecx, %ecx
> > > > + jnz L(first_vec_x1)
> > > > +
> > > > + movaps (VEC_SIZE * 2)(%rdi), %xmm3
> > > > + pxor %xmm4, %xmm4
> > > > + PCMPEQ %xmm3, %xmm4
> > > > + pmovmskb %xmm4, %ecx
> > > > + testl %ecx, %ecx
> > > > + jnz L(first_vec_x2)
> > > > +
> > > > + addq $VEC_SIZE, %rdi
> > > > + /* Save pointer again before realigning. */
> > > > + movq %rdi, %rsi
> > > > + andq $-(VEC_SIZE * 2), %rdi
> > > > + .p2align 4
> > > > +L(first_loop):
> > > > + /* Do 2x VEC at a time. */
> > > > + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> > > > + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> > > > + /* If SSE2 no pminud. */
> > > > +#ifdef NO_PMINU
> > >
> > > Do we really need SSE4.1 wcsrchr? I think we should focus on AVX2 and
> > > above.
> >
> > It seems like freebie performance that can make a difference in the loop
> > cases. (see the SSE4.1 commit for numbers).
>
> But these numbers are on Tiger Lake. I think we should continue to
> improve SSE2
> version and optimize AVX2/AVX512. I don't think we should increase code sizes
> for SSE4.
Fair enough. Removed SSE4 version but added comment suggesting it as an
optimization if the need arises.
>
> > Imo there is little harm but if you feel strongly I'll drop. (In V2 will
> > change the .text section for SSE4_1).
> >
> > What do you think?
> > >
> > > > + movaps %xmm5, %xmm6
> > > > + pxor %xmm8, %xmm8
> > > > +
> > > > + PCMPEQ %xmm8, %xmm5
> > > > + PCMPEQ %xmm4, %xmm8
> > > > + por %xmm5, %xmm8
> > > > +#else
> > > > + movaps %xmm5, %xmm6
> > > > + PMINU %xmm4, %xmm5
> > > > +#endif
> > > > +
> > > > + movaps %xmm4, %xmm9
> > > > + PCMPEQ %xmm0, %xmm4
> > > > + PCMPEQ %xmm0, %xmm6
> > > > + movaps %xmm6, %xmm7
> > > > + por %xmm4, %xmm6
> > > > +#ifndef NO_PMINU
> > > > + pxor %xmm8, %xmm8
> > > > + PCMPEQ %xmm5, %xmm8
> > > > +#endif
> > > > + pmovmskb %xmm8, %ecx
> > > > + pmovmskb %xmm6, %eax
> > > > +
> > > > + addq $(VEC_SIZE * 2), %rdi
> > > > + /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> > > > + macro-fuse with `jz`. */
> > > > + addl %ecx, %eax
> > > > + jz L(first_loop)
> > > > +
> > > > + /* Check if there is zero match. */
> > > > + testl %ecx, %ecx
> > > > + jz L(second_loop_match)
> > > > +
> > > > + /* Check if there was a match in last iteration. */
> > > > + subl %ecx, %eax
> > > > + jnz L(new_match)
> > > > +
> > > > +L(first_loop_old_match):
> > > > + PCMPEQ %xmm0, %xmm2
> > > > + PCMPEQ %xmm0, %xmm3
> > > > + pmovmskb %xmm2, %ecx
> > > > + pmovmskb %xmm3, %eax
> > > > + addl %eax, %ecx
> > > > + jz L(first_vec_x0_test)
> > > > + /* NB: We could move this shift to before the branch and save a
> > > > + bit of code size / performance on the fall through. The
> > > > + branch leads to the null case which generally seems hotter
> > > > + than char in first 3x VEC. */
> > > > + sall $16, %eax
> > > > + orl %ecx, %eax
> > > > +
> > > > + bsrl %eax, %eax
> > > > + addq %rsi, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > + ret
> > > > +
> > > > + .p2align 4
> > > > +L(new_match):
> > > > + pxor %xmm6, %xmm6
> > > > + PCMPEQ %xmm9, %xmm6
> > > > + pmovmskb %xmm6, %eax
> > > > + sall $16, %ecx
> > > > + orl %eax, %ecx
> > > > +
> > > > + /* We can't reuse either of the old comparisons as since we mask
> > > > + of zeros after first zero (instead of using the full
> > > > + comparison) we can't gurantee no interference between match
> > > > + after end of string and valid match. */
> > > > + pmovmskb %xmm4, %eax
> > > > + pmovmskb %xmm7, %edx
> > > > + sall $16, %edx
> > > > + orl %edx, %eax
> > > > +
> > > > + leal -1(%ecx), %edx
> > > > + xorl %edx, %ecx
> > > > + andl %ecx, %eax
> > > > + jz L(first_loop_old_match)
> > > > + bsrl %eax, %eax
> > > > + addq %rdi, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > ret
> > > >
> > > > + /* Save minimum state for getting most recent match. We can
> > > > + throw out all previous work. */
> > > > .p2align 4
> > > > -L(no_c_found):
> > > > - movl $1, %esi
> > > > - xorl %ecx, %ecx
> > > > - jmp L(loop_header)
> > > > +L(second_loop_match):
> > > > + movq %rdi, %rsi
> > > > + movaps %xmm4, %xmm2
> > > > + movaps %xmm7, %xmm3
> > > >
> > > > .p2align 4
> > > > -L(exit):
> > > > - xorl %eax, %eax
> > > > +L(second_loop):
> > > > + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> > > > + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> > > > +#ifdef NO_PMINU
> > > > + movaps %xmm5, %xmm6
> > > > + pxor %xmm8, %xmm8
> > > > +
> > > > + PCMPEQ %xmm8, %xmm5
> > > > + PCMPEQ %xmm4, %xmm8
> > > > + por %xmm5, %xmm8
> > > > +#else
> > > > + movaps %xmm5, %xmm6
> > > > + PMINU %xmm4, %xmm5
> > > > +#endif
> > > > +
> > > > + movaps %xmm4, %xmm9
> > > > + PCMPEQ %xmm0, %xmm4
> > > > + PCMPEQ %xmm0, %xmm6
> > > > + movaps %xmm6, %xmm7
> > > > + por %xmm4, %xmm6
> > > > +#ifndef NO_PMINU
> > > > + pxor %xmm8, %xmm8
> > > > + PCMPEQ %xmm5, %xmm8
> > > > +#endif
> > > > +
> > > > + pmovmskb %xmm8, %ecx
> > > > + pmovmskb %xmm6, %eax
> > > > +
> > > > + addq $(VEC_SIZE * 2), %rdi
> > > > + /* Either null term or new occurence of CHAR. */
> > > > + addl %ecx, %eax
> > > > + jz L(second_loop)
> > > > +
> > > > + /* No null term so much be new occurence of CHAR. */
> > > > + testl %ecx, %ecx
> > > > + jz L(second_loop_match)
> > > > +
> > > > +
> > > > + subl %ecx, %eax
> > > > + jnz L(second_loop_new_match)
> > > > +
> > > > +L(second_loop_old_match):
> > > > + pmovmskb %xmm2, %ecx
> > > > + pmovmskb %xmm3, %eax
> > > > + sall $16, %eax
> > > > + orl %ecx, %eax
> > > > + bsrl %eax, %eax
> > > > + addq %rsi, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > ret
> > > >
> > > > .p2align 4
> > > > +L(second_loop_new_match):
> > > > + pxor %xmm6, %xmm6
> > > > + PCMPEQ %xmm9, %xmm6
> > > > + pmovmskb %xmm6, %eax
> > > > + sall $16, %ecx
> > > > + orl %eax, %ecx
> > > > +
> > > > + /* We can't reuse either of the old comparisons as since we mask
> > > > + of zeros after first zero (instead of using the full
> > > > + comparison) we can't gurantee no interference between match
> > > > + after end of string and valid match. */
> > > > + pmovmskb %xmm4, %eax
> > > > + pmovmskb %xmm7, %edx
> > > > + sall $16, %edx
> > > > + orl %edx, %eax
> > > > +
> > > > + leal -1(%ecx), %edx
> > > > + xorl %edx, %ecx
> > > > + andl %ecx, %eax
> > > > + jz L(second_loop_old_match)
> > > > + bsrl %eax, %eax
> > > > + addq %rdi, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > + ret
> > > > +
> > > > + .p2align 4,, 4
> > > > L(cross_page):
> > > > - movq %rdi, %rax
> > > > - pxor %xmm0, %xmm0
> > > > - andq $-64, %rax
> > > > - movdqu (%rax), %xmm5
> > > > - movdqa %xmm5, %xmm6
> > > > - movdqu 16(%rax), %xmm4
> > > > - pcmpeqb %xmm1, %xmm5
> > > > - pcmpeqb %xmm0, %xmm6
> > > > - movdqu 32(%rax), %xmm3
> > > > - pmovmskb %xmm6, %esi
> > > > - movdqa %xmm4, %xmm6
> > > > - movdqu 48(%rax), %xmm2
> > > > - pcmpeqb %xmm1, %xmm4
> > > > - pcmpeqb %xmm0, %xmm6
> > > > - pmovmskb %xmm6, %edx
> > > > - movdqa %xmm3, %xmm6
> > > > - pcmpeqb %xmm1, %xmm3
> > > > - pcmpeqb %xmm0, %xmm6
> > > > - pcmpeqb %xmm2, %xmm0
> > > > - salq $16, %rdx
> > > > - pmovmskb %xmm3, %r9d
> > > > - pmovmskb %xmm6, %r8d
> > > > - pmovmskb %xmm0, %ecx
> > > > - salq $32, %r9
> > > > - salq $32, %r8
> > > > - pcmpeqb %xmm1, %xmm2
> > > > - orq %r8, %rdx
> > > > - salq $48, %rcx
> > > > - pmovmskb %xmm5, %r8d
> > > > - orq %rsi, %rdx
> > > > - pmovmskb %xmm4, %esi
> > > > - orq %rcx, %rdx
> > > > - pmovmskb %xmm2, %ecx
> > > > - salq $16, %rsi
> > > > - salq $48, %rcx
> > > > - orq %r9, %rsi
> > > > - orq %r8, %rsi
> > > > - orq %rcx, %rsi
> > > > + movq %rdi, %rsi
> > > > + andq $-VEC_SIZE, %rsi
> > > > + movaps (%rsi), %xmm1
> > > > + pxor %xmm2, %xmm2
> > > > + PCMPEQ %xmm1, %xmm2
> > > > + pmovmskb %xmm2, %edx
> > > > movl %edi, %ecx
> > > > - subl %eax, %ecx
> > > > - shrq %cl, %rdx
> > > > - shrq %cl, %rsi
> > > > - testq %rdx, %rdx
> > > > - je L(loop_header2)
> > > > - leaq -1(%rdx), %rax
> > > > - xorq %rdx, %rax
> > > > - andq %rax, %rsi
> > > > - je L(exit)
> > > > - bsrq %rsi, %rax
> > > > + andl $(VEC_SIZE - 1), %ecx
> > > > + sarl %cl, %edx
> > > > + jz L(cross_page_continue)
> > > > + PCMPEQ %xmm0, %xmm1
> > > > + pmovmskb %xmm1, %eax
> > > > + sarl %cl, %eax
> > > > + leal -1(%rdx), %ecx
> > > > + xorl %edx, %ecx
> > > > + andl %ecx, %eax
> > > > + jz L(ret1)
> > > > + bsrl %eax, %eax
> > > > addq %rdi, %rax
> > > > +#ifdef USE_AS_WCSRCHR
> > > > + andq $-CHAR_SIZE, %rax
> > > > +#endif
> > > > +L(ret1):
> > > > ret
> > > > -END (strrchr)
> > > > +END(STRRCHR)
> > > >
> > > > -weak_alias (strrchr, rindex)
> > > > -libc_hidden_builtin_def (strrchr)
> > > > +#ifndef USE_AS_WCSRCHR
> > > > + weak_alias (STRRCHR, rindex)
> > > > + libc_hidden_builtin_def (STRRCHR)
> > > > +#endif
> > > > diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> > > > index 61552954de..2b80efc5ef 100644
> > > > --- a/sysdeps/x86_64/wcsrchr.S
> > > > +++ b/sysdeps/x86_64/wcsrchr.S
> > > > @@ -1,4 +1,4 @@
> > > > -/* wcsrchr with SSSE3
> > > > +/* wcsrchr optimized with SSE2.
> > > > Copyright (C) 2011-2022 Free Software Foundation, Inc.
> > > > This file is part of the GNU C Library.
> > > >
> > > > @@ -16,266 +16,12 @@
> > > > License along with the GNU C Library; if not, see
> > > > <https://www.gnu.org/licenses/>. */
> > > >
> > > > -#include <sysdep.h>
> > > >
> > > > - .text
> > > > -ENTRY (wcsrchr)
> > > > +#define USE_AS_WCSRCHR 1
> > > > +#define NO_PMINU 1
> > > >
> > > > - movd %rsi, %xmm1
> > > > - mov %rdi, %rcx
> > > > - punpckldq %xmm1, %xmm1
> > > > - pxor %xmm2, %xmm2
> > > > - punpckldq %xmm1, %xmm1
> > > > - and $63, %rcx
> > > > - cmp $48, %rcx
> > > > - ja L(crosscache)
> > > > +#ifndef STRRCHR
> > > > +# define STRRCHR wcsrchr
> > > > +#endif
> > > >
> > > > - movdqu (%rdi), %xmm0
> > > > - pcmpeqd %xmm0, %xmm2
> > > > - pcmpeqd %xmm1, %xmm0
> > > > - pmovmskb %xmm2, %rcx
> > > > - pmovmskb %xmm0, %rax
> > > > - add $16, %rdi
> > > > -
> > > > - test %rax, %rax
> > > > - jnz L(unaligned_match1)
> > > > -
> > > > - test %rcx, %rcx
> > > > - jnz L(return_null)
> > > > -
> > > > - and $-16, %rdi
> > > > - xor %r8, %r8
> > > > - jmp L(loop)
> > > > -
> > > > - .p2align 4
> > > > -L(unaligned_match1):
> > > > - test %rcx, %rcx
> > > > - jnz L(prolog_find_zero_1)
> > > > -
> > > > - mov %rax, %r8
> > > > - mov %rdi, %rsi
> > > > - and $-16, %rdi
> > > > - jmp L(loop)
> > > > -
> > > > - .p2align 4
> > > > -L(crosscache):
> > > > - and $15, %rcx
> > > > - and $-16, %rdi
> > > > - pxor %xmm3, %xmm3
> > > > - movdqa (%rdi), %xmm0
> > > > - pcmpeqd %xmm0, %xmm3
> > > > - pcmpeqd %xmm1, %xmm0
> > > > - pmovmskb %xmm3, %rdx
> > > > - pmovmskb %xmm0, %rax
> > > > - shr %cl, %rdx
> > > > - shr %cl, %rax
> > > > - add $16, %rdi
> > > > -
> > > > - test %rax, %rax
> > > > - jnz L(unaligned_match)
> > > > -
> > > > - test %rdx, %rdx
> > > > - jnz L(return_null)
> > > > -
> > > > - xor %r8, %r8
> > > > - jmp L(loop)
> > > > -
> > > > - .p2align 4
> > > > -L(unaligned_match):
> > > > - test %rdx, %rdx
> > > > - jnz L(prolog_find_zero)
> > > > -
> > > > - mov %rax, %r8
> > > > - lea (%rdi, %rcx), %rsi
> > > > -
> > > > -/* Loop start on aligned string. */
> > > > - .p2align 4
> > > > -L(loop):
> > > > - movdqa (%rdi), %xmm0
> > > > - pcmpeqd %xmm0, %xmm2
> > > > - add $16, %rdi
> > > > - pcmpeqd %xmm1, %xmm0
> > > > - pmovmskb %xmm2, %rcx
> > > > - pmovmskb %xmm0, %rax
> > > > - or %rax, %rcx
> > > > - jnz L(matches)
> > > > -
> > > > - movdqa (%rdi), %xmm3
> > > > - pcmpeqd %xmm3, %xmm2
> > > > - add $16, %rdi
> > > > - pcmpeqd %xmm1, %xmm3
> > > > - pmovmskb %xmm2, %rcx
> > > > - pmovmskb %xmm3, %rax
> > > > - or %rax, %rcx
> > > > - jnz L(matches)
> > > > -
> > > > - movdqa (%rdi), %xmm4
> > > > - pcmpeqd %xmm4, %xmm2
> > > > - add $16, %rdi
> > > > - pcmpeqd %xmm1, %xmm4
> > > > - pmovmskb %xmm2, %rcx
> > > > - pmovmskb %xmm4, %rax
> > > > - or %rax, %rcx
> > > > - jnz L(matches)
> > > > -
> > > > - movdqa (%rdi), %xmm5
> > > > - pcmpeqd %xmm5, %xmm2
> > > > - add $16, %rdi
> > > > - pcmpeqd %xmm1, %xmm5
> > > > - pmovmskb %xmm2, %rcx
> > > > - pmovmskb %xmm5, %rax
> > > > - or %rax, %rcx
> > > > - jz L(loop)
> > > > -
> > > > - .p2align 4
> > > > -L(matches):
> > > > - test %rax, %rax
> > > > - jnz L(match)
> > > > -L(return_value):
> > > > - test %r8, %r8
> > > > - jz L(return_null)
> > > > - mov %r8, %rax
> > > > - mov %rsi, %rdi
> > > > -
> > > > - test $15 << 4, %ah
> > > > - jnz L(match_fourth_wchar)
> > > > - test %ah, %ah
> > > > - jnz L(match_third_wchar)
> > > > - test $15 << 4, %al
> > > > - jnz L(match_second_wchar)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(match):
> > > > - pmovmskb %xmm2, %rcx
> > > > - test %rcx, %rcx
> > > > - jnz L(find_zero)
> > > > - mov %rax, %r8
> > > > - mov %rdi, %rsi
> > > > - jmp L(loop)
> > > > -
> > > > - .p2align 4
> > > > -L(find_zero):
> > > > - test $15, %cl
> > > > - jnz L(find_zero_in_first_wchar)
> > > > - test %cl, %cl
> > > > - jnz L(find_zero_in_second_wchar)
> > > > - test $15, %ch
> > > > - jnz L(find_zero_in_third_wchar)
> > > > -
> > > > - and $1 << 13 - 1, %rax
> > > > - jz L(return_value)
> > > > -
> > > > - test $15 << 4, %ah
> > > > - jnz L(match_fourth_wchar)
> > > > - test %ah, %ah
> > > > - jnz L(match_third_wchar)
> > > > - test $15 << 4, %al
> > > > - jnz L(match_second_wchar)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(find_zero_in_first_wchar):
> > > > - test $1, %rax
> > > > - jz L(return_value)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(find_zero_in_second_wchar):
> > > > - and $1 << 5 - 1, %rax
> > > > - jz L(return_value)
> > > > -
> > > > - test $15 << 4, %al
> > > > - jnz L(match_second_wchar)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(find_zero_in_third_wchar):
> > > > - and $1 << 9 - 1, %rax
> > > > - jz L(return_value)
> > > > -
> > > > - test %ah, %ah
> > > > - jnz L(match_third_wchar)
> > > > - test $15 << 4, %al
> > > > - jnz L(match_second_wchar)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(prolog_find_zero):
> > > > - add %rcx, %rdi
> > > > - mov %rdx, %rcx
> > > > -L(prolog_find_zero_1):
> > > > - test $15, %cl
> > > > - jnz L(prolog_find_zero_in_first_wchar)
> > > > - test %cl, %cl
> > > > - jnz L(prolog_find_zero_in_second_wchar)
> > > > - test $15, %ch
> > > > - jnz L(prolog_find_zero_in_third_wchar)
> > > > -
> > > > - and $1 << 13 - 1, %rax
> > > > - jz L(return_null)
> > > > -
> > > > - test $15 << 4, %ah
> > > > - jnz L(match_fourth_wchar)
> > > > - test %ah, %ah
> > > > - jnz L(match_third_wchar)
> > > > - test $15 << 4, %al
> > > > - jnz L(match_second_wchar)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(prolog_find_zero_in_first_wchar):
> > > > - test $1, %rax
> > > > - jz L(return_null)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(prolog_find_zero_in_second_wchar):
> > > > - and $1 << 5 - 1, %rax
> > > > - jz L(return_null)
> > > > -
> > > > - test $15 << 4, %al
> > > > - jnz L(match_second_wchar)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(prolog_find_zero_in_third_wchar):
> > > > - and $1 << 9 - 1, %rax
> > > > - jz L(return_null)
> > > > -
> > > > - test %ah, %ah
> > > > - jnz L(match_third_wchar)
> > > > - test $15 << 4, %al
> > > > - jnz L(match_second_wchar)
> > > > - lea -16(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(match_second_wchar):
> > > > - lea -12(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(match_third_wchar):
> > > > - lea -8(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(match_fourth_wchar):
> > > > - lea -4(%rdi), %rax
> > > > - ret
> > > > -
> > > > - .p2align 4
> > > > -L(return_null):
> > > > - xor %rax, %rax
> > > > - ret
> > > > -
> > > > -END (wcsrchr)
> > > > +#include "../strrchr.S"
> > > > --
> > > > 2.25.1
> > > >
> > >
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.
@@ -17,7 +17,7 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define strrchr __strrchr_sse2
+# define STRRCHR __strrchr_sse2
# undef weak_alias
# define weak_alias(strrchr, rindex)
@@ -17,7 +17,6 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define wcsrchr __wcsrchr_sse2
+# define STRRCHR __wcsrchr_sse2
#endif
-
#include "../wcsrchr.S"
@@ -19,210 +19,355 @@
#include <sysdep.h>
+#ifndef STRRCHR
+# define STRRCHR strrchr
+#endif
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ pcmpeqd
+# define CHAR_SIZE 4
+# define PMINU pminud
+#else
+# define PCMPEQ pcmpeqb
+# define CHAR_SIZE 1
+# define PMINU pminub
+#endif
+
+#define PAGE_SIZE 4096
+#define VEC_SIZE 16
+
.text
-ENTRY (strrchr)
- movd %esi, %xmm1
+ENTRY(STRRCHR)
+ movd %esi, %xmm0
movq %rdi, %rax
- andl $4095, %eax
- punpcklbw %xmm1, %xmm1
- cmpq $4032, %rax
- punpcklwd %xmm1, %xmm1
- pshufd $0, %xmm1, %xmm1
+ andl $(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+ punpcklbw %xmm0, %xmm0
+ punpcklwd %xmm0, %xmm0
+#endif
+ pshufd $0, %xmm0, %xmm0
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
ja L(cross_page)
- movdqu (%rdi), %xmm0
+
+L(cross_page_continue):
+ movups (%rdi), %xmm1
pxor %xmm2, %xmm2
- movdqa %xmm0, %xmm3
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %ecx
- pmovmskb %xmm3, %edx
- testq %rdx, %rdx
- je L(next_48_bytes)
- leaq -1(%rdx), %rax
- xorq %rdx, %rax
- andq %rcx, %rax
- je L(exit)
- bsrq %rax, %rax
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %ecx
+ testl %ecx, %ecx
+ jz L(aligned_more)
+
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
addq %rdi, %rax
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+ search CHAR is zero we are correct. Either way `andq
+ -CHAR_SIZE, %rax` gets the correct result. */
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret0):
ret
+ /* Returns for first vec x1/x2 have hard coded backward search
+ path for earlier matches. */
.p2align 4
-L(next_48_bytes):
- movdqu 16(%rdi), %xmm4
- movdqa %xmm4, %xmm5
- movdqu 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm2, %xmm5
- movdqu 48(%rdi), %xmm0
- pmovmskb %xmm5, %edx
- movdqa %xmm3, %xmm5
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm2, %xmm5
- pcmpeqb %xmm0, %xmm2
- salq $16, %rdx
- pmovmskb %xmm3, %r8d
- pmovmskb %xmm5, %eax
- pmovmskb %xmm2, %esi
- salq $32, %r8
- salq $32, %rax
- pcmpeqb %xmm1, %xmm0
- orq %rdx, %rax
- movq %rsi, %rdx
- pmovmskb %xmm4, %esi
- salq $48, %rdx
- salq $16, %rsi
- orq %r8, %rsi
- orq %rcx, %rsi
- pmovmskb %xmm0, %ecx
- salq $48, %rcx
- orq %rcx, %rsi
- orq %rdx, %rax
- je L(loop_header2)
- leaq -1(%rax), %rcx
- xorq %rax, %rcx
- andq %rcx, %rsi
- je L(exit)
- bsrq %rsi, %rsi
- leaq (%rdi,%rsi), %rax
+L(first_vec_x0_test):
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ testl %eax, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
+ addq %r8, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
.p2align 4
-L(loop_header2):
- testq %rsi, %rsi
- movq %rdi, %rcx
- je L(no_c_found)
-L(loop_header):
- addq $64, %rdi
- pxor %xmm7, %xmm7
- andq $-64, %rdi
- jmp L(loop_entry)
+L(first_vec_x1):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
.p2align 4
-L(loop64):
- testq %rdx, %rdx
- cmovne %rdx, %rsi
- cmovne %rdi, %rcx
- addq $64, %rdi
-L(loop_entry):
- movdqa 32(%rdi), %xmm3
- pxor %xmm6, %xmm6
- movdqa 48(%rdi), %xmm2
- movdqa %xmm3, %xmm0
- movdqa 16(%rdi), %xmm4
- pminub %xmm2, %xmm0
- movdqa (%rdi), %xmm5
- pminub %xmm4, %xmm0
- pminub %xmm5, %xmm0
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %eax
- movdqa %xmm5, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %r9d
- movdqa %xmm4, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
- movdqa %xmm3, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $16, %rdx
- pmovmskb %xmm0, %r10d
- movdqa %xmm2, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $32, %r10
- orq %r10, %rdx
- pmovmskb %xmm0, %r8d
- orq %r9, %rdx
- salq $48, %r8
- orq %r8, %rdx
+L(first_vec_x1_test):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
testl %eax, %eax
- je L(loop64)
- pcmpeqb %xmm6, %xmm4
- pcmpeqb %xmm6, %xmm3
- pcmpeqb %xmm6, %xmm5
- pmovmskb %xmm4, %eax
- pmovmskb %xmm3, %r10d
- pcmpeqb %xmm6, %xmm2
- pmovmskb %xmm5, %r9d
- salq $32, %r10
- salq $16, %rax
- pmovmskb %xmm2, %r8d
- orq %r10, %rax
- orq %r9, %rax
- salq $48, %r8
- orq %r8, %rax
- leaq -1(%rax), %r8
- xorq %rax, %r8
- andq %r8, %rdx
- cmovne %rdi, %rcx
- cmovne %rdx, %rsi
- bsrq %rsi, %rsi
- leaq (%rcx,%rsi), %rax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm3, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(aligned_more):
+ /* Save original pointer if match was in VEC 0. */
+ movq %rdi, %r8
+ andq $-VEC_SIZE, %rdi
+
+ movaps VEC_SIZE(%rdi), %xmm2
+ pxor %xmm3, %xmm3
+ PCMPEQ %xmm2, %xmm3
+ pmovmskb %xmm3, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x1)
+
+ movaps (VEC_SIZE * 2)(%rdi), %xmm3
+ pxor %xmm4, %xmm4
+ PCMPEQ %xmm3, %xmm4
+ pmovmskb %xmm4, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
+
+ addq $VEC_SIZE, %rdi
+ /* Save pointer again before realigning. */
+ movq %rdi, %rsi
+ andq $-(VEC_SIZE * 2), %rdi
+ .p2align 4
+L(first_loop):
+ /* Do 2x VEC at a time. */
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+ /* If SSE2 no pminud. */
+#ifdef NO_PMINU
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef NO_PMINU
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
+#endif
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
+
+ addq $(VEC_SIZE * 2), %rdi
+ /* Use `addl` 1) so we can undo it with `subl` and 2) it can
+ macro-fuse with `jz`. */
+ addl %ecx, %eax
+ jz L(first_loop)
+
+ /* Check if there is zero match. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+ /* Check if there was a match in last iteration. */
+ subl %ecx, %eax
+ jnz L(new_match)
+
+L(first_loop_old_match):
+ PCMPEQ %xmm0, %xmm2
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ addl %eax, %ecx
+ jz L(first_vec_x0_test)
+ /* NB: We could move this shift to before the branch and save a
+ bit of code size / performance on the fall through. The
+ branch leads to the null case which generally seems hotter
+ than char in first 3x VEC. */
+ sall $16, %eax
+ orl %ecx, %eax
+
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
+ /* Save minimum state for getting most recent match. We can
+ throw out all previous work. */
.p2align 4
-L(no_c_found):
- movl $1, %esi
- xorl %ecx, %ecx
- jmp L(loop_header)
+L(second_loop_match):
+ movq %rdi, %rsi
+ movaps %xmm4, %xmm2
+ movaps %xmm7, %xmm3
.p2align 4
-L(exit):
- xorl %eax, %eax
+L(second_loop):
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+#ifdef NO_PMINU
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef NO_PMINU
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
+#endif
+
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
+
+ addq $(VEC_SIZE * 2), %rdi
+ /* Either null term or new occurence of CHAR. */
+ addl %ecx, %eax
+ jz L(second_loop)
+
+ /* No null term so much be new occurence of CHAR. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+
+ subl %ecx, %eax
+ jnz L(second_loop_new_match)
+
+L(second_loop_old_match):
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ sall $16, %eax
+ orl %ecx, %eax
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
.p2align 4
+L(second_loop_new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(second_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4,, 4
L(cross_page):
- movq %rdi, %rax
- pxor %xmm0, %xmm0
- andq $-64, %rax
- movdqu (%rax), %xmm5
- movdqa %xmm5, %xmm6
- movdqu 16(%rax), %xmm4
- pcmpeqb %xmm1, %xmm5
- pcmpeqb %xmm0, %xmm6
- movdqu 32(%rax), %xmm3
- pmovmskb %xmm6, %esi
- movdqa %xmm4, %xmm6
- movdqu 48(%rax), %xmm2
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm0, %xmm6
- pmovmskb %xmm6, %edx
- movdqa %xmm3, %xmm6
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm0, %xmm6
- pcmpeqb %xmm2, %xmm0
- salq $16, %rdx
- pmovmskb %xmm3, %r9d
- pmovmskb %xmm6, %r8d
- pmovmskb %xmm0, %ecx
- salq $32, %r9
- salq $32, %r8
- pcmpeqb %xmm1, %xmm2
- orq %r8, %rdx
- salq $48, %rcx
- pmovmskb %xmm5, %r8d
- orq %rsi, %rdx
- pmovmskb %xmm4, %esi
- orq %rcx, %rdx
- pmovmskb %xmm2, %ecx
- salq $16, %rsi
- salq $48, %rcx
- orq %r9, %rsi
- orq %r8, %rsi
- orq %rcx, %rsi
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rsi
+ movaps (%rsi), %xmm1
+ pxor %xmm2, %xmm2
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %edx
movl %edi, %ecx
- subl %eax, %ecx
- shrq %cl, %rdx
- shrq %cl, %rsi
- testq %rdx, %rdx
- je L(loop_header2)
- leaq -1(%rdx), %rax
- xorq %rdx, %rax
- andq %rax, %rsi
- je L(exit)
- bsrq %rsi, %rax
+ andl $(VEC_SIZE - 1), %ecx
+ sarl %cl, %edx
+ jz L(cross_page_continue)
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ sarl %cl, %eax
+ leal -1(%rdx), %ecx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret1)
+ bsrl %eax, %eax
addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret1):
ret
-END (strrchr)
+END(STRRCHR)
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
+#ifndef USE_AS_WCSRCHR
+ weak_alias (STRRCHR, rindex)
+ libc_hidden_builtin_def (STRRCHR)
+#endif
@@ -1,4 +1,4 @@
-/* wcsrchr with SSSE3
+/* wcsrchr optimized with SSE2.
Copyright (C) 2011-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,266 +16,12 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
- .text
-ENTRY (wcsrchr)
+#define USE_AS_WCSRCHR 1
+#define NO_PMINU 1
- movd %rsi, %xmm1
- mov %rdi, %rcx
- punpckldq %xmm1, %xmm1
- pxor %xmm2, %xmm2
- punpckldq %xmm1, %xmm1
- and $63, %rcx
- cmp $48, %rcx
- ja L(crosscache)
+#ifndef STRRCHR
+# define STRRCHR wcsrchr
+#endif
- movdqu (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match1)
-
- test %rcx, %rcx
- jnz L(return_null)
-
- and $-16, %rdi
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match1):
- test %rcx, %rcx
- jnz L(prolog_find_zero_1)
-
- mov %rax, %r8
- mov %rdi, %rsi
- and $-16, %rdi
- jmp L(loop)
-
- .p2align 4
-L(crosscache):
- and $15, %rcx
- and $-16, %rdi
- pxor %xmm3, %xmm3
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm3
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm3, %rdx
- pmovmskb %xmm0, %rax
- shr %cl, %rdx
- shr %cl, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match)
-
- test %rdx, %rdx
- jnz L(return_null)
-
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match):
- test %rdx, %rdx
- jnz L(prolog_find_zero)
-
- mov %rax, %r8
- lea (%rdi, %rcx), %rsi
-
-/* Loop start on aligned string. */
- .p2align 4
-L(loop):
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm3
- pcmpeqd %xmm3, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm3
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm3, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm4
- pcmpeqd %xmm4, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm4
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm4, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm5
- pcmpeqd %xmm5, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm5
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm5, %rax
- or %rax, %rcx
- jz L(loop)
-
- .p2align 4
-L(matches):
- test %rax, %rax
- jnz L(match)
-L(return_value):
- test %r8, %r8
- jz L(return_null)
- mov %r8, %rax
- mov %rsi, %rdi
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match):
- pmovmskb %xmm2, %rcx
- test %rcx, %rcx
- jnz L(find_zero)
- mov %rax, %r8
- mov %rdi, %rsi
- jmp L(loop)
-
- .p2align 4
-L(find_zero):
- test $15, %cl
- jnz L(find_zero_in_first_wchar)
- test %cl, %cl
- jnz L(find_zero_in_second_wchar)
- test $15, %ch
- jnz L(find_zero_in_third_wchar)
-
- and $1 << 13 - 1, %rax
- jz L(return_value)
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_first_wchar):
- test $1, %rax
- jz L(return_value)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_second_wchar):
- and $1 << 5 - 1, %rax
- jz L(return_value)
-
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_third_wchar):
- and $1 << 9 - 1, %rax
- jz L(return_value)
-
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero):
- add %rcx, %rdi
- mov %rdx, %rcx
-L(prolog_find_zero_1):
- test $15, %cl
- jnz L(prolog_find_zero_in_first_wchar)
- test %cl, %cl
- jnz L(prolog_find_zero_in_second_wchar)
- test $15, %ch
- jnz L(prolog_find_zero_in_third_wchar)
-
- and $1 << 13 - 1, %rax
- jz L(return_null)
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_first_wchar):
- test $1, %rax
- jz L(return_null)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_second_wchar):
- and $1 << 5 - 1, %rax
- jz L(return_null)
-
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_third_wchar):
- and $1 << 9 - 1, %rax
- jz L(return_null)
-
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match_second_wchar):
- lea -12(%rdi), %rax
- ret
-
- .p2align 4
-L(match_third_wchar):
- lea -8(%rdi), %rax
- ret
-
- .p2align 4
-L(match_fourth_wchar):
- lea -4(%rdi), %rax
- ret
-
- .p2align 4
-L(return_null):
- xor %rax, %rax
- ret
-
-END (wcsrchr)
+#include "../strrchr.S"