[v1,07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
sign extensions.
geometric_mean(N=20) of all benchmarks that dont fallback on
sse2/strlen; New / Original: .928
All string/memory tests pass.
---
Geomtric Mean N=20 runs; All functions page aligned
len, align1, align2, pos, New Time / Old Time
0, 0, 0, 512, 1.207
1, 0, 0, 512, 1.039
1, 1, 0, 512, 0.997
1, 0, 1, 512, 0.981
1, 1, 1, 512, 0.977
2, 0, 0, 512, 1.02
2, 2, 0, 512, 0.979
2, 0, 2, 512, 0.902
2, 2, 2, 512, 0.958
3, 0, 0, 512, 0.978
3, 3, 0, 512, 0.988
3, 0, 3, 512, 0.979
3, 3, 3, 512, 0.955
4, 0, 0, 512, 0.969
4, 4, 0, 512, 0.991
4, 0, 4, 512, 0.94
4, 4, 4, 512, 0.958
5, 0, 0, 512, 0.963
5, 5, 0, 512, 1.004
5, 0, 5, 512, 0.948
5, 5, 5, 512, 0.971
6, 0, 0, 512, 0.933
6, 6, 0, 512, 1.007
6, 0, 6, 512, 0.921
6, 6, 6, 512, 0.969
7, 0, 0, 512, 0.928
7, 7, 0, 512, 0.976
7, 0, 7, 512, 0.932
7, 7, 7, 512, 0.995
8, 0, 0, 512, 0.931
8, 0, 8, 512, 0.766
9, 0, 0, 512, 0.965
9, 1, 0, 512, 0.999
9, 0, 9, 512, 0.765
9, 1, 9, 512, 0.97
10, 0, 0, 512, 0.976
10, 2, 0, 512, 0.991
10, 0, 10, 512, 0.768
10, 2, 10, 512, 0.926
11, 0, 0, 512, 0.958
11, 3, 0, 512, 1.006
11, 0, 11, 512, 0.768
11, 3, 11, 512, 0.908
12, 0, 0, 512, 0.945
12, 4, 0, 512, 0.896
12, 0, 12, 512, 0.764
12, 4, 12, 512, 0.785
13, 0, 0, 512, 0.957
13, 5, 0, 512, 1.019
13, 0, 13, 512, 0.76
13, 5, 13, 512, 0.785
14, 0, 0, 512, 0.918
14, 6, 0, 512, 1.004
14, 0, 14, 512, 0.78
14, 6, 14, 512, 0.711
15, 0, 0, 512, 0.855
15, 7, 0, 512, 0.985
15, 0, 15, 512, 0.779
15, 7, 15, 512, 0.772
16, 0, 0, 512, 0.987
16, 0, 16, 512, 0.99
17, 0, 0, 512, 0.996
17, 1, 0, 512, 0.979
17, 0, 17, 512, 1.001
17, 1, 17, 512, 1.03
18, 0, 0, 512, 0.976
18, 2, 0, 512, 0.989
18, 0, 18, 512, 0.976
18, 2, 18, 512, 0.992
19, 0, 0, 512, 0.991
19, 3, 0, 512, 0.988
19, 0, 19, 512, 1.009
19, 3, 19, 512, 1.018
20, 0, 0, 512, 0.999
20, 4, 0, 512, 1.005
20, 0, 20, 512, 0.993
20, 4, 20, 512, 0.983
21, 0, 0, 512, 0.982
21, 5, 0, 512, 0.988
21, 0, 21, 512, 0.978
21, 5, 21, 512, 0.984
22, 0, 0, 512, 0.988
22, 6, 0, 512, 0.979
22, 0, 22, 512, 0.984
22, 6, 22, 512, 0.983
23, 0, 0, 512, 0.996
23, 7, 0, 512, 0.998
23, 0, 23, 512, 0.979
23, 7, 23, 512, 0.987
24, 0, 0, 512, 0.99
24, 0, 24, 512, 0.979
25, 0, 0, 512, 0.985
25, 1, 0, 512, 0.988
25, 0, 25, 512, 0.99
25, 1, 25, 512, 0.986
26, 0, 0, 512, 1.005
26, 2, 0, 512, 0.995
26, 0, 26, 512, 0.992
26, 2, 26, 512, 0.983
27, 0, 0, 512, 0.986
27, 3, 0, 512, 0.978
27, 0, 27, 512, 0.986
27, 3, 27, 512, 0.973
28, 0, 0, 512, 0.995
28, 4, 0, 512, 0.993
28, 0, 28, 512, 0.983
28, 4, 28, 512, 1.005
29, 0, 0, 512, 0.983
29, 5, 0, 512, 0.982
29, 0, 29, 512, 0.984
29, 5, 29, 512, 1.005
30, 0, 0, 512, 0.978
30, 6, 0, 512, 0.985
30, 0, 30, 512, 0.994
30, 6, 30, 512, 0.993
31, 0, 0, 512, 0.984
31, 7, 0, 512, 0.983
31, 0, 31, 512, 1.0
31, 7, 31, 512, 1.031
4, 0, 0, 32, 0.916
4, 1, 0, 32, 0.952
4, 0, 1, 32, 0.927
4, 1, 1, 32, 0.969
4, 0, 0, 64, 0.961
4, 2, 0, 64, 0.955
4, 0, 2, 64, 0.975
4, 2, 2, 64, 0.972
4, 0, 0, 128, 0.971
4, 3, 0, 128, 0.982
4, 0, 3, 128, 0.945
4, 3, 3, 128, 0.971
4, 0, 0, 256, 1.004
4, 4, 0, 256, 0.966
4, 0, 4, 256, 0.961
4, 4, 4, 256, 0.971
4, 5, 0, 512, 0.929
4, 0, 5, 512, 0.969
4, 5, 5, 512, 0.985
4, 0, 0, 1024, 1.003
4, 6, 0, 1024, 1.009
4, 0, 6, 1024, 1.005
4, 6, 6, 1024, 0.999
4, 0, 0, 2048, 0.917
4, 7, 0, 2048, 1.015
4, 0, 7, 2048, 1.011
4, 7, 7, 2048, 0.907
10, 1, 0, 64, 0.964
10, 1, 1, 64, 0.966
10, 2, 0, 64, 0.953
10, 2, 2, 64, 0.972
10, 3, 0, 64, 0.962
10, 3, 3, 64, 0.969
10, 4, 0, 64, 0.957
10, 4, 4, 64, 0.969
10, 5, 0, 64, 0.961
10, 5, 5, 64, 0.965
10, 6, 0, 64, 0.949
10, 6, 6, 64, 0.9
10, 7, 0, 64, 0.957
10, 7, 7, 64, 0.897
6, 0, 0, 0, 0.991
6, 0, 0, 1, 1.011
6, 0, 1, 1, 0.939
6, 0, 0, 2, 1.016
6, 0, 2, 2, 0.94
6, 0, 0, 3, 1.019
6, 0, 3, 3, 0.941
6, 0, 0, 4, 1.056
6, 0, 4, 4, 0.884
6, 0, 0, 5, 0.977
6, 0, 5, 5, 0.934
6, 0, 0, 6, 0.954
6, 0, 6, 6, 0.93
6, 0, 0, 7, 0.963
6, 0, 7, 7, 0.916
6, 0, 0, 8, 0.963
6, 0, 8, 8, 0.945
6, 0, 0, 9, 1.028
6, 0, 9, 9, 0.942
6, 0, 0, 10, 0.955
6, 0, 10, 10, 0.831
6, 0, 0, 11, 0.948
6, 0, 11, 11, 0.82
6, 0, 0, 12, 1.033
6, 0, 12, 12, 0.873
6, 0, 0, 13, 0.983
6, 0, 13, 13, 0.852
6, 0, 0, 14, 0.984
6, 0, 14, 14, 0.853
6, 0, 0, 15, 0.984
6, 0, 15, 15, 0.882
6, 0, 0, 16, 0.971
6, 0, 16, 16, 0.958
6, 0, 0, 17, 0.938
6, 0, 17, 17, 0.947
6, 0, 0, 18, 0.96
6, 0, 18, 18, 0.938
6, 0, 0, 19, 0.903
6, 0, 19, 19, 0.943
6, 0, 0, 20, 0.947
6, 0, 20, 20, 0.951
6, 0, 0, 21, 0.948
6, 0, 21, 21, 0.96
6, 0, 0, 22, 0.926
6, 0, 22, 22, 0.951
6, 0, 0, 23, 0.923
6, 0, 23, 23, 0.959
6, 0, 0, 24, 0.918
6, 0, 24, 24, 0.952
6, 0, 0, 25, 0.97
6, 0, 25, 25, 0.952
6, 0, 0, 26, 0.871
6, 0, 26, 26, 0.869
6, 0, 0, 27, 0.935
6, 0, 27, 27, 0.836
6, 0, 0, 28, 0.936
6, 0, 28, 28, 0.857
6, 0, 0, 29, 0.876
6, 0, 29, 29, 0.859
6, 0, 0, 30, 0.934
6, 0, 30, 30, 0.857
6, 0, 0, 31, 0.962
6, 0, 31, 31, 0.86
6, 0, 0, 32, 0.912
6, 0, 32, 32, 0.94
6, 0, 0, 33, 0.903
6, 0, 33, 33, 0.968
6, 0, 0, 34, 0.913
6, 0, 34, 34, 0.896
6, 0, 0, 35, 0.904
6, 0, 35, 35, 0.913
6, 0, 0, 36, 0.905
6, 0, 36, 36, 0.907
6, 0, 0, 37, 0.899
6, 0, 37, 37, 0.9
6, 0, 0, 38, 0.912
6, 0, 38, 38, 0.919
6, 0, 0, 39, 0.925
6, 0, 39, 39, 0.927
6, 0, 0, 40, 0.923
6, 0, 40, 40, 0.972
6, 0, 0, 41, 0.92
6, 0, 41, 41, 0.966
6, 0, 0, 42, 0.915
6, 0, 42, 42, 0.834
6, 0, 0, 43, 0.92
6, 0, 43, 43, 0.856
6, 0, 0, 44, 0.908
6, 0, 44, 44, 0.858
6, 0, 0, 45, 0.932
6, 0, 45, 45, 0.847
6, 0, 0, 46, 0.927
6, 0, 46, 46, 0.859
6, 0, 0, 47, 0.902
6, 0, 47, 47, 0.855
6, 0, 0, 48, 0.949
6, 0, 48, 48, 0.934
6, 0, 0, 49, 0.907
6, 0, 49, 49, 0.943
6, 0, 0, 50, 0.934
6, 0, 50, 50, 0.943
6, 0, 0, 51, 0.933
6, 0, 51, 51, 0.939
6, 0, 0, 52, 0.944
6, 0, 52, 52, 0.944
6, 0, 0, 53, 0.939
6, 0, 53, 53, 0.938
6, 0, 0, 54, 0.9
6, 0, 54, 54, 0.923
6, 0, 0, 55, 0.9
6, 0, 55, 55, 0.927
6, 0, 0, 56, 0.9
6, 0, 56, 56, 0.917
6, 0, 0, 57, 0.9
6, 0, 57, 57, 0.916
6, 0, 0, 58, 0.914
6, 0, 58, 58, 0.784
6, 0, 0, 59, 0.863
6, 0, 59, 59, 0.846
6, 0, 0, 60, 0.88
6, 0, 60, 60, 0.827
6, 0, 0, 61, 0.896
6, 0, 61, 61, 0.847
6, 0, 0, 62, 0.894
6, 0, 62, 62, 0.865
6, 0, 0, 63, 0.934
6, 0, 63, 63, 0.866
sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++---------------
1 file changed, 37 insertions(+), 46 deletions(-)
Comments
On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
> _mm_cmpistri. Also change offset to unsigned to avoid unnecessary
> sign extensions.
>
> geometric_mean(N=20) of all benchmarks that dont fallback on
> sse2/strlen; New / Original: .928
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> len, align1, align2, pos, New Time / Old Time
> 0, 0, 0, 512, 1.207
> 1, 0, 0, 512, 1.039
> 1, 1, 0, 512, 0.997
> 1, 0, 1, 512, 0.981
> 1, 1, 1, 512, 0.977
> 2, 0, 0, 512, 1.02
> 2, 2, 0, 512, 0.979
> 2, 0, 2, 512, 0.902
> 2, 2, 2, 512, 0.958
> 3, 0, 0, 512, 0.978
> 3, 3, 0, 512, 0.988
> 3, 0, 3, 512, 0.979
> 3, 3, 3, 512, 0.955
> 4, 0, 0, 512, 0.969
> 4, 4, 0, 512, 0.991
> 4, 0, 4, 512, 0.94
> 4, 4, 4, 512, 0.958
> 5, 0, 0, 512, 0.963
> 5, 5, 0, 512, 1.004
> 5, 0, 5, 512, 0.948
> 5, 5, 5, 512, 0.971
> 6, 0, 0, 512, 0.933
> 6, 6, 0, 512, 1.007
> 6, 0, 6, 512, 0.921
> 6, 6, 6, 512, 0.969
> 7, 0, 0, 512, 0.928
> 7, 7, 0, 512, 0.976
> 7, 0, 7, 512, 0.932
> 7, 7, 7, 512, 0.995
> 8, 0, 0, 512, 0.931
> 8, 0, 8, 512, 0.766
> 9, 0, 0, 512, 0.965
> 9, 1, 0, 512, 0.999
> 9, 0, 9, 512, 0.765
> 9, 1, 9, 512, 0.97
> 10, 0, 0, 512, 0.976
> 10, 2, 0, 512, 0.991
> 10, 0, 10, 512, 0.768
> 10, 2, 10, 512, 0.926
> 11, 0, 0, 512, 0.958
> 11, 3, 0, 512, 1.006
> 11, 0, 11, 512, 0.768
> 11, 3, 11, 512, 0.908
> 12, 0, 0, 512, 0.945
> 12, 4, 0, 512, 0.896
> 12, 0, 12, 512, 0.764
> 12, 4, 12, 512, 0.785
> 13, 0, 0, 512, 0.957
> 13, 5, 0, 512, 1.019
> 13, 0, 13, 512, 0.76
> 13, 5, 13, 512, 0.785
> 14, 0, 0, 512, 0.918
> 14, 6, 0, 512, 1.004
> 14, 0, 14, 512, 0.78
> 14, 6, 14, 512, 0.711
> 15, 0, 0, 512, 0.855
> 15, 7, 0, 512, 0.985
> 15, 0, 15, 512, 0.779
> 15, 7, 15, 512, 0.772
> 16, 0, 0, 512, 0.987
> 16, 0, 16, 512, 0.99
> 17, 0, 0, 512, 0.996
> 17, 1, 0, 512, 0.979
> 17, 0, 17, 512, 1.001
> 17, 1, 17, 512, 1.03
> 18, 0, 0, 512, 0.976
> 18, 2, 0, 512, 0.989
> 18, 0, 18, 512, 0.976
> 18, 2, 18, 512, 0.992
> 19, 0, 0, 512, 0.991
> 19, 3, 0, 512, 0.988
> 19, 0, 19, 512, 1.009
> 19, 3, 19, 512, 1.018
> 20, 0, 0, 512, 0.999
> 20, 4, 0, 512, 1.005
> 20, 0, 20, 512, 0.993
> 20, 4, 20, 512, 0.983
> 21, 0, 0, 512, 0.982
> 21, 5, 0, 512, 0.988
> 21, 0, 21, 512, 0.978
> 21, 5, 21, 512, 0.984
> 22, 0, 0, 512, 0.988
> 22, 6, 0, 512, 0.979
> 22, 0, 22, 512, 0.984
> 22, 6, 22, 512, 0.983
> 23, 0, 0, 512, 0.996
> 23, 7, 0, 512, 0.998
> 23, 0, 23, 512, 0.979
> 23, 7, 23, 512, 0.987
> 24, 0, 0, 512, 0.99
> 24, 0, 24, 512, 0.979
> 25, 0, 0, 512, 0.985
> 25, 1, 0, 512, 0.988
> 25, 0, 25, 512, 0.99
> 25, 1, 25, 512, 0.986
> 26, 0, 0, 512, 1.005
> 26, 2, 0, 512, 0.995
> 26, 0, 26, 512, 0.992
> 26, 2, 26, 512, 0.983
> 27, 0, 0, 512, 0.986
> 27, 3, 0, 512, 0.978
> 27, 0, 27, 512, 0.986
> 27, 3, 27, 512, 0.973
> 28, 0, 0, 512, 0.995
> 28, 4, 0, 512, 0.993
> 28, 0, 28, 512, 0.983
> 28, 4, 28, 512, 1.005
> 29, 0, 0, 512, 0.983
> 29, 5, 0, 512, 0.982
> 29, 0, 29, 512, 0.984
> 29, 5, 29, 512, 1.005
> 30, 0, 0, 512, 0.978
> 30, 6, 0, 512, 0.985
> 30, 0, 30, 512, 0.994
> 30, 6, 30, 512, 0.993
> 31, 0, 0, 512, 0.984
> 31, 7, 0, 512, 0.983
> 31, 0, 31, 512, 1.0
> 31, 7, 31, 512, 1.031
> 4, 0, 0, 32, 0.916
> 4, 1, 0, 32, 0.952
> 4, 0, 1, 32, 0.927
> 4, 1, 1, 32, 0.969
> 4, 0, 0, 64, 0.961
> 4, 2, 0, 64, 0.955
> 4, 0, 2, 64, 0.975
> 4, 2, 2, 64, 0.972
> 4, 0, 0, 128, 0.971
> 4, 3, 0, 128, 0.982
> 4, 0, 3, 128, 0.945
> 4, 3, 3, 128, 0.971
> 4, 0, 0, 256, 1.004
> 4, 4, 0, 256, 0.966
> 4, 0, 4, 256, 0.961
> 4, 4, 4, 256, 0.971
> 4, 5, 0, 512, 0.929
> 4, 0, 5, 512, 0.969
> 4, 5, 5, 512, 0.985
> 4, 0, 0, 1024, 1.003
> 4, 6, 0, 1024, 1.009
> 4, 0, 6, 1024, 1.005
> 4, 6, 6, 1024, 0.999
> 4, 0, 0, 2048, 0.917
> 4, 7, 0, 2048, 1.015
> 4, 0, 7, 2048, 1.011
> 4, 7, 7, 2048, 0.907
> 10, 1, 0, 64, 0.964
> 10, 1, 1, 64, 0.966
> 10, 2, 0, 64, 0.953
> 10, 2, 2, 64, 0.972
> 10, 3, 0, 64, 0.962
> 10, 3, 3, 64, 0.969
> 10, 4, 0, 64, 0.957
> 10, 4, 4, 64, 0.969
> 10, 5, 0, 64, 0.961
> 10, 5, 5, 64, 0.965
> 10, 6, 0, 64, 0.949
> 10, 6, 6, 64, 0.9
> 10, 7, 0, 64, 0.957
> 10, 7, 7, 64, 0.897
> 6, 0, 0, 0, 0.991
> 6, 0, 0, 1, 1.011
> 6, 0, 1, 1, 0.939
> 6, 0, 0, 2, 1.016
> 6, 0, 2, 2, 0.94
> 6, 0, 0, 3, 1.019
> 6, 0, 3, 3, 0.941
> 6, 0, 0, 4, 1.056
> 6, 0, 4, 4, 0.884
> 6, 0, 0, 5, 0.977
> 6, 0, 5, 5, 0.934
> 6, 0, 0, 6, 0.954
> 6, 0, 6, 6, 0.93
> 6, 0, 0, 7, 0.963
> 6, 0, 7, 7, 0.916
> 6, 0, 0, 8, 0.963
> 6, 0, 8, 8, 0.945
> 6, 0, 0, 9, 1.028
> 6, 0, 9, 9, 0.942
> 6, 0, 0, 10, 0.955
> 6, 0, 10, 10, 0.831
> 6, 0, 0, 11, 0.948
> 6, 0, 11, 11, 0.82
> 6, 0, 0, 12, 1.033
> 6, 0, 12, 12, 0.873
> 6, 0, 0, 13, 0.983
> 6, 0, 13, 13, 0.852
> 6, 0, 0, 14, 0.984
> 6, 0, 14, 14, 0.853
> 6, 0, 0, 15, 0.984
> 6, 0, 15, 15, 0.882
> 6, 0, 0, 16, 0.971
> 6, 0, 16, 16, 0.958
> 6, 0, 0, 17, 0.938
> 6, 0, 17, 17, 0.947
> 6, 0, 0, 18, 0.96
> 6, 0, 18, 18, 0.938
> 6, 0, 0, 19, 0.903
> 6, 0, 19, 19, 0.943
> 6, 0, 0, 20, 0.947
> 6, 0, 20, 20, 0.951
> 6, 0, 0, 21, 0.948
> 6, 0, 21, 21, 0.96
> 6, 0, 0, 22, 0.926
> 6, 0, 22, 22, 0.951
> 6, 0, 0, 23, 0.923
> 6, 0, 23, 23, 0.959
> 6, 0, 0, 24, 0.918
> 6, 0, 24, 24, 0.952
> 6, 0, 0, 25, 0.97
> 6, 0, 25, 25, 0.952
> 6, 0, 0, 26, 0.871
> 6, 0, 26, 26, 0.869
> 6, 0, 0, 27, 0.935
> 6, 0, 27, 27, 0.836
> 6, 0, 0, 28, 0.936
> 6, 0, 28, 28, 0.857
> 6, 0, 0, 29, 0.876
> 6, 0, 29, 29, 0.859
> 6, 0, 0, 30, 0.934
> 6, 0, 30, 30, 0.857
> 6, 0, 0, 31, 0.962
> 6, 0, 31, 31, 0.86
> 6, 0, 0, 32, 0.912
> 6, 0, 32, 32, 0.94
> 6, 0, 0, 33, 0.903
> 6, 0, 33, 33, 0.968
> 6, 0, 0, 34, 0.913
> 6, 0, 34, 34, 0.896
> 6, 0, 0, 35, 0.904
> 6, 0, 35, 35, 0.913
> 6, 0, 0, 36, 0.905
> 6, 0, 36, 36, 0.907
> 6, 0, 0, 37, 0.899
> 6, 0, 37, 37, 0.9
> 6, 0, 0, 38, 0.912
> 6, 0, 38, 38, 0.919
> 6, 0, 0, 39, 0.925
> 6, 0, 39, 39, 0.927
> 6, 0, 0, 40, 0.923
> 6, 0, 40, 40, 0.972
> 6, 0, 0, 41, 0.92
> 6, 0, 41, 41, 0.966
> 6, 0, 0, 42, 0.915
> 6, 0, 42, 42, 0.834
> 6, 0, 0, 43, 0.92
> 6, 0, 43, 43, 0.856
> 6, 0, 0, 44, 0.908
> 6, 0, 44, 44, 0.858
> 6, 0, 0, 45, 0.932
> 6, 0, 45, 45, 0.847
> 6, 0, 0, 46, 0.927
> 6, 0, 46, 46, 0.859
> 6, 0, 0, 47, 0.902
> 6, 0, 47, 47, 0.855
> 6, 0, 0, 48, 0.949
> 6, 0, 48, 48, 0.934
> 6, 0, 0, 49, 0.907
> 6, 0, 49, 49, 0.943
> 6, 0, 0, 50, 0.934
> 6, 0, 50, 50, 0.943
> 6, 0, 0, 51, 0.933
> 6, 0, 51, 51, 0.939
> 6, 0, 0, 52, 0.944
> 6, 0, 52, 52, 0.944
> 6, 0, 0, 53, 0.939
> 6, 0, 53, 53, 0.938
> 6, 0, 0, 54, 0.9
> 6, 0, 54, 54, 0.923
> 6, 0, 0, 55, 0.9
> 6, 0, 55, 55, 0.927
> 6, 0, 0, 56, 0.9
> 6, 0, 56, 56, 0.917
> 6, 0, 0, 57, 0.9
> 6, 0, 57, 57, 0.916
> 6, 0, 0, 58, 0.914
> 6, 0, 58, 58, 0.784
> 6, 0, 0, 59, 0.863
> 6, 0, 59, 59, 0.846
> 6, 0, 0, 60, 0.88
> 6, 0, 60, 60, 0.827
> 6, 0, 0, 61, 0.896
> 6, 0, 61, 61, 0.847
> 6, 0, 0, 62, 0.894
> 6, 0, 62, 62, 0.865
> 6, 0, 0, 63, 0.934
> 6, 0, 63, 63, 0.866
>
> sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++---------------
> 1 file changed, 37 insertions(+), 46 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
> index 013aebf797..c312fab8b1 100644
> --- a/sysdeps/x86_64/multiarch/strcspn-c.c
> +++ b/sysdeps/x86_64/multiarch/strcspn-c.c
> @@ -84,83 +84,74 @@ STRCSPN_SSE42 (const char *s, const char *a)
> RETURN (NULL, strlen (s));
>
> const char *aligned;
> - __m128i mask;
> - int offset = (int) ((size_t) a & 15);
> + __m128i mask, maskz, zero;
> + unsigned int maskz_bits;
> + unsigned int offset = (unsigned int) ((size_t) a & 15);
> + zero = _mm_set1_epi8 (0);
> if (offset != 0)
> {
> /* Load masks. */
> aligned = (const char *) ((size_t) a & -16L);
> __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> -
> - mask = __m128i_shift_right (mask0, offset);
> + maskz = _mm_cmpeq_epi8 (mask0, zero);
>
> /* Find where the NULL terminator is. */
> - int length = _mm_cmpistri (mask, mask, 0x3a);
> - if (length == 16 - offset)
> - {
> - /* There is no NULL terminator. */
> - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
> - int index = _mm_cmpistri (mask1, mask1, 0x3a);
> - length += index;
> -
> - /* Don't use SSE4.2 if the length of A > 16. */
> - if (length > 16)
> - return STRCSPN_SSE2 (s, a);
> -
> - if (index != 0)
> - {
> - /* Combine mask0 and mask1. We could play games with
> - palignr, but frankly this data should be in L1 now
> - so do the merge via an unaligned load. */
> - mask = _mm_loadu_si128 ((__m128i *) a);
> - }
> - }
> + maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> + if (maskz_bits != 0)
> + {
> + mask = __m128i_shift_right (mask0, offset);
> + offset = (unsigned int) ((size_t) s & 15);
> + if (offset)
> + goto start_unaligned;
> +
> + aligned = s;
> + goto start_loop;
> + }
> }
> - else
> - {
> - /* A is aligned. */
> - mask = _mm_load_si128 ((__m128i *) a);
>
> - /* Find where the NULL terminator is. */
> - int length = _mm_cmpistri (mask, mask, 0x3a);
> - if (length == 16)
> - {
> - /* There is no NULL terminator. Don't use SSE4.2 if the length
> - of A > 16. */
> - if (a[16] != 0)
> - return STRCSPN_SSE2 (s, a);
> - }
> + /* A is aligned. */
> + mask = _mm_loadu_si128 ((__m128i *) a);
> + /* Find where the NULL terminator is. */
> + maskz = _mm_cmpeq_epi8 (mask, zero);
> + maskz_bits = _mm_movemask_epi8 (maskz);
> + if (maskz_bits == 0)
> + {
> + /* There is no NULL terminator. Don't use SSE4.2 if the length
> + of A > 16. */
> + if (a[16] != 0)
> + return STRCSPN_SSE2 (s, a);
> }
>
> - offset = (int) ((size_t) s & 15);
> + aligned = s;
> + offset = (unsigned int) ((size_t) s & 15);
> if (offset != 0)
> {
> + start_unaligned:
> /* Check partial string. */
> aligned = (const char *) ((size_t) s & -16L);
> __m128i value = _mm_load_si128 ((__m128i *) aligned);
>
> value = __m128i_shift_right (value, offset);
>
> - int length = _mm_cmpistri (mask, value, 0x2);
> + unsigned int length = _mm_cmpistri (mask, value, 0x2);
> /* No need to check ZFlag since ZFlag is always 1. */
> - int cflag = _mm_cmpistrc (mask, value, 0x2);
> + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> if (cflag)
> RETURN ((char *) (s + length), length);
> /* Find where the NULL terminator is. */
> - int index = _mm_cmpistri (value, value, 0x3a);
> + unsigned int index = _mm_cmpistri (value, value, 0x3a);
> if (index < 16 - offset)
> RETURN (NULL, index);
> aligned += 16;
> }
> - else
> - aligned = s;
>
> +start_loop:
> while (1)
> {
> __m128i value = _mm_load_si128 ((__m128i *) aligned);
> - int index = _mm_cmpistri (mask, value, 0x2);
> - int cflag = _mm_cmpistrc (mask, value, 0x2);
> - int zflag = _mm_cmpistrz (mask, value, 0x2);
> + unsigned int index = _mm_cmpistri (mask, value, 0x2);
> + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> + unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
> if (cflag)
> RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
> if (zflag)
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
On Thu, Mar 24, 2022 at 11:57 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
> > _mm_cmpistri. Also change offset to unsigned to avoid unnecessary
> > sign extensions.
> >
> > geometric_mean(N=20) of all benchmarks that dont fallback on
> > sse2/strlen; New / Original: .928
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > len, align1, align2, pos, New Time / Old Time
> > 0, 0, 0, 512, 1.207
> > 1, 0, 0, 512, 1.039
> > 1, 1, 0, 512, 0.997
> > 1, 0, 1, 512, 0.981
> > 1, 1, 1, 512, 0.977
> > 2, 0, 0, 512, 1.02
> > 2, 2, 0, 512, 0.979
> > 2, 0, 2, 512, 0.902
> > 2, 2, 2, 512, 0.958
> > 3, 0, 0, 512, 0.978
> > 3, 3, 0, 512, 0.988
> > 3, 0, 3, 512, 0.979
> > 3, 3, 3, 512, 0.955
> > 4, 0, 0, 512, 0.969
> > 4, 4, 0, 512, 0.991
> > 4, 0, 4, 512, 0.94
> > 4, 4, 4, 512, 0.958
> > 5, 0, 0, 512, 0.963
> > 5, 5, 0, 512, 1.004
> > 5, 0, 5, 512, 0.948
> > 5, 5, 5, 512, 0.971
> > 6, 0, 0, 512, 0.933
> > 6, 6, 0, 512, 1.007
> > 6, 0, 6, 512, 0.921
> > 6, 6, 6, 512, 0.969
> > 7, 0, 0, 512, 0.928
> > 7, 7, 0, 512, 0.976
> > 7, 0, 7, 512, 0.932
> > 7, 7, 7, 512, 0.995
> > 8, 0, 0, 512, 0.931
> > 8, 0, 8, 512, 0.766
> > 9, 0, 0, 512, 0.965
> > 9, 1, 0, 512, 0.999
> > 9, 0, 9, 512, 0.765
> > 9, 1, 9, 512, 0.97
> > 10, 0, 0, 512, 0.976
> > 10, 2, 0, 512, 0.991
> > 10, 0, 10, 512, 0.768
> > 10, 2, 10, 512, 0.926
> > 11, 0, 0, 512, 0.958
> > 11, 3, 0, 512, 1.006
> > 11, 0, 11, 512, 0.768
> > 11, 3, 11, 512, 0.908
> > 12, 0, 0, 512, 0.945
> > 12, 4, 0, 512, 0.896
> > 12, 0, 12, 512, 0.764
> > 12, 4, 12, 512, 0.785
> > 13, 0, 0, 512, 0.957
> > 13, 5, 0, 512, 1.019
> > 13, 0, 13, 512, 0.76
> > 13, 5, 13, 512, 0.785
> > 14, 0, 0, 512, 0.918
> > 14, 6, 0, 512, 1.004
> > 14, 0, 14, 512, 0.78
> > 14, 6, 14, 512, 0.711
> > 15, 0, 0, 512, 0.855
> > 15, 7, 0, 512, 0.985
> > 15, 0, 15, 512, 0.779
> > 15, 7, 15, 512, 0.772
> > 16, 0, 0, 512, 0.987
> > 16, 0, 16, 512, 0.99
> > 17, 0, 0, 512, 0.996
> > 17, 1, 0, 512, 0.979
> > 17, 0, 17, 512, 1.001
> > 17, 1, 17, 512, 1.03
> > 18, 0, 0, 512, 0.976
> > 18, 2, 0, 512, 0.989
> > 18, 0, 18, 512, 0.976
> > 18, 2, 18, 512, 0.992
> > 19, 0, 0, 512, 0.991
> > 19, 3, 0, 512, 0.988
> > 19, 0, 19, 512, 1.009
> > 19, 3, 19, 512, 1.018
> > 20, 0, 0, 512, 0.999
> > 20, 4, 0, 512, 1.005
> > 20, 0, 20, 512, 0.993
> > 20, 4, 20, 512, 0.983
> > 21, 0, 0, 512, 0.982
> > 21, 5, 0, 512, 0.988
> > 21, 0, 21, 512, 0.978
> > 21, 5, 21, 512, 0.984
> > 22, 0, 0, 512, 0.988
> > 22, 6, 0, 512, 0.979
> > 22, 0, 22, 512, 0.984
> > 22, 6, 22, 512, 0.983
> > 23, 0, 0, 512, 0.996
> > 23, 7, 0, 512, 0.998
> > 23, 0, 23, 512, 0.979
> > 23, 7, 23, 512, 0.987
> > 24, 0, 0, 512, 0.99
> > 24, 0, 24, 512, 0.979
> > 25, 0, 0, 512, 0.985
> > 25, 1, 0, 512, 0.988
> > 25, 0, 25, 512, 0.99
> > 25, 1, 25, 512, 0.986
> > 26, 0, 0, 512, 1.005
> > 26, 2, 0, 512, 0.995
> > 26, 0, 26, 512, 0.992
> > 26, 2, 26, 512, 0.983
> > 27, 0, 0, 512, 0.986
> > 27, 3, 0, 512, 0.978
> > 27, 0, 27, 512, 0.986
> > 27, 3, 27, 512, 0.973
> > 28, 0, 0, 512, 0.995
> > 28, 4, 0, 512, 0.993
> > 28, 0, 28, 512, 0.983
> > 28, 4, 28, 512, 1.005
> > 29, 0, 0, 512, 0.983
> > 29, 5, 0, 512, 0.982
> > 29, 0, 29, 512, 0.984
> > 29, 5, 29, 512, 1.005
> > 30, 0, 0, 512, 0.978
> > 30, 6, 0, 512, 0.985
> > 30, 0, 30, 512, 0.994
> > 30, 6, 30, 512, 0.993
> > 31, 0, 0, 512, 0.984
> > 31, 7, 0, 512, 0.983
> > 31, 0, 31, 512, 1.0
> > 31, 7, 31, 512, 1.031
> > 4, 0, 0, 32, 0.916
> > 4, 1, 0, 32, 0.952
> > 4, 0, 1, 32, 0.927
> > 4, 1, 1, 32, 0.969
> > 4, 0, 0, 64, 0.961
> > 4, 2, 0, 64, 0.955
> > 4, 0, 2, 64, 0.975
> > 4, 2, 2, 64, 0.972
> > 4, 0, 0, 128, 0.971
> > 4, 3, 0, 128, 0.982
> > 4, 0, 3, 128, 0.945
> > 4, 3, 3, 128, 0.971
> > 4, 0, 0, 256, 1.004
> > 4, 4, 0, 256, 0.966
> > 4, 0, 4, 256, 0.961
> > 4, 4, 4, 256, 0.971
> > 4, 5, 0, 512, 0.929
> > 4, 0, 5, 512, 0.969
> > 4, 5, 5, 512, 0.985
> > 4, 0, 0, 1024, 1.003
> > 4, 6, 0, 1024, 1.009
> > 4, 0, 6, 1024, 1.005
> > 4, 6, 6, 1024, 0.999
> > 4, 0, 0, 2048, 0.917
> > 4, 7, 0, 2048, 1.015
> > 4, 0, 7, 2048, 1.011
> > 4, 7, 7, 2048, 0.907
> > 10, 1, 0, 64, 0.964
> > 10, 1, 1, 64, 0.966
> > 10, 2, 0, 64, 0.953
> > 10, 2, 2, 64, 0.972
> > 10, 3, 0, 64, 0.962
> > 10, 3, 3, 64, 0.969
> > 10, 4, 0, 64, 0.957
> > 10, 4, 4, 64, 0.969
> > 10, 5, 0, 64, 0.961
> > 10, 5, 5, 64, 0.965
> > 10, 6, 0, 64, 0.949
> > 10, 6, 6, 64, 0.9
> > 10, 7, 0, 64, 0.957
> > 10, 7, 7, 64, 0.897
> > 6, 0, 0, 0, 0.991
> > 6, 0, 0, 1, 1.011
> > 6, 0, 1, 1, 0.939
> > 6, 0, 0, 2, 1.016
> > 6, 0, 2, 2, 0.94
> > 6, 0, 0, 3, 1.019
> > 6, 0, 3, 3, 0.941
> > 6, 0, 0, 4, 1.056
> > 6, 0, 4, 4, 0.884
> > 6, 0, 0, 5, 0.977
> > 6, 0, 5, 5, 0.934
> > 6, 0, 0, 6, 0.954
> > 6, 0, 6, 6, 0.93
> > 6, 0, 0, 7, 0.963
> > 6, 0, 7, 7, 0.916
> > 6, 0, 0, 8, 0.963
> > 6, 0, 8, 8, 0.945
> > 6, 0, 0, 9, 1.028
> > 6, 0, 9, 9, 0.942
> > 6, 0, 0, 10, 0.955
> > 6, 0, 10, 10, 0.831
> > 6, 0, 0, 11, 0.948
> > 6, 0, 11, 11, 0.82
> > 6, 0, 0, 12, 1.033
> > 6, 0, 12, 12, 0.873
> > 6, 0, 0, 13, 0.983
> > 6, 0, 13, 13, 0.852
> > 6, 0, 0, 14, 0.984
> > 6, 0, 14, 14, 0.853
> > 6, 0, 0, 15, 0.984
> > 6, 0, 15, 15, 0.882
> > 6, 0, 0, 16, 0.971
> > 6, 0, 16, 16, 0.958
> > 6, 0, 0, 17, 0.938
> > 6, 0, 17, 17, 0.947
> > 6, 0, 0, 18, 0.96
> > 6, 0, 18, 18, 0.938
> > 6, 0, 0, 19, 0.903
> > 6, 0, 19, 19, 0.943
> > 6, 0, 0, 20, 0.947
> > 6, 0, 20, 20, 0.951
> > 6, 0, 0, 21, 0.948
> > 6, 0, 21, 21, 0.96
> > 6, 0, 0, 22, 0.926
> > 6, 0, 22, 22, 0.951
> > 6, 0, 0, 23, 0.923
> > 6, 0, 23, 23, 0.959
> > 6, 0, 0, 24, 0.918
> > 6, 0, 24, 24, 0.952
> > 6, 0, 0, 25, 0.97
> > 6, 0, 25, 25, 0.952
> > 6, 0, 0, 26, 0.871
> > 6, 0, 26, 26, 0.869
> > 6, 0, 0, 27, 0.935
> > 6, 0, 27, 27, 0.836
> > 6, 0, 0, 28, 0.936
> > 6, 0, 28, 28, 0.857
> > 6, 0, 0, 29, 0.876
> > 6, 0, 29, 29, 0.859
> > 6, 0, 0, 30, 0.934
> > 6, 0, 30, 30, 0.857
> > 6, 0, 0, 31, 0.962
> > 6, 0, 31, 31, 0.86
> > 6, 0, 0, 32, 0.912
> > 6, 0, 32, 32, 0.94
> > 6, 0, 0, 33, 0.903
> > 6, 0, 33, 33, 0.968
> > 6, 0, 0, 34, 0.913
> > 6, 0, 34, 34, 0.896
> > 6, 0, 0, 35, 0.904
> > 6, 0, 35, 35, 0.913
> > 6, 0, 0, 36, 0.905
> > 6, 0, 36, 36, 0.907
> > 6, 0, 0, 37, 0.899
> > 6, 0, 37, 37, 0.9
> > 6, 0, 0, 38, 0.912
> > 6, 0, 38, 38, 0.919
> > 6, 0, 0, 39, 0.925
> > 6, 0, 39, 39, 0.927
> > 6, 0, 0, 40, 0.923
> > 6, 0, 40, 40, 0.972
> > 6, 0, 0, 41, 0.92
> > 6, 0, 41, 41, 0.966
> > 6, 0, 0, 42, 0.915
> > 6, 0, 42, 42, 0.834
> > 6, 0, 0, 43, 0.92
> > 6, 0, 43, 43, 0.856
> > 6, 0, 0, 44, 0.908
> > 6, 0, 44, 44, 0.858
> > 6, 0, 0, 45, 0.932
> > 6, 0, 45, 45, 0.847
> > 6, 0, 0, 46, 0.927
> > 6, 0, 46, 46, 0.859
> > 6, 0, 0, 47, 0.902
> > 6, 0, 47, 47, 0.855
> > 6, 0, 0, 48, 0.949
> > 6, 0, 48, 48, 0.934
> > 6, 0, 0, 49, 0.907
> > 6, 0, 49, 49, 0.943
> > 6, 0, 0, 50, 0.934
> > 6, 0, 50, 50, 0.943
> > 6, 0, 0, 51, 0.933
> > 6, 0, 51, 51, 0.939
> > 6, 0, 0, 52, 0.944
> > 6, 0, 52, 52, 0.944
> > 6, 0, 0, 53, 0.939
> > 6, 0, 53, 53, 0.938
> > 6, 0, 0, 54, 0.9
> > 6, 0, 54, 54, 0.923
> > 6, 0, 0, 55, 0.9
> > 6, 0, 55, 55, 0.927
> > 6, 0, 0, 56, 0.9
> > 6, 0, 56, 56, 0.917
> > 6, 0, 0, 57, 0.9
> > 6, 0, 57, 57, 0.916
> > 6, 0, 0, 58, 0.914
> > 6, 0, 58, 58, 0.784
> > 6, 0, 0, 59, 0.863
> > 6, 0, 59, 59, 0.846
> > 6, 0, 0, 60, 0.88
> > 6, 0, 60, 60, 0.827
> > 6, 0, 0, 61, 0.896
> > 6, 0, 61, 61, 0.847
> > 6, 0, 0, 62, 0.894
> > 6, 0, 62, 62, 0.865
> > 6, 0, 0, 63, 0.934
> > 6, 0, 63, 63, 0.866
> >
> > sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++---------------
> > 1 file changed, 37 insertions(+), 46 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
> > index 013aebf797..c312fab8b1 100644
> > --- a/sysdeps/x86_64/multiarch/strcspn-c.c
> > +++ b/sysdeps/x86_64/multiarch/strcspn-c.c
> > @@ -84,83 +84,74 @@ STRCSPN_SSE42 (const char *s, const char *a)
> > RETURN (NULL, strlen (s));
> >
> > const char *aligned;
> > - __m128i mask;
> > - int offset = (int) ((size_t) a & 15);
> > + __m128i mask, maskz, zero;
> > + unsigned int maskz_bits;
> > + unsigned int offset = (unsigned int) ((size_t) a & 15);
> > + zero = _mm_set1_epi8 (0);
> > if (offset != 0)
> > {
> > /* Load masks. */
> > aligned = (const char *) ((size_t) a & -16L);
> > __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> > -
> > - mask = __m128i_shift_right (mask0, offset);
> > + maskz = _mm_cmpeq_epi8 (mask0, zero);
> >
> > /* Find where the NULL terminator is. */
> > - int length = _mm_cmpistri (mask, mask, 0x3a);
> > - if (length == 16 - offset)
> > - {
> > - /* There is no NULL terminator. */
> > - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
> > - int index = _mm_cmpistri (mask1, mask1, 0x3a);
> > - length += index;
> > -
> > - /* Don't use SSE4.2 if the length of A > 16. */
> > - if (length > 16)
> > - return STRCSPN_SSE2 (s, a);
> > -
> > - if (index != 0)
> > - {
> > - /* Combine mask0 and mask1. We could play games with
> > - palignr, but frankly this data should be in L1 now
> > - so do the merge via an unaligned load. */
> > - mask = _mm_loadu_si128 ((__m128i *) a);
> > - }
> > - }
> > + maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > + if (maskz_bits != 0)
> > + {
> > + mask = __m128i_shift_right (mask0, offset);
> > + offset = (unsigned int) ((size_t) s & 15);
> > + if (offset)
> > + goto start_unaligned;
> > +
> > + aligned = s;
> > + goto start_loop;
> > + }
> > }
> > - else
> > - {
> > - /* A is aligned. */
> > - mask = _mm_load_si128 ((__m128i *) a);
> >
> > - /* Find where the NULL terminator is. */
> > - int length = _mm_cmpistri (mask, mask, 0x3a);
> > - if (length == 16)
> > - {
> > - /* There is no NULL terminator. Don't use SSE4.2 if the length
> > - of A > 16. */
> > - if (a[16] != 0)
> > - return STRCSPN_SSE2 (s, a);
> > - }
> > + /* A is aligned. */
> > + mask = _mm_loadu_si128 ((__m128i *) a);
> > + /* Find where the NULL terminator is. */
> > + maskz = _mm_cmpeq_epi8 (mask, zero);
> > + maskz_bits = _mm_movemask_epi8 (maskz);
> > + if (maskz_bits == 0)
> > + {
> > + /* There is no NULL terminator. Don't use SSE4.2 if the length
> > + of A > 16. */
> > + if (a[16] != 0)
> > + return STRCSPN_SSE2 (s, a);
> > }
> >
> > - offset = (int) ((size_t) s & 15);
> > + aligned = s;
> > + offset = (unsigned int) ((size_t) s & 15);
> > if (offset != 0)
> > {
> > + start_unaligned:
> > /* Check partial string. */
> > aligned = (const char *) ((size_t) s & -16L);
> > __m128i value = _mm_load_si128 ((__m128i *) aligned);
> >
> > value = __m128i_shift_right (value, offset);
> >
> > - int length = _mm_cmpistri (mask, value, 0x2);
> > + unsigned int length = _mm_cmpistri (mask, value, 0x2);
> > /* No need to check ZFlag since ZFlag is always 1. */
> > - int cflag = _mm_cmpistrc (mask, value, 0x2);
> > + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> > if (cflag)
> > RETURN ((char *) (s + length), length);
> > /* Find where the NULL terminator is. */
> > - int index = _mm_cmpistri (value, value, 0x3a);
> > + unsigned int index = _mm_cmpistri (value, value, 0x3a);
> > if (index < 16 - offset)
> > RETURN (NULL, index);
> > aligned += 16;
> > }
> > - else
> > - aligned = s;
> >
> > +start_loop:
> > while (1)
> > {
> > __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > - int index = _mm_cmpistri (mask, value, 0x2);
> > - int cflag = _mm_cmpistrc (mask, value, 0x2);
> > - int zflag = _mm_cmpistrz (mask, value, 0x2);
> > + unsigned int index = _mm_cmpistri (mask, value, 0x2);
> > + unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> > + unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
> > if (cflag)
> > RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
> > if (zflag)
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
@@ -84,83 +84,74 @@ STRCSPN_SSE42 (const char *s, const char *a)
RETURN (NULL, strlen (s));
const char *aligned;
- __m128i mask;
- int offset = (int) ((size_t) a & 15);
+ __m128i mask, maskz, zero;
+ unsigned int maskz_bits;
+ unsigned int offset = (unsigned int) ((size_t) a & 15);
+ zero = _mm_set1_epi8 (0);
if (offset != 0)
{
/* Load masks. */
aligned = (const char *) ((size_t) a & -16L);
__m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
- mask = __m128i_shift_right (mask0, offset);
+ maskz = _mm_cmpeq_epi8 (mask0, zero);
/* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16 - offset)
- {
- /* There is no NULL terminator. */
- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
- int index = _mm_cmpistri (mask1, mask1, 0x3a);
- length += index;
-
- /* Don't use SSE4.2 if the length of A > 16. */
- if (length > 16)
- return STRCSPN_SSE2 (s, a);
-
- if (index != 0)
- {
- /* Combine mask0 and mask1. We could play games with
- palignr, but frankly this data should be in L1 now
- so do the merge via an unaligned load. */
- mask = _mm_loadu_si128 ((__m128i *) a);
- }
- }
+ maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+ if (maskz_bits != 0)
+ {
+ mask = __m128i_shift_right (mask0, offset);
+ offset = (unsigned int) ((size_t) s & 15);
+ if (offset)
+ goto start_unaligned;
+
+ aligned = s;
+ goto start_loop;
+ }
}
- else
- {
- /* A is aligned. */
- mask = _mm_load_si128 ((__m128i *) a);
- /* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16)
- {
- /* There is no NULL terminator. Don't use SSE4.2 if the length
- of A > 16. */
- if (a[16] != 0)
- return STRCSPN_SSE2 (s, a);
- }
+ /* A is aligned. */
+ mask = _mm_loadu_si128 ((__m128i *) a);
+ /* Find where the NULL terminator is. */
+ maskz = _mm_cmpeq_epi8 (mask, zero);
+ maskz_bits = _mm_movemask_epi8 (maskz);
+ if (maskz_bits == 0)
+ {
+ /* There is no NULL terminator. Don't use SSE4.2 if the length
+ of A > 16. */
+ if (a[16] != 0)
+ return STRCSPN_SSE2 (s, a);
}
- offset = (int) ((size_t) s & 15);
+ aligned = s;
+ offset = (unsigned int) ((size_t) s & 15);
if (offset != 0)
{
+ start_unaligned:
/* Check partial string. */
aligned = (const char *) ((size_t) s & -16L);
__m128i value = _mm_load_si128 ((__m128i *) aligned);
value = __m128i_shift_right (value, offset);
- int length = _mm_cmpistri (mask, value, 0x2);
+ unsigned int length = _mm_cmpistri (mask, value, 0x2);
/* No need to check ZFlag since ZFlag is always 1. */
- int cflag = _mm_cmpistrc (mask, value, 0x2);
+ unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
if (cflag)
RETURN ((char *) (s + length), length);
/* Find where the NULL terminator is. */
- int index = _mm_cmpistri (value, value, 0x3a);
+ unsigned int index = _mm_cmpistri (value, value, 0x3a);
if (index < 16 - offset)
RETURN (NULL, index);
aligned += 16;
}
- else
- aligned = s;
+start_loop:
while (1)
{
__m128i value = _mm_load_si128 ((__m128i *) aligned);
- int index = _mm_cmpistri (mask, value, 0x2);
- int cflag = _mm_cmpistrc (mask, value, 0x2);
- int zflag = _mm_cmpistrz (mask, value, 0x2);
+ unsigned int index = _mm_cmpistri (mask, value, 0x2);
+ unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
+ unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
if (cflag)
RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
if (zflag)