[v1,08/23] x86: Optimize strspn in strspn-c.c
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
sign extensions.
geometric_mean(N=20) of all benchmarks that dont fallback on
sse2; New / Original: .901
All string/memory tests pass.
---
Geomtric Mean N=20 runs; All functions page aligned
len, align1, align2, pos, New Time / Old Time
1, 0, 0, 512, 0.768
1, 1, 0, 512, 0.666
1, 0, 1, 512, 1.193
1, 1, 1, 512, 0.872
2, 0, 0, 512, 0.698
2, 2, 0, 512, 0.687
2, 0, 2, 512, 1.393
2, 2, 2, 512, 0.944
3, 0, 0, 512, 0.691
3, 3, 0, 512, 0.676
3, 0, 3, 512, 1.388
3, 3, 3, 512, 0.948
4, 0, 0, 512, 0.74
4, 4, 0, 512, 0.678
4, 0, 4, 512, 1.421
4, 4, 4, 512, 0.943
5, 0, 0, 512, 0.691
5, 5, 0, 512, 0.675
5, 0, 5, 512, 1.348
5, 5, 5, 512, 0.952
6, 0, 0, 512, 0.685
6, 6, 0, 512, 0.67
6, 0, 6, 512, 1.333
6, 6, 6, 512, 0.95
7, 0, 0, 512, 0.688
7, 7, 0, 512, 0.675
7, 0, 7, 512, 1.344
7, 7, 7, 512, 0.919
8, 0, 0, 512, 0.716
8, 0, 8, 512, 0.935
9, 0, 0, 512, 0.716
9, 1, 0, 512, 0.712
9, 0, 9, 512, 0.956
9, 1, 9, 512, 0.992
10, 0, 0, 512, 0.699
10, 2, 0, 512, 0.68
10, 0, 10, 512, 0.952
10, 2, 10, 512, 0.932
11, 0, 0, 512, 0.705
11, 3, 0, 512, 0.685
11, 0, 11, 512, 0.956
11, 3, 11, 512, 0.927
12, 0, 0, 512, 0.695
12, 4, 0, 512, 0.675
12, 0, 12, 512, 0.948
12, 4, 12, 512, 0.928
13, 0, 0, 512, 0.7
13, 5, 0, 512, 0.678
13, 0, 13, 512, 0.944
13, 5, 13, 512, 0.931
14, 0, 0, 512, 0.703
14, 6, 0, 512, 0.678
14, 0, 14, 512, 0.949
14, 6, 14, 512, 0.93
15, 0, 0, 512, 0.694
15, 7, 0, 512, 0.678
15, 0, 15, 512, 0.953
15, 7, 15, 512, 0.924
16, 0, 0, 512, 1.021
16, 0, 16, 512, 1.067
17, 0, 0, 512, 0.991
17, 1, 0, 512, 0.984
17, 0, 17, 512, 0.979
17, 1, 17, 512, 0.993
18, 0, 0, 512, 0.992
18, 2, 0, 512, 1.008
18, 0, 18, 512, 1.016
18, 2, 18, 512, 0.993
19, 0, 0, 512, 0.984
19, 3, 0, 512, 0.985
19, 0, 19, 512, 1.007
19, 3, 19, 512, 1.006
20, 0, 0, 512, 0.969
20, 4, 0, 512, 0.968
20, 0, 20, 512, 0.975
20, 4, 20, 512, 0.975
21, 0, 0, 512, 0.992
21, 5, 0, 512, 0.992
21, 0, 21, 512, 0.98
21, 5, 21, 512, 0.97
22, 0, 0, 512, 0.989
22, 6, 0, 512, 0.987
22, 0, 22, 512, 0.99
22, 6, 22, 512, 0.985
23, 0, 0, 512, 0.989
23, 7, 0, 512, 0.98
23, 0, 23, 512, 1.0
23, 7, 23, 512, 0.993
24, 0, 0, 512, 0.99
24, 0, 24, 512, 0.998
25, 0, 0, 512, 1.01
25, 1, 0, 512, 1.0
25, 0, 25, 512, 0.97
25, 1, 25, 512, 0.967
26, 0, 0, 512, 1.009
26, 2, 0, 512, 0.986
26, 0, 26, 512, 0.997
26, 2, 26, 512, 0.993
27, 0, 0, 512, 0.984
27, 3, 0, 512, 0.997
27, 0, 27, 512, 0.989
27, 3, 27, 512, 0.976
28, 0, 0, 512, 0.991
28, 4, 0, 512, 1.003
28, 0, 28, 512, 0.986
28, 4, 28, 512, 0.989
29, 0, 0, 512, 0.986
29, 5, 0, 512, 0.985
29, 0, 29, 512, 0.984
29, 5, 29, 512, 0.977
30, 0, 0, 512, 0.991
30, 6, 0, 512, 0.987
30, 0, 30, 512, 0.979
30, 6, 30, 512, 0.974
31, 0, 0, 512, 0.995
31, 7, 0, 512, 0.995
31, 0, 31, 512, 0.994
31, 7, 31, 512, 0.984
4, 0, 0, 32, 0.861
4, 1, 0, 32, 0.864
4, 0, 1, 32, 0.962
4, 1, 1, 32, 0.967
4, 0, 0, 64, 0.884
4, 2, 0, 64, 0.818
4, 0, 2, 64, 0.889
4, 2, 2, 64, 0.918
4, 0, 0, 128, 0.942
4, 3, 0, 128, 0.884
4, 0, 3, 128, 0.931
4, 3, 3, 128, 0.883
4, 0, 0, 256, 0.964
4, 4, 0, 256, 0.922
4, 0, 4, 256, 0.956
4, 4, 4, 256, 0.93
4, 5, 0, 512, 0.833
4, 0, 5, 512, 1.027
4, 5, 5, 512, 0.929
4, 0, 0, 1024, 0.998
4, 6, 0, 1024, 0.986
4, 0, 6, 1024, 0.984
4, 6, 6, 1024, 0.977
4, 0, 0, 2048, 0.991
4, 7, 0, 2048, 0.987
4, 0, 7, 2048, 0.996
4, 7, 7, 2048, 0.98
10, 1, 0, 64, 0.826
10, 1, 1, 64, 0.907
10, 2, 0, 64, 0.829
10, 2, 2, 64, 0.91
10, 3, 0, 64, 0.83
10, 3, 3, 64, 0.915
10, 4, 0, 64, 0.83
10, 4, 4, 64, 0.911
10, 5, 0, 64, 0.828
10, 5, 5, 64, 0.905
10, 6, 0, 64, 0.828
10, 6, 6, 64, 0.812
10, 7, 0, 64, 0.83
10, 7, 7, 64, 0.819
6, 0, 0, 0, 1.261
6, 0, 0, 1, 1.252
6, 0, 1, 1, 0.845
6, 0, 0, 2, 1.27
6, 0, 2, 2, 0.85
6, 0, 0, 3, 1.269
6, 0, 3, 3, 0.845
6, 0, 0, 4, 1.287
6, 0, 4, 4, 0.852
6, 0, 0, 5, 1.278
6, 0, 5, 5, 0.851
6, 0, 0, 6, 1.269
6, 0, 6, 6, 0.841
6, 0, 0, 7, 1.268
6, 0, 7, 7, 0.851
6, 0, 0, 8, 1.291
6, 0, 8, 8, 0.837
6, 0, 0, 9, 1.283
6, 0, 9, 9, 0.831
6, 0, 0, 10, 1.252
6, 0, 10, 10, 0.997
6, 0, 0, 11, 1.295
6, 0, 11, 11, 1.046
6, 0, 0, 12, 1.296
6, 0, 12, 12, 1.038
6, 0, 0, 13, 1.287
6, 0, 13, 13, 1.082
6, 0, 0, 14, 1.284
6, 0, 14, 14, 1.001
6, 0, 0, 15, 1.286
6, 0, 15, 15, 1.002
6, 0, 0, 16, 0.894
6, 0, 16, 16, 0.874
6, 0, 0, 17, 0.892
6, 0, 17, 17, 0.974
6, 0, 0, 18, 0.907
6, 0, 18, 18, 0.993
6, 0, 0, 19, 0.909
6, 0, 19, 19, 0.99
6, 0, 0, 20, 0.894
6, 0, 20, 20, 0.978
6, 0, 0, 21, 0.89
6, 0, 21, 21, 0.958
6, 0, 0, 22, 0.893
6, 0, 22, 22, 0.99
6, 0, 0, 23, 0.899
6, 0, 23, 23, 0.986
6, 0, 0, 24, 0.893
6, 0, 24, 24, 0.989
6, 0, 0, 25, 0.889
6, 0, 25, 25, 0.982
6, 0, 0, 26, 0.889
6, 0, 26, 26, 0.852
6, 0, 0, 27, 0.89
6, 0, 27, 27, 0.832
6, 0, 0, 28, 0.89
6, 0, 28, 28, 0.831
6, 0, 0, 29, 0.89
6, 0, 29, 29, 0.838
6, 0, 0, 30, 0.907
6, 0, 30, 30, 0.833
6, 0, 0, 31, 0.888
6, 0, 31, 31, 0.837
6, 0, 0, 32, 0.853
6, 0, 32, 32, 0.828
6, 0, 0, 33, 0.857
6, 0, 33, 33, 0.947
6, 0, 0, 34, 0.847
6, 0, 34, 34, 0.954
6, 0, 0, 35, 0.841
6, 0, 35, 35, 0.94
6, 0, 0, 36, 0.854
6, 0, 36, 36, 0.958
6, 0, 0, 37, 0.856
6, 0, 37, 37, 0.957
6, 0, 0, 38, 0.839
6, 0, 38, 38, 0.962
6, 0, 0, 39, 0.866
6, 0, 39, 39, 0.945
6, 0, 0, 40, 0.845
6, 0, 40, 40, 0.961
6, 0, 0, 41, 0.858
6, 0, 41, 41, 0.961
6, 0, 0, 42, 0.862
6, 0, 42, 42, 0.825
6, 0, 0, 43, 0.864
6, 0, 43, 43, 0.82
6, 0, 0, 44, 0.843
6, 0, 44, 44, 0.81
6, 0, 0, 45, 0.859
6, 0, 45, 45, 0.816
6, 0, 0, 46, 0.866
6, 0, 46, 46, 0.81
6, 0, 0, 47, 0.858
6, 0, 47, 47, 0.807
6, 0, 0, 48, 0.87
6, 0, 48, 48, 0.87
6, 0, 0, 49, 0.871
6, 0, 49, 49, 0.874
6, 0, 0, 50, 0.87
6, 0, 50, 50, 0.881
6, 0, 0, 51, 0.868
6, 0, 51, 51, 0.875
6, 0, 0, 52, 0.873
6, 0, 52, 52, 0.871
6, 0, 0, 53, 0.866
6, 0, 53, 53, 0.882
6, 0, 0, 54, 0.863
6, 0, 54, 54, 0.876
6, 0, 0, 55, 0.851
6, 0, 55, 55, 0.871
6, 0, 0, 56, 0.867
6, 0, 56, 56, 0.888
6, 0, 0, 57, 0.862
6, 0, 57, 57, 0.899
6, 0, 0, 58, 0.873
6, 0, 58, 58, 0.798
6, 0, 0, 59, 0.881
6, 0, 59, 59, 0.785
6, 0, 0, 60, 0.867
6, 0, 60, 60, 0.797
6, 0, 0, 61, 0.872
6, 0, 61, 61, 0.791
6, 0, 0, 62, 0.859
6, 0, 62, 62, 0.79
6, 0, 0, 63, 0.87
6, 0, 63, 63, 0.796
sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++----------------
1 file changed, 39 insertions(+), 47 deletions(-)
Comments
On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
> _mm_cmpistri. Also change offset to unsigned to avoid unnecessary
> sign extensions.
>
> geometric_mean(N=20) of all benchmarks that dont fallback on
> sse2; New / Original: .901
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> len, align1, align2, pos, New Time / Old Time
> 1, 0, 0, 512, 0.768
> 1, 1, 0, 512, 0.666
> 1, 0, 1, 512, 1.193
> 1, 1, 1, 512, 0.872
> 2, 0, 0, 512, 0.698
> 2, 2, 0, 512, 0.687
> 2, 0, 2, 512, 1.393
> 2, 2, 2, 512, 0.944
> 3, 0, 0, 512, 0.691
> 3, 3, 0, 512, 0.676
> 3, 0, 3, 512, 1.388
> 3, 3, 3, 512, 0.948
> 4, 0, 0, 512, 0.74
> 4, 4, 0, 512, 0.678
> 4, 0, 4, 512, 1.421
> 4, 4, 4, 512, 0.943
> 5, 0, 0, 512, 0.691
> 5, 5, 0, 512, 0.675
> 5, 0, 5, 512, 1.348
> 5, 5, 5, 512, 0.952
> 6, 0, 0, 512, 0.685
> 6, 6, 0, 512, 0.67
> 6, 0, 6, 512, 1.333
> 6, 6, 6, 512, 0.95
> 7, 0, 0, 512, 0.688
> 7, 7, 0, 512, 0.675
> 7, 0, 7, 512, 1.344
> 7, 7, 7, 512, 0.919
> 8, 0, 0, 512, 0.716
> 8, 0, 8, 512, 0.935
> 9, 0, 0, 512, 0.716
> 9, 1, 0, 512, 0.712
> 9, 0, 9, 512, 0.956
> 9, 1, 9, 512, 0.992
> 10, 0, 0, 512, 0.699
> 10, 2, 0, 512, 0.68
> 10, 0, 10, 512, 0.952
> 10, 2, 10, 512, 0.932
> 11, 0, 0, 512, 0.705
> 11, 3, 0, 512, 0.685
> 11, 0, 11, 512, 0.956
> 11, 3, 11, 512, 0.927
> 12, 0, 0, 512, 0.695
> 12, 4, 0, 512, 0.675
> 12, 0, 12, 512, 0.948
> 12, 4, 12, 512, 0.928
> 13, 0, 0, 512, 0.7
> 13, 5, 0, 512, 0.678
> 13, 0, 13, 512, 0.944
> 13, 5, 13, 512, 0.931
> 14, 0, 0, 512, 0.703
> 14, 6, 0, 512, 0.678
> 14, 0, 14, 512, 0.949
> 14, 6, 14, 512, 0.93
> 15, 0, 0, 512, 0.694
> 15, 7, 0, 512, 0.678
> 15, 0, 15, 512, 0.953
> 15, 7, 15, 512, 0.924
> 16, 0, 0, 512, 1.021
> 16, 0, 16, 512, 1.067
> 17, 0, 0, 512, 0.991
> 17, 1, 0, 512, 0.984
> 17, 0, 17, 512, 0.979
> 17, 1, 17, 512, 0.993
> 18, 0, 0, 512, 0.992
> 18, 2, 0, 512, 1.008
> 18, 0, 18, 512, 1.016
> 18, 2, 18, 512, 0.993
> 19, 0, 0, 512, 0.984
> 19, 3, 0, 512, 0.985
> 19, 0, 19, 512, 1.007
> 19, 3, 19, 512, 1.006
> 20, 0, 0, 512, 0.969
> 20, 4, 0, 512, 0.968
> 20, 0, 20, 512, 0.975
> 20, 4, 20, 512, 0.975
> 21, 0, 0, 512, 0.992
> 21, 5, 0, 512, 0.992
> 21, 0, 21, 512, 0.98
> 21, 5, 21, 512, 0.97
> 22, 0, 0, 512, 0.989
> 22, 6, 0, 512, 0.987
> 22, 0, 22, 512, 0.99
> 22, 6, 22, 512, 0.985
> 23, 0, 0, 512, 0.989
> 23, 7, 0, 512, 0.98
> 23, 0, 23, 512, 1.0
> 23, 7, 23, 512, 0.993
> 24, 0, 0, 512, 0.99
> 24, 0, 24, 512, 0.998
> 25, 0, 0, 512, 1.01
> 25, 1, 0, 512, 1.0
> 25, 0, 25, 512, 0.97
> 25, 1, 25, 512, 0.967
> 26, 0, 0, 512, 1.009
> 26, 2, 0, 512, 0.986
> 26, 0, 26, 512, 0.997
> 26, 2, 26, 512, 0.993
> 27, 0, 0, 512, 0.984
> 27, 3, 0, 512, 0.997
> 27, 0, 27, 512, 0.989
> 27, 3, 27, 512, 0.976
> 28, 0, 0, 512, 0.991
> 28, 4, 0, 512, 1.003
> 28, 0, 28, 512, 0.986
> 28, 4, 28, 512, 0.989
> 29, 0, 0, 512, 0.986
> 29, 5, 0, 512, 0.985
> 29, 0, 29, 512, 0.984
> 29, 5, 29, 512, 0.977
> 30, 0, 0, 512, 0.991
> 30, 6, 0, 512, 0.987
> 30, 0, 30, 512, 0.979
> 30, 6, 30, 512, 0.974
> 31, 0, 0, 512, 0.995
> 31, 7, 0, 512, 0.995
> 31, 0, 31, 512, 0.994
> 31, 7, 31, 512, 0.984
> 4, 0, 0, 32, 0.861
> 4, 1, 0, 32, 0.864
> 4, 0, 1, 32, 0.962
> 4, 1, 1, 32, 0.967
> 4, 0, 0, 64, 0.884
> 4, 2, 0, 64, 0.818
> 4, 0, 2, 64, 0.889
> 4, 2, 2, 64, 0.918
> 4, 0, 0, 128, 0.942
> 4, 3, 0, 128, 0.884
> 4, 0, 3, 128, 0.931
> 4, 3, 3, 128, 0.883
> 4, 0, 0, 256, 0.964
> 4, 4, 0, 256, 0.922
> 4, 0, 4, 256, 0.956
> 4, 4, 4, 256, 0.93
> 4, 5, 0, 512, 0.833
> 4, 0, 5, 512, 1.027
> 4, 5, 5, 512, 0.929
> 4, 0, 0, 1024, 0.998
> 4, 6, 0, 1024, 0.986
> 4, 0, 6, 1024, 0.984
> 4, 6, 6, 1024, 0.977
> 4, 0, 0, 2048, 0.991
> 4, 7, 0, 2048, 0.987
> 4, 0, 7, 2048, 0.996
> 4, 7, 7, 2048, 0.98
> 10, 1, 0, 64, 0.826
> 10, 1, 1, 64, 0.907
> 10, 2, 0, 64, 0.829
> 10, 2, 2, 64, 0.91
> 10, 3, 0, 64, 0.83
> 10, 3, 3, 64, 0.915
> 10, 4, 0, 64, 0.83
> 10, 4, 4, 64, 0.911
> 10, 5, 0, 64, 0.828
> 10, 5, 5, 64, 0.905
> 10, 6, 0, 64, 0.828
> 10, 6, 6, 64, 0.812
> 10, 7, 0, 64, 0.83
> 10, 7, 7, 64, 0.819
> 6, 0, 0, 0, 1.261
> 6, 0, 0, 1, 1.252
> 6, 0, 1, 1, 0.845
> 6, 0, 0, 2, 1.27
> 6, 0, 2, 2, 0.85
> 6, 0, 0, 3, 1.269
> 6, 0, 3, 3, 0.845
> 6, 0, 0, 4, 1.287
> 6, 0, 4, 4, 0.852
> 6, 0, 0, 5, 1.278
> 6, 0, 5, 5, 0.851
> 6, 0, 0, 6, 1.269
> 6, 0, 6, 6, 0.841
> 6, 0, 0, 7, 1.268
> 6, 0, 7, 7, 0.851
> 6, 0, 0, 8, 1.291
> 6, 0, 8, 8, 0.837
> 6, 0, 0, 9, 1.283
> 6, 0, 9, 9, 0.831
> 6, 0, 0, 10, 1.252
> 6, 0, 10, 10, 0.997
> 6, 0, 0, 11, 1.295
> 6, 0, 11, 11, 1.046
> 6, 0, 0, 12, 1.296
> 6, 0, 12, 12, 1.038
> 6, 0, 0, 13, 1.287
> 6, 0, 13, 13, 1.082
> 6, 0, 0, 14, 1.284
> 6, 0, 14, 14, 1.001
> 6, 0, 0, 15, 1.286
> 6, 0, 15, 15, 1.002
> 6, 0, 0, 16, 0.894
> 6, 0, 16, 16, 0.874
> 6, 0, 0, 17, 0.892
> 6, 0, 17, 17, 0.974
> 6, 0, 0, 18, 0.907
> 6, 0, 18, 18, 0.993
> 6, 0, 0, 19, 0.909
> 6, 0, 19, 19, 0.99
> 6, 0, 0, 20, 0.894
> 6, 0, 20, 20, 0.978
> 6, 0, 0, 21, 0.89
> 6, 0, 21, 21, 0.958
> 6, 0, 0, 22, 0.893
> 6, 0, 22, 22, 0.99
> 6, 0, 0, 23, 0.899
> 6, 0, 23, 23, 0.986
> 6, 0, 0, 24, 0.893
> 6, 0, 24, 24, 0.989
> 6, 0, 0, 25, 0.889
> 6, 0, 25, 25, 0.982
> 6, 0, 0, 26, 0.889
> 6, 0, 26, 26, 0.852
> 6, 0, 0, 27, 0.89
> 6, 0, 27, 27, 0.832
> 6, 0, 0, 28, 0.89
> 6, 0, 28, 28, 0.831
> 6, 0, 0, 29, 0.89
> 6, 0, 29, 29, 0.838
> 6, 0, 0, 30, 0.907
> 6, 0, 30, 30, 0.833
> 6, 0, 0, 31, 0.888
> 6, 0, 31, 31, 0.837
> 6, 0, 0, 32, 0.853
> 6, 0, 32, 32, 0.828
> 6, 0, 0, 33, 0.857
> 6, 0, 33, 33, 0.947
> 6, 0, 0, 34, 0.847
> 6, 0, 34, 34, 0.954
> 6, 0, 0, 35, 0.841
> 6, 0, 35, 35, 0.94
> 6, 0, 0, 36, 0.854
> 6, 0, 36, 36, 0.958
> 6, 0, 0, 37, 0.856
> 6, 0, 37, 37, 0.957
> 6, 0, 0, 38, 0.839
> 6, 0, 38, 38, 0.962
> 6, 0, 0, 39, 0.866
> 6, 0, 39, 39, 0.945
> 6, 0, 0, 40, 0.845
> 6, 0, 40, 40, 0.961
> 6, 0, 0, 41, 0.858
> 6, 0, 41, 41, 0.961
> 6, 0, 0, 42, 0.862
> 6, 0, 42, 42, 0.825
> 6, 0, 0, 43, 0.864
> 6, 0, 43, 43, 0.82
> 6, 0, 0, 44, 0.843
> 6, 0, 44, 44, 0.81
> 6, 0, 0, 45, 0.859
> 6, 0, 45, 45, 0.816
> 6, 0, 0, 46, 0.866
> 6, 0, 46, 46, 0.81
> 6, 0, 0, 47, 0.858
> 6, 0, 47, 47, 0.807
> 6, 0, 0, 48, 0.87
> 6, 0, 48, 48, 0.87
> 6, 0, 0, 49, 0.871
> 6, 0, 49, 49, 0.874
> 6, 0, 0, 50, 0.87
> 6, 0, 50, 50, 0.881
> 6, 0, 0, 51, 0.868
> 6, 0, 51, 51, 0.875
> 6, 0, 0, 52, 0.873
> 6, 0, 52, 52, 0.871
> 6, 0, 0, 53, 0.866
> 6, 0, 53, 53, 0.882
> 6, 0, 0, 54, 0.863
> 6, 0, 54, 54, 0.876
> 6, 0, 0, 55, 0.851
> 6, 0, 55, 55, 0.871
> 6, 0, 0, 56, 0.867
> 6, 0, 56, 56, 0.888
> 6, 0, 0, 57, 0.862
> 6, 0, 57, 57, 0.899
> 6, 0, 0, 58, 0.873
> 6, 0, 58, 58, 0.798
> 6, 0, 0, 59, 0.881
> 6, 0, 59, 59, 0.785
> 6, 0, 0, 60, 0.867
> 6, 0, 60, 60, 0.797
> 6, 0, 0, 61, 0.872
> 6, 0, 61, 61, 0.791
> 6, 0, 0, 62, 0.859
> 6, 0, 62, 62, 0.79
> 6, 0, 0, 63, 0.87
> 6, 0, 63, 63, 0.796
>
> sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++----------------
> 1 file changed, 39 insertions(+), 47 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
> index 8fb3aba64d..6124033ceb 100644
> --- a/sysdeps/x86_64/multiarch/strspn-c.c
> +++ b/sysdeps/x86_64/multiarch/strspn-c.c
> @@ -62,81 +62,73 @@ __strspn_sse42 (const char *s, const char *a)
> return 0;
>
> const char *aligned;
> - __m128i mask;
> - int offset = (int) ((size_t) a & 15);
> + __m128i mask, maskz, zero;
> + unsigned int maskz_bits;
> + unsigned int offset = (int) ((size_t) a & 15);
> + zero = _mm_set1_epi8 (0);
> if (offset != 0)
> {
> /* Load masks. */
> aligned = (const char *) ((size_t) a & -16L);
> __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> -
> - mask = __m128i_shift_right (mask0, offset);
> + maskz = _mm_cmpeq_epi8 (mask0, zero);
>
> /* Find where the NULL terminator is. */
> - int length = _mm_cmpistri (mask, mask, 0x3a);
> - if (length == 16 - offset)
> - {
> - /* There is no NULL terminator. */
> - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
> - int index = _mm_cmpistri (mask1, mask1, 0x3a);
> - length += index;
> -
> - /* Don't use SSE4.2 if the length of A > 16. */
> - if (length > 16)
> - return __strspn_sse2 (s, a);
> -
> - if (index != 0)
> - {
> - /* Combine mask0 and mask1. We could play games with
> - palignr, but frankly this data should be in L1 now
> - so do the merge via an unaligned load. */
> - mask = _mm_loadu_si128 ((__m128i *) a);
> - }
> - }
> + maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> + if (maskz_bits != 0)
> + {
> + mask = __m128i_shift_right (mask0, offset);
> + offset = (unsigned int) ((size_t) s & 15);
> + if (offset)
> + goto start_unaligned;
> +
> + aligned = s;
> + goto start_loop;
> + }
> }
> - else
> - {
> - /* A is aligned. */
> - mask = _mm_load_si128 ((__m128i *) a);
>
> - /* Find where the NULL terminator is. */
> - int length = _mm_cmpistri (mask, mask, 0x3a);
> - if (length == 16)
> - {
> - /* There is no NULL terminator. Don't use SSE4.2 if the length
> - of A > 16. */
> - if (a[16] != 0)
> - return __strspn_sse2 (s, a);
> - }
> + /* A is aligned. */
> + mask = _mm_loadu_si128 ((__m128i *) a);
> +
> + /* Find where the NULL terminator is. */
> + maskz = _mm_cmpeq_epi8 (mask, zero);
> + maskz_bits = _mm_movemask_epi8 (maskz);
> + if (maskz_bits == 0)
> + {
> + /* There is no NULL terminator. Don't use SSE4.2 if the length
> + of A > 16. */
> + if (a[16] != 0)
> + return __strspn_sse2 (s, a);
> }
> + aligned = s;
> + offset = (unsigned int) ((size_t) s & 15);
>
> - offset = (int) ((size_t) s & 15);
> if (offset != 0)
> {
> + start_unaligned:
> /* Check partial string. */
> aligned = (const char *) ((size_t) s & -16L);
> __m128i value = _mm_load_si128 ((__m128i *) aligned);
> + __m128i adj_value = __m128i_shift_right (value, offset);
>
> - value = __m128i_shift_right (value, offset);
> -
> - int length = _mm_cmpistri (mask, value, 0x12);
> + unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
> /* No need to check CFlag since it is always 1. */
> if (length < 16 - offset)
> return length;
> /* Find where the NULL terminator is. */
> - int index = _mm_cmpistri (value, value, 0x3a);
> - if (index < 16 - offset)
> + maskz = _mm_cmpeq_epi8 (value, zero);
> + maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> + if (maskz_bits != 0)
> return length;
> aligned += 16;
> }
> - else
> - aligned = s;
>
> +start_loop:
> while (1)
> {
> __m128i value = _mm_load_si128 ((__m128i *) aligned);
> - int index = _mm_cmpistri (mask, value, 0x12);
> - int cflag = _mm_cmpistrc (mask, value, 0x12);
> + unsigned int index = _mm_cmpistri (mask, value, 0x12);
> + unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
> if (cflag)
> return (size_t) (aligned + index - s);
> aligned += 16;
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
On Thu, Mar 24, 2022 at 11:58 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
> > _mm_cmpistri. Also change offset to unsigned to avoid unnecessary
> > sign extensions.
> >
> > geometric_mean(N=20) of all benchmarks that dont fallback on
> > sse2; New / Original: .901
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > len, align1, align2, pos, New Time / Old Time
> > 1, 0, 0, 512, 0.768
> > 1, 1, 0, 512, 0.666
> > 1, 0, 1, 512, 1.193
> > 1, 1, 1, 512, 0.872
> > 2, 0, 0, 512, 0.698
> > 2, 2, 0, 512, 0.687
> > 2, 0, 2, 512, 1.393
> > 2, 2, 2, 512, 0.944
> > 3, 0, 0, 512, 0.691
> > 3, 3, 0, 512, 0.676
> > 3, 0, 3, 512, 1.388
> > 3, 3, 3, 512, 0.948
> > 4, 0, 0, 512, 0.74
> > 4, 4, 0, 512, 0.678
> > 4, 0, 4, 512, 1.421
> > 4, 4, 4, 512, 0.943
> > 5, 0, 0, 512, 0.691
> > 5, 5, 0, 512, 0.675
> > 5, 0, 5, 512, 1.348
> > 5, 5, 5, 512, 0.952
> > 6, 0, 0, 512, 0.685
> > 6, 6, 0, 512, 0.67
> > 6, 0, 6, 512, 1.333
> > 6, 6, 6, 512, 0.95
> > 7, 0, 0, 512, 0.688
> > 7, 7, 0, 512, 0.675
> > 7, 0, 7, 512, 1.344
> > 7, 7, 7, 512, 0.919
> > 8, 0, 0, 512, 0.716
> > 8, 0, 8, 512, 0.935
> > 9, 0, 0, 512, 0.716
> > 9, 1, 0, 512, 0.712
> > 9, 0, 9, 512, 0.956
> > 9, 1, 9, 512, 0.992
> > 10, 0, 0, 512, 0.699
> > 10, 2, 0, 512, 0.68
> > 10, 0, 10, 512, 0.952
> > 10, 2, 10, 512, 0.932
> > 11, 0, 0, 512, 0.705
> > 11, 3, 0, 512, 0.685
> > 11, 0, 11, 512, 0.956
> > 11, 3, 11, 512, 0.927
> > 12, 0, 0, 512, 0.695
> > 12, 4, 0, 512, 0.675
> > 12, 0, 12, 512, 0.948
> > 12, 4, 12, 512, 0.928
> > 13, 0, 0, 512, 0.7
> > 13, 5, 0, 512, 0.678
> > 13, 0, 13, 512, 0.944
> > 13, 5, 13, 512, 0.931
> > 14, 0, 0, 512, 0.703
> > 14, 6, 0, 512, 0.678
> > 14, 0, 14, 512, 0.949
> > 14, 6, 14, 512, 0.93
> > 15, 0, 0, 512, 0.694
> > 15, 7, 0, 512, 0.678
> > 15, 0, 15, 512, 0.953
> > 15, 7, 15, 512, 0.924
> > 16, 0, 0, 512, 1.021
> > 16, 0, 16, 512, 1.067
> > 17, 0, 0, 512, 0.991
> > 17, 1, 0, 512, 0.984
> > 17, 0, 17, 512, 0.979
> > 17, 1, 17, 512, 0.993
> > 18, 0, 0, 512, 0.992
> > 18, 2, 0, 512, 1.008
> > 18, 0, 18, 512, 1.016
> > 18, 2, 18, 512, 0.993
> > 19, 0, 0, 512, 0.984
> > 19, 3, 0, 512, 0.985
> > 19, 0, 19, 512, 1.007
> > 19, 3, 19, 512, 1.006
> > 20, 0, 0, 512, 0.969
> > 20, 4, 0, 512, 0.968
> > 20, 0, 20, 512, 0.975
> > 20, 4, 20, 512, 0.975
> > 21, 0, 0, 512, 0.992
> > 21, 5, 0, 512, 0.992
> > 21, 0, 21, 512, 0.98
> > 21, 5, 21, 512, 0.97
> > 22, 0, 0, 512, 0.989
> > 22, 6, 0, 512, 0.987
> > 22, 0, 22, 512, 0.99
> > 22, 6, 22, 512, 0.985
> > 23, 0, 0, 512, 0.989
> > 23, 7, 0, 512, 0.98
> > 23, 0, 23, 512, 1.0
> > 23, 7, 23, 512, 0.993
> > 24, 0, 0, 512, 0.99
> > 24, 0, 24, 512, 0.998
> > 25, 0, 0, 512, 1.01
> > 25, 1, 0, 512, 1.0
> > 25, 0, 25, 512, 0.97
> > 25, 1, 25, 512, 0.967
> > 26, 0, 0, 512, 1.009
> > 26, 2, 0, 512, 0.986
> > 26, 0, 26, 512, 0.997
> > 26, 2, 26, 512, 0.993
> > 27, 0, 0, 512, 0.984
> > 27, 3, 0, 512, 0.997
> > 27, 0, 27, 512, 0.989
> > 27, 3, 27, 512, 0.976
> > 28, 0, 0, 512, 0.991
> > 28, 4, 0, 512, 1.003
> > 28, 0, 28, 512, 0.986
> > 28, 4, 28, 512, 0.989
> > 29, 0, 0, 512, 0.986
> > 29, 5, 0, 512, 0.985
> > 29, 0, 29, 512, 0.984
> > 29, 5, 29, 512, 0.977
> > 30, 0, 0, 512, 0.991
> > 30, 6, 0, 512, 0.987
> > 30, 0, 30, 512, 0.979
> > 30, 6, 30, 512, 0.974
> > 31, 0, 0, 512, 0.995
> > 31, 7, 0, 512, 0.995
> > 31, 0, 31, 512, 0.994
> > 31, 7, 31, 512, 0.984
> > 4, 0, 0, 32, 0.861
> > 4, 1, 0, 32, 0.864
> > 4, 0, 1, 32, 0.962
> > 4, 1, 1, 32, 0.967
> > 4, 0, 0, 64, 0.884
> > 4, 2, 0, 64, 0.818
> > 4, 0, 2, 64, 0.889
> > 4, 2, 2, 64, 0.918
> > 4, 0, 0, 128, 0.942
> > 4, 3, 0, 128, 0.884
> > 4, 0, 3, 128, 0.931
> > 4, 3, 3, 128, 0.883
> > 4, 0, 0, 256, 0.964
> > 4, 4, 0, 256, 0.922
> > 4, 0, 4, 256, 0.956
> > 4, 4, 4, 256, 0.93
> > 4, 5, 0, 512, 0.833
> > 4, 0, 5, 512, 1.027
> > 4, 5, 5, 512, 0.929
> > 4, 0, 0, 1024, 0.998
> > 4, 6, 0, 1024, 0.986
> > 4, 0, 6, 1024, 0.984
> > 4, 6, 6, 1024, 0.977
> > 4, 0, 0, 2048, 0.991
> > 4, 7, 0, 2048, 0.987
> > 4, 0, 7, 2048, 0.996
> > 4, 7, 7, 2048, 0.98
> > 10, 1, 0, 64, 0.826
> > 10, 1, 1, 64, 0.907
> > 10, 2, 0, 64, 0.829
> > 10, 2, 2, 64, 0.91
> > 10, 3, 0, 64, 0.83
> > 10, 3, 3, 64, 0.915
> > 10, 4, 0, 64, 0.83
> > 10, 4, 4, 64, 0.911
> > 10, 5, 0, 64, 0.828
> > 10, 5, 5, 64, 0.905
> > 10, 6, 0, 64, 0.828
> > 10, 6, 6, 64, 0.812
> > 10, 7, 0, 64, 0.83
> > 10, 7, 7, 64, 0.819
> > 6, 0, 0, 0, 1.261
> > 6, 0, 0, 1, 1.252
> > 6, 0, 1, 1, 0.845
> > 6, 0, 0, 2, 1.27
> > 6, 0, 2, 2, 0.85
> > 6, 0, 0, 3, 1.269
> > 6, 0, 3, 3, 0.845
> > 6, 0, 0, 4, 1.287
> > 6, 0, 4, 4, 0.852
> > 6, 0, 0, 5, 1.278
> > 6, 0, 5, 5, 0.851
> > 6, 0, 0, 6, 1.269
> > 6, 0, 6, 6, 0.841
> > 6, 0, 0, 7, 1.268
> > 6, 0, 7, 7, 0.851
> > 6, 0, 0, 8, 1.291
> > 6, 0, 8, 8, 0.837
> > 6, 0, 0, 9, 1.283
> > 6, 0, 9, 9, 0.831
> > 6, 0, 0, 10, 1.252
> > 6, 0, 10, 10, 0.997
> > 6, 0, 0, 11, 1.295
> > 6, 0, 11, 11, 1.046
> > 6, 0, 0, 12, 1.296
> > 6, 0, 12, 12, 1.038
> > 6, 0, 0, 13, 1.287
> > 6, 0, 13, 13, 1.082
> > 6, 0, 0, 14, 1.284
> > 6, 0, 14, 14, 1.001
> > 6, 0, 0, 15, 1.286
> > 6, 0, 15, 15, 1.002
> > 6, 0, 0, 16, 0.894
> > 6, 0, 16, 16, 0.874
> > 6, 0, 0, 17, 0.892
> > 6, 0, 17, 17, 0.974
> > 6, 0, 0, 18, 0.907
> > 6, 0, 18, 18, 0.993
> > 6, 0, 0, 19, 0.909
> > 6, 0, 19, 19, 0.99
> > 6, 0, 0, 20, 0.894
> > 6, 0, 20, 20, 0.978
> > 6, 0, 0, 21, 0.89
> > 6, 0, 21, 21, 0.958
> > 6, 0, 0, 22, 0.893
> > 6, 0, 22, 22, 0.99
> > 6, 0, 0, 23, 0.899
> > 6, 0, 23, 23, 0.986
> > 6, 0, 0, 24, 0.893
> > 6, 0, 24, 24, 0.989
> > 6, 0, 0, 25, 0.889
> > 6, 0, 25, 25, 0.982
> > 6, 0, 0, 26, 0.889
> > 6, 0, 26, 26, 0.852
> > 6, 0, 0, 27, 0.89
> > 6, 0, 27, 27, 0.832
> > 6, 0, 0, 28, 0.89
> > 6, 0, 28, 28, 0.831
> > 6, 0, 0, 29, 0.89
> > 6, 0, 29, 29, 0.838
> > 6, 0, 0, 30, 0.907
> > 6, 0, 30, 30, 0.833
> > 6, 0, 0, 31, 0.888
> > 6, 0, 31, 31, 0.837
> > 6, 0, 0, 32, 0.853
> > 6, 0, 32, 32, 0.828
> > 6, 0, 0, 33, 0.857
> > 6, 0, 33, 33, 0.947
> > 6, 0, 0, 34, 0.847
> > 6, 0, 34, 34, 0.954
> > 6, 0, 0, 35, 0.841
> > 6, 0, 35, 35, 0.94
> > 6, 0, 0, 36, 0.854
> > 6, 0, 36, 36, 0.958
> > 6, 0, 0, 37, 0.856
> > 6, 0, 37, 37, 0.957
> > 6, 0, 0, 38, 0.839
> > 6, 0, 38, 38, 0.962
> > 6, 0, 0, 39, 0.866
> > 6, 0, 39, 39, 0.945
> > 6, 0, 0, 40, 0.845
> > 6, 0, 40, 40, 0.961
> > 6, 0, 0, 41, 0.858
> > 6, 0, 41, 41, 0.961
> > 6, 0, 0, 42, 0.862
> > 6, 0, 42, 42, 0.825
> > 6, 0, 0, 43, 0.864
> > 6, 0, 43, 43, 0.82
> > 6, 0, 0, 44, 0.843
> > 6, 0, 44, 44, 0.81
> > 6, 0, 0, 45, 0.859
> > 6, 0, 45, 45, 0.816
> > 6, 0, 0, 46, 0.866
> > 6, 0, 46, 46, 0.81
> > 6, 0, 0, 47, 0.858
> > 6, 0, 47, 47, 0.807
> > 6, 0, 0, 48, 0.87
> > 6, 0, 48, 48, 0.87
> > 6, 0, 0, 49, 0.871
> > 6, 0, 49, 49, 0.874
> > 6, 0, 0, 50, 0.87
> > 6, 0, 50, 50, 0.881
> > 6, 0, 0, 51, 0.868
> > 6, 0, 51, 51, 0.875
> > 6, 0, 0, 52, 0.873
> > 6, 0, 52, 52, 0.871
> > 6, 0, 0, 53, 0.866
> > 6, 0, 53, 53, 0.882
> > 6, 0, 0, 54, 0.863
> > 6, 0, 54, 54, 0.876
> > 6, 0, 0, 55, 0.851
> > 6, 0, 55, 55, 0.871
> > 6, 0, 0, 56, 0.867
> > 6, 0, 56, 56, 0.888
> > 6, 0, 0, 57, 0.862
> > 6, 0, 57, 57, 0.899
> > 6, 0, 0, 58, 0.873
> > 6, 0, 58, 58, 0.798
> > 6, 0, 0, 59, 0.881
> > 6, 0, 59, 59, 0.785
> > 6, 0, 0, 60, 0.867
> > 6, 0, 60, 60, 0.797
> > 6, 0, 0, 61, 0.872
> > 6, 0, 61, 61, 0.791
> > 6, 0, 0, 62, 0.859
> > 6, 0, 62, 62, 0.79
> > 6, 0, 0, 63, 0.87
> > 6, 0, 63, 63, 0.796
> >
> > sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++----------------
> > 1 file changed, 39 insertions(+), 47 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
> > index 8fb3aba64d..6124033ceb 100644
> > --- a/sysdeps/x86_64/multiarch/strspn-c.c
> > +++ b/sysdeps/x86_64/multiarch/strspn-c.c
> > @@ -62,81 +62,73 @@ __strspn_sse42 (const char *s, const char *a)
> > return 0;
> >
> > const char *aligned;
> > - __m128i mask;
> > - int offset = (int) ((size_t) a & 15);
> > + __m128i mask, maskz, zero;
> > + unsigned int maskz_bits;
> > + unsigned int offset = (int) ((size_t) a & 15);
> > + zero = _mm_set1_epi8 (0);
> > if (offset != 0)
> > {
> > /* Load masks. */
> > aligned = (const char *) ((size_t) a & -16L);
> > __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> > -
> > - mask = __m128i_shift_right (mask0, offset);
> > + maskz = _mm_cmpeq_epi8 (mask0, zero);
> >
> > /* Find where the NULL terminator is. */
> > - int length = _mm_cmpistri (mask, mask, 0x3a);
> > - if (length == 16 - offset)
> > - {
> > - /* There is no NULL terminator. */
> > - __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
> > - int index = _mm_cmpistri (mask1, mask1, 0x3a);
> > - length += index;
> > -
> > - /* Don't use SSE4.2 if the length of A > 16. */
> > - if (length > 16)
> > - return __strspn_sse2 (s, a);
> > -
> > - if (index != 0)
> > - {
> > - /* Combine mask0 and mask1. We could play games with
> > - palignr, but frankly this data should be in L1 now
> > - so do the merge via an unaligned load. */
> > - mask = _mm_loadu_si128 ((__m128i *) a);
> > - }
> > - }
> > + maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > + if (maskz_bits != 0)
> > + {
> > + mask = __m128i_shift_right (mask0, offset);
> > + offset = (unsigned int) ((size_t) s & 15);
> > + if (offset)
> > + goto start_unaligned;
> > +
> > + aligned = s;
> > + goto start_loop;
> > + }
> > }
> > - else
> > - {
> > - /* A is aligned. */
> > - mask = _mm_load_si128 ((__m128i *) a);
> >
> > - /* Find where the NULL terminator is. */
> > - int length = _mm_cmpistri (mask, mask, 0x3a);
> > - if (length == 16)
> > - {
> > - /* There is no NULL terminator. Don't use SSE4.2 if the length
> > - of A > 16. */
> > - if (a[16] != 0)
> > - return __strspn_sse2 (s, a);
> > - }
> > + /* A is aligned. */
> > + mask = _mm_loadu_si128 ((__m128i *) a);
> > +
> > + /* Find where the NULL terminator is. */
> > + maskz = _mm_cmpeq_epi8 (mask, zero);
> > + maskz_bits = _mm_movemask_epi8 (maskz);
> > + if (maskz_bits == 0)
> > + {
> > + /* There is no NULL terminator. Don't use SSE4.2 if the length
> > + of A > 16. */
> > + if (a[16] != 0)
> > + return __strspn_sse2 (s, a);
> > }
> > + aligned = s;
> > + offset = (unsigned int) ((size_t) s & 15);
> >
> > - offset = (int) ((size_t) s & 15);
> > if (offset != 0)
> > {
> > + start_unaligned:
> > /* Check partial string. */
> > aligned = (const char *) ((size_t) s & -16L);
> > __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > + __m128i adj_value = __m128i_shift_right (value, offset);
> >
> > - value = __m128i_shift_right (value, offset);
> > -
> > - int length = _mm_cmpistri (mask, value, 0x12);
> > + unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
> > /* No need to check CFlag since it is always 1. */
> > if (length < 16 - offset)
> > return length;
> > /* Find where the NULL terminator is. */
> > - int index = _mm_cmpistri (value, value, 0x3a);
> > - if (index < 16 - offset)
> > + maskz = _mm_cmpeq_epi8 (value, zero);
> > + maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > + if (maskz_bits != 0)
> > return length;
> > aligned += 16;
> > }
> > - else
> > - aligned = s;
> >
> > +start_loop:
> > while (1)
> > {
> > __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > - int index = _mm_cmpistri (mask, value, 0x12);
> > - int cflag = _mm_cmpistrc (mask, value, 0x12);
> > + unsigned int index = _mm_cmpistri (mask, value, 0x12);
> > + unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
> > if (cflag)
> > return (size_t) (aligned + index - s);
> > aligned += 16;
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
@@ -62,81 +62,73 @@ __strspn_sse42 (const char *s, const char *a)
return 0;
const char *aligned;
- __m128i mask;
- int offset = (int) ((size_t) a & 15);
+ __m128i mask, maskz, zero;
+ unsigned int maskz_bits;
+ unsigned int offset = (int) ((size_t) a & 15);
+ zero = _mm_set1_epi8 (0);
if (offset != 0)
{
/* Load masks. */
aligned = (const char *) ((size_t) a & -16L);
__m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
- mask = __m128i_shift_right (mask0, offset);
+ maskz = _mm_cmpeq_epi8 (mask0, zero);
/* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16 - offset)
- {
- /* There is no NULL terminator. */
- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
- int index = _mm_cmpistri (mask1, mask1, 0x3a);
- length += index;
-
- /* Don't use SSE4.2 if the length of A > 16. */
- if (length > 16)
- return __strspn_sse2 (s, a);
-
- if (index != 0)
- {
- /* Combine mask0 and mask1. We could play games with
- palignr, but frankly this data should be in L1 now
- so do the merge via an unaligned load. */
- mask = _mm_loadu_si128 ((__m128i *) a);
- }
- }
+ maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+ if (maskz_bits != 0)
+ {
+ mask = __m128i_shift_right (mask0, offset);
+ offset = (unsigned int) ((size_t) s & 15);
+ if (offset)
+ goto start_unaligned;
+
+ aligned = s;
+ goto start_loop;
+ }
}
- else
- {
- /* A is aligned. */
- mask = _mm_load_si128 ((__m128i *) a);
- /* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16)
- {
- /* There is no NULL terminator. Don't use SSE4.2 if the length
- of A > 16. */
- if (a[16] != 0)
- return __strspn_sse2 (s, a);
- }
+ /* A is aligned. */
+ mask = _mm_loadu_si128 ((__m128i *) a);
+
+ /* Find where the NULL terminator is. */
+ maskz = _mm_cmpeq_epi8 (mask, zero);
+ maskz_bits = _mm_movemask_epi8 (maskz);
+ if (maskz_bits == 0)
+ {
+ /* There is no NULL terminator. Don't use SSE4.2 if the length
+ of A > 16. */
+ if (a[16] != 0)
+ return __strspn_sse2 (s, a);
}
+ aligned = s;
+ offset = (unsigned int) ((size_t) s & 15);
- offset = (int) ((size_t) s & 15);
if (offset != 0)
{
+ start_unaligned:
/* Check partial string. */
aligned = (const char *) ((size_t) s & -16L);
__m128i value = _mm_load_si128 ((__m128i *) aligned);
+ __m128i adj_value = __m128i_shift_right (value, offset);
- value = __m128i_shift_right (value, offset);
-
- int length = _mm_cmpistri (mask, value, 0x12);
+ unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
/* No need to check CFlag since it is always 1. */
if (length < 16 - offset)
return length;
/* Find where the NULL terminator is. */
- int index = _mm_cmpistri (value, value, 0x3a);
- if (index < 16 - offset)
+ maskz = _mm_cmpeq_epi8 (value, zero);
+ maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+ if (maskz_bits != 0)
return length;
aligned += 16;
}
- else
- aligned = s;
+start_loop:
while (1)
{
__m128i value = _mm_load_si128 ((__m128i *) aligned);
- int index = _mm_cmpistri (mask, value, 0x12);
- int cflag = _mm_cmpistrc (mask, value, 0x12);
+ unsigned int index = _mm_cmpistri (mask, value, 0x12);
+ unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
if (cflag)
return (size_t) (aligned + index - s);
aligned += 16;