[v1,04/23] x86: Code cleanup in strchr-evex and comment justifying branch
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
Small code cleanup for size: -81 bytes.
Add comment justifying using a branch to do NULL/non-null return.
All string/memory tests pass and no regressions in benchtests.
geometric_mean(N=20) of all benchmarks New / Original: .985
---
Geomtric Mean N=20 runs; All functions page aligned
length, alignment, pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
2048, 0, 32, 0, 23, 127, 0.878
2048, 1, 32, 0, 23, 127, 0.88
2048, 0, 64, 0, 23, 127, 0.997
2048, 2, 64, 0, 23, 127, 1.001
2048, 0, 128, 0, 23, 127, 0.973
2048, 3, 128, 0, 23, 127, 0.971
2048, 0, 256, 0, 23, 127, 0.976
2048, 4, 256, 0, 23, 127, 0.973
2048, 0, 512, 0, 23, 127, 1.001
2048, 5, 512, 0, 23, 127, 1.004
2048, 0, 1024, 0, 23, 127, 1.005
2048, 6, 1024, 0, 23, 127, 1.007
2048, 0, 2048, 0, 23, 127, 1.035
2048, 7, 2048, 0, 23, 127, 1.03
4096, 0, 32, 0, 23, 127, 0.889
4096, 1, 32, 0, 23, 127, 0.891
4096, 0, 64, 0, 23, 127, 1.012
4096, 2, 64, 0, 23, 127, 1.017
4096, 0, 128, 0, 23, 127, 0.975
4096, 3, 128, 0, 23, 127, 0.974
4096, 0, 256, 0, 23, 127, 0.974
4096, 4, 256, 0, 23, 127, 0.972
4096, 0, 512, 0, 23, 127, 1.002
4096, 5, 512, 0, 23, 127, 1.016
4096, 0, 1024, 0, 23, 127, 1.009
4096, 6, 1024, 0, 23, 127, 1.008
4096, 0, 2048, 0, 23, 127, 1.003
4096, 7, 2048, 0, 23, 127, 1.004
256, 1, 64, 0, 23, 127, 0.993
256, 2, 64, 0, 23, 127, 0.999
256, 3, 64, 0, 23, 127, 0.992
256, 4, 64, 0, 23, 127, 0.99
256, 5, 64, 0, 23, 127, 0.99
256, 6, 64, 0, 23, 127, 0.994
256, 7, 64, 0, 23, 127, 0.991
512, 0, 256, 0, 23, 127, 0.971
512, 16, 256, 0, 23, 127, 0.971
512, 32, 256, 0, 23, 127, 1.005
512, 48, 256, 0, 23, 127, 0.998
512, 64, 256, 0, 23, 127, 1.001
512, 80, 256, 0, 23, 127, 1.002
512, 96, 256, 0, 23, 127, 1.005
512, 112, 256, 0, 23, 127, 1.012
1, 0, 0, 0, 23, 127, 1.024
2, 0, 1, 0, 23, 127, 0.991
3, 0, 2, 0, 23, 127, 0.997
4, 0, 3, 0, 23, 127, 0.984
5, 0, 4, 0, 23, 127, 0.993
6, 0, 5, 0, 23, 127, 0.985
7, 0, 6, 0, 23, 127, 0.979
8, 0, 7, 0, 23, 127, 0.975
9, 0, 8, 0, 23, 127, 0.965
10, 0, 9, 0, 23, 127, 0.957
11, 0, 10, 0, 23, 127, 0.979
12, 0, 11, 0, 23, 127, 0.987
13, 0, 12, 0, 23, 127, 1.023
14, 0, 13, 0, 23, 127, 0.997
15, 0, 14, 0, 23, 127, 0.983
16, 0, 15, 0, 23, 127, 0.987
17, 0, 16, 0, 23, 127, 0.993
18, 0, 17, 0, 23, 127, 0.985
19, 0, 18, 0, 23, 127, 0.999
20, 0, 19, 0, 23, 127, 0.998
21, 0, 20, 0, 23, 127, 0.983
22, 0, 21, 0, 23, 127, 0.983
23, 0, 22, 0, 23, 127, 1.002
24, 0, 23, 0, 23, 127, 1.0
25, 0, 24, 0, 23, 127, 1.002
26, 0, 25, 0, 23, 127, 0.984
27, 0, 26, 0, 23, 127, 0.994
28, 0, 27, 0, 23, 127, 0.995
29, 0, 28, 0, 23, 127, 1.017
30, 0, 29, 0, 23, 127, 1.009
31, 0, 30, 0, 23, 127, 1.001
32, 0, 31, 0, 23, 127, 1.021
2048, 0, 32, 0, 0, 127, 0.899
2048, 1, 32, 0, 0, 127, 0.93
2048, 0, 64, 0, 0, 127, 1.009
2048, 2, 64, 0, 0, 127, 1.023
2048, 0, 128, 0, 0, 127, 0.973
2048, 3, 128, 0, 0, 127, 0.975
2048, 0, 256, 0, 0, 127, 0.974
2048, 4, 256, 0, 0, 127, 0.97
2048, 0, 512, 0, 0, 127, 0.999
2048, 5, 512, 0, 0, 127, 1.004
2048, 0, 1024, 0, 0, 127, 1.008
2048, 6, 1024, 0, 0, 127, 1.008
2048, 0, 2048, 0, 0, 127, 0.996
2048, 7, 2048, 0, 0, 127, 1.002
4096, 0, 32, 0, 0, 127, 0.872
4096, 1, 32, 0, 0, 127, 0.881
4096, 0, 64, 0, 0, 127, 1.006
4096, 2, 64, 0, 0, 127, 1.005
4096, 0, 128, 0, 0, 127, 0.973
4096, 3, 128, 0, 0, 127, 0.974
4096, 0, 256, 0, 0, 127, 0.969
4096, 4, 256, 0, 0, 127, 0.971
4096, 0, 512, 0, 0, 127, 1.0
4096, 5, 512, 0, 0, 127, 1.005
4096, 0, 1024, 0, 0, 127, 1.007
4096, 6, 1024, 0, 0, 127, 1.009
4096, 0, 2048, 0, 0, 127, 1.005
4096, 7, 2048, 0, 0, 127, 1.007
256, 1, 64, 0, 0, 127, 0.994
256, 2, 64, 0, 0, 127, 1.008
256, 3, 64, 0, 0, 127, 1.019
256, 4, 64, 0, 0, 127, 0.991
256, 5, 64, 0, 0, 127, 0.992
256, 6, 64, 0, 0, 127, 0.991
256, 7, 64, 0, 0, 127, 0.988
512, 0, 256, 0, 0, 127, 0.971
512, 16, 256, 0, 0, 127, 0.967
512, 32, 256, 0, 0, 127, 1.005
512, 48, 256, 0, 0, 127, 1.001
512, 64, 256, 0, 0, 127, 1.009
512, 80, 256, 0, 0, 127, 1.008
512, 96, 256, 0, 0, 127, 1.009
512, 112, 256, 0, 0, 127, 1.016
1, 0, 0, 0, 0, 127, 1.038
2, 0, 1, 0, 0, 127, 1.009
3, 0, 2, 0, 0, 127, 0.992
4, 0, 3, 0, 0, 127, 1.004
5, 0, 4, 0, 0, 127, 0.966
6, 0, 5, 0, 0, 127, 0.968
7, 0, 6, 0, 0, 127, 1.004
8, 0, 7, 0, 0, 127, 0.99
9, 0, 8, 0, 0, 127, 0.958
10, 0, 9, 0, 0, 127, 0.96
11, 0, 10, 0, 0, 127, 0.948
12, 0, 11, 0, 0, 127, 0.984
13, 0, 12, 0, 0, 127, 0.967
14, 0, 13, 0, 0, 127, 0.993
15, 0, 14, 0, 0, 127, 0.991
16, 0, 15, 0, 0, 127, 1.0
17, 0, 16, 0, 0, 127, 0.982
18, 0, 17, 0, 0, 127, 0.977
19, 0, 18, 0, 0, 127, 0.987
20, 0, 19, 0, 0, 127, 0.978
21, 0, 20, 0, 0, 127, 1.0
22, 0, 21, 0, 0, 127, 0.99
23, 0, 22, 0, 0, 127, 0.988
24, 0, 23, 0, 0, 127, 0.997
25, 0, 24, 0, 0, 127, 1.003
26, 0, 25, 0, 0, 127, 1.004
27, 0, 26, 0, 0, 127, 0.982
28, 0, 27, 0, 0, 127, 0.972
29, 0, 28, 0, 0, 127, 0.978
30, 0, 29, 0, 0, 127, 0.992
31, 0, 30, 0, 0, 127, 0.986
32, 0, 31, 0, 0, 127, 1.0
16, 0, 15, 1, 1, 0, 0.997
16, 0, 15, 1, 0, 0, 1.001
16, 0, 15, 1, 1, 0.1, 0.984
16, 0, 15, 1, 0, 0.1, 0.999
16, 0, 15, 1, 1, 0.25, 0.929
16, 0, 15, 1, 0, 0.25, 1.001
16, 0, 15, 1, 1, 0.33, 0.892
16, 0, 15, 1, 0, 0.33, 0.996
16, 0, 15, 1, 1, 0.5, 0.897
16, 0, 15, 1, 0, 0.5, 1.009
16, 0, 15, 1, 1, 0.66, 0.882
16, 0, 15, 1, 0, 0.66, 0.967
16, 0, 15, 1, 1, 0.75, 0.919
16, 0, 15, 1, 0, 0.75, 1.027
16, 0, 15, 1, 1, 0.9, 0.949
16, 0, 15, 1, 0, 0.9, 1.021
16, 0, 15, 1, 1, 1, 0.998
16, 0, 15, 1, 0, 1, 0.999
sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++-----------
1 file changed, 80 insertions(+), 66 deletions(-)
Comments
On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Small code cleanup for size: -81 bytes.
>
> Add comment justifying using a branch to do NULL/non-null return.
>
> All string/memory tests pass and no regressions in benchtests.
>
> geometric_mean(N=20) of all benchmarks New / Original: .985
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> length, alignment, pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
> 2048, 0, 32, 0, 23, 127, 0.878
> 2048, 1, 32, 0, 23, 127, 0.88
> 2048, 0, 64, 0, 23, 127, 0.997
> 2048, 2, 64, 0, 23, 127, 1.001
> 2048, 0, 128, 0, 23, 127, 0.973
> 2048, 3, 128, 0, 23, 127, 0.971
> 2048, 0, 256, 0, 23, 127, 0.976
> 2048, 4, 256, 0, 23, 127, 0.973
> 2048, 0, 512, 0, 23, 127, 1.001
> 2048, 5, 512, 0, 23, 127, 1.004
> 2048, 0, 1024, 0, 23, 127, 1.005
> 2048, 6, 1024, 0, 23, 127, 1.007
> 2048, 0, 2048, 0, 23, 127, 1.035
> 2048, 7, 2048, 0, 23, 127, 1.03
> 4096, 0, 32, 0, 23, 127, 0.889
> 4096, 1, 32, 0, 23, 127, 0.891
> 4096, 0, 64, 0, 23, 127, 1.012
> 4096, 2, 64, 0, 23, 127, 1.017
> 4096, 0, 128, 0, 23, 127, 0.975
> 4096, 3, 128, 0, 23, 127, 0.974
> 4096, 0, 256, 0, 23, 127, 0.974
> 4096, 4, 256, 0, 23, 127, 0.972
> 4096, 0, 512, 0, 23, 127, 1.002
> 4096, 5, 512, 0, 23, 127, 1.016
> 4096, 0, 1024, 0, 23, 127, 1.009
> 4096, 6, 1024, 0, 23, 127, 1.008
> 4096, 0, 2048, 0, 23, 127, 1.003
> 4096, 7, 2048, 0, 23, 127, 1.004
> 256, 1, 64, 0, 23, 127, 0.993
> 256, 2, 64, 0, 23, 127, 0.999
> 256, 3, 64, 0, 23, 127, 0.992
> 256, 4, 64, 0, 23, 127, 0.99
> 256, 5, 64, 0, 23, 127, 0.99
> 256, 6, 64, 0, 23, 127, 0.994
> 256, 7, 64, 0, 23, 127, 0.991
> 512, 0, 256, 0, 23, 127, 0.971
> 512, 16, 256, 0, 23, 127, 0.971
> 512, 32, 256, 0, 23, 127, 1.005
> 512, 48, 256, 0, 23, 127, 0.998
> 512, 64, 256, 0, 23, 127, 1.001
> 512, 80, 256, 0, 23, 127, 1.002
> 512, 96, 256, 0, 23, 127, 1.005
> 512, 112, 256, 0, 23, 127, 1.012
> 1, 0, 0, 0, 23, 127, 1.024
> 2, 0, 1, 0, 23, 127, 0.991
> 3, 0, 2, 0, 23, 127, 0.997
> 4, 0, 3, 0, 23, 127, 0.984
> 5, 0, 4, 0, 23, 127, 0.993
> 6, 0, 5, 0, 23, 127, 0.985
> 7, 0, 6, 0, 23, 127, 0.979
> 8, 0, 7, 0, 23, 127, 0.975
> 9, 0, 8, 0, 23, 127, 0.965
> 10, 0, 9, 0, 23, 127, 0.957
> 11, 0, 10, 0, 23, 127, 0.979
> 12, 0, 11, 0, 23, 127, 0.987
> 13, 0, 12, 0, 23, 127, 1.023
> 14, 0, 13, 0, 23, 127, 0.997
> 15, 0, 14, 0, 23, 127, 0.983
> 16, 0, 15, 0, 23, 127, 0.987
> 17, 0, 16, 0, 23, 127, 0.993
> 18, 0, 17, 0, 23, 127, 0.985
> 19, 0, 18, 0, 23, 127, 0.999
> 20, 0, 19, 0, 23, 127, 0.998
> 21, 0, 20, 0, 23, 127, 0.983
> 22, 0, 21, 0, 23, 127, 0.983
> 23, 0, 22, 0, 23, 127, 1.002
> 24, 0, 23, 0, 23, 127, 1.0
> 25, 0, 24, 0, 23, 127, 1.002
> 26, 0, 25, 0, 23, 127, 0.984
> 27, 0, 26, 0, 23, 127, 0.994
> 28, 0, 27, 0, 23, 127, 0.995
> 29, 0, 28, 0, 23, 127, 1.017
> 30, 0, 29, 0, 23, 127, 1.009
> 31, 0, 30, 0, 23, 127, 1.001
> 32, 0, 31, 0, 23, 127, 1.021
> 2048, 0, 32, 0, 0, 127, 0.899
> 2048, 1, 32, 0, 0, 127, 0.93
> 2048, 0, 64, 0, 0, 127, 1.009
> 2048, 2, 64, 0, 0, 127, 1.023
> 2048, 0, 128, 0, 0, 127, 0.973
> 2048, 3, 128, 0, 0, 127, 0.975
> 2048, 0, 256, 0, 0, 127, 0.974
> 2048, 4, 256, 0, 0, 127, 0.97
> 2048, 0, 512, 0, 0, 127, 0.999
> 2048, 5, 512, 0, 0, 127, 1.004
> 2048, 0, 1024, 0, 0, 127, 1.008
> 2048, 6, 1024, 0, 0, 127, 1.008
> 2048, 0, 2048, 0, 0, 127, 0.996
> 2048, 7, 2048, 0, 0, 127, 1.002
> 4096, 0, 32, 0, 0, 127, 0.872
> 4096, 1, 32, 0, 0, 127, 0.881
> 4096, 0, 64, 0, 0, 127, 1.006
> 4096, 2, 64, 0, 0, 127, 1.005
> 4096, 0, 128, 0, 0, 127, 0.973
> 4096, 3, 128, 0, 0, 127, 0.974
> 4096, 0, 256, 0, 0, 127, 0.969
> 4096, 4, 256, 0, 0, 127, 0.971
> 4096, 0, 512, 0, 0, 127, 1.0
> 4096, 5, 512, 0, 0, 127, 1.005
> 4096, 0, 1024, 0, 0, 127, 1.007
> 4096, 6, 1024, 0, 0, 127, 1.009
> 4096, 0, 2048, 0, 0, 127, 1.005
> 4096, 7, 2048, 0, 0, 127, 1.007
> 256, 1, 64, 0, 0, 127, 0.994
> 256, 2, 64, 0, 0, 127, 1.008
> 256, 3, 64, 0, 0, 127, 1.019
> 256, 4, 64, 0, 0, 127, 0.991
> 256, 5, 64, 0, 0, 127, 0.992
> 256, 6, 64, 0, 0, 127, 0.991
> 256, 7, 64, 0, 0, 127, 0.988
> 512, 0, 256, 0, 0, 127, 0.971
> 512, 16, 256, 0, 0, 127, 0.967
> 512, 32, 256, 0, 0, 127, 1.005
> 512, 48, 256, 0, 0, 127, 1.001
> 512, 64, 256, 0, 0, 127, 1.009
> 512, 80, 256, 0, 0, 127, 1.008
> 512, 96, 256, 0, 0, 127, 1.009
> 512, 112, 256, 0, 0, 127, 1.016
> 1, 0, 0, 0, 0, 127, 1.038
> 2, 0, 1, 0, 0, 127, 1.009
> 3, 0, 2, 0, 0, 127, 0.992
> 4, 0, 3, 0, 0, 127, 1.004
> 5, 0, 4, 0, 0, 127, 0.966
> 6, 0, 5, 0, 0, 127, 0.968
> 7, 0, 6, 0, 0, 127, 1.004
> 8, 0, 7, 0, 0, 127, 0.99
> 9, 0, 8, 0, 0, 127, 0.958
> 10, 0, 9, 0, 0, 127, 0.96
> 11, 0, 10, 0, 0, 127, 0.948
> 12, 0, 11, 0, 0, 127, 0.984
> 13, 0, 12, 0, 0, 127, 0.967
> 14, 0, 13, 0, 0, 127, 0.993
> 15, 0, 14, 0, 0, 127, 0.991
> 16, 0, 15, 0, 0, 127, 1.0
> 17, 0, 16, 0, 0, 127, 0.982
> 18, 0, 17, 0, 0, 127, 0.977
> 19, 0, 18, 0, 0, 127, 0.987
> 20, 0, 19, 0, 0, 127, 0.978
> 21, 0, 20, 0, 0, 127, 1.0
> 22, 0, 21, 0, 0, 127, 0.99
> 23, 0, 22, 0, 0, 127, 0.988
> 24, 0, 23, 0, 0, 127, 0.997
> 25, 0, 24, 0, 0, 127, 1.003
> 26, 0, 25, 0, 0, 127, 1.004
> 27, 0, 26, 0, 0, 127, 0.982
> 28, 0, 27, 0, 0, 127, 0.972
> 29, 0, 28, 0, 0, 127, 0.978
> 30, 0, 29, 0, 0, 127, 0.992
> 31, 0, 30, 0, 0, 127, 0.986
> 32, 0, 31, 0, 0, 127, 1.0
>
> 16, 0, 15, 1, 1, 0, 0.997
> 16, 0, 15, 1, 0, 0, 1.001
> 16, 0, 15, 1, 1, 0.1, 0.984
> 16, 0, 15, 1, 0, 0.1, 0.999
> 16, 0, 15, 1, 1, 0.25, 0.929
> 16, 0, 15, 1, 0, 0.25, 1.001
> 16, 0, 15, 1, 1, 0.33, 0.892
> 16, 0, 15, 1, 0, 0.33, 0.996
> 16, 0, 15, 1, 1, 0.5, 0.897
> 16, 0, 15, 1, 0, 0.5, 1.009
> 16, 0, 15, 1, 1, 0.66, 0.882
> 16, 0, 15, 1, 0, 0.66, 0.967
> 16, 0, 15, 1, 1, 0.75, 0.919
> 16, 0, 15, 1, 0, 0.75, 1.027
> 16, 0, 15, 1, 1, 0.9, 0.949
> 16, 0, 15, 1, 0, 0.9, 1.021
> 16, 0, 15, 1, 1, 1, 0.998
> 16, 0, 15, 1, 0, 1, 0.999
>
> sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++-----------
> 1 file changed, 80 insertions(+), 66 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
> index f62cd9d144..ec739fb8f9 100644
> --- a/sysdeps/x86_64/multiarch/strchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/strchr-evex.S
> @@ -30,6 +30,7 @@
> # ifdef USE_AS_WCSCHR
> # define VPBROADCAST vpbroadcastd
> # define VPCMP vpcmpd
> +# define VPTESTN vptestnmd
> # define VPMINU vpminud
> # define CHAR_REG esi
> # define SHIFT_REG ecx
> @@ -37,6 +38,7 @@
> # else
> # define VPBROADCAST vpbroadcastb
> # define VPCMP vpcmpb
> +# define VPTESTN vptestnmb
> # define VPMINU vpminub
> # define CHAR_REG sil
> # define SHIFT_REG edx
> @@ -61,13 +63,11 @@
> # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
>
> .section .text.evex,"ax",@progbits
> -ENTRY (STRCHR)
> +ENTRY_P2ALIGN (STRCHR, 5)
> /* Broadcast CHAR to YMM0. */
> VPBROADCAST %esi, %YMM0
> movl %edi, %eax
> andl $(PAGE_SIZE - 1), %eax
> - vpxorq %XMMZERO, %XMMZERO, %XMMZERO
> -
> /* Check if we cross page boundary with one vector load.
> Otherwise it is safe to use an unaligned load. */
> cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> @@ -81,49 +81,35 @@ ENTRY (STRCHR)
> vpxorq %YMM1, %YMM0, %YMM2
> VPMINU %YMM2, %YMM1, %YMM2
> /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM2, %k0
> + VPTESTN %YMM2, %YMM2, %k0
> kmovd %k0, %eax
> testl %eax, %eax
> jz L(aligned_more)
> tzcntl %eax, %eax
> +# ifndef USE_AS_STRCHRNUL
> + /* Found CHAR or the null byte. */
> + cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> + /* NB: Use a branch instead of cmovcc here. The expectation is
> + that with strchr the user will branch based on input being
> + null. Since this branch will be 100% predictive of the user
> + branch a branch miss here should save what otherwise would
> + be branch miss in the user code. Otherwise using a branch 1)
> + saves code size and 2) is faster in highly predictable
> + environments. */
> + jne L(zero)
> +# endif
> # ifdef USE_AS_WCSCHR
> /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> */
> leaq (%rdi, %rax, CHAR_SIZE), %rax
> # else
> addq %rdi, %rax
> -# endif
> -# ifndef USE_AS_STRCHRNUL
> - /* Found CHAR or the null byte. */
> - cmp (%rax), %CHAR_REG
> - jne L(zero)
> # endif
> ret
>
> - /* .p2align 5 helps keep performance more consistent if ENTRY()
> - alignment % 32 was either 16 or 0. As well this makes the
> - alignment % 32 of the loop_4x_vec fixed which makes tuning it
> - easier. */
> - .p2align 5
> -L(first_vec_x3):
> - tzcntl %eax, %eax
> -# ifndef USE_AS_STRCHRNUL
> - /* Found CHAR or the null byte. */
> - cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> - jne L(zero)
> -# endif
> - /* NB: Multiply sizeof char type (1 or 4) to get the number of
> - bytes. */
> - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> - ret
>
> -# ifndef USE_AS_STRCHRNUL
> -L(zero):
> - xorl %eax, %eax
> - ret
> -# endif
>
> - .p2align 4
> + .p2align 4,, 10
> L(first_vec_x4):
> # ifndef USE_AS_STRCHRNUL
> /* Check to see if first match was CHAR (k0) or null (k1). */
> @@ -144,9 +130,18 @@ L(first_vec_x4):
> leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> ret
>
> +# ifndef USE_AS_STRCHRNUL
> +L(zero):
> + xorl %eax, %eax
> + ret
> +# endif
> +
> +
> .p2align 4
> L(first_vec_x1):
> - tzcntl %eax, %eax
> + /* Use bsf here to save 1-byte keeping keeping the block in 1x
> + fetch block. eax guranteed non-zero. */
> + bsfl %eax, %eax
> # ifndef USE_AS_STRCHRNUL
> /* Found CHAR or the null byte. */
> cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> @@ -158,7 +153,7 @@ L(first_vec_x1):
> leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> ret
>
> - .p2align 4
> + .p2align 4,, 10
> L(first_vec_x2):
> # ifndef USE_AS_STRCHRNUL
> /* Check to see if first match was CHAR (k0) or null (k1). */
> @@ -179,6 +174,21 @@ L(first_vec_x2):
> leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> ret
>
> + .p2align 4,, 10
> +L(first_vec_x3):
> + /* Use bsf here to save 1-byte keeping keeping the block in 1x
> + fetch block. eax guranteed non-zero. */
> + bsfl %eax, %eax
> +# ifndef USE_AS_STRCHRNUL
> + /* Found CHAR or the null byte. */
> + cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> + jne L(zero)
> +# endif
> + /* NB: Multiply sizeof char type (1 or 4) to get the number of
> + bytes. */
> + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
> +
> .p2align 4
> L(aligned_more):
> /* Align data to VEC_SIZE. */
> @@ -195,7 +205,7 @@ L(cross_page_continue):
> vpxorq %YMM1, %YMM0, %YMM2
> VPMINU %YMM2, %YMM1, %YMM2
> /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM2, %k0
> + VPTESTN %YMM2, %YMM2, %k0
> kmovd %k0, %eax
> testl %eax, %eax
> jnz L(first_vec_x1)
> @@ -206,7 +216,7 @@ L(cross_page_continue):
> /* Each bit in K0 represents a CHAR in YMM1. */
> VPCMP $0, %YMM1, %YMM0, %k0
> /* Each bit in K1 represents a CHAR in YMM1. */
> - VPCMP $0, %YMM1, %YMMZERO, %k1
> + VPTESTN %YMM1, %YMM1, %k1
> kortestd %k0, %k1
> jnz L(first_vec_x2)
>
> @@ -215,7 +225,7 @@ L(cross_page_continue):
> vpxorq %YMM1, %YMM0, %YMM2
> VPMINU %YMM2, %YMM1, %YMM2
> /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM2, %k0
> + VPTESTN %YMM2, %YMM2, %k0
> kmovd %k0, %eax
> testl %eax, %eax
> jnz L(first_vec_x3)
> @@ -224,7 +234,7 @@ L(cross_page_continue):
> /* Each bit in K0 represents a CHAR in YMM1. */
> VPCMP $0, %YMM1, %YMM0, %k0
> /* Each bit in K1 represents a CHAR in YMM1. */
> - VPCMP $0, %YMM1, %YMMZERO, %k1
> + VPTESTN %YMM1, %YMM1, %k1
> kortestd %k0, %k1
> jnz L(first_vec_x4)
>
> @@ -265,33 +275,33 @@ L(loop_4x_vec):
> VPMINU %YMM3, %YMM4, %YMM4
> VPMINU %YMM2, %YMM4, %YMM4{%k4}{z}
>
> - VPCMP $0, %YMMZERO, %YMM4, %k1
> + VPTESTN %YMM4, %YMM4, %k1
> kmovd %k1, %ecx
> subq $-(VEC_SIZE * 4), %rdi
> testl %ecx, %ecx
> jz L(loop_4x_vec)
>
> - VPCMP $0, %YMMZERO, %YMM1, %k0
> + VPTESTN %YMM1, %YMM1, %k0
> kmovd %k0, %eax
> testl %eax, %eax
> jnz L(last_vec_x1)
>
> - VPCMP $0, %YMMZERO, %YMM2, %k0
> + VPTESTN %YMM2, %YMM2, %k0
> kmovd %k0, %eax
> testl %eax, %eax
> jnz L(last_vec_x2)
>
> - VPCMP $0, %YMMZERO, %YMM3, %k0
> + VPTESTN %YMM3, %YMM3, %k0
> kmovd %k0, %eax
> /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */
> # ifdef USE_AS_WCSCHR
> sall $8, %ecx
> orl %ecx, %eax
> - tzcntl %eax, %eax
> + bsfl %eax, %eax
> # else
> salq $32, %rcx
> orq %rcx, %rax
> - tzcntq %rax, %rax
> + bsfq %rax, %rax
> # endif
> # ifndef USE_AS_STRCHRNUL
> /* Check if match was CHAR or null. */
> @@ -303,28 +313,28 @@ L(loop_4x_vec):
> leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> ret
>
> -# ifndef USE_AS_STRCHRNUL
> -L(zero_end):
> - xorl %eax, %eax
> - ret
> + .p2align 4,, 8
> +L(last_vec_x1):
> + bsfl %eax, %eax
> +# ifdef USE_AS_WCSCHR
> + /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> + */
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> + addq %rdi, %rax
> # endif
>
> - .p2align 4
> -L(last_vec_x1):
> - tzcntl %eax, %eax
> # ifndef USE_AS_STRCHRNUL
> /* Check if match was null. */
> - cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> + cmp (%rax), %CHAR_REG
> jne L(zero_end)
> # endif
> - /* NB: Multiply sizeof char type (1 or 4) to get the number of
> - bytes. */
> - leaq (%rdi, %rax, CHAR_SIZE), %rax
> +
> ret
>
> - .p2align 4
> + .p2align 4,, 8
> L(last_vec_x2):
> - tzcntl %eax, %eax
> + bsfl %eax, %eax
> # ifndef USE_AS_STRCHRNUL
> /* Check if match was null. */
> cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> @@ -336,7 +346,7 @@ L(last_vec_x2):
> ret
>
> /* Cold case for crossing page with first load. */
> - .p2align 4
> + .p2align 4,, 8
> L(cross_page_boundary):
> movq %rdi, %rdx
> /* Align rdi. */
> @@ -346,9 +356,9 @@ L(cross_page_boundary):
> vpxorq %YMM1, %YMM0, %YMM2
> VPMINU %YMM2, %YMM1, %YMM2
> /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
> - VPCMP $0, %YMMZERO, %YMM2, %k0
> + VPTESTN %YMM2, %YMM2, %k0
> kmovd %k0, %eax
> - /* Remove the leading bits. */
> + /* Remove the leading bits. */
> # ifdef USE_AS_WCSCHR
> movl %edx, %SHIFT_REG
> /* NB: Divide shift count by 4 since each bit in K1 represent 4
> @@ -360,20 +370,24 @@ L(cross_page_boundary):
> /* If eax is zero continue. */
> testl %eax, %eax
> jz L(cross_page_continue)
> - tzcntl %eax, %eax
> -# ifndef USE_AS_STRCHRNUL
> - /* Check to see if match was CHAR or null. */
> - cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG
> - jne L(zero_end)
> -# endif
> + bsfl %eax, %eax
> +
> # ifdef USE_AS_WCSCHR
> /* NB: Multiply wchar_t count by 4 to get the number of
> bytes. */
> leaq (%rdx, %rax, CHAR_SIZE), %rax
> # else
> addq %rdx, %rax
> +# endif
> +# ifndef USE_AS_STRCHRNUL
> + /* Check to see if match was CHAR or null. */
> + cmp (%rax), %CHAR_REG
> + je L(cross_page_ret)
> +L(zero_end):
> + xorl %eax, %eax
> +L(cross_page_ret):
> # endif
> ret
>
> END (STRCHR)
> -# endif
> +#endif
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
On Thu, Mar 24, 2022 at 11:55 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Small code cleanup for size: -81 bytes.
> >
> > Add comment justifying using a branch to do NULL/non-null return.
> >
> > All string/memory tests pass and no regressions in benchtests.
> >
> > geometric_mean(N=20) of all benchmarks New / Original: .985
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > length, alignment, pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
> > 2048, 0, 32, 0, 23, 127, 0.878
> > 2048, 1, 32, 0, 23, 127, 0.88
> > 2048, 0, 64, 0, 23, 127, 0.997
> > 2048, 2, 64, 0, 23, 127, 1.001
> > 2048, 0, 128, 0, 23, 127, 0.973
> > 2048, 3, 128, 0, 23, 127, 0.971
> > 2048, 0, 256, 0, 23, 127, 0.976
> > 2048, 4, 256, 0, 23, 127, 0.973
> > 2048, 0, 512, 0, 23, 127, 1.001
> > 2048, 5, 512, 0, 23, 127, 1.004
> > 2048, 0, 1024, 0, 23, 127, 1.005
> > 2048, 6, 1024, 0, 23, 127, 1.007
> > 2048, 0, 2048, 0, 23, 127, 1.035
> > 2048, 7, 2048, 0, 23, 127, 1.03
> > 4096, 0, 32, 0, 23, 127, 0.889
> > 4096, 1, 32, 0, 23, 127, 0.891
> > 4096, 0, 64, 0, 23, 127, 1.012
> > 4096, 2, 64, 0, 23, 127, 1.017
> > 4096, 0, 128, 0, 23, 127, 0.975
> > 4096, 3, 128, 0, 23, 127, 0.974
> > 4096, 0, 256, 0, 23, 127, 0.974
> > 4096, 4, 256, 0, 23, 127, 0.972
> > 4096, 0, 512, 0, 23, 127, 1.002
> > 4096, 5, 512, 0, 23, 127, 1.016
> > 4096, 0, 1024, 0, 23, 127, 1.009
> > 4096, 6, 1024, 0, 23, 127, 1.008
> > 4096, 0, 2048, 0, 23, 127, 1.003
> > 4096, 7, 2048, 0, 23, 127, 1.004
> > 256, 1, 64, 0, 23, 127, 0.993
> > 256, 2, 64, 0, 23, 127, 0.999
> > 256, 3, 64, 0, 23, 127, 0.992
> > 256, 4, 64, 0, 23, 127, 0.99
> > 256, 5, 64, 0, 23, 127, 0.99
> > 256, 6, 64, 0, 23, 127, 0.994
> > 256, 7, 64, 0, 23, 127, 0.991
> > 512, 0, 256, 0, 23, 127, 0.971
> > 512, 16, 256, 0, 23, 127, 0.971
> > 512, 32, 256, 0, 23, 127, 1.005
> > 512, 48, 256, 0, 23, 127, 0.998
> > 512, 64, 256, 0, 23, 127, 1.001
> > 512, 80, 256, 0, 23, 127, 1.002
> > 512, 96, 256, 0, 23, 127, 1.005
> > 512, 112, 256, 0, 23, 127, 1.012
> > 1, 0, 0, 0, 23, 127, 1.024
> > 2, 0, 1, 0, 23, 127, 0.991
> > 3, 0, 2, 0, 23, 127, 0.997
> > 4, 0, 3, 0, 23, 127, 0.984
> > 5, 0, 4, 0, 23, 127, 0.993
> > 6, 0, 5, 0, 23, 127, 0.985
> > 7, 0, 6, 0, 23, 127, 0.979
> > 8, 0, 7, 0, 23, 127, 0.975
> > 9, 0, 8, 0, 23, 127, 0.965
> > 10, 0, 9, 0, 23, 127, 0.957
> > 11, 0, 10, 0, 23, 127, 0.979
> > 12, 0, 11, 0, 23, 127, 0.987
> > 13, 0, 12, 0, 23, 127, 1.023
> > 14, 0, 13, 0, 23, 127, 0.997
> > 15, 0, 14, 0, 23, 127, 0.983
> > 16, 0, 15, 0, 23, 127, 0.987
> > 17, 0, 16, 0, 23, 127, 0.993
> > 18, 0, 17, 0, 23, 127, 0.985
> > 19, 0, 18, 0, 23, 127, 0.999
> > 20, 0, 19, 0, 23, 127, 0.998
> > 21, 0, 20, 0, 23, 127, 0.983
> > 22, 0, 21, 0, 23, 127, 0.983
> > 23, 0, 22, 0, 23, 127, 1.002
> > 24, 0, 23, 0, 23, 127, 1.0
> > 25, 0, 24, 0, 23, 127, 1.002
> > 26, 0, 25, 0, 23, 127, 0.984
> > 27, 0, 26, 0, 23, 127, 0.994
> > 28, 0, 27, 0, 23, 127, 0.995
> > 29, 0, 28, 0, 23, 127, 1.017
> > 30, 0, 29, 0, 23, 127, 1.009
> > 31, 0, 30, 0, 23, 127, 1.001
> > 32, 0, 31, 0, 23, 127, 1.021
> > 2048, 0, 32, 0, 0, 127, 0.899
> > 2048, 1, 32, 0, 0, 127, 0.93
> > 2048, 0, 64, 0, 0, 127, 1.009
> > 2048, 2, 64, 0, 0, 127, 1.023
> > 2048, 0, 128, 0, 0, 127, 0.973
> > 2048, 3, 128, 0, 0, 127, 0.975
> > 2048, 0, 256, 0, 0, 127, 0.974
> > 2048, 4, 256, 0, 0, 127, 0.97
> > 2048, 0, 512, 0, 0, 127, 0.999
> > 2048, 5, 512, 0, 0, 127, 1.004
> > 2048, 0, 1024, 0, 0, 127, 1.008
> > 2048, 6, 1024, 0, 0, 127, 1.008
> > 2048, 0, 2048, 0, 0, 127, 0.996
> > 2048, 7, 2048, 0, 0, 127, 1.002
> > 4096, 0, 32, 0, 0, 127, 0.872
> > 4096, 1, 32, 0, 0, 127, 0.881
> > 4096, 0, 64, 0, 0, 127, 1.006
> > 4096, 2, 64, 0, 0, 127, 1.005
> > 4096, 0, 128, 0, 0, 127, 0.973
> > 4096, 3, 128, 0, 0, 127, 0.974
> > 4096, 0, 256, 0, 0, 127, 0.969
> > 4096, 4, 256, 0, 0, 127, 0.971
> > 4096, 0, 512, 0, 0, 127, 1.0
> > 4096, 5, 512, 0, 0, 127, 1.005
> > 4096, 0, 1024, 0, 0, 127, 1.007
> > 4096, 6, 1024, 0, 0, 127, 1.009
> > 4096, 0, 2048, 0, 0, 127, 1.005
> > 4096, 7, 2048, 0, 0, 127, 1.007
> > 256, 1, 64, 0, 0, 127, 0.994
> > 256, 2, 64, 0, 0, 127, 1.008
> > 256, 3, 64, 0, 0, 127, 1.019
> > 256, 4, 64, 0, 0, 127, 0.991
> > 256, 5, 64, 0, 0, 127, 0.992
> > 256, 6, 64, 0, 0, 127, 0.991
> > 256, 7, 64, 0, 0, 127, 0.988
> > 512, 0, 256, 0, 0, 127, 0.971
> > 512, 16, 256, 0, 0, 127, 0.967
> > 512, 32, 256, 0, 0, 127, 1.005
> > 512, 48, 256, 0, 0, 127, 1.001
> > 512, 64, 256, 0, 0, 127, 1.009
> > 512, 80, 256, 0, 0, 127, 1.008
> > 512, 96, 256, 0, 0, 127, 1.009
> > 512, 112, 256, 0, 0, 127, 1.016
> > 1, 0, 0, 0, 0, 127, 1.038
> > 2, 0, 1, 0, 0, 127, 1.009
> > 3, 0, 2, 0, 0, 127, 0.992
> > 4, 0, 3, 0, 0, 127, 1.004
> > 5, 0, 4, 0, 0, 127, 0.966
> > 6, 0, 5, 0, 0, 127, 0.968
> > 7, 0, 6, 0, 0, 127, 1.004
> > 8, 0, 7, 0, 0, 127, 0.99
> > 9, 0, 8, 0, 0, 127, 0.958
> > 10, 0, 9, 0, 0, 127, 0.96
> > 11, 0, 10, 0, 0, 127, 0.948
> > 12, 0, 11, 0, 0, 127, 0.984
> > 13, 0, 12, 0, 0, 127, 0.967
> > 14, 0, 13, 0, 0, 127, 0.993
> > 15, 0, 14, 0, 0, 127, 0.991
> > 16, 0, 15, 0, 0, 127, 1.0
> > 17, 0, 16, 0, 0, 127, 0.982
> > 18, 0, 17, 0, 0, 127, 0.977
> > 19, 0, 18, 0, 0, 127, 0.987
> > 20, 0, 19, 0, 0, 127, 0.978
> > 21, 0, 20, 0, 0, 127, 1.0
> > 22, 0, 21, 0, 0, 127, 0.99
> > 23, 0, 22, 0, 0, 127, 0.988
> > 24, 0, 23, 0, 0, 127, 0.997
> > 25, 0, 24, 0, 0, 127, 1.003
> > 26, 0, 25, 0, 0, 127, 1.004
> > 27, 0, 26, 0, 0, 127, 0.982
> > 28, 0, 27, 0, 0, 127, 0.972
> > 29, 0, 28, 0, 0, 127, 0.978
> > 30, 0, 29, 0, 0, 127, 0.992
> > 31, 0, 30, 0, 0, 127, 0.986
> > 32, 0, 31, 0, 0, 127, 1.0
> >
> > 16, 0, 15, 1, 1, 0, 0.997
> > 16, 0, 15, 1, 0, 0, 1.001
> > 16, 0, 15, 1, 1, 0.1, 0.984
> > 16, 0, 15, 1, 0, 0.1, 0.999
> > 16, 0, 15, 1, 1, 0.25, 0.929
> > 16, 0, 15, 1, 0, 0.25, 1.001
> > 16, 0, 15, 1, 1, 0.33, 0.892
> > 16, 0, 15, 1, 0, 0.33, 0.996
> > 16, 0, 15, 1, 1, 0.5, 0.897
> > 16, 0, 15, 1, 0, 0.5, 1.009
> > 16, 0, 15, 1, 1, 0.66, 0.882
> > 16, 0, 15, 1, 0, 0.66, 0.967
> > 16, 0, 15, 1, 1, 0.75, 0.919
> > 16, 0, 15, 1, 0, 0.75, 1.027
> > 16, 0, 15, 1, 1, 0.9, 0.949
> > 16, 0, 15, 1, 0, 0.9, 1.021
> > 16, 0, 15, 1, 1, 1, 0.998
> > 16, 0, 15, 1, 0, 1, 0.999
> >
> > sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++-----------
> > 1 file changed, 80 insertions(+), 66 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
> > index f62cd9d144..ec739fb8f9 100644
> > --- a/sysdeps/x86_64/multiarch/strchr-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strchr-evex.S
> > @@ -30,6 +30,7 @@
> > # ifdef USE_AS_WCSCHR
> > # define VPBROADCAST vpbroadcastd
> > # define VPCMP vpcmpd
> > +# define VPTESTN vptestnmd
> > # define VPMINU vpminud
> > # define CHAR_REG esi
> > # define SHIFT_REG ecx
> > @@ -37,6 +38,7 @@
> > # else
> > # define VPBROADCAST vpbroadcastb
> > # define VPCMP vpcmpb
> > +# define VPTESTN vptestnmb
> > # define VPMINU vpminub
> > # define CHAR_REG sil
> > # define SHIFT_REG edx
> > @@ -61,13 +63,11 @@
> > # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> >
> > .section .text.evex,"ax",@progbits
> > -ENTRY (STRCHR)
> > +ENTRY_P2ALIGN (STRCHR, 5)
> > /* Broadcast CHAR to YMM0. */
> > VPBROADCAST %esi, %YMM0
> > movl %edi, %eax
> > andl $(PAGE_SIZE - 1), %eax
> > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO
> > -
> > /* Check if we cross page boundary with one vector load.
> > Otherwise it is safe to use an unaligned load. */
> > cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > @@ -81,49 +81,35 @@ ENTRY (STRCHR)
> > vpxorq %YMM1, %YMM0, %YMM2
> > VPMINU %YMM2, %YMM1, %YMM2
> > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM2, %k0
> > + VPTESTN %YMM2, %YMM2, %k0
> > kmovd %k0, %eax
> > testl %eax, %eax
> > jz L(aligned_more)
> > tzcntl %eax, %eax
> > +# ifndef USE_AS_STRCHRNUL
> > + /* Found CHAR or the null byte. */
> > + cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > + /* NB: Use a branch instead of cmovcc here. The expectation is
> > + that with strchr the user will branch based on input being
> > + null. Since this branch will be 100% predictive of the user
> > + branch a branch miss here should save what otherwise would
> > + be branch miss in the user code. Otherwise using a branch 1)
> > + saves code size and 2) is faster in highly predictable
> > + environments. */
> > + jne L(zero)
> > +# endif
> > # ifdef USE_AS_WCSCHR
> > /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> > */
> > leaq (%rdi, %rax, CHAR_SIZE), %rax
> > # else
> > addq %rdi, %rax
> > -# endif
> > -# ifndef USE_AS_STRCHRNUL
> > - /* Found CHAR or the null byte. */
> > - cmp (%rax), %CHAR_REG
> > - jne L(zero)
> > # endif
> > ret
> >
> > - /* .p2align 5 helps keep performance more consistent if ENTRY()
> > - alignment % 32 was either 16 or 0. As well this makes the
> > - alignment % 32 of the loop_4x_vec fixed which makes tuning it
> > - easier. */
> > - .p2align 5
> > -L(first_vec_x3):
> > - tzcntl %eax, %eax
> > -# ifndef USE_AS_STRCHRNUL
> > - /* Found CHAR or the null byte. */
> > - cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > - jne L(zero)
> > -# endif
> > - /* NB: Multiply sizeof char type (1 or 4) to get the number of
> > - bytes. */
> > - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > - ret
> >
> > -# ifndef USE_AS_STRCHRNUL
> > -L(zero):
> > - xorl %eax, %eax
> > - ret
> > -# endif
> >
> > - .p2align 4
> > + .p2align 4,, 10
> > L(first_vec_x4):
> > # ifndef USE_AS_STRCHRNUL
> > /* Check to see if first match was CHAR (k0) or null (k1). */
> > @@ -144,9 +130,18 @@ L(first_vec_x4):
> > leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> > ret
> >
> > +# ifndef USE_AS_STRCHRNUL
> > +L(zero):
> > + xorl %eax, %eax
> > + ret
> > +# endif
> > +
> > +
> > .p2align 4
> > L(first_vec_x1):
> > - tzcntl %eax, %eax
> > + /* Use bsf here to save 1-byte keeping keeping the block in 1x
> > + fetch block. eax guranteed non-zero. */
> > + bsfl %eax, %eax
> > # ifndef USE_AS_STRCHRNUL
> > /* Found CHAR or the null byte. */
> > cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > @@ -158,7 +153,7 @@ L(first_vec_x1):
> > leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> > ret
> >
> > - .p2align 4
> > + .p2align 4,, 10
> > L(first_vec_x2):
> > # ifndef USE_AS_STRCHRNUL
> > /* Check to see if first match was CHAR (k0) or null (k1). */
> > @@ -179,6 +174,21 @@ L(first_vec_x2):
> > leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > ret
> >
> > + .p2align 4,, 10
> > +L(first_vec_x3):
> > + /* Use bsf here to save 1-byte keeping keeping the block in 1x
> > + fetch block. eax guranteed non-zero. */
> > + bsfl %eax, %eax
> > +# ifndef USE_AS_STRCHRNUL
> > + /* Found CHAR or the null byte. */
> > + cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > + jne L(zero)
> > +# endif
> > + /* NB: Multiply sizeof char type (1 or 4) to get the number of
> > + bytes. */
> > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> > +
> > .p2align 4
> > L(aligned_more):
> > /* Align data to VEC_SIZE. */
> > @@ -195,7 +205,7 @@ L(cross_page_continue):
> > vpxorq %YMM1, %YMM0, %YMM2
> > VPMINU %YMM2, %YMM1, %YMM2
> > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM2, %k0
> > + VPTESTN %YMM2, %YMM2, %k0
> > kmovd %k0, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x1)
> > @@ -206,7 +216,7 @@ L(cross_page_continue):
> > /* Each bit in K0 represents a CHAR in YMM1. */
> > VPCMP $0, %YMM1, %YMM0, %k0
> > /* Each bit in K1 represents a CHAR in YMM1. */
> > - VPCMP $0, %YMM1, %YMMZERO, %k1
> > + VPTESTN %YMM1, %YMM1, %k1
> > kortestd %k0, %k1
> > jnz L(first_vec_x2)
> >
> > @@ -215,7 +225,7 @@ L(cross_page_continue):
> > vpxorq %YMM1, %YMM0, %YMM2
> > VPMINU %YMM2, %YMM1, %YMM2
> > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM2, %k0
> > + VPTESTN %YMM2, %YMM2, %k0
> > kmovd %k0, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x3)
> > @@ -224,7 +234,7 @@ L(cross_page_continue):
> > /* Each bit in K0 represents a CHAR in YMM1. */
> > VPCMP $0, %YMM1, %YMM0, %k0
> > /* Each bit in K1 represents a CHAR in YMM1. */
> > - VPCMP $0, %YMM1, %YMMZERO, %k1
> > + VPTESTN %YMM1, %YMM1, %k1
> > kortestd %k0, %k1
> > jnz L(first_vec_x4)
> >
> > @@ -265,33 +275,33 @@ L(loop_4x_vec):
> > VPMINU %YMM3, %YMM4, %YMM4
> > VPMINU %YMM2, %YMM4, %YMM4{%k4}{z}
> >
> > - VPCMP $0, %YMMZERO, %YMM4, %k1
> > + VPTESTN %YMM4, %YMM4, %k1
> > kmovd %k1, %ecx
> > subq $-(VEC_SIZE * 4), %rdi
> > testl %ecx, %ecx
> > jz L(loop_4x_vec)
> >
> > - VPCMP $0, %YMMZERO, %YMM1, %k0
> > + VPTESTN %YMM1, %YMM1, %k0
> > kmovd %k0, %eax
> > testl %eax, %eax
> > jnz L(last_vec_x1)
> >
> > - VPCMP $0, %YMMZERO, %YMM2, %k0
> > + VPTESTN %YMM2, %YMM2, %k0
> > kmovd %k0, %eax
> > testl %eax, %eax
> > jnz L(last_vec_x2)
> >
> > - VPCMP $0, %YMMZERO, %YMM3, %k0
> > + VPTESTN %YMM3, %YMM3, %k0
> > kmovd %k0, %eax
> > /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */
> > # ifdef USE_AS_WCSCHR
> > sall $8, %ecx
> > orl %ecx, %eax
> > - tzcntl %eax, %eax
> > + bsfl %eax, %eax
> > # else
> > salq $32, %rcx
> > orq %rcx, %rax
> > - tzcntq %rax, %rax
> > + bsfq %rax, %rax
> > # endif
> > # ifndef USE_AS_STRCHRNUL
> > /* Check if match was CHAR or null. */
> > @@ -303,28 +313,28 @@ L(loop_4x_vec):
> > leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > ret
> >
> > -# ifndef USE_AS_STRCHRNUL
> > -L(zero_end):
> > - xorl %eax, %eax
> > - ret
> > + .p2align 4,, 8
> > +L(last_vec_x1):
> > + bsfl %eax, %eax
> > +# ifdef USE_AS_WCSCHR
> > + /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> > + */
> > + leaq (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > + addq %rdi, %rax
> > # endif
> >
> > - .p2align 4
> > -L(last_vec_x1):
> > - tzcntl %eax, %eax
> > # ifndef USE_AS_STRCHRNUL
> > /* Check if match was null. */
> > - cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > + cmp (%rax), %CHAR_REG
> > jne L(zero_end)
> > # endif
> > - /* NB: Multiply sizeof char type (1 or 4) to get the number of
> > - bytes. */
> > - leaq (%rdi, %rax, CHAR_SIZE), %rax
> > +
> > ret
> >
> > - .p2align 4
> > + .p2align 4,, 8
> > L(last_vec_x2):
> > - tzcntl %eax, %eax
> > + bsfl %eax, %eax
> > # ifndef USE_AS_STRCHRNUL
> > /* Check if match was null. */
> > cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > @@ -336,7 +346,7 @@ L(last_vec_x2):
> > ret
> >
> > /* Cold case for crossing page with first load. */
> > - .p2align 4
> > + .p2align 4,, 8
> > L(cross_page_boundary):
> > movq %rdi, %rdx
> > /* Align rdi. */
> > @@ -346,9 +356,9 @@ L(cross_page_boundary):
> > vpxorq %YMM1, %YMM0, %YMM2
> > VPMINU %YMM2, %YMM1, %YMM2
> > /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
> > - VPCMP $0, %YMMZERO, %YMM2, %k0
> > + VPTESTN %YMM2, %YMM2, %k0
> > kmovd %k0, %eax
> > - /* Remove the leading bits. */
> > + /* Remove the leading bits. */
> > # ifdef USE_AS_WCSCHR
> > movl %edx, %SHIFT_REG
> > /* NB: Divide shift count by 4 since each bit in K1 represent 4
> > @@ -360,20 +370,24 @@ L(cross_page_boundary):
> > /* If eax is zero continue. */
> > testl %eax, %eax
> > jz L(cross_page_continue)
> > - tzcntl %eax, %eax
> > -# ifndef USE_AS_STRCHRNUL
> > - /* Check to see if match was CHAR or null. */
> > - cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG
> > - jne L(zero_end)
> > -# endif
> > + bsfl %eax, %eax
> > +
> > # ifdef USE_AS_WCSCHR
> > /* NB: Multiply wchar_t count by 4 to get the number of
> > bytes. */
> > leaq (%rdx, %rax, CHAR_SIZE), %rax
> > # else
> > addq %rdx, %rax
> > +# endif
> > +# ifndef USE_AS_STRCHRNUL
> > + /* Check to see if match was CHAR or null. */
> > + cmp (%rax), %CHAR_REG
> > + je L(cross_page_ret)
> > +L(zero_end):
> > + xorl %eax, %eax
> > +L(cross_page_ret):
> > # endif
> > ret
> >
> > END (STRCHR)
> > -# endif
> > +#endif
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
@@ -30,6 +30,7 @@
# ifdef USE_AS_WCSCHR
# define VPBROADCAST vpbroadcastd
# define VPCMP vpcmpd
+# define VPTESTN vptestnmd
# define VPMINU vpminud
# define CHAR_REG esi
# define SHIFT_REG ecx
@@ -37,6 +38,7 @@
# else
# define VPBROADCAST vpbroadcastb
# define VPCMP vpcmpb
+# define VPTESTN vptestnmb
# define VPMINU vpminub
# define CHAR_REG sil
# define SHIFT_REG edx
@@ -61,13 +63,11 @@
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section .text.evex,"ax",@progbits
-ENTRY (STRCHR)
+ENTRY_P2ALIGN (STRCHR, 5)
/* Broadcast CHAR to YMM0. */
VPBROADCAST %esi, %YMM0
movl %edi, %eax
andl $(PAGE_SIZE - 1), %eax
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
-
/* Check if we cross page boundary with one vector load.
Otherwise it is safe to use an unaligned load. */
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
@@ -81,49 +81,35 @@ ENTRY (STRCHR)
vpxorq %YMM1, %YMM0, %YMM2
VPMINU %YMM2, %YMM1, %YMM2
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM2, %k0
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %eax
testl %eax, %eax
jz L(aligned_more)
tzcntl %eax, %eax
+# ifndef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
+ /* NB: Use a branch instead of cmovcc here. The expectation is
+ that with strchr the user will branch based on input being
+ null. Since this branch will be 100% predictive of the user
+ branch a branch miss here should save what otherwise would
+ be branch miss in the user code. Otherwise using a branch 1)
+ saves code size and 2) is faster in highly predictable
+ environments. */
+ jne L(zero)
+# endif
# ifdef USE_AS_WCSCHR
/* NB: Multiply wchar_t count by 4 to get the number of bytes.
*/
leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
addq %rdi, %rax
-# endif
-# ifndef USE_AS_STRCHRNUL
- /* Found CHAR or the null byte. */
- cmp (%rax), %CHAR_REG
- jne L(zero)
# endif
ret
- /* .p2align 5 helps keep performance more consistent if ENTRY()
- alignment % 32 was either 16 or 0. As well this makes the
- alignment % 32 of the loop_4x_vec fixed which makes tuning it
- easier. */
- .p2align 5
-L(first_vec_x3):
- tzcntl %eax, %eax
-# ifndef USE_AS_STRCHRNUL
- /* Found CHAR or the null byte. */
- cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
- jne L(zero)
-# endif
- /* NB: Multiply sizeof char type (1 or 4) to get the number of
- bytes. */
- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
- ret
-# ifndef USE_AS_STRCHRNUL
-L(zero):
- xorl %eax, %eax
- ret
-# endif
- .p2align 4
+ .p2align 4,, 10
L(first_vec_x4):
# ifndef USE_AS_STRCHRNUL
/* Check to see if first match was CHAR (k0) or null (k1). */
@@ -144,9 +130,18 @@ L(first_vec_x4):
leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
ret
+# ifndef USE_AS_STRCHRNUL
+L(zero):
+ xorl %eax, %eax
+ ret
+# endif
+
+
.p2align 4
L(first_vec_x1):
- tzcntl %eax, %eax
+ /* Use bsf here to save 1-byte keeping keeping the block in 1x
+ fetch block. eax guranteed non-zero. */
+ bsfl %eax, %eax
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
@@ -158,7 +153,7 @@ L(first_vec_x1):
leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
ret
- .p2align 4
+ .p2align 4,, 10
L(first_vec_x2):
# ifndef USE_AS_STRCHRNUL
/* Check to see if first match was CHAR (k0) or null (k1). */
@@ -179,6 +174,21 @@ L(first_vec_x2):
leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
ret
+ .p2align 4,, 10
+L(first_vec_x3):
+ /* Use bsf here to save 1-byte keeping keeping the block in 1x
+ fetch block. eax guranteed non-zero. */
+ bsfl %eax, %eax
+# ifndef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
+ cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+ jne L(zero)
+# endif
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
+ bytes. */
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
.p2align 4
L(aligned_more):
/* Align data to VEC_SIZE. */
@@ -195,7 +205,7 @@ L(cross_page_continue):
vpxorq %YMM1, %YMM0, %YMM2
VPMINU %YMM2, %YMM1, %YMM2
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM2, %k0
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x1)
@@ -206,7 +216,7 @@ L(cross_page_continue):
/* Each bit in K0 represents a CHAR in YMM1. */
VPCMP $0, %YMM1, %YMM0, %k0
/* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMM1, %YMMZERO, %k1
+ VPTESTN %YMM1, %YMM1, %k1
kortestd %k0, %k1
jnz L(first_vec_x2)
@@ -215,7 +225,7 @@ L(cross_page_continue):
vpxorq %YMM1, %YMM0, %YMM2
VPMINU %YMM2, %YMM1, %YMM2
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM2, %k0
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x3)
@@ -224,7 +234,7 @@ L(cross_page_continue):
/* Each bit in K0 represents a CHAR in YMM1. */
VPCMP $0, %YMM1, %YMM0, %k0
/* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMM1, %YMMZERO, %k1
+ VPTESTN %YMM1, %YMM1, %k1
kortestd %k0, %k1
jnz L(first_vec_x4)
@@ -265,33 +275,33 @@ L(loop_4x_vec):
VPMINU %YMM3, %YMM4, %YMM4
VPMINU %YMM2, %YMM4, %YMM4{%k4}{z}
- VPCMP $0, %YMMZERO, %YMM4, %k1
+ VPTESTN %YMM4, %YMM4, %k1
kmovd %k1, %ecx
subq $-(VEC_SIZE * 4), %rdi
testl %ecx, %ecx
jz L(loop_4x_vec)
- VPCMP $0, %YMMZERO, %YMM1, %k0
+ VPTESTN %YMM1, %YMM1, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(last_vec_x1)
- VPCMP $0, %YMMZERO, %YMM2, %k0
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %eax
testl %eax, %eax
jnz L(last_vec_x2)
- VPCMP $0, %YMMZERO, %YMM3, %k0
+ VPTESTN %YMM3, %YMM3, %k0
kmovd %k0, %eax
/* Combine YMM3 matches (eax) with YMM4 matches (ecx). */
# ifdef USE_AS_WCSCHR
sall $8, %ecx
orl %ecx, %eax
- tzcntl %eax, %eax
+ bsfl %eax, %eax
# else
salq $32, %rcx
orq %rcx, %rax
- tzcntq %rax, %rax
+ bsfq %rax, %rax
# endif
# ifndef USE_AS_STRCHRNUL
/* Check if match was CHAR or null. */
@@ -303,28 +313,28 @@ L(loop_4x_vec):
leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
ret
-# ifndef USE_AS_STRCHRNUL
-L(zero_end):
- xorl %eax, %eax
- ret
+ .p2align 4,, 8
+L(last_vec_x1):
+ bsfl %eax, %eax
+# ifdef USE_AS_WCSCHR
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes.
+ */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rdi, %rax
# endif
- .p2align 4
-L(last_vec_x1):
- tzcntl %eax, %eax
# ifndef USE_AS_STRCHRNUL
/* Check if match was null. */
- cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
+ cmp (%rax), %CHAR_REG
jne L(zero_end)
# endif
- /* NB: Multiply sizeof char type (1 or 4) to get the number of
- bytes. */
- leaq (%rdi, %rax, CHAR_SIZE), %rax
+
ret
- .p2align 4
+ .p2align 4,, 8
L(last_vec_x2):
- tzcntl %eax, %eax
+ bsfl %eax, %eax
# ifndef USE_AS_STRCHRNUL
/* Check if match was null. */
cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
@@ -336,7 +346,7 @@ L(last_vec_x2):
ret
/* Cold case for crossing page with first load. */
- .p2align 4
+ .p2align 4,, 8
L(cross_page_boundary):
movq %rdi, %rdx
/* Align rdi. */
@@ -346,9 +356,9 @@ L(cross_page_boundary):
vpxorq %YMM1, %YMM0, %YMM2
VPMINU %YMM2, %YMM1, %YMM2
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM2, %k0
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %eax
- /* Remove the leading bits. */
+ /* Remove the leading bits. */
# ifdef USE_AS_WCSCHR
movl %edx, %SHIFT_REG
/* NB: Divide shift count by 4 since each bit in K1 represent 4
@@ -360,20 +370,24 @@ L(cross_page_boundary):
/* If eax is zero continue. */
testl %eax, %eax
jz L(cross_page_continue)
- tzcntl %eax, %eax
-# ifndef USE_AS_STRCHRNUL
- /* Check to see if match was CHAR or null. */
- cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG
- jne L(zero_end)
-# endif
+ bsfl %eax, %eax
+
# ifdef USE_AS_WCSCHR
/* NB: Multiply wchar_t count by 4 to get the number of
bytes. */
leaq (%rdx, %rax, CHAR_SIZE), %rax
# else
addq %rdx, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+ /* Check to see if match was CHAR or null. */
+ cmp (%rax), %CHAR_REG
+ je L(cross_page_ret)
+L(zero_end):
+ xorl %eax, %eax
+L(cross_page_ret):
# endif
ret
END (STRCHR)
-# endif
+#endif