[v1,03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
Small code cleanup for size: -53 bytes.
Add comment justifying using a branch to do NULL/non-null return.
All string/memory tests pass and no regressions in benchtests.
geometric_mean(N=20) of all benchmarks Original / New: 1.00
---
Geomtric Mean N=20 runs; All functions page aligned
length, alignment, pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
2048, 0, 32, 0, 23, 127, 1.033
2048, 1, 32, 0, 23, 127, 1.006
2048, 0, 64, 0, 23, 127, 1.02
2048, 2, 64, 0, 23, 127, 0.992
2048, 0, 128, 0, 23, 127, 0.996
2048, 3, 128, 0, 23, 127, 0.966
2048, 0, 256, 0, 23, 127, 0.996
2048, 4, 256, 0, 23, 127, 0.998
2048, 0, 512, 0, 23, 127, 0.991
2048, 5, 512, 0, 23, 127, 0.991
2048, 0, 1024, 0, 23, 127, 0.993
2048, 6, 1024, 0, 23, 127, 0.992
2048, 0, 2048, 0, 23, 127, 0.992
2048, 7, 2048, 0, 23, 127, 0.976
4096, 0, 32, 0, 23, 127, 0.983
4096, 1, 32, 0, 23, 127, 0.994
4096, 0, 64, 0, 23, 127, 0.968
4096, 2, 64, 0, 23, 127, 1.018
4096, 0, 128, 0, 23, 127, 0.99
4096, 3, 128, 0, 23, 127, 1.001
4096, 0, 256, 0, 23, 127, 1.0
4096, 4, 256, 0, 23, 127, 1.001
4096, 0, 512, 0, 23, 127, 0.989
4096, 5, 512, 0, 23, 127, 0.988
4096, 0, 1024, 0, 23, 127, 0.994
4096, 6, 1024, 0, 23, 127, 0.993
4096, 0, 2048, 0, 23, 127, 0.987
4096, 7, 2048, 0, 23, 127, 0.996
256, 1, 64, 0, 23, 127, 1.004
256, 2, 64, 0, 23, 127, 1.004
256, 3, 64, 0, 23, 127, 0.992
256, 4, 64, 0, 23, 127, 1.001
256, 5, 64, 0, 23, 127, 1.001
256, 6, 64, 0, 23, 127, 0.998
256, 7, 64, 0, 23, 127, 0.994
512, 0, 256, 0, 23, 127, 0.999
512, 16, 256, 0, 23, 127, 1.002
512, 32, 256, 0, 23, 127, 0.994
512, 48, 256, 0, 23, 127, 0.991
512, 64, 256, 0, 23, 127, 0.994
512, 80, 256, 0, 23, 127, 0.994
512, 96, 256, 0, 23, 127, 0.996
512, 112, 256, 0, 23, 127, 0.999
1, 0, 0, 0, 23, 127, 0.978
2, 0, 1, 0, 23, 127, 0.981
3, 0, 2, 0, 23, 127, 0.993
4, 0, 3, 0, 23, 127, 1.004
5, 0, 4, 0, 23, 127, 1.002
6, 0, 5, 0, 23, 127, 0.991
7, 0, 6, 0, 23, 127, 0.99
8, 0, 7, 0, 23, 127, 1.012
9, 0, 8, 0, 23, 127, 0.994
10, 0, 9, 0, 23, 127, 1.003
11, 0, 10, 0, 23, 127, 0.999
12, 0, 11, 0, 23, 127, 1.007
13, 0, 12, 0, 23, 127, 1.0
14, 0, 13, 0, 23, 127, 0.997
15, 0, 14, 0, 23, 127, 0.996
16, 0, 15, 0, 23, 127, 0.993
17, 0, 16, 0, 23, 127, 1.002
18, 0, 17, 0, 23, 127, 0.997
19, 0, 18, 0, 23, 127, 0.998
20, 0, 19, 0, 23, 127, 0.994
21, 0, 20, 0, 23, 127, 0.99
22, 0, 21, 0, 23, 127, 0.992
23, 0, 22, 0, 23, 127, 0.996
24, 0, 23, 0, 23, 127, 0.991
25, 0, 24, 0, 23, 127, 0.997
26, 0, 25, 0, 23, 127, 1.011
27, 0, 26, 0, 23, 127, 1.013
28, 0, 27, 0, 23, 127, 0.996
29, 0, 28, 0, 23, 127, 0.993
30, 0, 29, 0, 23, 127, 1.009
31, 0, 30, 0, 23, 127, 1.009
32, 0, 31, 0, 23, 127, 1.008
2048, 0, 32, 0, 0, 127, 1.0
2048, 1, 32, 0, 0, 127, 1.01
2048, 0, 64, 0, 0, 127, 0.997
2048, 2, 64, 0, 0, 127, 1.002
2048, 0, 128, 0, 0, 127, 0.986
2048, 3, 128, 0, 0, 127, 0.997
2048, 0, 256, 0, 0, 127, 1.002
2048, 4, 256, 0, 0, 127, 0.999
2048, 0, 512, 0, 0, 127, 0.991
2048, 5, 512, 0, 0, 127, 0.984
2048, 0, 1024, 0, 0, 127, 0.994
2048, 6, 1024, 0, 0, 127, 0.993
2048, 0, 2048, 0, 0, 127, 0.951
2048, 7, 2048, 0, 0, 127, 0.989
4096, 0, 32, 0, 0, 127, 0.993
4096, 1, 32, 0, 0, 127, 0.997
4096, 0, 64, 0, 0, 127, 1.004
4096, 2, 64, 0, 0, 127, 1.016
4096, 0, 128, 0, 0, 127, 0.973
4096, 3, 128, 0, 0, 127, 1.001
4096, 0, 256, 0, 0, 127, 0.999
4096, 4, 256, 0, 0, 127, 0.998
4096, 0, 512, 0, 0, 127, 0.99
4096, 5, 512, 0, 0, 127, 0.985
4096, 0, 1024, 0, 0, 127, 0.993
4096, 6, 1024, 0, 0, 127, 0.997
4096, 0, 2048, 0, 0, 127, 0.995
4096, 7, 2048, 0, 0, 127, 0.996
256, 1, 64, 0, 0, 127, 1.01
256, 2, 64, 0, 0, 127, 1.024
256, 3, 64, 0, 0, 127, 1.03
256, 4, 64, 0, 0, 127, 1.004
256, 5, 64, 0, 0, 127, 0.998
256, 6, 64, 0, 0, 127, 0.998
256, 7, 64, 0, 0, 127, 0.997
512, 0, 256, 0, 0, 127, 0.996
512, 16, 256, 0, 0, 127, 0.995
512, 32, 256, 0, 0, 127, 0.996
512, 48, 256, 0, 0, 127, 0.992
512, 64, 256, 0, 0, 127, 0.999
512, 80, 256, 0, 0, 127, 1.002
512, 96, 256, 0, 0, 127, 0.999
512, 112, 256, 0, 0, 127, 0.998
1, 0, 0, 0, 0, 127, 1.016
2, 0, 1, 0, 0, 127, 0.998
3, 0, 2, 0, 0, 127, 1.02
4, 0, 3, 0, 0, 127, 1.004
5, 0, 4, 0, 0, 127, 1.021
6, 0, 5, 0, 0, 127, 1.014
7, 0, 6, 0, 0, 127, 1.007
8, 0, 7, 0, 0, 127, 1.016
9, 0, 8, 0, 0, 127, 1.003
10, 0, 9, 0, 0, 127, 1.004
11, 0, 10, 0, 0, 127, 0.995
12, 0, 11, 0, 0, 127, 1.009
13, 0, 12, 0, 0, 127, 1.005
14, 0, 13, 0, 0, 127, 0.987
15, 0, 14, 0, 0, 127, 0.998
16, 0, 15, 0, 0, 127, 1.004
17, 0, 16, 0, 0, 127, 1.01
18, 0, 17, 0, 0, 127, 1.01
19, 0, 18, 0, 0, 127, 1.006
20, 0, 19, 0, 0, 127, 1.012
21, 0, 20, 0, 0, 127, 0.999
22, 0, 21, 0, 0, 127, 1.004
23, 0, 22, 0, 0, 127, 0.988
24, 0, 23, 0, 0, 127, 0.993
25, 0, 24, 0, 0, 127, 1.004
26, 0, 25, 0, 0, 127, 0.99
27, 0, 26, 0, 0, 127, 1.016
28, 0, 27, 0, 0, 127, 0.987
29, 0, 28, 0, 0, 127, 0.989
30, 0, 29, 0, 0, 127, 0.998
31, 0, 30, 0, 0, 127, 1.005
32, 0, 31, 0, 0, 127, 0.993
16, 0, 15, 1, 1, 0, 1.002
16, 0, 15, 1, 0, 0, 1.0
16, 0, 15, 1, 1, 0.1, 1.034
16, 0, 15, 1, 0, 0.1, 1.03
16, 0, 15, 1, 1, 0.25, 0.993
16, 0, 15, 1, 0, 0.25, 1.081
16, 0, 15, 1, 1, 0.33, 0.959
16, 0, 15, 1, 0, 0.33, 1.142
16, 0, 15, 1, 1, 0.5, 0.929
16, 0, 15, 1, 0, 0.5, 1.072
16, 0, 15, 1, 1, 0.66, 0.984
16, 0, 15, 1, 0, 0.66, 1.069
16, 0, 15, 1, 1, 0.75, 0.969
16, 0, 15, 1, 0, 0.75, 1.059
16, 0, 15, 1, 1, 0.9, 0.98
16, 0, 15, 1, 0, 0.9, 0.994
16, 0, 15, 1, 1, 1, 0.993
16, 0, 15, 1, 0, 1, 0.996
sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------
1 file changed, 107 insertions(+), 97 deletions(-)
Comments
On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Small code cleanup for size: -53 bytes.
>
> Add comment justifying using a branch to do NULL/non-null return.
Do you have followup patches to improve its performance? We are
backporting all x86-64 improvements to Intel release branches:
https://gitlab.com/x86-glibc/glibc/-/wikis/home
Patches without performance improvements are undesirable.
> All string/memory tests pass and no regressions in benchtests.
>
> geometric_mean(N=20) of all benchmarks Original / New: 1.00
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> length, alignment, pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
> 2048, 0, 32, 0, 23, 127, 1.033
> 2048, 1, 32, 0, 23, 127, 1.006
> 2048, 0, 64, 0, 23, 127, 1.02
> 2048, 2, 64, 0, 23, 127, 0.992
> 2048, 0, 128, 0, 23, 127, 0.996
> 2048, 3, 128, 0, 23, 127, 0.966
> 2048, 0, 256, 0, 23, 127, 0.996
> 2048, 4, 256, 0, 23, 127, 0.998
> 2048, 0, 512, 0, 23, 127, 0.991
> 2048, 5, 512, 0, 23, 127, 0.991
> 2048, 0, 1024, 0, 23, 127, 0.993
> 2048, 6, 1024, 0, 23, 127, 0.992
> 2048, 0, 2048, 0, 23, 127, 0.992
> 2048, 7, 2048, 0, 23, 127, 0.976
> 4096, 0, 32, 0, 23, 127, 0.983
> 4096, 1, 32, 0, 23, 127, 0.994
> 4096, 0, 64, 0, 23, 127, 0.968
> 4096, 2, 64, 0, 23, 127, 1.018
> 4096, 0, 128, 0, 23, 127, 0.99
> 4096, 3, 128, 0, 23, 127, 1.001
> 4096, 0, 256, 0, 23, 127, 1.0
> 4096, 4, 256, 0, 23, 127, 1.001
> 4096, 0, 512, 0, 23, 127, 0.989
> 4096, 5, 512, 0, 23, 127, 0.988
> 4096, 0, 1024, 0, 23, 127, 0.994
> 4096, 6, 1024, 0, 23, 127, 0.993
> 4096, 0, 2048, 0, 23, 127, 0.987
> 4096, 7, 2048, 0, 23, 127, 0.996
> 256, 1, 64, 0, 23, 127, 1.004
> 256, 2, 64, 0, 23, 127, 1.004
> 256, 3, 64, 0, 23, 127, 0.992
> 256, 4, 64, 0, 23, 127, 1.001
> 256, 5, 64, 0, 23, 127, 1.001
> 256, 6, 64, 0, 23, 127, 0.998
> 256, 7, 64, 0, 23, 127, 0.994
> 512, 0, 256, 0, 23, 127, 0.999
> 512, 16, 256, 0, 23, 127, 1.002
> 512, 32, 256, 0, 23, 127, 0.994
> 512, 48, 256, 0, 23, 127, 0.991
> 512, 64, 256, 0, 23, 127, 0.994
> 512, 80, 256, 0, 23, 127, 0.994
> 512, 96, 256, 0, 23, 127, 0.996
> 512, 112, 256, 0, 23, 127, 0.999
> 1, 0, 0, 0, 23, 127, 0.978
> 2, 0, 1, 0, 23, 127, 0.981
> 3, 0, 2, 0, 23, 127, 0.993
> 4, 0, 3, 0, 23, 127, 1.004
> 5, 0, 4, 0, 23, 127, 1.002
> 6, 0, 5, 0, 23, 127, 0.991
> 7, 0, 6, 0, 23, 127, 0.99
> 8, 0, 7, 0, 23, 127, 1.012
> 9, 0, 8, 0, 23, 127, 0.994
> 10, 0, 9, 0, 23, 127, 1.003
> 11, 0, 10, 0, 23, 127, 0.999
> 12, 0, 11, 0, 23, 127, 1.007
> 13, 0, 12, 0, 23, 127, 1.0
> 14, 0, 13, 0, 23, 127, 0.997
> 15, 0, 14, 0, 23, 127, 0.996
> 16, 0, 15, 0, 23, 127, 0.993
> 17, 0, 16, 0, 23, 127, 1.002
> 18, 0, 17, 0, 23, 127, 0.997
> 19, 0, 18, 0, 23, 127, 0.998
> 20, 0, 19, 0, 23, 127, 0.994
> 21, 0, 20, 0, 23, 127, 0.99
> 22, 0, 21, 0, 23, 127, 0.992
> 23, 0, 22, 0, 23, 127, 0.996
> 24, 0, 23, 0, 23, 127, 0.991
> 25, 0, 24, 0, 23, 127, 0.997
> 26, 0, 25, 0, 23, 127, 1.011
> 27, 0, 26, 0, 23, 127, 1.013
> 28, 0, 27, 0, 23, 127, 0.996
> 29, 0, 28, 0, 23, 127, 0.993
> 30, 0, 29, 0, 23, 127, 1.009
> 31, 0, 30, 0, 23, 127, 1.009
> 32, 0, 31, 0, 23, 127, 1.008
> 2048, 0, 32, 0, 0, 127, 1.0
> 2048, 1, 32, 0, 0, 127, 1.01
> 2048, 0, 64, 0, 0, 127, 0.997
> 2048, 2, 64, 0, 0, 127, 1.002
> 2048, 0, 128, 0, 0, 127, 0.986
> 2048, 3, 128, 0, 0, 127, 0.997
> 2048, 0, 256, 0, 0, 127, 1.002
> 2048, 4, 256, 0, 0, 127, 0.999
> 2048, 0, 512, 0, 0, 127, 0.991
> 2048, 5, 512, 0, 0, 127, 0.984
> 2048, 0, 1024, 0, 0, 127, 0.994
> 2048, 6, 1024, 0, 0, 127, 0.993
> 2048, 0, 2048, 0, 0, 127, 0.951
> 2048, 7, 2048, 0, 0, 127, 0.989
> 4096, 0, 32, 0, 0, 127, 0.993
> 4096, 1, 32, 0, 0, 127, 0.997
> 4096, 0, 64, 0, 0, 127, 1.004
> 4096, 2, 64, 0, 0, 127, 1.016
> 4096, 0, 128, 0, 0, 127, 0.973
> 4096, 3, 128, 0, 0, 127, 1.001
> 4096, 0, 256, 0, 0, 127, 0.999
> 4096, 4, 256, 0, 0, 127, 0.998
> 4096, 0, 512, 0, 0, 127, 0.99
> 4096, 5, 512, 0, 0, 127, 0.985
> 4096, 0, 1024, 0, 0, 127, 0.993
> 4096, 6, 1024, 0, 0, 127, 0.997
> 4096, 0, 2048, 0, 0, 127, 0.995
> 4096, 7, 2048, 0, 0, 127, 0.996
> 256, 1, 64, 0, 0, 127, 1.01
> 256, 2, 64, 0, 0, 127, 1.024
> 256, 3, 64, 0, 0, 127, 1.03
> 256, 4, 64, 0, 0, 127, 1.004
> 256, 5, 64, 0, 0, 127, 0.998
> 256, 6, 64, 0, 0, 127, 0.998
> 256, 7, 64, 0, 0, 127, 0.997
> 512, 0, 256, 0, 0, 127, 0.996
> 512, 16, 256, 0, 0, 127, 0.995
> 512, 32, 256, 0, 0, 127, 0.996
> 512, 48, 256, 0, 0, 127, 0.992
> 512, 64, 256, 0, 0, 127, 0.999
> 512, 80, 256, 0, 0, 127, 1.002
> 512, 96, 256, 0, 0, 127, 0.999
> 512, 112, 256, 0, 0, 127, 0.998
> 1, 0, 0, 0, 0, 127, 1.016
> 2, 0, 1, 0, 0, 127, 0.998
> 3, 0, 2, 0, 0, 127, 1.02
> 4, 0, 3, 0, 0, 127, 1.004
> 5, 0, 4, 0, 0, 127, 1.021
> 6, 0, 5, 0, 0, 127, 1.014
> 7, 0, 6, 0, 0, 127, 1.007
> 8, 0, 7, 0, 0, 127, 1.016
> 9, 0, 8, 0, 0, 127, 1.003
> 10, 0, 9, 0, 0, 127, 1.004
> 11, 0, 10, 0, 0, 127, 0.995
> 12, 0, 11, 0, 0, 127, 1.009
> 13, 0, 12, 0, 0, 127, 1.005
> 14, 0, 13, 0, 0, 127, 0.987
> 15, 0, 14, 0, 0, 127, 0.998
> 16, 0, 15, 0, 0, 127, 1.004
> 17, 0, 16, 0, 0, 127, 1.01
> 18, 0, 17, 0, 0, 127, 1.01
> 19, 0, 18, 0, 0, 127, 1.006
> 20, 0, 19, 0, 0, 127, 1.012
> 21, 0, 20, 0, 0, 127, 0.999
> 22, 0, 21, 0, 0, 127, 1.004
> 23, 0, 22, 0, 0, 127, 0.988
> 24, 0, 23, 0, 0, 127, 0.993
> 25, 0, 24, 0, 0, 127, 1.004
> 26, 0, 25, 0, 0, 127, 0.99
> 27, 0, 26, 0, 0, 127, 1.016
> 28, 0, 27, 0, 0, 127, 0.987
> 29, 0, 28, 0, 0, 127, 0.989
> 30, 0, 29, 0, 0, 127, 0.998
> 31, 0, 30, 0, 0, 127, 1.005
> 32, 0, 31, 0, 0, 127, 0.993
>
> 16, 0, 15, 1, 1, 0, 1.002
> 16, 0, 15, 1, 0, 0, 1.0
> 16, 0, 15, 1, 1, 0.1, 1.034
> 16, 0, 15, 1, 0, 0.1, 1.03
> 16, 0, 15, 1, 1, 0.25, 0.993
> 16, 0, 15, 1, 0, 0.25, 1.081
> 16, 0, 15, 1, 1, 0.33, 0.959
> 16, 0, 15, 1, 0, 0.33, 1.142
> 16, 0, 15, 1, 1, 0.5, 0.929
> 16, 0, 15, 1, 0, 0.5, 1.072
> 16, 0, 15, 1, 1, 0.66, 0.984
> 16, 0, 15, 1, 0, 0.66, 1.069
> 16, 0, 15, 1, 1, 0.75, 0.969
> 16, 0, 15, 1, 0, 0.75, 1.059
> 16, 0, 15, 1, 1, 0.9, 0.98
> 16, 0, 15, 1, 0, 0.9, 0.994
> 16, 0, 15, 1, 1, 1, 0.993
> 16, 0, 15, 1, 0, 1, 0.996
>
> sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------
> 1 file changed, 107 insertions(+), 97 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> index 086cabf76a..1a916cc951 100644
> --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> @@ -48,13 +48,13 @@
> # define PAGE_SIZE 4096
>
> .section SECTION(.text),"ax",@progbits
> -ENTRY (STRCHR)
> +ENTRY_P2ALIGN (STRCHR, 5)
> /* Broadcast CHAR to YMM0. */
> vmovd %esi, %xmm0
> movl %edi, %eax
> andl $(PAGE_SIZE - 1), %eax
> VPBROADCAST %xmm0, %ymm0
> - vpxor %xmm9, %xmm9, %xmm9
> + vpxor %xmm1, %xmm1, %xmm1
>
> /* Check if we cross page boundary with one vector load. */
> cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> @@ -62,37 +62,29 @@ ENTRY (STRCHR)
>
> /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> null byte. */
> - vmovdqu (%rdi), %ymm8
> - VPCMPEQ %ymm8, %ymm0, %ymm1
> - VPCMPEQ %ymm8, %ymm9, %ymm2
> - vpor %ymm1, %ymm2, %ymm1
> - vpmovmskb %ymm1, %eax
> + vmovdqu (%rdi), %ymm2
> + VPCMPEQ %ymm2, %ymm0, %ymm3
> + VPCMPEQ %ymm2, %ymm1, %ymm2
> + vpor %ymm3, %ymm2, %ymm3
> + vpmovmskb %ymm3, %eax
> testl %eax, %eax
> jz L(aligned_more)
> tzcntl %eax, %eax
> # ifndef USE_AS_STRCHRNUL
> - /* Found CHAR or the null byte. */
> - cmp (%rdi, %rax), %CHAR_REG
> - jne L(zero)
> -# endif
> - addq %rdi, %rax
> - VZEROUPPER_RETURN
> -
> - /* .p2align 5 helps keep performance more consistent if ENTRY()
> - alignment % 32 was either 16 or 0. As well this makes the
> - alignment % 32 of the loop_4x_vec fixed which makes tuning it
> - easier. */
> - .p2align 5
> -L(first_vec_x4):
> - tzcntl %eax, %eax
> - addq $(VEC_SIZE * 3 + 1), %rdi
> -# ifndef USE_AS_STRCHRNUL
> - /* Found CHAR or the null byte. */
> + /* Found CHAR or the null byte. */
> cmp (%rdi, %rax), %CHAR_REG
> + /* NB: Use a branch instead of cmovcc here. The expectation is
> + that with strchr the user will branch based on input being
> + null. Since this branch will be 100% predictive of the user
> + branch a branch miss here should save what otherwise would
> + be branch miss in the user code. Otherwise using a branch 1)
> + saves code size and 2) is faster in highly predictable
> + environments. */
> jne L(zero)
> # endif
> addq %rdi, %rax
> - VZEROUPPER_RETURN
> +L(return_vzeroupper):
> + ZERO_UPPER_VEC_REGISTERS_RETURN
>
> # ifndef USE_AS_STRCHRNUL
> L(zero):
> @@ -103,7 +95,8 @@ L(zero):
>
> .p2align 4
> L(first_vec_x1):
> - tzcntl %eax, %eax
> + /* Use bsf to save code size. */
> + bsfl %eax, %eax
> incq %rdi
> # ifndef USE_AS_STRCHRNUL
> /* Found CHAR or the null byte. */
> @@ -113,9 +106,10 @@ L(first_vec_x1):
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> - .p2align 4
> + .p2align 4,, 10
> L(first_vec_x2):
> - tzcntl %eax, %eax
> + /* Use bsf to save code size. */
> + bsfl %eax, %eax
> addq $(VEC_SIZE + 1), %rdi
> # ifndef USE_AS_STRCHRNUL
> /* Found CHAR or the null byte. */
> @@ -125,9 +119,10 @@ L(first_vec_x2):
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> - .p2align 4
> + .p2align 4,, 8
> L(first_vec_x3):
> - tzcntl %eax, %eax
> + /* Use bsf to save code size. */
> + bsfl %eax, %eax
> addq $(VEC_SIZE * 2 + 1), %rdi
> # ifndef USE_AS_STRCHRNUL
> /* Found CHAR or the null byte. */
> @@ -137,6 +132,21 @@ L(first_vec_x3):
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> + .p2align 4,, 10
> +L(first_vec_x4):
> + /* Use bsf to save code size. */
> + bsfl %eax, %eax
> + addq $(VEC_SIZE * 3 + 1), %rdi
> +# ifndef USE_AS_STRCHRNUL
> + /* Found CHAR or the null byte. */
> + cmp (%rdi, %rax), %CHAR_REG
> + jne L(zero)
> +# endif
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
> +
> +
> +
> .p2align 4
> L(aligned_more):
> /* Align data to VEC_SIZE - 1. This is the same number of
> @@ -146,90 +156,92 @@ L(aligned_more):
> L(cross_page_continue):
> /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
> since data is only aligned to VEC_SIZE. */
> - vmovdqa 1(%rdi), %ymm8
> - VPCMPEQ %ymm8, %ymm0, %ymm1
> - VPCMPEQ %ymm8, %ymm9, %ymm2
> - vpor %ymm1, %ymm2, %ymm1
> - vpmovmskb %ymm1, %eax
> + vmovdqa 1(%rdi), %ymm2
> + VPCMPEQ %ymm2, %ymm0, %ymm3
> + VPCMPEQ %ymm2, %ymm1, %ymm2
> + vpor %ymm3, %ymm2, %ymm3
> + vpmovmskb %ymm3, %eax
> testl %eax, %eax
> jnz L(first_vec_x1)
>
> - vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
> - VPCMPEQ %ymm8, %ymm0, %ymm1
> - VPCMPEQ %ymm8, %ymm9, %ymm2
> - vpor %ymm1, %ymm2, %ymm1
> - vpmovmskb %ymm1, %eax
> + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2
> + VPCMPEQ %ymm2, %ymm0, %ymm3
> + VPCMPEQ %ymm2, %ymm1, %ymm2
> + vpor %ymm3, %ymm2, %ymm3
> + vpmovmskb %ymm3, %eax
> testl %eax, %eax
> jnz L(first_vec_x2)
>
> - vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
> - VPCMPEQ %ymm8, %ymm0, %ymm1
> - VPCMPEQ %ymm8, %ymm9, %ymm2
> - vpor %ymm1, %ymm2, %ymm1
> - vpmovmskb %ymm1, %eax
> + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2
> + VPCMPEQ %ymm2, %ymm0, %ymm3
> + VPCMPEQ %ymm2, %ymm1, %ymm2
> + vpor %ymm3, %ymm2, %ymm3
> + vpmovmskb %ymm3, %eax
> testl %eax, %eax
> jnz L(first_vec_x3)
>
> - vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
> - VPCMPEQ %ymm8, %ymm0, %ymm1
> - VPCMPEQ %ymm8, %ymm9, %ymm2
> - vpor %ymm1, %ymm2, %ymm1
> - vpmovmskb %ymm1, %eax
> + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2
> + VPCMPEQ %ymm2, %ymm0, %ymm3
> + VPCMPEQ %ymm2, %ymm1, %ymm2
> + vpor %ymm3, %ymm2, %ymm3
> + vpmovmskb %ymm3, %eax
> testl %eax, %eax
> jnz L(first_vec_x4)
> - /* Align data to VEC_SIZE * 4 - 1. */
> - addq $(VEC_SIZE * 4 + 1), %rdi
> - andq $-(VEC_SIZE * 4), %rdi
> + /* Align data to VEC_SIZE * 4 - 1. */
> + incq %rdi
> + orq $(VEC_SIZE * 4 - 1), %rdi
> .p2align 4
> L(loop_4x_vec):
> /* Compare 4 * VEC at a time forward. */
> - vmovdqa (%rdi), %ymm5
> - vmovdqa (VEC_SIZE)(%rdi), %ymm6
> - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> + vmovdqa 1(%rdi), %ymm6
> + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7
>
> /* Leaves only CHARS matching esi as 0. */
> - vpxor %ymm5, %ymm0, %ymm1
> vpxor %ymm6, %ymm0, %ymm2
> vpxor %ymm7, %ymm0, %ymm3
> - vpxor %ymm8, %ymm0, %ymm4
>
> - VPMINU %ymm1, %ymm5, %ymm1
> VPMINU %ymm2, %ymm6, %ymm2
> VPMINU %ymm3, %ymm7, %ymm3
> - VPMINU %ymm4, %ymm8, %ymm4
>
> - VPMINU %ymm1, %ymm2, %ymm5
> - VPMINU %ymm3, %ymm4, %ymm6
> + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6
> + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7
> +
> + vpxor %ymm6, %ymm0, %ymm4
> + vpxor %ymm7, %ymm0, %ymm5
> +
> + VPMINU %ymm4, %ymm6, %ymm4
> + VPMINU %ymm5, %ymm7, %ymm5
>
> - VPMINU %ymm5, %ymm6, %ymm6
> + VPMINU %ymm2, %ymm3, %ymm6
> + VPMINU %ymm4, %ymm5, %ymm7
>
> - VPCMPEQ %ymm6, %ymm9, %ymm6
> - vpmovmskb %ymm6, %ecx
> + VPMINU %ymm6, %ymm7, %ymm7
> +
> + VPCMPEQ %ymm7, %ymm1, %ymm7
> + vpmovmskb %ymm7, %ecx
> subq $-(VEC_SIZE * 4), %rdi
> testl %ecx, %ecx
> jz L(loop_4x_vec)
>
> -
> - VPCMPEQ %ymm1, %ymm9, %ymm1
> - vpmovmskb %ymm1, %eax
> + VPCMPEQ %ymm2, %ymm1, %ymm2
> + vpmovmskb %ymm2, %eax
> testl %eax, %eax
> jnz L(last_vec_x0)
>
>
> - VPCMPEQ %ymm5, %ymm9, %ymm2
> - vpmovmskb %ymm2, %eax
> + VPCMPEQ %ymm3, %ymm1, %ymm3
> + vpmovmskb %ymm3, %eax
> testl %eax, %eax
> jnz L(last_vec_x1)
>
> - VPCMPEQ %ymm3, %ymm9, %ymm3
> - vpmovmskb %ymm3, %eax
> + VPCMPEQ %ymm4, %ymm1, %ymm4
> + vpmovmskb %ymm4, %eax
> /* rcx has combined result from all 4 VEC. It will only be used
> if the first 3 other VEC all did not contain a match. */
> salq $32, %rcx
> orq %rcx, %rax
> tzcntq %rax, %rax
> - subq $(VEC_SIZE * 2), %rdi
> + subq $(VEC_SIZE * 2 - 1), %rdi
> # ifndef USE_AS_STRCHRNUL
> /* Found CHAR or the null byte. */
> cmp (%rdi, %rax), %CHAR_REG
> @@ -239,10 +251,11 @@ L(loop_4x_vec):
> VZEROUPPER_RETURN
>
>
> - .p2align 4
> + .p2align 4,, 10
> L(last_vec_x0):
> - tzcntl %eax, %eax
> - addq $-(VEC_SIZE * 4), %rdi
> + /* Use bsf to save code size. */
> + bsfl %eax, %eax
> + addq $-(VEC_SIZE * 4 - 1), %rdi
> # ifndef USE_AS_STRCHRNUL
> /* Found CHAR or the null byte. */
> cmp (%rdi, %rax), %CHAR_REG
> @@ -251,16 +264,11 @@ L(last_vec_x0):
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> -# ifndef USE_AS_STRCHRNUL
> -L(zero_end):
> - xorl %eax, %eax
> - VZEROUPPER_RETURN
> -# endif
>
> - .p2align 4
> + .p2align 4,, 10
> L(last_vec_x1):
> tzcntl %eax, %eax
> - subq $(VEC_SIZE * 3), %rdi
> + subq $(VEC_SIZE * 3 - 1), %rdi
> # ifndef USE_AS_STRCHRNUL
> /* Found CHAR or the null byte. */
> cmp (%rdi, %rax), %CHAR_REG
> @@ -269,18 +277,23 @@ L(last_vec_x1):
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> +# ifndef USE_AS_STRCHRNUL
> +L(zero_end):
> + xorl %eax, %eax
> + VZEROUPPER_RETURN
> +# endif
>
> /* Cold case for crossing page with first load. */
> - .p2align 4
> + .p2align 4,, 8
> L(cross_page_boundary):
> movq %rdi, %rdx
> /* Align rdi to VEC_SIZE - 1. */
> orq $(VEC_SIZE - 1), %rdi
> - vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
> - VPCMPEQ %ymm8, %ymm0, %ymm1
> - VPCMPEQ %ymm8, %ymm9, %ymm2
> - vpor %ymm1, %ymm2, %ymm1
> - vpmovmskb %ymm1, %eax
> + vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2
> + VPCMPEQ %ymm2, %ymm0, %ymm3
> + VPCMPEQ %ymm2, %ymm1, %ymm2
> + vpor %ymm3, %ymm2, %ymm3
> + vpmovmskb %ymm3, %eax
> /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> so no need to manually mod edx. */
> sarxl %edx, %eax, %eax
> @@ -291,13 +304,10 @@ L(cross_page_boundary):
> xorl %ecx, %ecx
> /* Found CHAR or the null byte. */
> cmp (%rdx, %rax), %CHAR_REG
> - leaq (%rdx, %rax), %rax
> - cmovne %rcx, %rax
> -# else
> - addq %rdx, %rax
> + jne L(zero_end)
> # endif
> -L(return_vzeroupper):
> - ZERO_UPPER_VEC_REGISTERS_RETURN
> + addq %rdx, %rax
> + VZEROUPPER_RETURN
>
> END (STRCHR)
> -# endif
> +#endif
> --
> 2.25.1
>
On Thu, Mar 24, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Small code cleanup for size: -53 bytes.
> >
> > Add comment justifying using a branch to do NULL/non-null return.
>
>
> Do you have followup patches to improve its performance? We are
> backporting all x86-64 improvements to Intel release branches:
>
> https://gitlab.com/x86-glibc/glibc/-/wikis/home
>
> Patches without performance improvements are undesirable.
No further changes planned at the moment, code size saves
seem worth it for master though. Also in favor of adding the comment
as I think its non-intuitive.
>
> > All string/memory tests pass and no regressions in benchtests.
> >
> > geometric_mean(N=20) of all benchmarks Original / New: 1.00
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > length, alignment, pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
> > 2048, 0, 32, 0, 23, 127, 1.033
> > 2048, 1, 32, 0, 23, 127, 1.006
> > 2048, 0, 64, 0, 23, 127, 1.02
> > 2048, 2, 64, 0, 23, 127, 0.992
> > 2048, 0, 128, 0, 23, 127, 0.996
> > 2048, 3, 128, 0, 23, 127, 0.966
> > 2048, 0, 256, 0, 23, 127, 0.996
> > 2048, 4, 256, 0, 23, 127, 0.998
> > 2048, 0, 512, 0, 23, 127, 0.991
> > 2048, 5, 512, 0, 23, 127, 0.991
> > 2048, 0, 1024, 0, 23, 127, 0.993
> > 2048, 6, 1024, 0, 23, 127, 0.992
> > 2048, 0, 2048, 0, 23, 127, 0.992
> > 2048, 7, 2048, 0, 23, 127, 0.976
> > 4096, 0, 32, 0, 23, 127, 0.983
> > 4096, 1, 32, 0, 23, 127, 0.994
> > 4096, 0, 64, 0, 23, 127, 0.968
> > 4096, 2, 64, 0, 23, 127, 1.018
> > 4096, 0, 128, 0, 23, 127, 0.99
> > 4096, 3, 128, 0, 23, 127, 1.001
> > 4096, 0, 256, 0, 23, 127, 1.0
> > 4096, 4, 256, 0, 23, 127, 1.001
> > 4096, 0, 512, 0, 23, 127, 0.989
> > 4096, 5, 512, 0, 23, 127, 0.988
> > 4096, 0, 1024, 0, 23, 127, 0.994
> > 4096, 6, 1024, 0, 23, 127, 0.993
> > 4096, 0, 2048, 0, 23, 127, 0.987
> > 4096, 7, 2048, 0, 23, 127, 0.996
> > 256, 1, 64, 0, 23, 127, 1.004
> > 256, 2, 64, 0, 23, 127, 1.004
> > 256, 3, 64, 0, 23, 127, 0.992
> > 256, 4, 64, 0, 23, 127, 1.001
> > 256, 5, 64, 0, 23, 127, 1.001
> > 256, 6, 64, 0, 23, 127, 0.998
> > 256, 7, 64, 0, 23, 127, 0.994
> > 512, 0, 256, 0, 23, 127, 0.999
> > 512, 16, 256, 0, 23, 127, 1.002
> > 512, 32, 256, 0, 23, 127, 0.994
> > 512, 48, 256, 0, 23, 127, 0.991
> > 512, 64, 256, 0, 23, 127, 0.994
> > 512, 80, 256, 0, 23, 127, 0.994
> > 512, 96, 256, 0, 23, 127, 0.996
> > 512, 112, 256, 0, 23, 127, 0.999
> > 1, 0, 0, 0, 23, 127, 0.978
> > 2, 0, 1, 0, 23, 127, 0.981
> > 3, 0, 2, 0, 23, 127, 0.993
> > 4, 0, 3, 0, 23, 127, 1.004
> > 5, 0, 4, 0, 23, 127, 1.002
> > 6, 0, 5, 0, 23, 127, 0.991
> > 7, 0, 6, 0, 23, 127, 0.99
> > 8, 0, 7, 0, 23, 127, 1.012
> > 9, 0, 8, 0, 23, 127, 0.994
> > 10, 0, 9, 0, 23, 127, 1.003
> > 11, 0, 10, 0, 23, 127, 0.999
> > 12, 0, 11, 0, 23, 127, 1.007
> > 13, 0, 12, 0, 23, 127, 1.0
> > 14, 0, 13, 0, 23, 127, 0.997
> > 15, 0, 14, 0, 23, 127, 0.996
> > 16, 0, 15, 0, 23, 127, 0.993
> > 17, 0, 16, 0, 23, 127, 1.002
> > 18, 0, 17, 0, 23, 127, 0.997
> > 19, 0, 18, 0, 23, 127, 0.998
> > 20, 0, 19, 0, 23, 127, 0.994
> > 21, 0, 20, 0, 23, 127, 0.99
> > 22, 0, 21, 0, 23, 127, 0.992
> > 23, 0, 22, 0, 23, 127, 0.996
> > 24, 0, 23, 0, 23, 127, 0.991
> > 25, 0, 24, 0, 23, 127, 0.997
> > 26, 0, 25, 0, 23, 127, 1.011
> > 27, 0, 26, 0, 23, 127, 1.013
> > 28, 0, 27, 0, 23, 127, 0.996
> > 29, 0, 28, 0, 23, 127, 0.993
> > 30, 0, 29, 0, 23, 127, 1.009
> > 31, 0, 30, 0, 23, 127, 1.009
> > 32, 0, 31, 0, 23, 127, 1.008
> > 2048, 0, 32, 0, 0, 127, 1.0
> > 2048, 1, 32, 0, 0, 127, 1.01
> > 2048, 0, 64, 0, 0, 127, 0.997
> > 2048, 2, 64, 0, 0, 127, 1.002
> > 2048, 0, 128, 0, 0, 127, 0.986
> > 2048, 3, 128, 0, 0, 127, 0.997
> > 2048, 0, 256, 0, 0, 127, 1.002
> > 2048, 4, 256, 0, 0, 127, 0.999
> > 2048, 0, 512, 0, 0, 127, 0.991
> > 2048, 5, 512, 0, 0, 127, 0.984
> > 2048, 0, 1024, 0, 0, 127, 0.994
> > 2048, 6, 1024, 0, 0, 127, 0.993
> > 2048, 0, 2048, 0, 0, 127, 0.951
> > 2048, 7, 2048, 0, 0, 127, 0.989
> > 4096, 0, 32, 0, 0, 127, 0.993
> > 4096, 1, 32, 0, 0, 127, 0.997
> > 4096, 0, 64, 0, 0, 127, 1.004
> > 4096, 2, 64, 0, 0, 127, 1.016
> > 4096, 0, 128, 0, 0, 127, 0.973
> > 4096, 3, 128, 0, 0, 127, 1.001
> > 4096, 0, 256, 0, 0, 127, 0.999
> > 4096, 4, 256, 0, 0, 127, 0.998
> > 4096, 0, 512, 0, 0, 127, 0.99
> > 4096, 5, 512, 0, 0, 127, 0.985
> > 4096, 0, 1024, 0, 0, 127, 0.993
> > 4096, 6, 1024, 0, 0, 127, 0.997
> > 4096, 0, 2048, 0, 0, 127, 0.995
> > 4096, 7, 2048, 0, 0, 127, 0.996
> > 256, 1, 64, 0, 0, 127, 1.01
> > 256, 2, 64, 0, 0, 127, 1.024
> > 256, 3, 64, 0, 0, 127, 1.03
> > 256, 4, 64, 0, 0, 127, 1.004
> > 256, 5, 64, 0, 0, 127, 0.998
> > 256, 6, 64, 0, 0, 127, 0.998
> > 256, 7, 64, 0, 0, 127, 0.997
> > 512, 0, 256, 0, 0, 127, 0.996
> > 512, 16, 256, 0, 0, 127, 0.995
> > 512, 32, 256, 0, 0, 127, 0.996
> > 512, 48, 256, 0, 0, 127, 0.992
> > 512, 64, 256, 0, 0, 127, 0.999
> > 512, 80, 256, 0, 0, 127, 1.002
> > 512, 96, 256, 0, 0, 127, 0.999
> > 512, 112, 256, 0, 0, 127, 0.998
> > 1, 0, 0, 0, 0, 127, 1.016
> > 2, 0, 1, 0, 0, 127, 0.998
> > 3, 0, 2, 0, 0, 127, 1.02
> > 4, 0, 3, 0, 0, 127, 1.004
> > 5, 0, 4, 0, 0, 127, 1.021
> > 6, 0, 5, 0, 0, 127, 1.014
> > 7, 0, 6, 0, 0, 127, 1.007
> > 8, 0, 7, 0, 0, 127, 1.016
> > 9, 0, 8, 0, 0, 127, 1.003
> > 10, 0, 9, 0, 0, 127, 1.004
> > 11, 0, 10, 0, 0, 127, 0.995
> > 12, 0, 11, 0, 0, 127, 1.009
> > 13, 0, 12, 0, 0, 127, 1.005
> > 14, 0, 13, 0, 0, 127, 0.987
> > 15, 0, 14, 0, 0, 127, 0.998
> > 16, 0, 15, 0, 0, 127, 1.004
> > 17, 0, 16, 0, 0, 127, 1.01
> > 18, 0, 17, 0, 0, 127, 1.01
> > 19, 0, 18, 0, 0, 127, 1.006
> > 20, 0, 19, 0, 0, 127, 1.012
> > 21, 0, 20, 0, 0, 127, 0.999
> > 22, 0, 21, 0, 0, 127, 1.004
> > 23, 0, 22, 0, 0, 127, 0.988
> > 24, 0, 23, 0, 0, 127, 0.993
> > 25, 0, 24, 0, 0, 127, 1.004
> > 26, 0, 25, 0, 0, 127, 0.99
> > 27, 0, 26, 0, 0, 127, 1.016
> > 28, 0, 27, 0, 0, 127, 0.987
> > 29, 0, 28, 0, 0, 127, 0.989
> > 30, 0, 29, 0, 0, 127, 0.998
> > 31, 0, 30, 0, 0, 127, 1.005
> > 32, 0, 31, 0, 0, 127, 0.993
> >
> > 16, 0, 15, 1, 1, 0, 1.002
> > 16, 0, 15, 1, 0, 0, 1.0
> > 16, 0, 15, 1, 1, 0.1, 1.034
> > 16, 0, 15, 1, 0, 0.1, 1.03
> > 16, 0, 15, 1, 1, 0.25, 0.993
> > 16, 0, 15, 1, 0, 0.25, 1.081
> > 16, 0, 15, 1, 1, 0.33, 0.959
> > 16, 0, 15, 1, 0, 0.33, 1.142
> > 16, 0, 15, 1, 1, 0.5, 0.929
> > 16, 0, 15, 1, 0, 0.5, 1.072
> > 16, 0, 15, 1, 1, 0.66, 0.984
> > 16, 0, 15, 1, 0, 0.66, 1.069
> > 16, 0, 15, 1, 1, 0.75, 0.969
> > 16, 0, 15, 1, 0, 0.75, 1.059
> > 16, 0, 15, 1, 1, 0.9, 0.98
> > 16, 0, 15, 1, 0, 0.9, 0.994
> > 16, 0, 15, 1, 1, 1, 0.993
> > 16, 0, 15, 1, 0, 1, 0.996
> >
> > sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------
> > 1 file changed, 107 insertions(+), 97 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > index 086cabf76a..1a916cc951 100644
> > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > @@ -48,13 +48,13 @@
> > # define PAGE_SIZE 4096
> >
> > .section SECTION(.text),"ax",@progbits
> > -ENTRY (STRCHR)
> > +ENTRY_P2ALIGN (STRCHR, 5)
> > /* Broadcast CHAR to YMM0. */
> > vmovd %esi, %xmm0
> > movl %edi, %eax
> > andl $(PAGE_SIZE - 1), %eax
> > VPBROADCAST %xmm0, %ymm0
> > - vpxor %xmm9, %xmm9, %xmm9
> > + vpxor %xmm1, %xmm1, %xmm1
> >
> > /* Check if we cross page boundary with one vector load. */
> > cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > @@ -62,37 +62,29 @@ ENTRY (STRCHR)
> >
> > /* Check the first VEC_SIZE bytes. Search for both CHAR and the
> > null byte. */
> > - vmovdqu (%rdi), %ymm8
> > - VPCMPEQ %ymm8, %ymm0, %ymm1
> > - VPCMPEQ %ymm8, %ymm9, %ymm2
> > - vpor %ymm1, %ymm2, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vmovdqu (%rdi), %ymm2
> > + VPCMPEQ %ymm2, %ymm0, %ymm3
> > + VPCMPEQ %ymm2, %ymm1, %ymm2
> > + vpor %ymm3, %ymm2, %ymm3
> > + vpmovmskb %ymm3, %eax
> > testl %eax, %eax
> > jz L(aligned_more)
> > tzcntl %eax, %eax
> > # ifndef USE_AS_STRCHRNUL
> > - /* Found CHAR or the null byte. */
> > - cmp (%rdi, %rax), %CHAR_REG
> > - jne L(zero)
> > -# endif
> > - addq %rdi, %rax
> > - VZEROUPPER_RETURN
> > -
> > - /* .p2align 5 helps keep performance more consistent if ENTRY()
> > - alignment % 32 was either 16 or 0. As well this makes the
> > - alignment % 32 of the loop_4x_vec fixed which makes tuning it
> > - easier. */
> > - .p2align 5
> > -L(first_vec_x4):
> > - tzcntl %eax, %eax
> > - addq $(VEC_SIZE * 3 + 1), %rdi
> > -# ifndef USE_AS_STRCHRNUL
> > - /* Found CHAR or the null byte. */
> > + /* Found CHAR or the null byte. */
> > cmp (%rdi, %rax), %CHAR_REG
> > + /* NB: Use a branch instead of cmovcc here. The expectation is
> > + that with strchr the user will branch based on input being
> > + null. Since this branch will be 100% predictive of the user
> > + branch a branch miss here should save what otherwise would
> > + be branch miss in the user code. Otherwise using a branch 1)
> > + saves code size and 2) is faster in highly predictable
> > + environments. */
> > jne L(zero)
> > # endif
> > addq %rdi, %rax
> > - VZEROUPPER_RETURN
> > +L(return_vzeroupper):
> > + ZERO_UPPER_VEC_REGISTERS_RETURN
> >
> > # ifndef USE_AS_STRCHRNUL
> > L(zero):
> > @@ -103,7 +95,8 @@ L(zero):
> >
> > .p2align 4
> > L(first_vec_x1):
> > - tzcntl %eax, %eax
> > + /* Use bsf to save code size. */
> > + bsfl %eax, %eax
> > incq %rdi
> > # ifndef USE_AS_STRCHRNUL
> > /* Found CHAR or the null byte. */
> > @@ -113,9 +106,10 @@ L(first_vec_x1):
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> >
> > - .p2align 4
> > + .p2align 4,, 10
> > L(first_vec_x2):
> > - tzcntl %eax, %eax
> > + /* Use bsf to save code size. */
> > + bsfl %eax, %eax
> > addq $(VEC_SIZE + 1), %rdi
> > # ifndef USE_AS_STRCHRNUL
> > /* Found CHAR or the null byte. */
> > @@ -125,9 +119,10 @@ L(first_vec_x2):
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> >
> > - .p2align 4
> > + .p2align 4,, 8
> > L(first_vec_x3):
> > - tzcntl %eax, %eax
> > + /* Use bsf to save code size. */
> > + bsfl %eax, %eax
> > addq $(VEC_SIZE * 2 + 1), %rdi
> > # ifndef USE_AS_STRCHRNUL
> > /* Found CHAR or the null byte. */
> > @@ -137,6 +132,21 @@ L(first_vec_x3):
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> >
> > + .p2align 4,, 10
> > +L(first_vec_x4):
> > + /* Use bsf to save code size. */
> > + bsfl %eax, %eax
> > + addq $(VEC_SIZE * 3 + 1), %rdi
> > +# ifndef USE_AS_STRCHRNUL
> > + /* Found CHAR or the null byte. */
> > + cmp (%rdi, %rax), %CHAR_REG
> > + jne L(zero)
> > +# endif
> > + addq %rdi, %rax
> > + VZEROUPPER_RETURN
> > +
> > +
> > +
> > .p2align 4
> > L(aligned_more):
> > /* Align data to VEC_SIZE - 1. This is the same number of
> > @@ -146,90 +156,92 @@ L(aligned_more):
> > L(cross_page_continue):
> > /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > since data is only aligned to VEC_SIZE. */
> > - vmovdqa 1(%rdi), %ymm8
> > - VPCMPEQ %ymm8, %ymm0, %ymm1
> > - VPCMPEQ %ymm8, %ymm9, %ymm2
> > - vpor %ymm1, %ymm2, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vmovdqa 1(%rdi), %ymm2
> > + VPCMPEQ %ymm2, %ymm0, %ymm3
> > + VPCMPEQ %ymm2, %ymm1, %ymm2
> > + vpor %ymm3, %ymm2, %ymm3
> > + vpmovmskb %ymm3, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x1)
> >
> > - vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
> > - VPCMPEQ %ymm8, %ymm0, %ymm1
> > - VPCMPEQ %ymm8, %ymm9, %ymm2
> > - vpor %ymm1, %ymm2, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2
> > + VPCMPEQ %ymm2, %ymm0, %ymm3
> > + VPCMPEQ %ymm2, %ymm1, %ymm2
> > + vpor %ymm3, %ymm2, %ymm3
> > + vpmovmskb %ymm3, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x2)
> >
> > - vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
> > - VPCMPEQ %ymm8, %ymm0, %ymm1
> > - VPCMPEQ %ymm8, %ymm9, %ymm2
> > - vpor %ymm1, %ymm2, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2
> > + VPCMPEQ %ymm2, %ymm0, %ymm3
> > + VPCMPEQ %ymm2, %ymm1, %ymm2
> > + vpor %ymm3, %ymm2, %ymm3
> > + vpmovmskb %ymm3, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x3)
> >
> > - vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
> > - VPCMPEQ %ymm8, %ymm0, %ymm1
> > - VPCMPEQ %ymm8, %ymm9, %ymm2
> > - vpor %ymm1, %ymm2, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2
> > + VPCMPEQ %ymm2, %ymm0, %ymm3
> > + VPCMPEQ %ymm2, %ymm1, %ymm2
> > + vpor %ymm3, %ymm2, %ymm3
> > + vpmovmskb %ymm3, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x4)
> > - /* Align data to VEC_SIZE * 4 - 1. */
> > - addq $(VEC_SIZE * 4 + 1), %rdi
> > - andq $-(VEC_SIZE * 4), %rdi
> > + /* Align data to VEC_SIZE * 4 - 1. */
> > + incq %rdi
> > + orq $(VEC_SIZE * 4 - 1), %rdi
> > .p2align 4
> > L(loop_4x_vec):
> > /* Compare 4 * VEC at a time forward. */
> > - vmovdqa (%rdi), %ymm5
> > - vmovdqa (VEC_SIZE)(%rdi), %ymm6
> > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > + vmovdqa 1(%rdi), %ymm6
> > + vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7
> >
> > /* Leaves only CHARS matching esi as 0. */
> > - vpxor %ymm5, %ymm0, %ymm1
> > vpxor %ymm6, %ymm0, %ymm2
> > vpxor %ymm7, %ymm0, %ymm3
> > - vpxor %ymm8, %ymm0, %ymm4
> >
> > - VPMINU %ymm1, %ymm5, %ymm1
> > VPMINU %ymm2, %ymm6, %ymm2
> > VPMINU %ymm3, %ymm7, %ymm3
> > - VPMINU %ymm4, %ymm8, %ymm4
> >
> > - VPMINU %ymm1, %ymm2, %ymm5
> > - VPMINU %ymm3, %ymm4, %ymm6
> > + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6
> > + vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7
> > +
> > + vpxor %ymm6, %ymm0, %ymm4
> > + vpxor %ymm7, %ymm0, %ymm5
> > +
> > + VPMINU %ymm4, %ymm6, %ymm4
> > + VPMINU %ymm5, %ymm7, %ymm5
> >
> > - VPMINU %ymm5, %ymm6, %ymm6
> > + VPMINU %ymm2, %ymm3, %ymm6
> > + VPMINU %ymm4, %ymm5, %ymm7
> >
> > - VPCMPEQ %ymm6, %ymm9, %ymm6
> > - vpmovmskb %ymm6, %ecx
> > + VPMINU %ymm6, %ymm7, %ymm7
> > +
> > + VPCMPEQ %ymm7, %ymm1, %ymm7
> > + vpmovmskb %ymm7, %ecx
> > subq $-(VEC_SIZE * 4), %rdi
> > testl %ecx, %ecx
> > jz L(loop_4x_vec)
> >
> > -
> > - VPCMPEQ %ymm1, %ymm9, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + VPCMPEQ %ymm2, %ymm1, %ymm2
> > + vpmovmskb %ymm2, %eax
> > testl %eax, %eax
> > jnz L(last_vec_x0)
> >
> >
> > - VPCMPEQ %ymm5, %ymm9, %ymm2
> > - vpmovmskb %ymm2, %eax
> > + VPCMPEQ %ymm3, %ymm1, %ymm3
> > + vpmovmskb %ymm3, %eax
> > testl %eax, %eax
> > jnz L(last_vec_x1)
> >
> > - VPCMPEQ %ymm3, %ymm9, %ymm3
> > - vpmovmskb %ymm3, %eax
> > + VPCMPEQ %ymm4, %ymm1, %ymm4
> > + vpmovmskb %ymm4, %eax
> > /* rcx has combined result from all 4 VEC. It will only be used
> > if the first 3 other VEC all did not contain a match. */
> > salq $32, %rcx
> > orq %rcx, %rax
> > tzcntq %rax, %rax
> > - subq $(VEC_SIZE * 2), %rdi
> > + subq $(VEC_SIZE * 2 - 1), %rdi
> > # ifndef USE_AS_STRCHRNUL
> > /* Found CHAR or the null byte. */
> > cmp (%rdi, %rax), %CHAR_REG
> > @@ -239,10 +251,11 @@ L(loop_4x_vec):
> > VZEROUPPER_RETURN
> >
> >
> > - .p2align 4
> > + .p2align 4,, 10
> > L(last_vec_x0):
> > - tzcntl %eax, %eax
> > - addq $-(VEC_SIZE * 4), %rdi
> > + /* Use bsf to save code size. */
> > + bsfl %eax, %eax
> > + addq $-(VEC_SIZE * 4 - 1), %rdi
> > # ifndef USE_AS_STRCHRNUL
> > /* Found CHAR or the null byte. */
> > cmp (%rdi, %rax), %CHAR_REG
> > @@ -251,16 +264,11 @@ L(last_vec_x0):
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> >
> > -# ifndef USE_AS_STRCHRNUL
> > -L(zero_end):
> > - xorl %eax, %eax
> > - VZEROUPPER_RETURN
> > -# endif
> >
> > - .p2align 4
> > + .p2align 4,, 10
> > L(last_vec_x1):
> > tzcntl %eax, %eax
> > - subq $(VEC_SIZE * 3), %rdi
> > + subq $(VEC_SIZE * 3 - 1), %rdi
> > # ifndef USE_AS_STRCHRNUL
> > /* Found CHAR or the null byte. */
> > cmp (%rdi, %rax), %CHAR_REG
> > @@ -269,18 +277,23 @@ L(last_vec_x1):
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> >
> > +# ifndef USE_AS_STRCHRNUL
> > +L(zero_end):
> > + xorl %eax, %eax
> > + VZEROUPPER_RETURN
> > +# endif
> >
> > /* Cold case for crossing page with first load. */
> > - .p2align 4
> > + .p2align 4,, 8
> > L(cross_page_boundary):
> > movq %rdi, %rdx
> > /* Align rdi to VEC_SIZE - 1. */
> > orq $(VEC_SIZE - 1), %rdi
> > - vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
> > - VPCMPEQ %ymm8, %ymm0, %ymm1
> > - VPCMPEQ %ymm8, %ymm9, %ymm2
> > - vpor %ymm1, %ymm2, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2
> > + VPCMPEQ %ymm2, %ymm0, %ymm3
> > + VPCMPEQ %ymm2, %ymm1, %ymm2
> > + vpor %ymm3, %ymm2, %ymm3
> > + vpmovmskb %ymm3, %eax
> > /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> > so no need to manually mod edx. */
> > sarxl %edx, %eax, %eax
> > @@ -291,13 +304,10 @@ L(cross_page_boundary):
> > xorl %ecx, %ecx
> > /* Found CHAR or the null byte. */
> > cmp (%rdx, %rax), %CHAR_REG
> > - leaq (%rdx, %rax), %rax
> > - cmovne %rcx, %rax
> > -# else
> > - addq %rdx, %rax
> > + jne L(zero_end)
> > # endif
> > -L(return_vzeroupper):
> > - ZERO_UPPER_VEC_REGISTERS_RETURN
> > + addq %rdx, %rax
> > + VZEROUPPER_RETURN
> >
> > END (STRCHR)
> > -# endif
> > +#endif
> > --
> > 2.25.1
> >
>
>
> --
> H.J.
On Thu, Mar 24, 2022 at 12:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Mar 24, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > Small code cleanup for size: -53 bytes.
> > >
> > > Add comment justifying using a branch to do NULL/non-null return.
> >
> >
> > Do you have followup patches to improve its performance? We are
> > backporting all x86-64 improvements to Intel release branches:
> >
> > https://gitlab.com/x86-glibc/glibc/-/wikis/home
> >
> > Patches without performance improvements are undesirable.
>
> No further changes planned at the moment, code size saves
> seem worth it for master though. Also in favor of adding the comment
> as I think its non-intuitive.
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
On Thu, Mar 24, 2022 at 12:37 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Thu, Mar 24, 2022 at 12:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Thu, Mar 24, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > Small code cleanup for size: -53 bytes.
> > > >
> > > > Add comment justifying using a branch to do NULL/non-null return.
> > >
> > >
> > > Do you have followup patches to improve its performance? We are
> > > backporting all x86-64 improvements to Intel release branches:
> > >
> > > https://gitlab.com/x86-glibc/glibc/-/wikis/home
> > >
> > > Patches without performance improvements are undesirable.
> >
> > No further changes planned at the moment, code size saves
> > seem worth it for master though. Also in favor of adding the comment
> > as I think its non-intuitive.
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
@@ -48,13 +48,13 @@
# define PAGE_SIZE 4096
.section SECTION(.text),"ax",@progbits
-ENTRY (STRCHR)
+ENTRY_P2ALIGN (STRCHR, 5)
/* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
movl %edi, %eax
andl $(PAGE_SIZE - 1), %eax
VPBROADCAST %xmm0, %ymm0
- vpxor %xmm9, %xmm9, %xmm9
+ vpxor %xmm1, %xmm1, %xmm1
/* Check if we cross page boundary with one vector load. */
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
@@ -62,37 +62,29 @@ ENTRY (STRCHR)
/* Check the first VEC_SIZE bytes. Search for both CHAR and the
null byte. */
- vmovdqu (%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqu (%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jz L(aligned_more)
tzcntl %eax, %eax
# ifndef USE_AS_STRCHRNUL
- /* Found CHAR or the null byte. */
- cmp (%rdi, %rax), %CHAR_REG
- jne L(zero)
-# endif
- addq %rdi, %rax
- VZEROUPPER_RETURN
-
- /* .p2align 5 helps keep performance more consistent if ENTRY()
- alignment % 32 was either 16 or 0. As well this makes the
- alignment % 32 of the loop_4x_vec fixed which makes tuning it
- easier. */
- .p2align 5
-L(first_vec_x4):
- tzcntl %eax, %eax
- addq $(VEC_SIZE * 3 + 1), %rdi
-# ifndef USE_AS_STRCHRNUL
- /* Found CHAR or the null byte. */
+ /* Found CHAR or the null byte. */
cmp (%rdi, %rax), %CHAR_REG
+ /* NB: Use a branch instead of cmovcc here. The expectation is
+ that with strchr the user will branch based on input being
+ null. Since this branch will be 100% predictive of the user
+ branch a branch miss here should save what otherwise would
+ be branch miss in the user code. Otherwise using a branch 1)
+ saves code size and 2) is faster in highly predictable
+ environments. */
jne L(zero)
# endif
addq %rdi, %rax
- VZEROUPPER_RETURN
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
# ifndef USE_AS_STRCHRNUL
L(zero):
@@ -103,7 +95,8 @@ L(zero):
.p2align 4
L(first_vec_x1):
- tzcntl %eax, %eax
+ /* Use bsf to save code size. */
+ bsfl %eax, %eax
incq %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
@@ -113,9 +106,10 @@ L(first_vec_x1):
addq %rdi, %rax
VZEROUPPER_RETURN
- .p2align 4
+ .p2align 4,, 10
L(first_vec_x2):
- tzcntl %eax, %eax
+ /* Use bsf to save code size. */
+ bsfl %eax, %eax
addq $(VEC_SIZE + 1), %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
@@ -125,9 +119,10 @@ L(first_vec_x2):
addq %rdi, %rax
VZEROUPPER_RETURN
- .p2align 4
+ .p2align 4,, 8
L(first_vec_x3):
- tzcntl %eax, %eax
+ /* Use bsf to save code size. */
+ bsfl %eax, %eax
addq $(VEC_SIZE * 2 + 1), %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
@@ -137,6 +132,21 @@ L(first_vec_x3):
addq %rdi, %rax
VZEROUPPER_RETURN
+ .p2align 4,, 10
+L(first_vec_x4):
+ /* Use bsf to save code size. */
+ bsfl %eax, %eax
+ addq $(VEC_SIZE * 3 + 1), %rdi
+# ifndef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero)
+# endif
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
+
+
+
.p2align 4
L(aligned_more):
/* Align data to VEC_SIZE - 1. This is the same number of
@@ -146,90 +156,92 @@ L(aligned_more):
L(cross_page_continue):
/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- vmovdqa 1(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqa 1(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jnz L(first_vec_x1)
- vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jnz L(first_vec_x4)
- /* Align data to VEC_SIZE * 4 - 1. */
- addq $(VEC_SIZE * 4 + 1), %rdi
- andq $-(VEC_SIZE * 4), %rdi
+ /* Align data to VEC_SIZE * 4 - 1. */
+ incq %rdi
+ orq $(VEC_SIZE * 4 - 1), %rdi
.p2align 4
L(loop_4x_vec):
/* Compare 4 * VEC at a time forward. */
- vmovdqa (%rdi), %ymm5
- vmovdqa (VEC_SIZE)(%rdi), %ymm6
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
+ vmovdqa 1(%rdi), %ymm6
+ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7
/* Leaves only CHARS matching esi as 0. */
- vpxor %ymm5, %ymm0, %ymm1
vpxor %ymm6, %ymm0, %ymm2
vpxor %ymm7, %ymm0, %ymm3
- vpxor %ymm8, %ymm0, %ymm4
- VPMINU %ymm1, %ymm5, %ymm1
VPMINU %ymm2, %ymm6, %ymm2
VPMINU %ymm3, %ymm7, %ymm3
- VPMINU %ymm4, %ymm8, %ymm4
- VPMINU %ymm1, %ymm2, %ymm5
- VPMINU %ymm3, %ymm4, %ymm6
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6
+ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7
+
+ vpxor %ymm6, %ymm0, %ymm4
+ vpxor %ymm7, %ymm0, %ymm5
+
+ VPMINU %ymm4, %ymm6, %ymm4
+ VPMINU %ymm5, %ymm7, %ymm5
- VPMINU %ymm5, %ymm6, %ymm6
+ VPMINU %ymm2, %ymm3, %ymm6
+ VPMINU %ymm4, %ymm5, %ymm7
- VPCMPEQ %ymm6, %ymm9, %ymm6
- vpmovmskb %ymm6, %ecx
+ VPMINU %ymm6, %ymm7, %ymm7
+
+ VPCMPEQ %ymm7, %ymm1, %ymm7
+ vpmovmskb %ymm7, %ecx
subq $-(VEC_SIZE * 4), %rdi
testl %ecx, %ecx
jz L(loop_4x_vec)
-
- VPCMPEQ %ymm1, %ymm9, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
jnz L(last_vec_x0)
- VPCMPEQ %ymm5, %ymm9, %ymm2
- vpmovmskb %ymm2, %eax
+ VPCMPEQ %ymm3, %ymm1, %ymm3
+ vpmovmskb %ymm3, %eax
testl %eax, %eax
jnz L(last_vec_x1)
- VPCMPEQ %ymm3, %ymm9, %ymm3
- vpmovmskb %ymm3, %eax
+ VPCMPEQ %ymm4, %ymm1, %ymm4
+ vpmovmskb %ymm4, %eax
/* rcx has combined result from all 4 VEC. It will only be used
if the first 3 other VEC all did not contain a match. */
salq $32, %rcx
orq %rcx, %rax
tzcntq %rax, %rax
- subq $(VEC_SIZE * 2), %rdi
+ subq $(VEC_SIZE * 2 - 1), %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
cmp (%rdi, %rax), %CHAR_REG
@@ -239,10 +251,11 @@ L(loop_4x_vec):
VZEROUPPER_RETURN
- .p2align 4
+ .p2align 4,, 10
L(last_vec_x0):
- tzcntl %eax, %eax
- addq $-(VEC_SIZE * 4), %rdi
+ /* Use bsf to save code size. */
+ bsfl %eax, %eax
+ addq $-(VEC_SIZE * 4 - 1), %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
cmp (%rdi, %rax), %CHAR_REG
@@ -251,16 +264,11 @@ L(last_vec_x0):
addq %rdi, %rax
VZEROUPPER_RETURN
-# ifndef USE_AS_STRCHRNUL
-L(zero_end):
- xorl %eax, %eax
- VZEROUPPER_RETURN
-# endif
- .p2align 4
+ .p2align 4,, 10
L(last_vec_x1):
tzcntl %eax, %eax
- subq $(VEC_SIZE * 3), %rdi
+ subq $(VEC_SIZE * 3 - 1), %rdi
# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
cmp (%rdi, %rax), %CHAR_REG
@@ -269,18 +277,23 @@ L(last_vec_x1):
addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
+# endif
/* Cold case for crossing page with first load. */
- .p2align 4
+ .p2align 4,, 8
L(cross_page_boundary):
movq %rdi, %rdx
/* Align rdi to VEC_SIZE - 1. */
orq $(VEC_SIZE - 1), %rdi
- vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
+ vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm3
+ VPCMPEQ %ymm2, %ymm1, %ymm2
+ vpor %ymm3, %ymm2, %ymm3
+ vpmovmskb %ymm3, %eax
/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
so no need to manually mod edx. */
sarxl %edx, %eax, %eax
@@ -291,13 +304,10 @@ L(cross_page_boundary):
xorl %ecx, %ecx
/* Found CHAR or the null byte. */
cmp (%rdx, %rax), %CHAR_REG
- leaq (%rdx, %rax), %rax
- cmovne %rcx, %rax
-# else
- addq %rdx, %rax
+ jne L(zero_end)
# endif
-L(return_vzeroupper):
- ZERO_UPPER_VEC_REGISTERS_RETURN
+ addq %rdx, %rax
+ VZEROUPPER_RETURN
END (STRCHR)
-# endif
+#endif