diff mbox series

[v1,1/2] x86: Optimize strlen-avx2.S

Message ID 20210421213951.404588-1-goldstein.w.n@gmail.com
State Superseded
Headers show
Series [v1,1/2] x86: Optimize strlen-avx2.S | expand

Commit Message

Noah Goldstein April 21, 2021, 9:39 p.m. UTC
No bug. This commit optimizes strlen-evex.S. The optimizations are all
small things such as save an ALU in the alignment process, saving a
few instructions in the loop return, saving some bytes in the main
loop, and increasing the ILP in the return cases. test-strchr,
test-strchrnul, test-wcschr, and test-wcschrnul are all passing.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
Tests where run on the following CPUs:
    
Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html

Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html

Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html
    
All times are the geometric mean of N=20. The unit of time is
seconds.

"Cur" refers to the current implementation
"New" refers to this patches implementation

For strchr-evex the numbers are a near universal improvement. The only
exception seems to be the [32, 64] case is marginally slower for
Tigerlake and about even on Icelake (less than the gain in the [0, 31]
case). Overall though I think the number show a sizable improvement,
particularly once the 4x loop is hit.

Results For Tigerlake strchr-evex
size, algn, Cur T , New T , Win , Dif 
32  , 0   , 4.89  , 5.23  , Cur , 0.34
32  , 1   , 4.67  , 5.09  , Cur , 0.42
64  , 0   , 5.59  , 5.46  , New , 0.13
64  , 2   , 5.52  , 5.43  , New , 0.09
128 , 0   , 8.04  , 7.44  , New , 0.6 
128 , 3   , 8.0   , 7.45  , New , 0.55
256 , 0   , 14.7  , 12.94 , New , 1.76
256 , 4   , 14.78 , 13.03 , New , 1.75
512 , 0   , 20.37 , 19.05 , New , 1.32
512 , 5   , 20.34 , 18.98 , New , 1.36
1024, 0   , 31.62 , 28.24 , New , 3.38
1024, 6   , 31.55 , 28.2  , New , 3.35
2048, 0   , 53.22 , 47.12 , New , 6.1 
2048, 7   , 53.15 , 47.0  , New , 6.15
64  , 1   , 5.45  , 5.41  , New , 0.04
64  , 3   , 5.46  , 5.39  , New , 0.07
64  , 4   , 5.48  , 5.39  , New , 0.09
64  , 5   , 5.54  , 5.39  , New , 0.15
64  , 6   , 5.47  , 5.41  , New , 0.06
64  , 7   , 5.46  , 5.39  , New , 0.07
256 , 16  , 14.58 , 12.92 , New , 1.66
256 , 32  , 15.36 , 13.54 , New , 1.82
256 , 48  , 15.49 , 13.71 , New , 1.78
256 , 64  , 16.53 , 14.78 , New , 1.75
256 , 80  , 16.57 , 14.82 , New , 1.75
256 , 96  , 13.26 , 11.99 , New , 1.27
256 , 112 , 13.36 , 12.07 , New , 1.29
0   , 0   , 3.75  , 3.09  , New , 0.66
1   , 0   , 3.75  , 3.09  , New , 0.66
2   , 0   , 3.74  , 3.09  , New , 0.65
3   , 0   , 3.74  , 3.09  , New , 0.65
4   , 0   , 3.74  , 3.09  , New , 0.65
5   , 0   , 3.74  , 3.1   , New , 0.64
6   , 0   , 3.74  , 3.1   , New , 0.64
7   , 0   , 3.74  , 3.09  , New , 0.65
8   , 0   , 3.74  , 3.09  , New , 0.65
9   , 0   , 3.74  , 3.1   , New , 0.64
10  , 0   , 3.75  , 3.09  , New , 0.66
11  , 0   , 3.75  , 3.1   , New , 0.65
12  , 0   , 3.74  , 3.1   , New , 0.64
13  , 0   , 3.77  , 3.1   , New , 0.67
14  , 0   , 3.78  , 3.1   , New , 0.68
15  , 0   , 3.82  , 3.1   , New , 0.72
16  , 0   , 3.76  , 3.1   , New , 0.66
17  , 0   , 3.8   , 3.1   , New , 0.7 
18  , 0   , 3.77  , 3.1   , New , 0.67
19  , 0   , 3.81  , 3.1   , New , 0.71
20  , 0   , 3.77  , 3.13  , New , 0.64
21  , 0   , 3.8   , 3.11  , New , 0.69
22  , 0   , 3.82  , 3.11  , New , 0.71
23  , 0   , 3.77  , 3.11  , New , 0.66
24  , 0   , 3.77  , 3.11  , New , 0.66
25  , 0   , 3.76  , 3.11  , New , 0.65
26  , 0   , 3.76  , 3.11  , New , 0.65
27  , 0   , 3.76  , 3.11  , New , 0.65
28  , 0   , 3.77  , 3.11  , New , 0.66
29  , 0   , 3.76  , 3.11  , New , 0.65
30  , 0   , 3.76  , 3.11  , New , 0.65
31  , 0   , 3.76  , 3.11  , New , 0.65

Results For Icelake strchr-evex
size, algn, Cur T , New T , Win , Dif 
32  , 0   , 3.57  , 3.77  , Cur , 0.2 
32  , 1   , 3.36  , 3.34  , New , 0.02
64  , 0   , 3.77  , 3.64  , New , 0.13
64  , 2   , 3.73  , 3.58  , New , 0.15
128 , 0   , 5.22  , 4.92  , New , 0.3 
128 , 3   , 5.16  , 4.94  , New , 0.22
256 , 0   , 9.83  , 8.8   , New , 1.03
256 , 4   , 9.89  , 8.77  , New , 1.12
512 , 0   , 13.47 , 12.77 , New , 0.7 
512 , 5   , 13.58 , 12.74 , New , 0.84
1024, 0   , 20.33 , 18.46 , New , 1.87
1024, 6   , 20.28 , 18.39 , New , 1.89
2048, 0   , 35.45 , 31.59 , New , 3.86
2048, 7   , 35.44 , 31.66 , New , 3.78
64  , 1   , 3.76  , 3.62  , New , 0.14
64  , 3   , 3.7   , 3.6   , New , 0.1 
64  , 4   , 3.71  , 3.62  , New , 0.09
64  , 5   , 3.74  , 3.61  , New , 0.13
64  , 6   , 3.74  , 3.61  , New , 0.13
64  , 7   , 3.72  , 3.62  , New , 0.1 
256 , 16  , 9.81  , 8.77  , New , 1.04
256 , 32  , 10.25 , 9.24  , New , 1.01
256 , 48  , 10.48 , 9.39  , New , 1.09
256 , 64  , 11.09 , 10.11 , New , 0.98
256 , 80  , 11.09 , 10.09 , New , 1.0 
256 , 96  , 8.88  , 8.09  , New , 0.79
256 , 112 , 8.84  , 8.16  , New , 0.68
0   , 0   , 2.31  , 2.08  , New , 0.23
1   , 0   , 2.36  , 2.09  , New , 0.27
2   , 0   , 2.39  , 2.12  , New , 0.27
3   , 0   , 2.4   , 2.14  , New , 0.26
4   , 0   , 2.42  , 2.15  , New , 0.27
5   , 0   , 2.4   , 2.15  , New , 0.25
6   , 0   , 2.38  , 2.15  , New , 0.23
7   , 0   , 2.36  , 2.15  , New , 0.21
8   , 0   , 2.41  , 2.16  , New , 0.25
9   , 0   , 2.37  , 2.14  , New , 0.23
10  , 0   , 2.36  , 2.16  , New , 0.2 
11  , 0   , 2.36  , 2.17  , New , 0.19
12  , 0   , 2.35  , 2.15  , New , 0.2 
13  , 0   , 2.37  , 2.16  , New , 0.21
14  , 0   , 2.37  , 2.16  , New , 0.21
15  , 0   , 2.39  , 2.15  , New , 0.24
16  , 0   , 2.36  , 2.14  , New , 0.22
17  , 0   , 2.35  , 2.14  , New , 0.21
18  , 0   , 2.36  , 2.14  , New , 0.22
19  , 0   , 2.37  , 2.14  , New , 0.23
20  , 0   , 2.37  , 2.16  , New , 0.21
21  , 0   , 2.38  , 2.16  , New , 0.22
22  , 0   , 2.38  , 2.14  , New , 0.24
23  , 0   , 2.33  , 2.11  , New , 0.22
24  , 0   , 2.3   , 2.07  , New , 0.23
25  , 0   , 2.27  , 2.06  , New , 0.21
26  , 0   , 2.26  , 2.06  , New , 0.2 
27  , 0   , 2.28  , 2.1   , New , 0.18
28  , 0   , 2.34  , 2.13  , New , 0.21
29  , 0   , 2.34  , 2.09  , New , 0.25
30  , 0   , 2.29  , 2.09  , New , 0.2 
31  , 0   , 2.31  , 2.08  , New , 0.23

For strchr-avx the results are a lot closer as the optimizations where
smaller but the trend is improvement. Especially on Skylake (which is
the only one of the benchmark CPUs that this will actually be used
on).

Results For Skylake strchr-avx2
size, algn, Cur T , New T , Win , Dif 
32  , 0   , 6.04  , 5.02  , New , 1.02
32  , 1   , 6.19  , 4.94  , New , 1.25
64  , 0   , 6.68  , 5.92  , New , 0.76
64  , 2   , 6.59  , 5.95  , New , 0.64
128 , 0   , 7.66  , 7.42  , New , 0.24
128 , 3   , 7.66  , 7.4   , New , 0.26
256 , 0   , 14.68 , 12.93 , New , 1.75
256 , 4   , 14.74 , 12.88 , New , 1.86
512 , 0   , 20.81 , 17.47 , New , 3.34
512 , 5   , 20.73 , 17.44 , New , 3.29
1024, 0   , 33.16 , 27.06 , New , 6.1 
1024, 6   , 33.15 , 27.09 , New , 6.06
2048, 0   , 59.06 , 56.15 , New , 2.91
2048, 7   , 59.0  , 53.92 , New , 5.08
64  , 1   , 6.56  , 5.86  , New , 0.7 
64  , 3   , 6.55  , 5.99  , New , 0.56
64  , 4   , 6.61  , 5.96  , New , 0.65
64  , 5   , 6.52  , 5.94  , New , 0.58
64  , 6   , 6.62  , 5.95  , New , 0.67
64  , 7   , 6.61  , 6.11  , New , 0.5 
256 , 16  , 14.64 , 12.85 , New , 1.79
256 , 32  , 15.2  , 12.97 , New , 2.23
256 , 48  , 15.13 , 13.33 , New , 1.8 
256 , 64  , 16.18 , 13.46 , New , 2.72
256 , 80  , 16.26 , 13.49 , New , 2.77
256 , 96  , 13.13 , 11.43 , New , 1.7 
256 , 112 , 13.12 , 11.4  , New , 1.72
0   , 0   , 5.36  , 4.25  , New , 1.11
1   , 0   , 5.28  , 4.24  , New , 1.04
2   , 0   , 5.27  , 4.2   , New , 1.07
3   , 0   , 5.27  , 4.23  , New , 1.04
4   , 0   , 5.36  , 4.3   , New , 1.06
5   , 0   , 5.35  , 4.29  , New , 1.06
6   , 0   , 5.38  , 4.35  , New , 1.03
7   , 0   , 5.39  , 4.28  , New , 1.11
8   , 0   , 5.5   , 4.45  , New , 1.05
9   , 0   , 5.47  , 4.43  , New , 1.04
10  , 0   , 5.5   , 4.4   , New , 1.1 
11  , 0   , 5.51  , 4.44  , New , 1.07
12  , 0   , 5.49  , 4.44  , New , 1.05
13  , 0   , 5.49  , 4.46  , New , 1.03
14  , 0   , 5.49  , 4.46  , New , 1.03
15  , 0   , 5.51  , 4.43  , New , 1.08
16  , 0   , 5.52  , 4.48  , New , 1.04
17  , 0   , 5.57  , 4.47  , New , 1.1 
18  , 0   , 5.56  , 4.52  , New , 1.04
19  , 0   , 5.54  , 4.46  , New , 1.08
20  , 0   , 5.53  , 4.48  , New , 1.05
21  , 0   , 5.54  , 4.48  , New , 1.06
22  , 0   , 5.57  , 4.45  , New , 1.12
23  , 0   , 5.57  , 4.48  , New , 1.09
24  , 0   , 5.53  , 4.43  , New , 1.1 
25  , 0   , 5.49  , 4.42  , New , 1.07
26  , 0   , 5.5   , 4.44  , New , 1.06
27  , 0   , 5.48  , 4.44  , New , 1.04
28  , 0   , 5.48  , 4.43  , New , 1.05
29  , 0   , 5.54  , 4.41  , New , 1.13
30  , 0   , 5.49  , 4.4   , New , 1.09
31  , 0   , 5.46  , 4.4   , New , 1.06

Results For Tigerlake strchr-avx2
size, algn, Cur T , New T , Win , Dif 
32  , 0   , 5.88  , 5.47  , New , 0.41
32  , 1   , 5.73  , 5.46  , New , 0.27
64  , 0   , 6.32  , 6.1   , New , 0.22
64  , 2   , 6.17  , 6.11  , New , 0.06
128 , 0   , 7.93  , 7.68  , New , 0.25
128 , 3   , 7.93  , 7.73  , New , 0.2 
256 , 0   , 14.87 , 14.5  , New , 0.37
256 , 4   , 14.96 , 14.59 , New , 0.37
512 , 0   , 21.25 , 20.18 , New , 1.07
512 , 5   , 21.25 , 20.11 , New , 1.14
1024, 0   , 33.17 , 31.26 , New , 1.91
1024, 6   , 33.14 , 31.13 , New , 2.01
2048, 0   , 53.39 , 52.51 , New , 0.88
2048, 7   , 53.3  , 52.34 , New , 0.96
64  , 1   , 6.11  , 6.09  , New , 0.02
64  , 3   , 6.04  , 6.01  , New , 0.03
64  , 4   , 6.04  , 6.03  , New , 0.01
64  , 5   , 6.13  , 6.05  , New , 0.08
64  , 6   , 6.09  , 6.06  , New , 0.03
64  , 7   , 6.04  , 6.03  , New , 0.01
256 , 16  , 14.77 , 14.39 , New , 0.38
256 , 32  , 15.58 , 15.27 , New , 0.31
256 , 48  , 15.88 , 15.32 , New , 0.56
256 , 64  , 16.85 , 16.01 , New , 0.84
256 , 80  , 16.83 , 16.03 , New , 0.8 
256 , 96  , 13.5  , 13.14 , New , 0.36
256 , 112 , 13.71 , 13.24 , New , 0.47
0   , 0   , 3.78  , 3.76  , New , 0.02
1   , 0   , 3.79  , 3.76  , New , 0.03
2   , 0   , 3.82  , 3.77  , New , 0.05
3   , 0   , 3.78  , 3.76  , New , 0.02
4   , 0   , 3.75  , 3.75  , Eq  , 0.0
5   , 0   , 3.77  , 3.74  , New , 0.03
6   , 0   , 3.78  , 3.76  , New , 0.02
7   , 0   , 3.91  , 3.85  , New , 0.06
8   , 0   , 3.76  , 3.77  , Cur , 0.01
9   , 0   , 3.75  , 3.75  , Eq  , 0.0
10  , 0   , 3.76  , 3.76  , Eq  , 0.0
11  , 0   , 3.77  , 3.75  , New , 0.02
12  , 0   , 3.79  , 3.77  , New , 0.02
13  , 0   , 3.86  , 3.86  , Eq  , 0.0
14  , 0   , 4.2   , 4.2   , Eq  , 0.0
15  , 0   , 4.17  , 4.07  , New , 0.1 
16  , 0   , 4.1   , 4.1   , Eq  , 0.0
17  , 0   , 4.12  , 4.09  , New , 0.03
18  , 0   , 4.12  , 4.12  , Eq  , 0.0
19  , 0   , 4.18  , 4.09  , New , 0.09
20  , 0   , 4.14  , 4.09  , New , 0.05
21  , 0   , 4.15  , 4.11  , New , 0.04
22  , 0   , 4.23  , 4.13  , New , 0.1 
23  , 0   , 4.18  , 4.16  , New , 0.02
24  , 0   , 4.13  , 4.21  , Cur , 0.08
25  , 0   , 4.17  , 4.15  , New , 0.02
26  , 0   , 4.17  , 4.16  , New , 0.01
27  , 0   , 4.18  , 4.16  , New , 0.02
28  , 0   , 4.17  , 4.15  , New , 0.02
29  , 0   , 4.2   , 4.13  , New , 0.07
30  , 0   , 4.16  , 4.12  , New , 0.04
31  , 0   , 4.15  , 4.15  , Eq  , 0.0

Results For Icelake strchr-avx2
size, algn, Cur T , New T , Win , Dif 
32  , 0   , 3.73  , 3.72  , New , 0.01
32  , 1   , 3.46  , 3.44  , New , 0.02
64  , 0   , 3.96  , 3.87  , New , 0.09
64  , 2   , 3.92  , 3.87  , New , 0.05
128 , 0   , 5.15  , 4.9   , New , 0.25
128 , 3   , 5.12  , 4.87  , New , 0.25
256 , 0   , 9.79  , 9.45  , New , 0.34
256 , 4   , 9.76  , 9.52  , New , 0.24
512 , 0   , 13.93 , 12.89 , New , 1.04
512 , 5   , 13.84 , 13.02 , New , 0.82
1024, 0   , 21.41 , 19.92 , New , 1.49
1024, 6   , 21.69 , 20.12 , New , 1.57
2048, 0   , 35.12 , 33.83 , New , 1.29
2048, 7   , 35.13 , 33.99 , New , 1.14
64  , 1   , 3.96  , 3.9   , New , 0.06
64  , 3   , 3.88  , 3.86  , New , 0.02
64  , 4   , 3.87  , 3.83  , New , 0.04
64  , 5   , 3.9   , 3.85  , New , 0.05
64  , 6   , 3.9   , 3.89  , New , 0.01
64  , 7   , 3.9   , 3.84  , New , 0.06
256 , 16  , 9.76  , 9.4   , New , 0.36
256 , 32  , 10.36 , 9.97  , New , 0.39
256 , 48  , 10.5  , 10.02 , New , 0.48
256 , 64  , 11.13 , 10.55 , New , 0.58
256 , 80  , 11.14 , 10.56 , New , 0.58
256 , 96  , 8.98  , 8.57  , New , 0.41
256 , 112 , 9.1   , 8.66  , New , 0.44
0   , 0   , 2.52  , 2.49  , New , 0.03
1   , 0   , 2.56  , 2.53  , New , 0.03
2   , 0   , 2.6   , 2.54  , New , 0.06
3   , 0   , 2.63  , 2.58  , New , 0.05
4   , 0   , 2.63  , 2.6   , New , 0.03
5   , 0   , 2.65  , 2.62  , New , 0.03
6   , 0   , 2.75  , 2.73  , New , 0.02
7   , 0   , 2.73  , 2.76  , Cur , 0.03
8   , 0   , 2.61  , 2.6   , New , 0.01
9   , 0   , 2.73  , 2.74  , Cur , 0.01
10  , 0   , 2.72  , 2.71  , New , 0.01
11  , 0   , 2.74  , 2.72  , New , 0.02
12  , 0   , 2.73  , 2.74  , Cur , 0.01
13  , 0   , 2.73  , 2.75  , Cur , 0.02
14  , 0   , 2.74  , 2.72  , New , 0.02
15  , 0   , 2.74  , 2.72  , New , 0.02
16  , 0   , 2.75  , 2.74  , New , 0.01
17  , 0   , 2.73  , 2.74  , Cur , 0.01
18  , 0   , 2.72  , 2.73  , Cur , 0.01
19  , 0   , 2.74  , 2.72  , New , 0.02
20  , 0   , 2.75  , 2.71  , New , 0.04
21  , 0   , 2.74  , 2.74  , Eq  , 0.0
22  , 0   , 2.73  , 2.74  , Cur , 0.01
23  , 0   , 2.7   , 2.72  , Cur , 0.02
24  , 0   , 2.68  , 2.68  , Eq  , 0.0
25  , 0   , 2.65  , 2.63  , New , 0.02
26  , 0   , 2.64  , 2.62  , New , 0.02
27  , 0   , 2.71  , 2.68  , New , 0.03
28  , 0   , 2.72  , 2.68  , New , 0.04
29  , 0   , 2.68  , 2.74  , Cur , 0.06
30  , 0   , 2.65  , 2.65  , Eq  , 0.0
31  , 0   , 2.7   , 2.68  , New , 0.02            
    
 sysdeps/x86_64/multiarch/strchr-avx2.S | 294 +++++++++++++++----------
 1 file changed, 173 insertions(+), 121 deletions(-)
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index 25bec38b5d..220165d2ba 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -49,132 +49,144 @@ 
 
 	.section SECTION(.text),"ax",@progbits
 ENTRY (STRCHR)
-	movl	%edi, %ecx
-# ifndef USE_AS_STRCHRNUL
-	xorl	%edx, %edx
-# endif
-
 	/* Broadcast CHAR to YMM0.	*/
 	vmovd	%esi, %xmm0
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	VPBROADCAST	%xmm0, %ymm0
 	vpxor	%xmm9, %xmm9, %xmm9
-	VPBROADCAST %xmm0, %ymm0
 
 	/* Check if we cross page boundary with one vector load.  */
-	andl	$(PAGE_SIZE - 1), %ecx
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
-	ja  L(cross_page_boundary)
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
 	   null byte.  */
 	vmovdqu	(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vpmovmskb	%ymm1, %eax
 	testl	%eax, %eax
-	jz	L(more_vecs)
+	jz	L(aligned_more)
 	tzcntl	%eax, %eax
-	/* Found CHAR or the null byte.	 */
-	addq	%rdi, %rax
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
 # endif
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
-
-	.p2align 4
-L(more_vecs):
-	/* Align data for aligned loads in the loop.  */
-	andq	$-VEC_SIZE, %rdi
-L(aligned_more):
-
-	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.	*/
-	vmovdqa	VEC_SIZE(%rdi), %ymm8
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	vmovdqa	VEC_SIZE(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jz	L(prep_loop_4x)
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
+	/* .p2align 5 helps keep performance more consistent if ENTRY()
+	   alignment % 32 was either 16 or 0. As well this makes the
+	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
+	   easier.  */
+	.p2align 5
+L(first_vec_x4):
 	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
 # endif
+	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(first_vec_x0):
-	tzcntl	%eax, %eax
-	/* Found CHAR or the null byte.	 */
-	addq	%rdi, %rax
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
-# endif
+L(zero):
+	xorl	%eax, %eax
 	VZEROUPPER_RETURN
+# endif
+
 
 	.p2align 4
 L(first_vec_x1):
 	tzcntl	%eax, %eax
-	leaq	VEC_SIZE(%rdi, %rax), %rax
+	incq	%rdi
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
 # endif
+	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
 	.p2align 4
 L(first_vec_x2):
 	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE + 1), %rdi
+# ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 2 + 1), %rdi
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
 # endif
+	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
-L(prep_loop_4x):
-	/* Align data to 4 * VEC_SIZE.	*/
-	andq	$-(VEC_SIZE * 4), %rdi
+	.p2align 4
+L(aligned_more):
+	/* Align data to VEC_SIZE - 1. This is the same number of
+	   instructions as using andq -VEC_SIZE but saves 4 bytes of code on
+	   x4 check.  */
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time since
+	   data is only aligned to VEC_SIZE.  */
+	vmovdqa	1(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
 
+	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+	/* Align data to VEC_SIZE * 4 - 1.	*/
+	addq	$(VEC_SIZE * 4 + 1), %rdi
+	andq	$-(VEC_SIZE * 4), %rdi
 	.p2align 4
 L(loop_4x_vec):
 	/* Compare 4 * VEC at a time forward.  */
-	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
-	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
-	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
-	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
+	vmovdqa	(%rdi), %ymm5
+	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
 
 	/* Leaves only CHARS matching esi as 0.	 */
 	vpxor	%ymm5, %ymm0, %ymm1
@@ -190,62 +202,102 @@  L(loop_4x_vec):
 	VPMINU	%ymm1, %ymm2, %ymm5
 	VPMINU	%ymm3, %ymm4, %ymm6
 
-	VPMINU	%ymm5, %ymm6, %ymm5
+	VPMINU	%ymm5, %ymm6, %ymm6
 
-	VPCMPEQ %ymm5, %ymm9, %ymm5
-	vpmovmskb %ymm5, %eax
+	VPCMPEQ	%ymm6, %ymm9, %ymm6
+	vpmovmskb	%ymm6, %ecx
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
 
-	addq	$(VEC_SIZE * 4), %rdi
-	testl	%eax, %eax
-	jz  L(loop_4x_vec)
 
-	VPCMPEQ %ymm1, %ymm9, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	%ymm1, %ymm9, %ymm1
+	vpmovmskb	%ymm1, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	jnz	L(last_vec_x0)
+
 
-	VPCMPEQ %ymm2, %ymm9, %ymm2
-	vpmovmskb %ymm2, %eax
+	VPCMPEQ	%ymm5, %ymm9, %ymm2
+	vpmovmskb	%ymm2, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	jnz	L(last_vec_x1)
+
+	VPCMPEQ	%ymm3, %ymm9, %ymm3
+	vpmovmskb	%ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used if
+	   the first 3 other VEC all did not contain a match.  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+	subq	$(VEC_SIZE * 2), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
 
-	VPCMPEQ %ymm3, %ymm9, %ymm3
-	VPCMPEQ %ymm4, %ymm9, %ymm4
-	vpmovmskb %ymm3, %ecx
-	vpmovmskb %ymm4, %eax
-	salq	$32, %rax
-	orq %rcx, %rax
-	tzcntq  %rax, %rax
-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+	.p2align 4
+L(last_vec_x0):
+	tzcntl	%eax, %eax
+	addq	$-(VEC_SIZE * 4), %rdi
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
 # endif
+	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+# endif
+
+	.p2align 4
+L(last_vec_x1):
+	tzcntl	%eax, %eax
+	subq	$(VEC_SIZE * 3), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+
 	/* Cold case for crossing page with first load.	 */
 	.p2align 4
 L(cross_page_boundary):
-	andq	$-VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-
-	vmovdqa	(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
+	movq	%rdi, %rdx
+	/* Align rdi to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	/* Remove the leading bits.	 */
-	sarxl	%ecx, %eax, %eax
+	vpmovmskb	%ymm1, %eax
+	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+	   so no need to manually mod edx.  */
+	sarxl	%edx, %eax, %eax
 	testl	%eax, %eax
-	jz	L(aligned_more)
+	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
-	addq	%rcx, %rdi
-	addq	%rdi, %rax
 # ifndef USE_AS_STRCHRNUL
-	cmp (%rax), %CHAR_REG
-	cmovne	%rdx, %rax
+	xorl	%ecx, %ecx
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdx, %rax), %CHAR_REG
+	leaq	(%rdx, %rax), %rax
+	cmovne	%rcx, %rax
+# else
+	addq	%rdx, %rax
 # endif
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 END (STRCHR)
 # endif