[v1,03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch

Message ID 20220323215734.3927131-3-goldstein.w.n@gmail.com
State Accepted, archived
Headers
Series [v1,01/23] benchtests: Use json-lib in bench-strchr.c |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Noah Goldstein March 23, 2022, 9:57 p.m. UTC
  Small code cleanup for size: -53 bytes.

Add comment justifying using a branch to do NULL/non-null return.

All string/memory tests pass and no regressions in benchtests.

geometric_mean(N=20) of all benchmarks Original / New: 1.00
---
Geomtric Mean N=20 runs; All functions page aligned
length, alignment,  pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
  2048,         0,   32,    0,               23,                127,               1.033
  2048,         1,   32,    0,               23,                127,               1.006
  2048,         0,   64,    0,               23,                127,                1.02
  2048,         2,   64,    0,               23,                127,               0.992
  2048,         0,  128,    0,               23,                127,               0.996
  2048,         3,  128,    0,               23,                127,               0.966
  2048,         0,  256,    0,               23,                127,               0.996
  2048,         4,  256,    0,               23,                127,               0.998
  2048,         0,  512,    0,               23,                127,               0.991
  2048,         5,  512,    0,               23,                127,               0.991
  2048,         0, 1024,    0,               23,                127,               0.993
  2048,         6, 1024,    0,               23,                127,               0.992
  2048,         0, 2048,    0,               23,                127,               0.992
  2048,         7, 2048,    0,               23,                127,               0.976
  4096,         0,   32,    0,               23,                127,               0.983
  4096,         1,   32,    0,               23,                127,               0.994
  4096,         0,   64,    0,               23,                127,               0.968
  4096,         2,   64,    0,               23,                127,               1.018
  4096,         0,  128,    0,               23,                127,                0.99
  4096,         3,  128,    0,               23,                127,               1.001
  4096,         0,  256,    0,               23,                127,                 1.0
  4096,         4,  256,    0,               23,                127,               1.001
  4096,         0,  512,    0,               23,                127,               0.989
  4096,         5,  512,    0,               23,                127,               0.988
  4096,         0, 1024,    0,               23,                127,               0.994
  4096,         6, 1024,    0,               23,                127,               0.993
  4096,         0, 2048,    0,               23,                127,               0.987
  4096,         7, 2048,    0,               23,                127,               0.996
   256,         1,   64,    0,               23,                127,               1.004
   256,         2,   64,    0,               23,                127,               1.004
   256,         3,   64,    0,               23,                127,               0.992
   256,         4,   64,    0,               23,                127,               1.001
   256,         5,   64,    0,               23,                127,               1.001
   256,         6,   64,    0,               23,                127,               0.998
   256,         7,   64,    0,               23,                127,               0.994
   512,         0,  256,    0,               23,                127,               0.999
   512,        16,  256,    0,               23,                127,               1.002
   512,        32,  256,    0,               23,                127,               0.994
   512,        48,  256,    0,               23,                127,               0.991
   512,        64,  256,    0,               23,                127,               0.994
   512,        80,  256,    0,               23,                127,               0.994
   512,        96,  256,    0,               23,                127,               0.996
   512,       112,  256,    0,               23,                127,               0.999
     1,         0,    0,    0,               23,                127,               0.978
     2,         0,    1,    0,               23,                127,               0.981
     3,         0,    2,    0,               23,                127,               0.993
     4,         0,    3,    0,               23,                127,               1.004
     5,         0,    4,    0,               23,                127,               1.002
     6,         0,    5,    0,               23,                127,               0.991
     7,         0,    6,    0,               23,                127,                0.99
     8,         0,    7,    0,               23,                127,               1.012
     9,         0,    8,    0,               23,                127,               0.994
    10,         0,    9,    0,               23,                127,               1.003
    11,         0,   10,    0,               23,                127,               0.999
    12,         0,   11,    0,               23,                127,               1.007
    13,         0,   12,    0,               23,                127,                 1.0
    14,         0,   13,    0,               23,                127,               0.997
    15,         0,   14,    0,               23,                127,               0.996
    16,         0,   15,    0,               23,                127,               0.993
    17,         0,   16,    0,               23,                127,               1.002
    18,         0,   17,    0,               23,                127,               0.997
    19,         0,   18,    0,               23,                127,               0.998
    20,         0,   19,    0,               23,                127,               0.994
    21,         0,   20,    0,               23,                127,                0.99
    22,         0,   21,    0,               23,                127,               0.992
    23,         0,   22,    0,               23,                127,               0.996
    24,         0,   23,    0,               23,                127,               0.991
    25,         0,   24,    0,               23,                127,               0.997
    26,         0,   25,    0,               23,                127,               1.011
    27,         0,   26,    0,               23,                127,               1.013
    28,         0,   27,    0,               23,                127,               0.996
    29,         0,   28,    0,               23,                127,               0.993
    30,         0,   29,    0,               23,                127,               1.009
    31,         0,   30,    0,               23,                127,               1.009
    32,         0,   31,    0,               23,                127,               1.008
  2048,         0,   32,    0,                0,                127,                 1.0
  2048,         1,   32,    0,                0,                127,                1.01
  2048,         0,   64,    0,                0,                127,               0.997
  2048,         2,   64,    0,                0,                127,               1.002
  2048,         0,  128,    0,                0,                127,               0.986
  2048,         3,  128,    0,                0,                127,               0.997
  2048,         0,  256,    0,                0,                127,               1.002
  2048,         4,  256,    0,                0,                127,               0.999
  2048,         0,  512,    0,                0,                127,               0.991
  2048,         5,  512,    0,                0,                127,               0.984
  2048,         0, 1024,    0,                0,                127,               0.994
  2048,         6, 1024,    0,                0,                127,               0.993
  2048,         0, 2048,    0,                0,                127,               0.951
  2048,         7, 2048,    0,                0,                127,               0.989
  4096,         0,   32,    0,                0,                127,               0.993
  4096,         1,   32,    0,                0,                127,               0.997
  4096,         0,   64,    0,                0,                127,               1.004
  4096,         2,   64,    0,                0,                127,               1.016
  4096,         0,  128,    0,                0,                127,               0.973
  4096,         3,  128,    0,                0,                127,               1.001
  4096,         0,  256,    0,                0,                127,               0.999
  4096,         4,  256,    0,                0,                127,               0.998
  4096,         0,  512,    0,                0,                127,                0.99
  4096,         5,  512,    0,                0,                127,               0.985
  4096,         0, 1024,    0,                0,                127,               0.993
  4096,         6, 1024,    0,                0,                127,               0.997
  4096,         0, 2048,    0,                0,                127,               0.995
  4096,         7, 2048,    0,                0,                127,               0.996
   256,         1,   64,    0,                0,                127,                1.01
   256,         2,   64,    0,                0,                127,               1.024
   256,         3,   64,    0,                0,                127,                1.03
   256,         4,   64,    0,                0,                127,               1.004
   256,         5,   64,    0,                0,                127,               0.998
   256,         6,   64,    0,                0,                127,               0.998
   256,         7,   64,    0,                0,                127,               0.997
   512,         0,  256,    0,                0,                127,               0.996
   512,        16,  256,    0,                0,                127,               0.995
   512,        32,  256,    0,                0,                127,               0.996
   512,        48,  256,    0,                0,                127,               0.992
   512,        64,  256,    0,                0,                127,               0.999
   512,        80,  256,    0,                0,                127,               1.002
   512,        96,  256,    0,                0,                127,               0.999
   512,       112,  256,    0,                0,                127,               0.998
     1,         0,    0,    0,                0,                127,               1.016
     2,         0,    1,    0,                0,                127,               0.998
     3,         0,    2,    0,                0,                127,                1.02
     4,         0,    3,    0,                0,                127,               1.004
     5,         0,    4,    0,                0,                127,               1.021
     6,         0,    5,    0,                0,                127,               1.014
     7,         0,    6,    0,                0,                127,               1.007
     8,         0,    7,    0,                0,                127,               1.016
     9,         0,    8,    0,                0,                127,               1.003
    10,         0,    9,    0,                0,                127,               1.004
    11,         0,   10,    0,                0,                127,               0.995
    12,         0,   11,    0,                0,                127,               1.009
    13,         0,   12,    0,                0,                127,               1.005
    14,         0,   13,    0,                0,                127,               0.987
    15,         0,   14,    0,                0,                127,               0.998
    16,         0,   15,    0,                0,                127,               1.004
    17,         0,   16,    0,                0,                127,                1.01
    18,         0,   17,    0,                0,                127,                1.01
    19,         0,   18,    0,                0,                127,               1.006
    20,         0,   19,    0,                0,                127,               1.012
    21,         0,   20,    0,                0,                127,               0.999
    22,         0,   21,    0,                0,                127,               1.004
    23,         0,   22,    0,                0,                127,               0.988
    24,         0,   23,    0,                0,                127,               0.993
    25,         0,   24,    0,                0,                127,               1.004
    26,         0,   25,    0,                0,                127,                0.99
    27,         0,   26,    0,                0,                127,               1.016
    28,         0,   27,    0,                0,                127,               0.987
    29,         0,   28,    0,                0,                127,               0.989
    30,         0,   29,    0,                0,                127,               0.998
    31,         0,   30,    0,                0,                127,               1.005
    32,         0,   31,    0,                0,                127,               0.993

    16,         0,   15,    1,                1,                  0,               1.002
    16,         0,   15,    1,                0,                  0,                 1.0
    16,         0,   15,    1,                1,                0.1,               1.034
    16,         0,   15,    1,                0,                0.1,                1.03
    16,         0,   15,    1,                1,               0.25,               0.993
    16,         0,   15,    1,                0,               0.25,               1.081
    16,         0,   15,    1,                1,               0.33,               0.959
    16,         0,   15,    1,                0,               0.33,               1.142
    16,         0,   15,    1,                1,                0.5,               0.929
    16,         0,   15,    1,                0,                0.5,               1.072
    16,         0,   15,    1,                1,               0.66,               0.984
    16,         0,   15,    1,                0,               0.66,               1.069
    16,         0,   15,    1,                1,               0.75,               0.969
    16,         0,   15,    1,                0,               0.75,               1.059
    16,         0,   15,    1,                1,                0.9,                0.98
    16,         0,   15,    1,                0,                0.9,               0.994
    16,         0,   15,    1,                1,                  1,               0.993
    16,         0,   15,    1,                0,                  1,               0.996

 sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------
 1 file changed, 107 insertions(+), 97 deletions(-)
  

Comments

H.J. Lu March 24, 2022, 6:53 p.m. UTC | #1
On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Small code cleanup for size: -53 bytes.
>
> Add comment justifying using a branch to do NULL/non-null return.


Do you have followup patches to improve its performance?  We are
backporting all x86-64 improvements to Intel release branches:

https://gitlab.com/x86-glibc/glibc/-/wikis/home

Patches without performance improvements are undesirable.

> All string/memory tests pass and no regressions in benchtests.
>
> geometric_mean(N=20) of all benchmarks Original / New: 1.00
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> length, alignment,  pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
>   2048,         0,   32,    0,               23,                127,               1.033
>   2048,         1,   32,    0,               23,                127,               1.006
>   2048,         0,   64,    0,               23,                127,                1.02
>   2048,         2,   64,    0,               23,                127,               0.992
>   2048,         0,  128,    0,               23,                127,               0.996
>   2048,         3,  128,    0,               23,                127,               0.966
>   2048,         0,  256,    0,               23,                127,               0.996
>   2048,         4,  256,    0,               23,                127,               0.998
>   2048,         0,  512,    0,               23,                127,               0.991
>   2048,         5,  512,    0,               23,                127,               0.991
>   2048,         0, 1024,    0,               23,                127,               0.993
>   2048,         6, 1024,    0,               23,                127,               0.992
>   2048,         0, 2048,    0,               23,                127,               0.992
>   2048,         7, 2048,    0,               23,                127,               0.976
>   4096,         0,   32,    0,               23,                127,               0.983
>   4096,         1,   32,    0,               23,                127,               0.994
>   4096,         0,   64,    0,               23,                127,               0.968
>   4096,         2,   64,    0,               23,                127,               1.018
>   4096,         0,  128,    0,               23,                127,                0.99
>   4096,         3,  128,    0,               23,                127,               1.001
>   4096,         0,  256,    0,               23,                127,                 1.0
>   4096,         4,  256,    0,               23,                127,               1.001
>   4096,         0,  512,    0,               23,                127,               0.989
>   4096,         5,  512,    0,               23,                127,               0.988
>   4096,         0, 1024,    0,               23,                127,               0.994
>   4096,         6, 1024,    0,               23,                127,               0.993
>   4096,         0, 2048,    0,               23,                127,               0.987
>   4096,         7, 2048,    0,               23,                127,               0.996
>    256,         1,   64,    0,               23,                127,               1.004
>    256,         2,   64,    0,               23,                127,               1.004
>    256,         3,   64,    0,               23,                127,               0.992
>    256,         4,   64,    0,               23,                127,               1.001
>    256,         5,   64,    0,               23,                127,               1.001
>    256,         6,   64,    0,               23,                127,               0.998
>    256,         7,   64,    0,               23,                127,               0.994
>    512,         0,  256,    0,               23,                127,               0.999
>    512,        16,  256,    0,               23,                127,               1.002
>    512,        32,  256,    0,               23,                127,               0.994
>    512,        48,  256,    0,               23,                127,               0.991
>    512,        64,  256,    0,               23,                127,               0.994
>    512,        80,  256,    0,               23,                127,               0.994
>    512,        96,  256,    0,               23,                127,               0.996
>    512,       112,  256,    0,               23,                127,               0.999
>      1,         0,    0,    0,               23,                127,               0.978
>      2,         0,    1,    0,               23,                127,               0.981
>      3,         0,    2,    0,               23,                127,               0.993
>      4,         0,    3,    0,               23,                127,               1.004
>      5,         0,    4,    0,               23,                127,               1.002
>      6,         0,    5,    0,               23,                127,               0.991
>      7,         0,    6,    0,               23,                127,                0.99
>      8,         0,    7,    0,               23,                127,               1.012
>      9,         0,    8,    0,               23,                127,               0.994
>     10,         0,    9,    0,               23,                127,               1.003
>     11,         0,   10,    0,               23,                127,               0.999
>     12,         0,   11,    0,               23,                127,               1.007
>     13,         0,   12,    0,               23,                127,                 1.0
>     14,         0,   13,    0,               23,                127,               0.997
>     15,         0,   14,    0,               23,                127,               0.996
>     16,         0,   15,    0,               23,                127,               0.993
>     17,         0,   16,    0,               23,                127,               1.002
>     18,         0,   17,    0,               23,                127,               0.997
>     19,         0,   18,    0,               23,                127,               0.998
>     20,         0,   19,    0,               23,                127,               0.994
>     21,         0,   20,    0,               23,                127,                0.99
>     22,         0,   21,    0,               23,                127,               0.992
>     23,         0,   22,    0,               23,                127,               0.996
>     24,         0,   23,    0,               23,                127,               0.991
>     25,         0,   24,    0,               23,                127,               0.997
>     26,         0,   25,    0,               23,                127,               1.011
>     27,         0,   26,    0,               23,                127,               1.013
>     28,         0,   27,    0,               23,                127,               0.996
>     29,         0,   28,    0,               23,                127,               0.993
>     30,         0,   29,    0,               23,                127,               1.009
>     31,         0,   30,    0,               23,                127,               1.009
>     32,         0,   31,    0,               23,                127,               1.008
>   2048,         0,   32,    0,                0,                127,                 1.0
>   2048,         1,   32,    0,                0,                127,                1.01
>   2048,         0,   64,    0,                0,                127,               0.997
>   2048,         2,   64,    0,                0,                127,               1.002
>   2048,         0,  128,    0,                0,                127,               0.986
>   2048,         3,  128,    0,                0,                127,               0.997
>   2048,         0,  256,    0,                0,                127,               1.002
>   2048,         4,  256,    0,                0,                127,               0.999
>   2048,         0,  512,    0,                0,                127,               0.991
>   2048,         5,  512,    0,                0,                127,               0.984
>   2048,         0, 1024,    0,                0,                127,               0.994
>   2048,         6, 1024,    0,                0,                127,               0.993
>   2048,         0, 2048,    0,                0,                127,               0.951
>   2048,         7, 2048,    0,                0,                127,               0.989
>   4096,         0,   32,    0,                0,                127,               0.993
>   4096,         1,   32,    0,                0,                127,               0.997
>   4096,         0,   64,    0,                0,                127,               1.004
>   4096,         2,   64,    0,                0,                127,               1.016
>   4096,         0,  128,    0,                0,                127,               0.973
>   4096,         3,  128,    0,                0,                127,               1.001
>   4096,         0,  256,    0,                0,                127,               0.999
>   4096,         4,  256,    0,                0,                127,               0.998
>   4096,         0,  512,    0,                0,                127,                0.99
>   4096,         5,  512,    0,                0,                127,               0.985
>   4096,         0, 1024,    0,                0,                127,               0.993
>   4096,         6, 1024,    0,                0,                127,               0.997
>   4096,         0, 2048,    0,                0,                127,               0.995
>   4096,         7, 2048,    0,                0,                127,               0.996
>    256,         1,   64,    0,                0,                127,                1.01
>    256,         2,   64,    0,                0,                127,               1.024
>    256,         3,   64,    0,                0,                127,                1.03
>    256,         4,   64,    0,                0,                127,               1.004
>    256,         5,   64,    0,                0,                127,               0.998
>    256,         6,   64,    0,                0,                127,               0.998
>    256,         7,   64,    0,                0,                127,               0.997
>    512,         0,  256,    0,                0,                127,               0.996
>    512,        16,  256,    0,                0,                127,               0.995
>    512,        32,  256,    0,                0,                127,               0.996
>    512,        48,  256,    0,                0,                127,               0.992
>    512,        64,  256,    0,                0,                127,               0.999
>    512,        80,  256,    0,                0,                127,               1.002
>    512,        96,  256,    0,                0,                127,               0.999
>    512,       112,  256,    0,                0,                127,               0.998
>      1,         0,    0,    0,                0,                127,               1.016
>      2,         0,    1,    0,                0,                127,               0.998
>      3,         0,    2,    0,                0,                127,                1.02
>      4,         0,    3,    0,                0,                127,               1.004
>      5,         0,    4,    0,                0,                127,               1.021
>      6,         0,    5,    0,                0,                127,               1.014
>      7,         0,    6,    0,                0,                127,               1.007
>      8,         0,    7,    0,                0,                127,               1.016
>      9,         0,    8,    0,                0,                127,               1.003
>     10,         0,    9,    0,                0,                127,               1.004
>     11,         0,   10,    0,                0,                127,               0.995
>     12,         0,   11,    0,                0,                127,               1.009
>     13,         0,   12,    0,                0,                127,               1.005
>     14,         0,   13,    0,                0,                127,               0.987
>     15,         0,   14,    0,                0,                127,               0.998
>     16,         0,   15,    0,                0,                127,               1.004
>     17,         0,   16,    0,                0,                127,                1.01
>     18,         0,   17,    0,                0,                127,                1.01
>     19,         0,   18,    0,                0,                127,               1.006
>     20,         0,   19,    0,                0,                127,               1.012
>     21,         0,   20,    0,                0,                127,               0.999
>     22,         0,   21,    0,                0,                127,               1.004
>     23,         0,   22,    0,                0,                127,               0.988
>     24,         0,   23,    0,                0,                127,               0.993
>     25,         0,   24,    0,                0,                127,               1.004
>     26,         0,   25,    0,                0,                127,                0.99
>     27,         0,   26,    0,                0,                127,               1.016
>     28,         0,   27,    0,                0,                127,               0.987
>     29,         0,   28,    0,                0,                127,               0.989
>     30,         0,   29,    0,                0,                127,               0.998
>     31,         0,   30,    0,                0,                127,               1.005
>     32,         0,   31,    0,                0,                127,               0.993
>
>     16,         0,   15,    1,                1,                  0,               1.002
>     16,         0,   15,    1,                0,                  0,                 1.0
>     16,         0,   15,    1,                1,                0.1,               1.034
>     16,         0,   15,    1,                0,                0.1,                1.03
>     16,         0,   15,    1,                1,               0.25,               0.993
>     16,         0,   15,    1,                0,               0.25,               1.081
>     16,         0,   15,    1,                1,               0.33,               0.959
>     16,         0,   15,    1,                0,               0.33,               1.142
>     16,         0,   15,    1,                1,                0.5,               0.929
>     16,         0,   15,    1,                0,                0.5,               1.072
>     16,         0,   15,    1,                1,               0.66,               0.984
>     16,         0,   15,    1,                0,               0.66,               1.069
>     16,         0,   15,    1,                1,               0.75,               0.969
>     16,         0,   15,    1,                0,               0.75,               1.059
>     16,         0,   15,    1,                1,                0.9,                0.98
>     16,         0,   15,    1,                0,                0.9,               0.994
>     16,         0,   15,    1,                1,                  1,               0.993
>     16,         0,   15,    1,                0,                  1,               0.996
>
>  sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------
>  1 file changed, 107 insertions(+), 97 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> index 086cabf76a..1a916cc951 100644
> --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> @@ -48,13 +48,13 @@
>  # define PAGE_SIZE 4096
>
>         .section SECTION(.text),"ax",@progbits
> -ENTRY (STRCHR)
> +ENTRY_P2ALIGN (STRCHR, 5)
>         /* Broadcast CHAR to YMM0.      */
>         vmovd   %esi, %xmm0
>         movl    %edi, %eax
>         andl    $(PAGE_SIZE - 1), %eax
>         VPBROADCAST     %xmm0, %ymm0
> -       vpxor   %xmm9, %xmm9, %xmm9
> +       vpxor   %xmm1, %xmm1, %xmm1
>
>         /* Check if we cross page boundary with one vector load.  */
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> @@ -62,37 +62,29 @@ ENTRY (STRCHR)
>
>         /* Check the first VEC_SIZE bytes.      Search for both CHAR and the
>            null byte.  */
> -       vmovdqu (%rdi), %ymm8
> -       VPCMPEQ %ymm8, %ymm0, %ymm1
> -       VPCMPEQ %ymm8, %ymm9, %ymm2
> -       vpor    %ymm1, %ymm2, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       vmovdqu (%rdi), %ymm2
> +       VPCMPEQ %ymm2, %ymm0, %ymm3
> +       VPCMPEQ %ymm2, %ymm1, %ymm2
> +       vpor    %ymm3, %ymm2, %ymm3
> +       vpmovmskb %ymm3, %eax
>         testl   %eax, %eax
>         jz      L(aligned_more)
>         tzcntl  %eax, %eax
>  # ifndef USE_AS_STRCHRNUL
> -       /* Found CHAR or the null byte.  */
> -       cmp     (%rdi, %rax), %CHAR_REG
> -       jne     L(zero)
> -# endif
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> -
> -       /* .p2align 5 helps keep performance more consistent if ENTRY()
> -          alignment % 32 was either 16 or 0. As well this makes the
> -          alignment % 32 of the loop_4x_vec fixed which makes tuning it
> -          easier.  */
> -       .p2align 5
> -L(first_vec_x4):
> -       tzcntl  %eax, %eax
> -       addq    $(VEC_SIZE * 3 + 1), %rdi
> -# ifndef USE_AS_STRCHRNUL
> -       /* Found CHAR or the null byte.  */
> +       /* Found CHAR or the null byte.  */
>         cmp     (%rdi, %rax), %CHAR_REG
> +       /* NB: Use a branch instead of cmovcc here. The expectation is
> +          that with strchr the user will branch based on input being
> +          null. Since this branch will be 100% predictive of the user
> +          branch a branch miss here should save what otherwise would
> +          be branch miss in the user code. Otherwise using a branch 1)
> +          saves code size and 2) is faster in highly predictable
> +          environments.  */
>         jne     L(zero)
>  # endif
>         addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> +L(return_vzeroupper):
> +       ZERO_UPPER_VEC_REGISTERS_RETURN
>
>  # ifndef USE_AS_STRCHRNUL
>  L(zero):
> @@ -103,7 +95,8 @@ L(zero):
>
>         .p2align 4
>  L(first_vec_x1):
> -       tzcntl  %eax, %eax
> +       /* Use bsf to save code size.  */
> +       bsfl    %eax, %eax
>         incq    %rdi
>  # ifndef USE_AS_STRCHRNUL
>         /* Found CHAR or the null byte.  */
> @@ -113,9 +106,10 @@ L(first_vec_x1):
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
>
> -       .p2align 4
> +       .p2align 4,, 10
>  L(first_vec_x2):
> -       tzcntl  %eax, %eax
> +       /* Use bsf to save code size.  */
> +       bsfl    %eax, %eax
>         addq    $(VEC_SIZE + 1), %rdi
>  # ifndef USE_AS_STRCHRNUL
>         /* Found CHAR or the null byte.  */
> @@ -125,9 +119,10 @@ L(first_vec_x2):
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
>
> -       .p2align 4
> +       .p2align 4,, 8
>  L(first_vec_x3):
> -       tzcntl  %eax, %eax
> +       /* Use bsf to save code size.  */
> +       bsfl    %eax, %eax
>         addq    $(VEC_SIZE * 2 + 1), %rdi
>  # ifndef USE_AS_STRCHRNUL
>         /* Found CHAR or the null byte.  */
> @@ -137,6 +132,21 @@ L(first_vec_x3):
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
>
> +       .p2align 4,, 10
> +L(first_vec_x4):
> +       /* Use bsf to save code size.  */
> +       bsfl    %eax, %eax
> +       addq    $(VEC_SIZE * 3 + 1), %rdi
> +# ifndef USE_AS_STRCHRNUL
> +       /* Found CHAR or the null byte.  */
> +       cmp     (%rdi, %rax), %CHAR_REG
> +       jne     L(zero)
> +# endif
> +       addq    %rdi, %rax
> +       VZEROUPPER_RETURN
> +
> +
> +
>         .p2align 4
>  L(aligned_more):
>         /* Align data to VEC_SIZE - 1. This is the same number of
> @@ -146,90 +156,92 @@ L(aligned_more):
>  L(cross_page_continue):
>         /* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>            since data is only aligned to VEC_SIZE.  */
> -       vmovdqa 1(%rdi), %ymm8
> -       VPCMPEQ %ymm8, %ymm0, %ymm1
> -       VPCMPEQ %ymm8, %ymm9, %ymm2
> -       vpor    %ymm1, %ymm2, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       vmovdqa 1(%rdi), %ymm2
> +       VPCMPEQ %ymm2, %ymm0, %ymm3
> +       VPCMPEQ %ymm2, %ymm1, %ymm2
> +       vpor    %ymm3, %ymm2, %ymm3
> +       vpmovmskb %ymm3, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x1)
>
> -       vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
> -       VPCMPEQ %ymm8, %ymm0, %ymm1
> -       VPCMPEQ %ymm8, %ymm9, %ymm2
> -       vpor    %ymm1, %ymm2, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2
> +       VPCMPEQ %ymm2, %ymm0, %ymm3
> +       VPCMPEQ %ymm2, %ymm1, %ymm2
> +       vpor    %ymm3, %ymm2, %ymm3
> +       vpmovmskb %ymm3, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x2)
>
> -       vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
> -       VPCMPEQ %ymm8, %ymm0, %ymm1
> -       VPCMPEQ %ymm8, %ymm9, %ymm2
> -       vpor    %ymm1, %ymm2, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2
> +       VPCMPEQ %ymm2, %ymm0, %ymm3
> +       VPCMPEQ %ymm2, %ymm1, %ymm2
> +       vpor    %ymm3, %ymm2, %ymm3
> +       vpmovmskb %ymm3, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x3)
>
> -       vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
> -       VPCMPEQ %ymm8, %ymm0, %ymm1
> -       VPCMPEQ %ymm8, %ymm9, %ymm2
> -       vpor    %ymm1, %ymm2, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2
> +       VPCMPEQ %ymm2, %ymm0, %ymm3
> +       VPCMPEQ %ymm2, %ymm1, %ymm2
> +       vpor    %ymm3, %ymm2, %ymm3
> +       vpmovmskb %ymm3, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x4)
> -       /* Align data to VEC_SIZE * 4 - 1.      */
> -       addq    $(VEC_SIZE * 4 + 1), %rdi
> -       andq    $-(VEC_SIZE * 4), %rdi
> +       /* Align data to VEC_SIZE * 4 - 1.  */
> +       incq    %rdi
> +       orq     $(VEC_SIZE * 4 - 1), %rdi
>         .p2align 4
>  L(loop_4x_vec):
>         /* Compare 4 * VEC at a time forward.  */
> -       vmovdqa (%rdi), %ymm5
> -       vmovdqa (VEC_SIZE)(%rdi), %ymm6
> -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> +       vmovdqa 1(%rdi), %ymm6
> +       vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7
>
>         /* Leaves only CHARS matching esi as 0.  */
> -       vpxor   %ymm5, %ymm0, %ymm1
>         vpxor   %ymm6, %ymm0, %ymm2
>         vpxor   %ymm7, %ymm0, %ymm3
> -       vpxor   %ymm8, %ymm0, %ymm4
>
> -       VPMINU  %ymm1, %ymm5, %ymm1
>         VPMINU  %ymm2, %ymm6, %ymm2
>         VPMINU  %ymm3, %ymm7, %ymm3
> -       VPMINU  %ymm4, %ymm8, %ymm4
>
> -       VPMINU  %ymm1, %ymm2, %ymm5
> -       VPMINU  %ymm3, %ymm4, %ymm6
> +       vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6
> +       vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7
> +
> +       vpxor   %ymm6, %ymm0, %ymm4
> +       vpxor   %ymm7, %ymm0, %ymm5
> +
> +       VPMINU  %ymm4, %ymm6, %ymm4
> +       VPMINU  %ymm5, %ymm7, %ymm5
>
> -       VPMINU  %ymm5, %ymm6, %ymm6
> +       VPMINU  %ymm2, %ymm3, %ymm6
> +       VPMINU  %ymm4, %ymm5, %ymm7
>
> -       VPCMPEQ %ymm6, %ymm9, %ymm6
> -       vpmovmskb %ymm6, %ecx
> +       VPMINU  %ymm6, %ymm7, %ymm7
> +
> +       VPCMPEQ %ymm7, %ymm1, %ymm7
> +       vpmovmskb %ymm7, %ecx
>         subq    $-(VEC_SIZE * 4), %rdi
>         testl   %ecx, %ecx
>         jz      L(loop_4x_vec)
>
> -
> -       VPCMPEQ %ymm1, %ymm9, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       VPCMPEQ %ymm2, %ymm1, %ymm2
> +       vpmovmskb %ymm2, %eax
>         testl   %eax, %eax
>         jnz     L(last_vec_x0)
>
>
> -       VPCMPEQ %ymm5, %ymm9, %ymm2
> -       vpmovmskb %ymm2, %eax
> +       VPCMPEQ %ymm3, %ymm1, %ymm3
> +       vpmovmskb %ymm3, %eax
>         testl   %eax, %eax
>         jnz     L(last_vec_x1)
>
> -       VPCMPEQ %ymm3, %ymm9, %ymm3
> -       vpmovmskb %ymm3, %eax
> +       VPCMPEQ %ymm4, %ymm1, %ymm4
> +       vpmovmskb %ymm4, %eax
>         /* rcx has combined result from all 4 VEC. It will only be used
>            if the first 3 other VEC all did not contain a match.  */
>         salq    $32, %rcx
>         orq     %rcx, %rax
>         tzcntq  %rax, %rax
> -       subq    $(VEC_SIZE * 2), %rdi
> +       subq    $(VEC_SIZE * 2 - 1), %rdi
>  # ifndef USE_AS_STRCHRNUL
>         /* Found CHAR or the null byte.  */
>         cmp     (%rdi, %rax), %CHAR_REG
> @@ -239,10 +251,11 @@ L(loop_4x_vec):
>         VZEROUPPER_RETURN
>
>
> -       .p2align 4
> +       .p2align 4,, 10
>  L(last_vec_x0):
> -       tzcntl  %eax, %eax
> -       addq    $-(VEC_SIZE * 4), %rdi
> +       /* Use bsf to save code size.  */
> +       bsfl    %eax, %eax
> +       addq    $-(VEC_SIZE * 4 - 1), %rdi
>  # ifndef USE_AS_STRCHRNUL
>         /* Found CHAR or the null byte.  */
>         cmp     (%rdi, %rax), %CHAR_REG
> @@ -251,16 +264,11 @@ L(last_vec_x0):
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
>
> -# ifndef USE_AS_STRCHRNUL
> -L(zero_end):
> -       xorl    %eax, %eax
> -       VZEROUPPER_RETURN
> -# endif
>
> -       .p2align 4
> +       .p2align 4,, 10
>  L(last_vec_x1):
>         tzcntl  %eax, %eax
> -       subq    $(VEC_SIZE * 3), %rdi
> +       subq    $(VEC_SIZE * 3 - 1), %rdi
>  # ifndef USE_AS_STRCHRNUL
>         /* Found CHAR or the null byte.  */
>         cmp     (%rdi, %rax), %CHAR_REG
> @@ -269,18 +277,23 @@ L(last_vec_x1):
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
>
> +# ifndef USE_AS_STRCHRNUL
> +L(zero_end):
> +       xorl    %eax, %eax
> +       VZEROUPPER_RETURN
> +# endif
>
>         /* Cold case for crossing page with first load.  */
> -       .p2align 4
> +       .p2align 4,, 8
>  L(cross_page_boundary):
>         movq    %rdi, %rdx
>         /* Align rdi to VEC_SIZE - 1.  */
>         orq     $(VEC_SIZE - 1), %rdi
> -       vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
> -       VPCMPEQ %ymm8, %ymm0, %ymm1
> -       VPCMPEQ %ymm8, %ymm9, %ymm2
> -       vpor    %ymm1, %ymm2, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2
> +       VPCMPEQ %ymm2, %ymm0, %ymm3
> +       VPCMPEQ %ymm2, %ymm1, %ymm2
> +       vpor    %ymm3, %ymm2, %ymm3
> +       vpmovmskb %ymm3, %eax
>         /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
>            so no need to manually mod edx.  */
>         sarxl   %edx, %eax, %eax
> @@ -291,13 +304,10 @@ L(cross_page_boundary):
>         xorl    %ecx, %ecx
>         /* Found CHAR or the null byte.  */
>         cmp     (%rdx, %rax), %CHAR_REG
> -       leaq    (%rdx, %rax), %rax
> -       cmovne  %rcx, %rax
> -# else
> -       addq    %rdx, %rax
> +       jne     L(zero_end)
>  # endif
> -L(return_vzeroupper):
> -       ZERO_UPPER_VEC_REGISTERS_RETURN
> +       addq    %rdx, %rax
> +       VZEROUPPER_RETURN
>
>  END (STRCHR)
> -# endif
> +#endif
> --
> 2.25.1
>
  
Noah Goldstein March 24, 2022, 7:20 p.m. UTC | #2
On Thu, Mar 24, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Small code cleanup for size: -53 bytes.
> >
> > Add comment justifying using a branch to do NULL/non-null return.
>
>
> Do you have followup patches to improve its performance?  We are
> backporting all x86-64 improvements to Intel release branches:
>
> https://gitlab.com/x86-glibc/glibc/-/wikis/home
>
> Patches without performance improvements are undesirable.

No further changes planned at the moment, code size saves
seem worth it for master though. Also in favor of adding the comment
as I think its non-intuitive.

>
> > All string/memory tests pass and no regressions in benchtests.
> >
> > geometric_mean(N=20) of all benchmarks Original / New: 1.00
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > length, alignment,  pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
> >   2048,         0,   32,    0,               23,                127,               1.033
> >   2048,         1,   32,    0,               23,                127,               1.006
> >   2048,         0,   64,    0,               23,                127,                1.02
> >   2048,         2,   64,    0,               23,                127,               0.992
> >   2048,         0,  128,    0,               23,                127,               0.996
> >   2048,         3,  128,    0,               23,                127,               0.966
> >   2048,         0,  256,    0,               23,                127,               0.996
> >   2048,         4,  256,    0,               23,                127,               0.998
> >   2048,         0,  512,    0,               23,                127,               0.991
> >   2048,         5,  512,    0,               23,                127,               0.991
> >   2048,         0, 1024,    0,               23,                127,               0.993
> >   2048,         6, 1024,    0,               23,                127,               0.992
> >   2048,         0, 2048,    0,               23,                127,               0.992
> >   2048,         7, 2048,    0,               23,                127,               0.976
> >   4096,         0,   32,    0,               23,                127,               0.983
> >   4096,         1,   32,    0,               23,                127,               0.994
> >   4096,         0,   64,    0,               23,                127,               0.968
> >   4096,         2,   64,    0,               23,                127,               1.018
> >   4096,         0,  128,    0,               23,                127,                0.99
> >   4096,         3,  128,    0,               23,                127,               1.001
> >   4096,         0,  256,    0,               23,                127,                 1.0
> >   4096,         4,  256,    0,               23,                127,               1.001
> >   4096,         0,  512,    0,               23,                127,               0.989
> >   4096,         5,  512,    0,               23,                127,               0.988
> >   4096,         0, 1024,    0,               23,                127,               0.994
> >   4096,         6, 1024,    0,               23,                127,               0.993
> >   4096,         0, 2048,    0,               23,                127,               0.987
> >   4096,         7, 2048,    0,               23,                127,               0.996
> >    256,         1,   64,    0,               23,                127,               1.004
> >    256,         2,   64,    0,               23,                127,               1.004
> >    256,         3,   64,    0,               23,                127,               0.992
> >    256,         4,   64,    0,               23,                127,               1.001
> >    256,         5,   64,    0,               23,                127,               1.001
> >    256,         6,   64,    0,               23,                127,               0.998
> >    256,         7,   64,    0,               23,                127,               0.994
> >    512,         0,  256,    0,               23,                127,               0.999
> >    512,        16,  256,    0,               23,                127,               1.002
> >    512,        32,  256,    0,               23,                127,               0.994
> >    512,        48,  256,    0,               23,                127,               0.991
> >    512,        64,  256,    0,               23,                127,               0.994
> >    512,        80,  256,    0,               23,                127,               0.994
> >    512,        96,  256,    0,               23,                127,               0.996
> >    512,       112,  256,    0,               23,                127,               0.999
> >      1,         0,    0,    0,               23,                127,               0.978
> >      2,         0,    1,    0,               23,                127,               0.981
> >      3,         0,    2,    0,               23,                127,               0.993
> >      4,         0,    3,    0,               23,                127,               1.004
> >      5,         0,    4,    0,               23,                127,               1.002
> >      6,         0,    5,    0,               23,                127,               0.991
> >      7,         0,    6,    0,               23,                127,                0.99
> >      8,         0,    7,    0,               23,                127,               1.012
> >      9,         0,    8,    0,               23,                127,               0.994
> >     10,         0,    9,    0,               23,                127,               1.003
> >     11,         0,   10,    0,               23,                127,               0.999
> >     12,         0,   11,    0,               23,                127,               1.007
> >     13,         0,   12,    0,               23,                127,                 1.0
> >     14,         0,   13,    0,               23,                127,               0.997
> >     15,         0,   14,    0,               23,                127,               0.996
> >     16,         0,   15,    0,               23,                127,               0.993
> >     17,         0,   16,    0,               23,                127,               1.002
> >     18,         0,   17,    0,               23,                127,               0.997
> >     19,         0,   18,    0,               23,                127,               0.998
> >     20,         0,   19,    0,               23,                127,               0.994
> >     21,         0,   20,    0,               23,                127,                0.99
> >     22,         0,   21,    0,               23,                127,               0.992
> >     23,         0,   22,    0,               23,                127,               0.996
> >     24,         0,   23,    0,               23,                127,               0.991
> >     25,         0,   24,    0,               23,                127,               0.997
> >     26,         0,   25,    0,               23,                127,               1.011
> >     27,         0,   26,    0,               23,                127,               1.013
> >     28,         0,   27,    0,               23,                127,               0.996
> >     29,         0,   28,    0,               23,                127,               0.993
> >     30,         0,   29,    0,               23,                127,               1.009
> >     31,         0,   30,    0,               23,                127,               1.009
> >     32,         0,   31,    0,               23,                127,               1.008
> >   2048,         0,   32,    0,                0,                127,                 1.0
> >   2048,         1,   32,    0,                0,                127,                1.01
> >   2048,         0,   64,    0,                0,                127,               0.997
> >   2048,         2,   64,    0,                0,                127,               1.002
> >   2048,         0,  128,    0,                0,                127,               0.986
> >   2048,         3,  128,    0,                0,                127,               0.997
> >   2048,         0,  256,    0,                0,                127,               1.002
> >   2048,         4,  256,    0,                0,                127,               0.999
> >   2048,         0,  512,    0,                0,                127,               0.991
> >   2048,         5,  512,    0,                0,                127,               0.984
> >   2048,         0, 1024,    0,                0,                127,               0.994
> >   2048,         6, 1024,    0,                0,                127,               0.993
> >   2048,         0, 2048,    0,                0,                127,               0.951
> >   2048,         7, 2048,    0,                0,                127,               0.989
> >   4096,         0,   32,    0,                0,                127,               0.993
> >   4096,         1,   32,    0,                0,                127,               0.997
> >   4096,         0,   64,    0,                0,                127,               1.004
> >   4096,         2,   64,    0,                0,                127,               1.016
> >   4096,         0,  128,    0,                0,                127,               0.973
> >   4096,         3,  128,    0,                0,                127,               1.001
> >   4096,         0,  256,    0,                0,                127,               0.999
> >   4096,         4,  256,    0,                0,                127,               0.998
> >   4096,         0,  512,    0,                0,                127,                0.99
> >   4096,         5,  512,    0,                0,                127,               0.985
> >   4096,         0, 1024,    0,                0,                127,               0.993
> >   4096,         6, 1024,    0,                0,                127,               0.997
> >   4096,         0, 2048,    0,                0,                127,               0.995
> >   4096,         7, 2048,    0,                0,                127,               0.996
> >    256,         1,   64,    0,                0,                127,                1.01
> >    256,         2,   64,    0,                0,                127,               1.024
> >    256,         3,   64,    0,                0,                127,                1.03
> >    256,         4,   64,    0,                0,                127,               1.004
> >    256,         5,   64,    0,                0,                127,               0.998
> >    256,         6,   64,    0,                0,                127,               0.998
> >    256,         7,   64,    0,                0,                127,               0.997
> >    512,         0,  256,    0,                0,                127,               0.996
> >    512,        16,  256,    0,                0,                127,               0.995
> >    512,        32,  256,    0,                0,                127,               0.996
> >    512,        48,  256,    0,                0,                127,               0.992
> >    512,        64,  256,    0,                0,                127,               0.999
> >    512,        80,  256,    0,                0,                127,               1.002
> >    512,        96,  256,    0,                0,                127,               0.999
> >    512,       112,  256,    0,                0,                127,               0.998
> >      1,         0,    0,    0,                0,                127,               1.016
> >      2,         0,    1,    0,                0,                127,               0.998
> >      3,         0,    2,    0,                0,                127,                1.02
> >      4,         0,    3,    0,                0,                127,               1.004
> >      5,         0,    4,    0,                0,                127,               1.021
> >      6,         0,    5,    0,                0,                127,               1.014
> >      7,         0,    6,    0,                0,                127,               1.007
> >      8,         0,    7,    0,                0,                127,               1.016
> >      9,         0,    8,    0,                0,                127,               1.003
> >     10,         0,    9,    0,                0,                127,               1.004
> >     11,         0,   10,    0,                0,                127,               0.995
> >     12,         0,   11,    0,                0,                127,               1.009
> >     13,         0,   12,    0,                0,                127,               1.005
> >     14,         0,   13,    0,                0,                127,               0.987
> >     15,         0,   14,    0,                0,                127,               0.998
> >     16,         0,   15,    0,                0,                127,               1.004
> >     17,         0,   16,    0,                0,                127,                1.01
> >     18,         0,   17,    0,                0,                127,                1.01
> >     19,         0,   18,    0,                0,                127,               1.006
> >     20,         0,   19,    0,                0,                127,               1.012
> >     21,         0,   20,    0,                0,                127,               0.999
> >     22,         0,   21,    0,                0,                127,               1.004
> >     23,         0,   22,    0,                0,                127,               0.988
> >     24,         0,   23,    0,                0,                127,               0.993
> >     25,         0,   24,    0,                0,                127,               1.004
> >     26,         0,   25,    0,                0,                127,                0.99
> >     27,         0,   26,    0,                0,                127,               1.016
> >     28,         0,   27,    0,                0,                127,               0.987
> >     29,         0,   28,    0,                0,                127,               0.989
> >     30,         0,   29,    0,                0,                127,               0.998
> >     31,         0,   30,    0,                0,                127,               1.005
> >     32,         0,   31,    0,                0,                127,               0.993
> >
> >     16,         0,   15,    1,                1,                  0,               1.002
> >     16,         0,   15,    1,                0,                  0,                 1.0
> >     16,         0,   15,    1,                1,                0.1,               1.034
> >     16,         0,   15,    1,                0,                0.1,                1.03
> >     16,         0,   15,    1,                1,               0.25,               0.993
> >     16,         0,   15,    1,                0,               0.25,               1.081
> >     16,         0,   15,    1,                1,               0.33,               0.959
> >     16,         0,   15,    1,                0,               0.33,               1.142
> >     16,         0,   15,    1,                1,                0.5,               0.929
> >     16,         0,   15,    1,                0,                0.5,               1.072
> >     16,         0,   15,    1,                1,               0.66,               0.984
> >     16,         0,   15,    1,                0,               0.66,               1.069
> >     16,         0,   15,    1,                1,               0.75,               0.969
> >     16,         0,   15,    1,                0,               0.75,               1.059
> >     16,         0,   15,    1,                1,                0.9,                0.98
> >     16,         0,   15,    1,                0,                0.9,               0.994
> >     16,         0,   15,    1,                1,                  1,               0.993
> >     16,         0,   15,    1,                0,                  1,               0.996
> >
> >  sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------
> >  1 file changed, 107 insertions(+), 97 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > index 086cabf76a..1a916cc951 100644
> > --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> > @@ -48,13 +48,13 @@
> >  # define PAGE_SIZE 4096
> >
> >         .section SECTION(.text),"ax",@progbits
> > -ENTRY (STRCHR)
> > +ENTRY_P2ALIGN (STRCHR, 5)
> >         /* Broadcast CHAR to YMM0.      */
> >         vmovd   %esi, %xmm0
> >         movl    %edi, %eax
> >         andl    $(PAGE_SIZE - 1), %eax
> >         VPBROADCAST     %xmm0, %ymm0
> > -       vpxor   %xmm9, %xmm9, %xmm9
> > +       vpxor   %xmm1, %xmm1, %xmm1
> >
> >         /* Check if we cross page boundary with one vector load.  */
> >         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > @@ -62,37 +62,29 @@ ENTRY (STRCHR)
> >
> >         /* Check the first VEC_SIZE bytes.      Search for both CHAR and the
> >            null byte.  */
> > -       vmovdqu (%rdi), %ymm8
> > -       VPCMPEQ %ymm8, %ymm0, %ymm1
> > -       VPCMPEQ %ymm8, %ymm9, %ymm2
> > -       vpor    %ymm1, %ymm2, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > +       vmovdqu (%rdi), %ymm2
> > +       VPCMPEQ %ymm2, %ymm0, %ymm3
> > +       VPCMPEQ %ymm2, %ymm1, %ymm2
> > +       vpor    %ymm3, %ymm2, %ymm3
> > +       vpmovmskb %ymm3, %eax
> >         testl   %eax, %eax
> >         jz      L(aligned_more)
> >         tzcntl  %eax, %eax
> >  # ifndef USE_AS_STRCHRNUL
> > -       /* Found CHAR or the null byte.  */
> > -       cmp     (%rdi, %rax), %CHAR_REG
> > -       jne     L(zero)
> > -# endif
> > -       addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> > -
> > -       /* .p2align 5 helps keep performance more consistent if ENTRY()
> > -          alignment % 32 was either 16 or 0. As well this makes the
> > -          alignment % 32 of the loop_4x_vec fixed which makes tuning it
> > -          easier.  */
> > -       .p2align 5
> > -L(first_vec_x4):
> > -       tzcntl  %eax, %eax
> > -       addq    $(VEC_SIZE * 3 + 1), %rdi
> > -# ifndef USE_AS_STRCHRNUL
> > -       /* Found CHAR or the null byte.  */
> > +       /* Found CHAR or the null byte.  */
> >         cmp     (%rdi, %rax), %CHAR_REG
> > +       /* NB: Use a branch instead of cmovcc here. The expectation is
> > +          that with strchr the user will branch based on input being
> > +          null. Since this branch will be 100% predictive of the user
> > +          branch a branch miss here should save what otherwise would
> > +          be branch miss in the user code. Otherwise using a branch 1)
> > +          saves code size and 2) is faster in highly predictable
> > +          environments.  */
> >         jne     L(zero)
> >  # endif
> >         addq    %rdi, %rax
> > -       VZEROUPPER_RETURN
> > +L(return_vzeroupper):
> > +       ZERO_UPPER_VEC_REGISTERS_RETURN
> >
> >  # ifndef USE_AS_STRCHRNUL
> >  L(zero):
> > @@ -103,7 +95,8 @@ L(zero):
> >
> >         .p2align 4
> >  L(first_vec_x1):
> > -       tzcntl  %eax, %eax
> > +       /* Use bsf to save code size.  */
> > +       bsfl    %eax, %eax
> >         incq    %rdi
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Found CHAR or the null byte.  */
> > @@ -113,9 +106,10 @@ L(first_vec_x1):
> >         addq    %rdi, %rax
> >         VZEROUPPER_RETURN
> >
> > -       .p2align 4
> > +       .p2align 4,, 10
> >  L(first_vec_x2):
> > -       tzcntl  %eax, %eax
> > +       /* Use bsf to save code size.  */
> > +       bsfl    %eax, %eax
> >         addq    $(VEC_SIZE + 1), %rdi
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Found CHAR or the null byte.  */
> > @@ -125,9 +119,10 @@ L(first_vec_x2):
> >         addq    %rdi, %rax
> >         VZEROUPPER_RETURN
> >
> > -       .p2align 4
> > +       .p2align 4,, 8
> >  L(first_vec_x3):
> > -       tzcntl  %eax, %eax
> > +       /* Use bsf to save code size.  */
> > +       bsfl    %eax, %eax
> >         addq    $(VEC_SIZE * 2 + 1), %rdi
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Found CHAR or the null byte.  */
> > @@ -137,6 +132,21 @@ L(first_vec_x3):
> >         addq    %rdi, %rax
> >         VZEROUPPER_RETURN
> >
> > +       .p2align 4,, 10
> > +L(first_vec_x4):
> > +       /* Use bsf to save code size.  */
> > +       bsfl    %eax, %eax
> > +       addq    $(VEC_SIZE * 3 + 1), %rdi
> > +# ifndef USE_AS_STRCHRNUL
> > +       /* Found CHAR or the null byte.  */
> > +       cmp     (%rdi, %rax), %CHAR_REG
> > +       jne     L(zero)
> > +# endif
> > +       addq    %rdi, %rax
> > +       VZEROUPPER_RETURN
> > +
> > +
> > +
> >         .p2align 4
> >  L(aligned_more):
> >         /* Align data to VEC_SIZE - 1. This is the same number of
> > @@ -146,90 +156,92 @@ L(aligned_more):
> >  L(cross_page_continue):
> >         /* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> >            since data is only aligned to VEC_SIZE.  */
> > -       vmovdqa 1(%rdi), %ymm8
> > -       VPCMPEQ %ymm8, %ymm0, %ymm1
> > -       VPCMPEQ %ymm8, %ymm9, %ymm2
> > -       vpor    %ymm1, %ymm2, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > +       vmovdqa 1(%rdi), %ymm2
> > +       VPCMPEQ %ymm2, %ymm0, %ymm3
> > +       VPCMPEQ %ymm2, %ymm1, %ymm2
> > +       vpor    %ymm3, %ymm2, %ymm3
> > +       vpmovmskb %ymm3, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x1)
> >
> > -       vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
> > -       VPCMPEQ %ymm8, %ymm0, %ymm1
> > -       VPCMPEQ %ymm8, %ymm9, %ymm2
> > -       vpor    %ymm1, %ymm2, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > +       vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2
> > +       VPCMPEQ %ymm2, %ymm0, %ymm3
> > +       VPCMPEQ %ymm2, %ymm1, %ymm2
> > +       vpor    %ymm3, %ymm2, %ymm3
> > +       vpmovmskb %ymm3, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x2)
> >
> > -       vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
> > -       VPCMPEQ %ymm8, %ymm0, %ymm1
> > -       VPCMPEQ %ymm8, %ymm9, %ymm2
> > -       vpor    %ymm1, %ymm2, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > +       vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2
> > +       VPCMPEQ %ymm2, %ymm0, %ymm3
> > +       VPCMPEQ %ymm2, %ymm1, %ymm2
> > +       vpor    %ymm3, %ymm2, %ymm3
> > +       vpmovmskb %ymm3, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x3)
> >
> > -       vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
> > -       VPCMPEQ %ymm8, %ymm0, %ymm1
> > -       VPCMPEQ %ymm8, %ymm9, %ymm2
> > -       vpor    %ymm1, %ymm2, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > +       vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2
> > +       VPCMPEQ %ymm2, %ymm0, %ymm3
> > +       VPCMPEQ %ymm2, %ymm1, %ymm2
> > +       vpor    %ymm3, %ymm2, %ymm3
> > +       vpmovmskb %ymm3, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x4)
> > -       /* Align data to VEC_SIZE * 4 - 1.      */
> > -       addq    $(VEC_SIZE * 4 + 1), %rdi
> > -       andq    $-(VEC_SIZE * 4), %rdi
> > +       /* Align data to VEC_SIZE * 4 - 1.  */
> > +       incq    %rdi
> > +       orq     $(VEC_SIZE * 4 - 1), %rdi
> >         .p2align 4
> >  L(loop_4x_vec):
> >         /* Compare 4 * VEC at a time forward.  */
> > -       vmovdqa (%rdi), %ymm5
> > -       vmovdqa (VEC_SIZE)(%rdi), %ymm6
> > -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> > -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> > +       vmovdqa 1(%rdi), %ymm6
> > +       vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7
> >
> >         /* Leaves only CHARS matching esi as 0.  */
> > -       vpxor   %ymm5, %ymm0, %ymm1
> >         vpxor   %ymm6, %ymm0, %ymm2
> >         vpxor   %ymm7, %ymm0, %ymm3
> > -       vpxor   %ymm8, %ymm0, %ymm4
> >
> > -       VPMINU  %ymm1, %ymm5, %ymm1
> >         VPMINU  %ymm2, %ymm6, %ymm2
> >         VPMINU  %ymm3, %ymm7, %ymm3
> > -       VPMINU  %ymm4, %ymm8, %ymm4
> >
> > -       VPMINU  %ymm1, %ymm2, %ymm5
> > -       VPMINU  %ymm3, %ymm4, %ymm6
> > +       vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6
> > +       vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7
> > +
> > +       vpxor   %ymm6, %ymm0, %ymm4
> > +       vpxor   %ymm7, %ymm0, %ymm5
> > +
> > +       VPMINU  %ymm4, %ymm6, %ymm4
> > +       VPMINU  %ymm5, %ymm7, %ymm5
> >
> > -       VPMINU  %ymm5, %ymm6, %ymm6
> > +       VPMINU  %ymm2, %ymm3, %ymm6
> > +       VPMINU  %ymm4, %ymm5, %ymm7
> >
> > -       VPCMPEQ %ymm6, %ymm9, %ymm6
> > -       vpmovmskb %ymm6, %ecx
> > +       VPMINU  %ymm6, %ymm7, %ymm7
> > +
> > +       VPCMPEQ %ymm7, %ymm1, %ymm7
> > +       vpmovmskb %ymm7, %ecx
> >         subq    $-(VEC_SIZE * 4), %rdi
> >         testl   %ecx, %ecx
> >         jz      L(loop_4x_vec)
> >
> > -
> > -       VPCMPEQ %ymm1, %ymm9, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > +       VPCMPEQ %ymm2, %ymm1, %ymm2
> > +       vpmovmskb %ymm2, %eax
> >         testl   %eax, %eax
> >         jnz     L(last_vec_x0)
> >
> >
> > -       VPCMPEQ %ymm5, %ymm9, %ymm2
> > -       vpmovmskb %ymm2, %eax
> > +       VPCMPEQ %ymm3, %ymm1, %ymm3
> > +       vpmovmskb %ymm3, %eax
> >         testl   %eax, %eax
> >         jnz     L(last_vec_x1)
> >
> > -       VPCMPEQ %ymm3, %ymm9, %ymm3
> > -       vpmovmskb %ymm3, %eax
> > +       VPCMPEQ %ymm4, %ymm1, %ymm4
> > +       vpmovmskb %ymm4, %eax
> >         /* rcx has combined result from all 4 VEC. It will only be used
> >            if the first 3 other VEC all did not contain a match.  */
> >         salq    $32, %rcx
> >         orq     %rcx, %rax
> >         tzcntq  %rax, %rax
> > -       subq    $(VEC_SIZE * 2), %rdi
> > +       subq    $(VEC_SIZE * 2 - 1), %rdi
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Found CHAR or the null byte.  */
> >         cmp     (%rdi, %rax), %CHAR_REG
> > @@ -239,10 +251,11 @@ L(loop_4x_vec):
> >         VZEROUPPER_RETURN
> >
> >
> > -       .p2align 4
> > +       .p2align 4,, 10
> >  L(last_vec_x0):
> > -       tzcntl  %eax, %eax
> > -       addq    $-(VEC_SIZE * 4), %rdi
> > +       /* Use bsf to save code size.  */
> > +       bsfl    %eax, %eax
> > +       addq    $-(VEC_SIZE * 4 - 1), %rdi
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Found CHAR or the null byte.  */
> >         cmp     (%rdi, %rax), %CHAR_REG
> > @@ -251,16 +264,11 @@ L(last_vec_x0):
> >         addq    %rdi, %rax
> >         VZEROUPPER_RETURN
> >
> > -# ifndef USE_AS_STRCHRNUL
> > -L(zero_end):
> > -       xorl    %eax, %eax
> > -       VZEROUPPER_RETURN
> > -# endif
> >
> > -       .p2align 4
> > +       .p2align 4,, 10
> >  L(last_vec_x1):
> >         tzcntl  %eax, %eax
> > -       subq    $(VEC_SIZE * 3), %rdi
> > +       subq    $(VEC_SIZE * 3 - 1), %rdi
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Found CHAR or the null byte.  */
> >         cmp     (%rdi, %rax), %CHAR_REG
> > @@ -269,18 +277,23 @@ L(last_vec_x1):
> >         addq    %rdi, %rax
> >         VZEROUPPER_RETURN
> >
> > +# ifndef USE_AS_STRCHRNUL
> > +L(zero_end):
> > +       xorl    %eax, %eax
> > +       VZEROUPPER_RETURN
> > +# endif
> >
> >         /* Cold case for crossing page with first load.  */
> > -       .p2align 4
> > +       .p2align 4,, 8
> >  L(cross_page_boundary):
> >         movq    %rdi, %rdx
> >         /* Align rdi to VEC_SIZE - 1.  */
> >         orq     $(VEC_SIZE - 1), %rdi
> > -       vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
> > -       VPCMPEQ %ymm8, %ymm0, %ymm1
> > -       VPCMPEQ %ymm8, %ymm9, %ymm2
> > -       vpor    %ymm1, %ymm2, %ymm1
> > -       vpmovmskb %ymm1, %eax
> > +       vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2
> > +       VPCMPEQ %ymm2, %ymm0, %ymm3
> > +       VPCMPEQ %ymm2, %ymm1, %ymm2
> > +       vpor    %ymm3, %ymm2, %ymm3
> > +       vpmovmskb %ymm3, %eax
> >         /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> >            so no need to manually mod edx.  */
> >         sarxl   %edx, %eax, %eax
> > @@ -291,13 +304,10 @@ L(cross_page_boundary):
> >         xorl    %ecx, %ecx
> >         /* Found CHAR or the null byte.  */
> >         cmp     (%rdx, %rax), %CHAR_REG
> > -       leaq    (%rdx, %rax), %rax
> > -       cmovne  %rcx, %rax
> > -# else
> > -       addq    %rdx, %rax
> > +       jne     L(zero_end)
> >  # endif
> > -L(return_vzeroupper):
> > -       ZERO_UPPER_VEC_REGISTERS_RETURN
> > +       addq    %rdx, %rax
> > +       VZEROUPPER_RETURN
> >
> >  END (STRCHR)
> > -# endif
> > +#endif
> > --
> > 2.25.1
> >
>
>
> --
> H.J.
  
H.J. Lu March 24, 2022, 7:36 p.m. UTC | #3
On Thu, Mar 24, 2022 at 12:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Mar 24, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > Small code cleanup for size: -53 bytes.
> > >
> > > Add comment justifying using a branch to do NULL/non-null return.
> >
> >
> > Do you have followup patches to improve its performance?  We are
> > backporting all x86-64 improvements to Intel release branches:
> >
> > https://gitlab.com/x86-glibc/glibc/-/wikis/home
> >
> > Patches without performance improvements are undesirable.
>
> No further changes planned at the moment, code size saves
> seem worth it for master though. Also in favor of adding the comment
> as I think its non-intuitive.
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
  
Sunil Pandey May 12, 2022, 7:31 p.m. UTC | #4
On Thu, Mar 24, 2022 at 12:37 PM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Thu, Mar 24, 2022 at 12:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Thu, Mar 24, 2022 at 1:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > Small code cleanup for size: -53 bytes.
> > > >
> > > > Add comment justifying using a branch to do NULL/non-null return.
> > >
> > >
> > > Do you have followup patches to improve its performance?  We are
> > > backporting all x86-64 improvements to Intel release branches:
> > >
> > > https://gitlab.com/x86-glibc/glibc/-/wikis/home
> > >
> > > Patches without performance improvements are undesirable.
> >
> > No further changes planned at the moment, code size saves
> > seem worth it for master though. Also in favor of adding the comment
> > as I think its non-intuitive.
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil
  

Patch

diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index 086cabf76a..1a916cc951 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -48,13 +48,13 @@ 
 # define PAGE_SIZE 4096
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (STRCHR)
+ENTRY_P2ALIGN (STRCHR, 5)
 	/* Broadcast CHAR to YMM0.	*/
 	vmovd	%esi, %xmm0
 	movl	%edi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 	VPBROADCAST	%xmm0, %ymm0
-	vpxor	%xmm9, %xmm9, %xmm9
+	vpxor	%xmm1, %xmm1, %xmm1
 
 	/* Check if we cross page boundary with one vector load.  */
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
@@ -62,37 +62,29 @@  ENTRY (STRCHR)
 
 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
 	   null byte.  */
-	vmovdqu	(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqu	(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jz	L(aligned_more)
 	tzcntl	%eax, %eax
 # ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
-	cmp	(%rdi, %rax), %CHAR_REG
-	jne	L(zero)
-# endif
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
-
-	/* .p2align 5 helps keep performance more consistent if ENTRY()
-	   alignment % 32 was either 16 or 0. As well this makes the
-	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
-	   easier.  */
-	.p2align 5
-L(first_vec_x4):
-	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 3 + 1), %rdi
-# ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
+	/* Found CHAR or the null byte.  */
 	cmp	(%rdi, %rax), %CHAR_REG
+	/* NB: Use a branch instead of cmovcc here. The expectation is
+	   that with strchr the user will branch based on input being
+	   null. Since this branch will be 100% predictive of the user
+	   branch a branch miss here should save what otherwise would
+	   be branch miss in the user code. Otherwise using a branch 1)
+	   saves code size and 2) is faster in highly predictable
+	   environments.  */
 	jne	L(zero)
 # endif
 	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 # ifndef USE_AS_STRCHRNUL
 L(zero):
@@ -103,7 +95,8 @@  L(zero):
 
 	.p2align 4
 L(first_vec_x1):
-	tzcntl	%eax, %eax
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
 	incq	%rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
@@ -113,9 +106,10 @@  L(first_vec_x1):
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
-	.p2align 4
+	.p2align 4,, 10
 L(first_vec_x2):
-	tzcntl	%eax, %eax
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
 	addq	$(VEC_SIZE + 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
@@ -125,9 +119,10 @@  L(first_vec_x2):
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
-	.p2align 4
+	.p2align 4,, 8
 L(first_vec_x3):
-	tzcntl	%eax, %eax
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
 	addq	$(VEC_SIZE * 2 + 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
@@ -137,6 +132,21 @@  L(first_vec_x3):
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
+	.p2align 4,, 10
+L(first_vec_x4):
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+
+
 	.p2align 4
 L(aligned_more):
 	/* Align data to VEC_SIZE - 1. This is the same number of
@@ -146,90 +156,92 @@  L(aligned_more):
 L(cross_page_continue):
 	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
-	vmovdqa	1(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	1(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x4)
-	/* Align data to VEC_SIZE * 4 - 1.	*/
-	addq	$(VEC_SIZE * 4 + 1), %rdi
-	andq	$-(VEC_SIZE * 4), %rdi
+	/* Align data to VEC_SIZE * 4 - 1.  */
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
 	.p2align 4
 L(loop_4x_vec):
 	/* Compare 4 * VEC at a time forward.  */
-	vmovdqa	(%rdi), %ymm5
-	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+	vmovdqa	1(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm7
 
 	/* Leaves only CHARS matching esi as 0.	 */
-	vpxor	%ymm5, %ymm0, %ymm1
 	vpxor	%ymm6, %ymm0, %ymm2
 	vpxor	%ymm7, %ymm0, %ymm3
-	vpxor	%ymm8, %ymm0, %ymm4
 
-	VPMINU	%ymm1, %ymm5, %ymm1
 	VPMINU	%ymm2, %ymm6, %ymm2
 	VPMINU	%ymm3, %ymm7, %ymm3
-	VPMINU	%ymm4, %ymm8, %ymm4
 
-	VPMINU	%ymm1, %ymm2, %ymm5
-	VPMINU	%ymm3, %ymm4, %ymm6
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm7
+
+	vpxor	%ymm6, %ymm0, %ymm4
+	vpxor	%ymm7, %ymm0, %ymm5
+
+	VPMINU	%ymm4, %ymm6, %ymm4
+	VPMINU	%ymm5, %ymm7, %ymm5
 
-	VPMINU	%ymm5, %ymm6, %ymm6
+	VPMINU	%ymm2, %ymm3, %ymm6
+	VPMINU	%ymm4, %ymm5, %ymm7
 
-	VPCMPEQ	%ymm6, %ymm9, %ymm6
-	vpmovmskb %ymm6, %ecx
+	VPMINU	%ymm6, %ymm7, %ymm7
+
+	VPCMPEQ	%ymm7, %ymm1, %ymm7
+	vpmovmskb %ymm7, %ecx
 	subq	$-(VEC_SIZE * 4), %rdi
 	testl	%ecx, %ecx
 	jz	L(loop_4x_vec)
 
-
-	VPCMPEQ	%ymm1, %ymm9, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x0)
 
 
-	VPCMPEQ	%ymm5, %ymm9, %ymm2
-	vpmovmskb %ymm2, %eax
+	VPCMPEQ	%ymm3, %ymm1, %ymm3
+	vpmovmskb %ymm3, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x1)
 
-	VPCMPEQ	%ymm3, %ymm9, %ymm3
-	vpmovmskb %ymm3, %eax
+	VPCMPEQ	%ymm4, %ymm1, %ymm4
+	vpmovmskb %ymm4, %eax
 	/* rcx has combined result from all 4 VEC. It will only be used
 	   if the first 3 other VEC all did not contain a match.  */
 	salq	$32, %rcx
 	orq	%rcx, %rax
 	tzcntq	%rax, %rax
-	subq	$(VEC_SIZE * 2), %rdi
+	subq	$(VEC_SIZE * 2 - 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 	cmp	(%rdi, %rax), %CHAR_REG
@@ -239,10 +251,11 @@  L(loop_4x_vec):
 	VZEROUPPER_RETURN
 
 
-	.p2align 4
+	.p2align 4,, 10
 L(last_vec_x0):
-	tzcntl	%eax, %eax
-	addq	$-(VEC_SIZE * 4), %rdi
+	/* Use bsf to save code size.  */
+	bsfl	%eax, %eax
+	addq	$-(VEC_SIZE * 4 - 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 	cmp	(%rdi, %rax), %CHAR_REG
@@ -251,16 +264,11 @@  L(last_vec_x0):
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
-# ifndef USE_AS_STRCHRNUL
-L(zero_end):
-	xorl	%eax, %eax
-	VZEROUPPER_RETURN
-# endif
 
-	.p2align 4
+	.p2align 4,, 10
 L(last_vec_x1):
 	tzcntl	%eax, %eax
-	subq	$(VEC_SIZE * 3), %rdi
+	subq	$(VEC_SIZE * 3 - 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 	cmp	(%rdi, %rax), %CHAR_REG
@@ -269,18 +277,23 @@  L(last_vec_x1):
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+# endif
 
 	/* Cold case for crossing page with first load.	 */
-	.p2align 4
+	.p2align 4,, 8
 L(cross_page_boundary):
 	movq	%rdi, %rdx
 	/* Align rdi to VEC_SIZE - 1.  */
 	orq	$(VEC_SIZE - 1), %rdi
-	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
-	VPCMPEQ	%ymm8, %ymm0, %ymm1
-	VPCMPEQ	%ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
+	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm3
+	VPCMPEQ	%ymm2, %ymm1, %ymm2
+	vpor	%ymm3, %ymm2, %ymm3
+	vpmovmskb %ymm3, %eax
 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
 	   so no need to manually mod edx.  */
 	sarxl	%edx, %eax, %eax
@@ -291,13 +304,10 @@  L(cross_page_boundary):
 	xorl	%ecx, %ecx
 	/* Found CHAR or the null byte.	 */
 	cmp	(%rdx, %rax), %CHAR_REG
-	leaq	(%rdx, %rax), %rax
-	cmovne	%rcx, %rax
-# else
-	addq	%rdx, %rax
+	jne	L(zero_end)
 # endif
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
+	addq	%rdx, %rax
+	VZEROUPPER_RETURN
 
 END (STRCHR)
-# endif
+#endif