[v1,04/23] x86: Code cleanup in strchr-evex and comment justifying branch

Message ID 20220323215734.3927131-4-goldstein.w.n@gmail.com (mailing list archive)
State Accepted, archived
Headers
Series [v1,01/23] benchtests: Use json-lib in bench-strchr.c |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Noah Goldstein March 23, 2022, 9:57 p.m. UTC
  Small code cleanup for size: -81 bytes.

Add comment justifying using a branch to do NULL/non-null return.

All string/memory tests pass and no regressions in benchtests.

geometric_mean(N=20) of all benchmarks New / Original: .985
---
Geomtric Mean N=20 runs; All functions page aligned
length, alignment,  pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
  2048,         0,   32,    0,               23,                127,               0.878
  2048,         1,   32,    0,               23,                127,                0.88
  2048,         0,   64,    0,               23,                127,               0.997
  2048,         2,   64,    0,               23,                127,               1.001
  2048,         0,  128,    0,               23,                127,               0.973
  2048,         3,  128,    0,               23,                127,               0.971
  2048,         0,  256,    0,               23,                127,               0.976
  2048,         4,  256,    0,               23,                127,               0.973
  2048,         0,  512,    0,               23,                127,               1.001
  2048,         5,  512,    0,               23,                127,               1.004
  2048,         0, 1024,    0,               23,                127,               1.005
  2048,         6, 1024,    0,               23,                127,               1.007
  2048,         0, 2048,    0,               23,                127,               1.035
  2048,         7, 2048,    0,               23,                127,                1.03
  4096,         0,   32,    0,               23,                127,               0.889
  4096,         1,   32,    0,               23,                127,               0.891
  4096,         0,   64,    0,               23,                127,               1.012
  4096,         2,   64,    0,               23,                127,               1.017
  4096,         0,  128,    0,               23,                127,               0.975
  4096,         3,  128,    0,               23,                127,               0.974
  4096,         0,  256,    0,               23,                127,               0.974
  4096,         4,  256,    0,               23,                127,               0.972
  4096,         0,  512,    0,               23,                127,               1.002
  4096,         5,  512,    0,               23,                127,               1.016
  4096,         0, 1024,    0,               23,                127,               1.009
  4096,         6, 1024,    0,               23,                127,               1.008
  4096,         0, 2048,    0,               23,                127,               1.003
  4096,         7, 2048,    0,               23,                127,               1.004
   256,         1,   64,    0,               23,                127,               0.993
   256,         2,   64,    0,               23,                127,               0.999
   256,         3,   64,    0,               23,                127,               0.992
   256,         4,   64,    0,               23,                127,                0.99
   256,         5,   64,    0,               23,                127,                0.99
   256,         6,   64,    0,               23,                127,               0.994
   256,         7,   64,    0,               23,                127,               0.991
   512,         0,  256,    0,               23,                127,               0.971
   512,        16,  256,    0,               23,                127,               0.971
   512,        32,  256,    0,               23,                127,               1.005
   512,        48,  256,    0,               23,                127,               0.998
   512,        64,  256,    0,               23,                127,               1.001
   512,        80,  256,    0,               23,                127,               1.002
   512,        96,  256,    0,               23,                127,               1.005
   512,       112,  256,    0,               23,                127,               1.012
     1,         0,    0,    0,               23,                127,               1.024
     2,         0,    1,    0,               23,                127,               0.991
     3,         0,    2,    0,               23,                127,               0.997
     4,         0,    3,    0,               23,                127,               0.984
     5,         0,    4,    0,               23,                127,               0.993
     6,         0,    5,    0,               23,                127,               0.985
     7,         0,    6,    0,               23,                127,               0.979
     8,         0,    7,    0,               23,                127,               0.975
     9,         0,    8,    0,               23,                127,               0.965
    10,         0,    9,    0,               23,                127,               0.957
    11,         0,   10,    0,               23,                127,               0.979
    12,         0,   11,    0,               23,                127,               0.987
    13,         0,   12,    0,               23,                127,               1.023
    14,         0,   13,    0,               23,                127,               0.997
    15,         0,   14,    0,               23,                127,               0.983
    16,         0,   15,    0,               23,                127,               0.987
    17,         0,   16,    0,               23,                127,               0.993
    18,         0,   17,    0,               23,                127,               0.985
    19,         0,   18,    0,               23,                127,               0.999
    20,         0,   19,    0,               23,                127,               0.998
    21,         0,   20,    0,               23,                127,               0.983
    22,         0,   21,    0,               23,                127,               0.983
    23,         0,   22,    0,               23,                127,               1.002
    24,         0,   23,    0,               23,                127,                 1.0
    25,         0,   24,    0,               23,                127,               1.002
    26,         0,   25,    0,               23,                127,               0.984
    27,         0,   26,    0,               23,                127,               0.994
    28,         0,   27,    0,               23,                127,               0.995
    29,         0,   28,    0,               23,                127,               1.017
    30,         0,   29,    0,               23,                127,               1.009
    31,         0,   30,    0,               23,                127,               1.001
    32,         0,   31,    0,               23,                127,               1.021
  2048,         0,   32,    0,                0,                127,               0.899
  2048,         1,   32,    0,                0,                127,                0.93
  2048,         0,   64,    0,                0,                127,               1.009
  2048,         2,   64,    0,                0,                127,               1.023
  2048,         0,  128,    0,                0,                127,               0.973
  2048,         3,  128,    0,                0,                127,               0.975
  2048,         0,  256,    0,                0,                127,               0.974
  2048,         4,  256,    0,                0,                127,                0.97
  2048,         0,  512,    0,                0,                127,               0.999
  2048,         5,  512,    0,                0,                127,               1.004
  2048,         0, 1024,    0,                0,                127,               1.008
  2048,         6, 1024,    0,                0,                127,               1.008
  2048,         0, 2048,    0,                0,                127,               0.996
  2048,         7, 2048,    0,                0,                127,               1.002
  4096,         0,   32,    0,                0,                127,               0.872
  4096,         1,   32,    0,                0,                127,               0.881
  4096,         0,   64,    0,                0,                127,               1.006
  4096,         2,   64,    0,                0,                127,               1.005
  4096,         0,  128,    0,                0,                127,               0.973
  4096,         3,  128,    0,                0,                127,               0.974
  4096,         0,  256,    0,                0,                127,               0.969
  4096,         4,  256,    0,                0,                127,               0.971
  4096,         0,  512,    0,                0,                127,                 1.0
  4096,         5,  512,    0,                0,                127,               1.005
  4096,         0, 1024,    0,                0,                127,               1.007
  4096,         6, 1024,    0,                0,                127,               1.009
  4096,         0, 2048,    0,                0,                127,               1.005
  4096,         7, 2048,    0,                0,                127,               1.007
   256,         1,   64,    0,                0,                127,               0.994
   256,         2,   64,    0,                0,                127,               1.008
   256,         3,   64,    0,                0,                127,               1.019
   256,         4,   64,    0,                0,                127,               0.991
   256,         5,   64,    0,                0,                127,               0.992
   256,         6,   64,    0,                0,                127,               0.991
   256,         7,   64,    0,                0,                127,               0.988
   512,         0,  256,    0,                0,                127,               0.971
   512,        16,  256,    0,                0,                127,               0.967
   512,        32,  256,    0,                0,                127,               1.005
   512,        48,  256,    0,                0,                127,               1.001
   512,        64,  256,    0,                0,                127,               1.009
   512,        80,  256,    0,                0,                127,               1.008
   512,        96,  256,    0,                0,                127,               1.009
   512,       112,  256,    0,                0,                127,               1.016
     1,         0,    0,    0,                0,                127,               1.038
     2,         0,    1,    0,                0,                127,               1.009
     3,         0,    2,    0,                0,                127,               0.992
     4,         0,    3,    0,                0,                127,               1.004
     5,         0,    4,    0,                0,                127,               0.966
     6,         0,    5,    0,                0,                127,               0.968
     7,         0,    6,    0,                0,                127,               1.004
     8,         0,    7,    0,                0,                127,                0.99
     9,         0,    8,    0,                0,                127,               0.958
    10,         0,    9,    0,                0,                127,                0.96
    11,         0,   10,    0,                0,                127,               0.948
    12,         0,   11,    0,                0,                127,               0.984
    13,         0,   12,    0,                0,                127,               0.967
    14,         0,   13,    0,                0,                127,               0.993
    15,         0,   14,    0,                0,                127,               0.991
    16,         0,   15,    0,                0,                127,                 1.0
    17,         0,   16,    0,                0,                127,               0.982
    18,         0,   17,    0,                0,                127,               0.977
    19,         0,   18,    0,                0,                127,               0.987
    20,         0,   19,    0,                0,                127,               0.978
    21,         0,   20,    0,                0,                127,                 1.0
    22,         0,   21,    0,                0,                127,                0.99
    23,         0,   22,    0,                0,                127,               0.988
    24,         0,   23,    0,                0,                127,               0.997
    25,         0,   24,    0,                0,                127,               1.003
    26,         0,   25,    0,                0,                127,               1.004
    27,         0,   26,    0,                0,                127,               0.982
    28,         0,   27,    0,                0,                127,               0.972
    29,         0,   28,    0,                0,                127,               0.978
    30,         0,   29,    0,                0,                127,               0.992
    31,         0,   30,    0,                0,                127,               0.986
    32,         0,   31,    0,                0,                127,                 1.0

    16,         0,   15,    1,                1,                  0,               0.997
    16,         0,   15,    1,                0,                  0,               1.001
    16,         0,   15,    1,                1,                0.1,               0.984
    16,         0,   15,    1,                0,                0.1,               0.999
    16,         0,   15,    1,                1,               0.25,               0.929
    16,         0,   15,    1,                0,               0.25,               1.001
    16,         0,   15,    1,                1,               0.33,               0.892
    16,         0,   15,    1,                0,               0.33,               0.996
    16,         0,   15,    1,                1,                0.5,               0.897
    16,         0,   15,    1,                0,                0.5,               1.009
    16,         0,   15,    1,                1,               0.66,               0.882
    16,         0,   15,    1,                0,               0.66,               0.967
    16,         0,   15,    1,                1,               0.75,               0.919
    16,         0,   15,    1,                0,               0.75,               1.027
    16,         0,   15,    1,                1,                0.9,               0.949
    16,         0,   15,    1,                0,                0.9,               1.021
    16,         0,   15,    1,                1,                  1,               0.998
    16,         0,   15,    1,                0,                  1,               0.999

 sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++-----------
 1 file changed, 80 insertions(+), 66 deletions(-)
  

Comments

H.J. Lu March 24, 2022, 6:54 p.m. UTC | #1
On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Small code cleanup for size: -81 bytes.
>
> Add comment justifying using a branch to do NULL/non-null return.
>
> All string/memory tests pass and no regressions in benchtests.
>
> geometric_mean(N=20) of all benchmarks New / Original: .985
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> length, alignment,  pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
>   2048,         0,   32,    0,               23,                127,               0.878
>   2048,         1,   32,    0,               23,                127,                0.88
>   2048,         0,   64,    0,               23,                127,               0.997
>   2048,         2,   64,    0,               23,                127,               1.001
>   2048,         0,  128,    0,               23,                127,               0.973
>   2048,         3,  128,    0,               23,                127,               0.971
>   2048,         0,  256,    0,               23,                127,               0.976
>   2048,         4,  256,    0,               23,                127,               0.973
>   2048,         0,  512,    0,               23,                127,               1.001
>   2048,         5,  512,    0,               23,                127,               1.004
>   2048,         0, 1024,    0,               23,                127,               1.005
>   2048,         6, 1024,    0,               23,                127,               1.007
>   2048,         0, 2048,    0,               23,                127,               1.035
>   2048,         7, 2048,    0,               23,                127,                1.03
>   4096,         0,   32,    0,               23,                127,               0.889
>   4096,         1,   32,    0,               23,                127,               0.891
>   4096,         0,   64,    0,               23,                127,               1.012
>   4096,         2,   64,    0,               23,                127,               1.017
>   4096,         0,  128,    0,               23,                127,               0.975
>   4096,         3,  128,    0,               23,                127,               0.974
>   4096,         0,  256,    0,               23,                127,               0.974
>   4096,         4,  256,    0,               23,                127,               0.972
>   4096,         0,  512,    0,               23,                127,               1.002
>   4096,         5,  512,    0,               23,                127,               1.016
>   4096,         0, 1024,    0,               23,                127,               1.009
>   4096,         6, 1024,    0,               23,                127,               1.008
>   4096,         0, 2048,    0,               23,                127,               1.003
>   4096,         7, 2048,    0,               23,                127,               1.004
>    256,         1,   64,    0,               23,                127,               0.993
>    256,         2,   64,    0,               23,                127,               0.999
>    256,         3,   64,    0,               23,                127,               0.992
>    256,         4,   64,    0,               23,                127,                0.99
>    256,         5,   64,    0,               23,                127,                0.99
>    256,         6,   64,    0,               23,                127,               0.994
>    256,         7,   64,    0,               23,                127,               0.991
>    512,         0,  256,    0,               23,                127,               0.971
>    512,        16,  256,    0,               23,                127,               0.971
>    512,        32,  256,    0,               23,                127,               1.005
>    512,        48,  256,    0,               23,                127,               0.998
>    512,        64,  256,    0,               23,                127,               1.001
>    512,        80,  256,    0,               23,                127,               1.002
>    512,        96,  256,    0,               23,                127,               1.005
>    512,       112,  256,    0,               23,                127,               1.012
>      1,         0,    0,    0,               23,                127,               1.024
>      2,         0,    1,    0,               23,                127,               0.991
>      3,         0,    2,    0,               23,                127,               0.997
>      4,         0,    3,    0,               23,                127,               0.984
>      5,         0,    4,    0,               23,                127,               0.993
>      6,         0,    5,    0,               23,                127,               0.985
>      7,         0,    6,    0,               23,                127,               0.979
>      8,         0,    7,    0,               23,                127,               0.975
>      9,         0,    8,    0,               23,                127,               0.965
>     10,         0,    9,    0,               23,                127,               0.957
>     11,         0,   10,    0,               23,                127,               0.979
>     12,         0,   11,    0,               23,                127,               0.987
>     13,         0,   12,    0,               23,                127,               1.023
>     14,         0,   13,    0,               23,                127,               0.997
>     15,         0,   14,    0,               23,                127,               0.983
>     16,         0,   15,    0,               23,                127,               0.987
>     17,         0,   16,    0,               23,                127,               0.993
>     18,         0,   17,    0,               23,                127,               0.985
>     19,         0,   18,    0,               23,                127,               0.999
>     20,         0,   19,    0,               23,                127,               0.998
>     21,         0,   20,    0,               23,                127,               0.983
>     22,         0,   21,    0,               23,                127,               0.983
>     23,         0,   22,    0,               23,                127,               1.002
>     24,         0,   23,    0,               23,                127,                 1.0
>     25,         0,   24,    0,               23,                127,               1.002
>     26,         0,   25,    0,               23,                127,               0.984
>     27,         0,   26,    0,               23,                127,               0.994
>     28,         0,   27,    0,               23,                127,               0.995
>     29,         0,   28,    0,               23,                127,               1.017
>     30,         0,   29,    0,               23,                127,               1.009
>     31,         0,   30,    0,               23,                127,               1.001
>     32,         0,   31,    0,               23,                127,               1.021
>   2048,         0,   32,    0,                0,                127,               0.899
>   2048,         1,   32,    0,                0,                127,                0.93
>   2048,         0,   64,    0,                0,                127,               1.009
>   2048,         2,   64,    0,                0,                127,               1.023
>   2048,         0,  128,    0,                0,                127,               0.973
>   2048,         3,  128,    0,                0,                127,               0.975
>   2048,         0,  256,    0,                0,                127,               0.974
>   2048,         4,  256,    0,                0,                127,                0.97
>   2048,         0,  512,    0,                0,                127,               0.999
>   2048,         5,  512,    0,                0,                127,               1.004
>   2048,         0, 1024,    0,                0,                127,               1.008
>   2048,         6, 1024,    0,                0,                127,               1.008
>   2048,         0, 2048,    0,                0,                127,               0.996
>   2048,         7, 2048,    0,                0,                127,               1.002
>   4096,         0,   32,    0,                0,                127,               0.872
>   4096,         1,   32,    0,                0,                127,               0.881
>   4096,         0,   64,    0,                0,                127,               1.006
>   4096,         2,   64,    0,                0,                127,               1.005
>   4096,         0,  128,    0,                0,                127,               0.973
>   4096,         3,  128,    0,                0,                127,               0.974
>   4096,         0,  256,    0,                0,                127,               0.969
>   4096,         4,  256,    0,                0,                127,               0.971
>   4096,         0,  512,    0,                0,                127,                 1.0
>   4096,         5,  512,    0,                0,                127,               1.005
>   4096,         0, 1024,    0,                0,                127,               1.007
>   4096,         6, 1024,    0,                0,                127,               1.009
>   4096,         0, 2048,    0,                0,                127,               1.005
>   4096,         7, 2048,    0,                0,                127,               1.007
>    256,         1,   64,    0,                0,                127,               0.994
>    256,         2,   64,    0,                0,                127,               1.008
>    256,         3,   64,    0,                0,                127,               1.019
>    256,         4,   64,    0,                0,                127,               0.991
>    256,         5,   64,    0,                0,                127,               0.992
>    256,         6,   64,    0,                0,                127,               0.991
>    256,         7,   64,    0,                0,                127,               0.988
>    512,         0,  256,    0,                0,                127,               0.971
>    512,        16,  256,    0,                0,                127,               0.967
>    512,        32,  256,    0,                0,                127,               1.005
>    512,        48,  256,    0,                0,                127,               1.001
>    512,        64,  256,    0,                0,                127,               1.009
>    512,        80,  256,    0,                0,                127,               1.008
>    512,        96,  256,    0,                0,                127,               1.009
>    512,       112,  256,    0,                0,                127,               1.016
>      1,         0,    0,    0,                0,                127,               1.038
>      2,         0,    1,    0,                0,                127,               1.009
>      3,         0,    2,    0,                0,                127,               0.992
>      4,         0,    3,    0,                0,                127,               1.004
>      5,         0,    4,    0,                0,                127,               0.966
>      6,         0,    5,    0,                0,                127,               0.968
>      7,         0,    6,    0,                0,                127,               1.004
>      8,         0,    7,    0,                0,                127,                0.99
>      9,         0,    8,    0,                0,                127,               0.958
>     10,         0,    9,    0,                0,                127,                0.96
>     11,         0,   10,    0,                0,                127,               0.948
>     12,         0,   11,    0,                0,                127,               0.984
>     13,         0,   12,    0,                0,                127,               0.967
>     14,         0,   13,    0,                0,                127,               0.993
>     15,         0,   14,    0,                0,                127,               0.991
>     16,         0,   15,    0,                0,                127,                 1.0
>     17,         0,   16,    0,                0,                127,               0.982
>     18,         0,   17,    0,                0,                127,               0.977
>     19,         0,   18,    0,                0,                127,               0.987
>     20,         0,   19,    0,                0,                127,               0.978
>     21,         0,   20,    0,                0,                127,                 1.0
>     22,         0,   21,    0,                0,                127,                0.99
>     23,         0,   22,    0,                0,                127,               0.988
>     24,         0,   23,    0,                0,                127,               0.997
>     25,         0,   24,    0,                0,                127,               1.003
>     26,         0,   25,    0,                0,                127,               1.004
>     27,         0,   26,    0,                0,                127,               0.982
>     28,         0,   27,    0,                0,                127,               0.972
>     29,         0,   28,    0,                0,                127,               0.978
>     30,         0,   29,    0,                0,                127,               0.992
>     31,         0,   30,    0,                0,                127,               0.986
>     32,         0,   31,    0,                0,                127,                 1.0
>
>     16,         0,   15,    1,                1,                  0,               0.997
>     16,         0,   15,    1,                0,                  0,               1.001
>     16,         0,   15,    1,                1,                0.1,               0.984
>     16,         0,   15,    1,                0,                0.1,               0.999
>     16,         0,   15,    1,                1,               0.25,               0.929
>     16,         0,   15,    1,                0,               0.25,               1.001
>     16,         0,   15,    1,                1,               0.33,               0.892
>     16,         0,   15,    1,                0,               0.33,               0.996
>     16,         0,   15,    1,                1,                0.5,               0.897
>     16,         0,   15,    1,                0,                0.5,               1.009
>     16,         0,   15,    1,                1,               0.66,               0.882
>     16,         0,   15,    1,                0,               0.66,               0.967
>     16,         0,   15,    1,                1,               0.75,               0.919
>     16,         0,   15,    1,                0,               0.75,               1.027
>     16,         0,   15,    1,                1,                0.9,               0.949
>     16,         0,   15,    1,                0,                0.9,               1.021
>     16,         0,   15,    1,                1,                  1,               0.998
>     16,         0,   15,    1,                0,                  1,               0.999
>
>  sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++-----------
>  1 file changed, 80 insertions(+), 66 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
> index f62cd9d144..ec739fb8f9 100644
> --- a/sysdeps/x86_64/multiarch/strchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/strchr-evex.S
> @@ -30,6 +30,7 @@
>  # ifdef USE_AS_WCSCHR
>  #  define VPBROADCAST  vpbroadcastd
>  #  define VPCMP                vpcmpd
> +#  define VPTESTN      vptestnmd
>  #  define VPMINU       vpminud
>  #  define CHAR_REG     esi
>  #  define SHIFT_REG    ecx
> @@ -37,6 +38,7 @@
>  # else
>  #  define VPBROADCAST  vpbroadcastb
>  #  define VPCMP                vpcmpb
> +#  define VPTESTN      vptestnmb
>  #  define VPMINU       vpminub
>  #  define CHAR_REG     sil
>  #  define SHIFT_REG    edx
> @@ -61,13 +63,11 @@
>  # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
>
>         .section .text.evex,"ax",@progbits
> -ENTRY (STRCHR)
> +ENTRY_P2ALIGN (STRCHR, 5)
>         /* Broadcast CHAR to YMM0.      */
>         VPBROADCAST     %esi, %YMM0
>         movl    %edi, %eax
>         andl    $(PAGE_SIZE - 1), %eax
> -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> -
>         /* Check if we cross page boundary with one vector load.
>            Otherwise it is safe to use an unaligned load.  */
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> @@ -81,49 +81,35 @@ ENTRY (STRCHR)
>         vpxorq  %YMM1, %YMM0, %YMM2
>         VPMINU  %YMM2, %YMM1, %YMM2
>         /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> +       VPTESTN %YMM2, %YMM2, %k0
>         kmovd   %k0, %eax
>         testl   %eax, %eax
>         jz      L(aligned_more)
>         tzcntl  %eax, %eax
> +# ifndef USE_AS_STRCHRNUL
> +       /* Found CHAR or the null byte.  */
> +       cmp     (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> +       /* NB: Use a branch instead of cmovcc here. The expectation is
> +          that with strchr the user will branch based on input being
> +          null. Since this branch will be 100% predictive of the user
> +          branch a branch miss here should save what otherwise would
> +          be branch miss in the user code. Otherwise using a branch 1)
> +          saves code size and 2) is faster in highly predictable
> +          environments.  */
> +       jne     L(zero)
> +# endif
>  # ifdef USE_AS_WCSCHR
>         /* NB: Multiply wchar_t count by 4 to get the number of bytes.
>          */
>         leaq    (%rdi, %rax, CHAR_SIZE), %rax
>  # else
>         addq    %rdi, %rax
> -# endif
> -# ifndef USE_AS_STRCHRNUL
> -       /* Found CHAR or the null byte.  */
> -       cmp     (%rax), %CHAR_REG
> -       jne     L(zero)
>  # endif
>         ret
>
> -       /* .p2align 5 helps keep performance more consistent if ENTRY()
> -          alignment % 32 was either 16 or 0. As well this makes the
> -          alignment % 32 of the loop_4x_vec fixed which makes tuning it
> -          easier.  */
> -       .p2align 5
> -L(first_vec_x3):
> -       tzcntl  %eax, %eax
> -# ifndef USE_AS_STRCHRNUL
> -       /* Found CHAR or the null byte.  */
> -       cmp     (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> -       jne     L(zero)
> -# endif
> -       /* NB: Multiply sizeof char type (1 or 4) to get the number of
> -          bytes.  */
> -       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> -       ret
>
> -# ifndef USE_AS_STRCHRNUL
> -L(zero):
> -       xorl    %eax, %eax
> -       ret
> -# endif
>
> -       .p2align 4
> +       .p2align 4,, 10
>  L(first_vec_x4):
>  # ifndef USE_AS_STRCHRNUL
>         /* Check to see if first match was CHAR (k0) or null (k1).  */
> @@ -144,9 +130,18 @@ L(first_vec_x4):
>         leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> +# ifndef USE_AS_STRCHRNUL
> +L(zero):
> +       xorl    %eax, %eax
> +       ret
> +# endif
> +
> +
>         .p2align 4
>  L(first_vec_x1):
> -       tzcntl  %eax, %eax
> +       /* Use bsf here to save 1-byte keeping keeping the block in 1x
> +          fetch block. eax guranteed non-zero.  */
> +       bsfl    %eax, %eax
>  # ifndef USE_AS_STRCHRNUL
>         /* Found CHAR or the null byte.  */
>         cmp     (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> @@ -158,7 +153,7 @@ L(first_vec_x1):
>         leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4
> +       .p2align 4,, 10
>  L(first_vec_x2):
>  # ifndef USE_AS_STRCHRNUL
>         /* Check to see if first match was CHAR (k0) or null (k1).  */
> @@ -179,6 +174,21 @@ L(first_vec_x2):
>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> +       .p2align 4,, 10
> +L(first_vec_x3):
> +       /* Use bsf here to save 1-byte keeping keeping the block in 1x
> +          fetch block. eax guranteed non-zero.  */
> +       bsfl    %eax, %eax
> +# ifndef USE_AS_STRCHRNUL
> +       /* Found CHAR or the null byte.  */
> +       cmp     (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> +       jne     L(zero)
> +# endif
> +       /* NB: Multiply sizeof char type (1 or 4) to get the number of
> +          bytes.  */
> +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
> +
>         .p2align 4
>  L(aligned_more):
>         /* Align data to VEC_SIZE.  */
> @@ -195,7 +205,7 @@ L(cross_page_continue):
>         vpxorq  %YMM1, %YMM0, %YMM2
>         VPMINU  %YMM2, %YMM1, %YMM2
>         /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> +       VPTESTN %YMM2, %YMM2, %k0
>         kmovd   %k0, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x1)
> @@ -206,7 +216,7 @@ L(cross_page_continue):
>         /* Each bit in K0 represents a CHAR in YMM1.  */
>         VPCMP   $0, %YMM1, %YMM0, %k0
>         /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMM1, %YMMZERO, %k1
> +       VPTESTN %YMM1, %YMM1, %k1
>         kortestd        %k0, %k1
>         jnz     L(first_vec_x2)
>
> @@ -215,7 +225,7 @@ L(cross_page_continue):
>         vpxorq  %YMM1, %YMM0, %YMM2
>         VPMINU  %YMM2, %YMM1, %YMM2
>         /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> +       VPTESTN %YMM2, %YMM2, %k0
>         kmovd   %k0, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x3)
> @@ -224,7 +234,7 @@ L(cross_page_continue):
>         /* Each bit in K0 represents a CHAR in YMM1.  */
>         VPCMP   $0, %YMM1, %YMM0, %k0
>         /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMM1, %YMMZERO, %k1
> +       VPTESTN %YMM1, %YMM1, %k1
>         kortestd        %k0, %k1
>         jnz     L(first_vec_x4)
>
> @@ -265,33 +275,33 @@ L(loop_4x_vec):
>         VPMINU  %YMM3, %YMM4, %YMM4
>         VPMINU  %YMM2, %YMM4, %YMM4{%k4}{z}
>
> -       VPCMP   $0, %YMMZERO, %YMM4, %k1
> +       VPTESTN %YMM4, %YMM4, %k1
>         kmovd   %k1, %ecx
>         subq    $-(VEC_SIZE * 4), %rdi
>         testl   %ecx, %ecx
>         jz      L(loop_4x_vec)
>
> -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> +       VPTESTN %YMM1, %YMM1, %k0
>         kmovd   %k0, %eax
>         testl   %eax, %eax
>         jnz     L(last_vec_x1)
>
> -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> +       VPTESTN %YMM2, %YMM2, %k0
>         kmovd   %k0, %eax
>         testl   %eax, %eax
>         jnz     L(last_vec_x2)
>
> -       VPCMP   $0, %YMMZERO, %YMM3, %k0
> +       VPTESTN %YMM3, %YMM3, %k0
>         kmovd   %k0, %eax
>         /* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
>  # ifdef USE_AS_WCSCHR
>         sall    $8, %ecx
>         orl     %ecx, %eax
> -       tzcntl  %eax, %eax
> +       bsfl    %eax, %eax
>  # else
>         salq    $32, %rcx
>         orq     %rcx, %rax
> -       tzcntq  %rax, %rax
> +       bsfq    %rax, %rax
>  # endif
>  # ifndef USE_AS_STRCHRNUL
>         /* Check if match was CHAR or null.  */
> @@ -303,28 +313,28 @@ L(loop_4x_vec):
>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> -# ifndef USE_AS_STRCHRNUL
> -L(zero_end):
> -       xorl    %eax, %eax
> -       ret
> +       .p2align 4,, 8
> +L(last_vec_x1):
> +       bsfl    %eax, %eax
> +# ifdef USE_AS_WCSCHR
> +       /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> +          */
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       addq    %rdi, %rax
>  # endif
>
> -       .p2align 4
> -L(last_vec_x1):
> -       tzcntl  %eax, %eax
>  # ifndef USE_AS_STRCHRNUL
>         /* Check if match was null.  */
> -       cmp     (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> +       cmp     (%rax), %CHAR_REG
>         jne     L(zero_end)
>  # endif
> -       /* NB: Multiply sizeof char type (1 or 4) to get the number of
> -          bytes.  */
> -       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +
>         ret
>
> -       .p2align 4
> +       .p2align 4,, 8
>  L(last_vec_x2):
> -       tzcntl  %eax, %eax
> +       bsfl    %eax, %eax
>  # ifndef USE_AS_STRCHRNUL
>         /* Check if match was null.  */
>         cmp     (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> @@ -336,7 +346,7 @@ L(last_vec_x2):
>         ret
>
>         /* Cold case for crossing page with first load.  */
> -       .p2align 4
> +       .p2align 4,, 8
>  L(cross_page_boundary):
>         movq    %rdi, %rdx
>         /* Align rdi.  */
> @@ -346,9 +356,9 @@ L(cross_page_boundary):
>         vpxorq  %YMM1, %YMM0, %YMM2
>         VPMINU  %YMM2, %YMM1, %YMM2
>         /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> +       VPTESTN %YMM2, %YMM2, %k0
>         kmovd   %k0, %eax
> -       /* Remove the leading bits.      */
> +       /* Remove the leading bits.  */
>  # ifdef USE_AS_WCSCHR
>         movl    %edx, %SHIFT_REG
>         /* NB: Divide shift count by 4 since each bit in K1 represent 4
> @@ -360,20 +370,24 @@ L(cross_page_boundary):
>         /* If eax is zero continue.  */
>         testl   %eax, %eax
>         jz      L(cross_page_continue)
> -       tzcntl  %eax, %eax
> -# ifndef USE_AS_STRCHRNUL
> -       /* Check to see if match was CHAR or null.  */
> -       cmp     (%rdx, %rax, CHAR_SIZE), %CHAR_REG
> -       jne     L(zero_end)
> -# endif
> +       bsfl    %eax, %eax
> +
>  # ifdef USE_AS_WCSCHR
>         /* NB: Multiply wchar_t count by 4 to get the number of
>            bytes.  */
>         leaq    (%rdx, %rax, CHAR_SIZE), %rax
>  # else
>         addq    %rdx, %rax
> +# endif
> +# ifndef USE_AS_STRCHRNUL
> +       /* Check to see if match was CHAR or null.  */
> +       cmp     (%rax), %CHAR_REG
> +       je      L(cross_page_ret)
> +L(zero_end):
> +       xorl    %eax, %eax
> +L(cross_page_ret):
>  # endif
>         ret
>
>  END (STRCHR)
> -# endif
> +#endif
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
  
Sunil Pandey May 12, 2022, 7:32 p.m. UTC | #2
On Thu, Mar 24, 2022 at 11:55 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 2:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Small code cleanup for size: -81 bytes.
> >
> > Add comment justifying using a branch to do NULL/non-null return.
> >
> > All string/memory tests pass and no regressions in benchtests.
> >
> > geometric_mean(N=20) of all benchmarks New / Original: .985
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > length, alignment,  pos, rand, seek_char/branch, max_char/perc-zero, New Time / Old Time
> >   2048,         0,   32,    0,               23,                127,               0.878
> >   2048,         1,   32,    0,               23,                127,                0.88
> >   2048,         0,   64,    0,               23,                127,               0.997
> >   2048,         2,   64,    0,               23,                127,               1.001
> >   2048,         0,  128,    0,               23,                127,               0.973
> >   2048,         3,  128,    0,               23,                127,               0.971
> >   2048,         0,  256,    0,               23,                127,               0.976
> >   2048,         4,  256,    0,               23,                127,               0.973
> >   2048,         0,  512,    0,               23,                127,               1.001
> >   2048,         5,  512,    0,               23,                127,               1.004
> >   2048,         0, 1024,    0,               23,                127,               1.005
> >   2048,         6, 1024,    0,               23,                127,               1.007
> >   2048,         0, 2048,    0,               23,                127,               1.035
> >   2048,         7, 2048,    0,               23,                127,                1.03
> >   4096,         0,   32,    0,               23,                127,               0.889
> >   4096,         1,   32,    0,               23,                127,               0.891
> >   4096,         0,   64,    0,               23,                127,               1.012
> >   4096,         2,   64,    0,               23,                127,               1.017
> >   4096,         0,  128,    0,               23,                127,               0.975
> >   4096,         3,  128,    0,               23,                127,               0.974
> >   4096,         0,  256,    0,               23,                127,               0.974
> >   4096,         4,  256,    0,               23,                127,               0.972
> >   4096,         0,  512,    0,               23,                127,               1.002
> >   4096,         5,  512,    0,               23,                127,               1.016
> >   4096,         0, 1024,    0,               23,                127,               1.009
> >   4096,         6, 1024,    0,               23,                127,               1.008
> >   4096,         0, 2048,    0,               23,                127,               1.003
> >   4096,         7, 2048,    0,               23,                127,               1.004
> >    256,         1,   64,    0,               23,                127,               0.993
> >    256,         2,   64,    0,               23,                127,               0.999
> >    256,         3,   64,    0,               23,                127,               0.992
> >    256,         4,   64,    0,               23,                127,                0.99
> >    256,         5,   64,    0,               23,                127,                0.99
> >    256,         6,   64,    0,               23,                127,               0.994
> >    256,         7,   64,    0,               23,                127,               0.991
> >    512,         0,  256,    0,               23,                127,               0.971
> >    512,        16,  256,    0,               23,                127,               0.971
> >    512,        32,  256,    0,               23,                127,               1.005
> >    512,        48,  256,    0,               23,                127,               0.998
> >    512,        64,  256,    0,               23,                127,               1.001
> >    512,        80,  256,    0,               23,                127,               1.002
> >    512,        96,  256,    0,               23,                127,               1.005
> >    512,       112,  256,    0,               23,                127,               1.012
> >      1,         0,    0,    0,               23,                127,               1.024
> >      2,         0,    1,    0,               23,                127,               0.991
> >      3,         0,    2,    0,               23,                127,               0.997
> >      4,         0,    3,    0,               23,                127,               0.984
> >      5,         0,    4,    0,               23,                127,               0.993
> >      6,         0,    5,    0,               23,                127,               0.985
> >      7,         0,    6,    0,               23,                127,               0.979
> >      8,         0,    7,    0,               23,                127,               0.975
> >      9,         0,    8,    0,               23,                127,               0.965
> >     10,         0,    9,    0,               23,                127,               0.957
> >     11,         0,   10,    0,               23,                127,               0.979
> >     12,         0,   11,    0,               23,                127,               0.987
> >     13,         0,   12,    0,               23,                127,               1.023
> >     14,         0,   13,    0,               23,                127,               0.997
> >     15,         0,   14,    0,               23,                127,               0.983
> >     16,         0,   15,    0,               23,                127,               0.987
> >     17,         0,   16,    0,               23,                127,               0.993
> >     18,         0,   17,    0,               23,                127,               0.985
> >     19,         0,   18,    0,               23,                127,               0.999
> >     20,         0,   19,    0,               23,                127,               0.998
> >     21,         0,   20,    0,               23,                127,               0.983
> >     22,         0,   21,    0,               23,                127,               0.983
> >     23,         0,   22,    0,               23,                127,               1.002
> >     24,         0,   23,    0,               23,                127,                 1.0
> >     25,         0,   24,    0,               23,                127,               1.002
> >     26,         0,   25,    0,               23,                127,               0.984
> >     27,         0,   26,    0,               23,                127,               0.994
> >     28,         0,   27,    0,               23,                127,               0.995
> >     29,         0,   28,    0,               23,                127,               1.017
> >     30,         0,   29,    0,               23,                127,               1.009
> >     31,         0,   30,    0,               23,                127,               1.001
> >     32,         0,   31,    0,               23,                127,               1.021
> >   2048,         0,   32,    0,                0,                127,               0.899
> >   2048,         1,   32,    0,                0,                127,                0.93
> >   2048,         0,   64,    0,                0,                127,               1.009
> >   2048,         2,   64,    0,                0,                127,               1.023
> >   2048,         0,  128,    0,                0,                127,               0.973
> >   2048,         3,  128,    0,                0,                127,               0.975
> >   2048,         0,  256,    0,                0,                127,               0.974
> >   2048,         4,  256,    0,                0,                127,                0.97
> >   2048,         0,  512,    0,                0,                127,               0.999
> >   2048,         5,  512,    0,                0,                127,               1.004
> >   2048,         0, 1024,    0,                0,                127,               1.008
> >   2048,         6, 1024,    0,                0,                127,               1.008
> >   2048,         0, 2048,    0,                0,                127,               0.996
> >   2048,         7, 2048,    0,                0,                127,               1.002
> >   4096,         0,   32,    0,                0,                127,               0.872
> >   4096,         1,   32,    0,                0,                127,               0.881
> >   4096,         0,   64,    0,                0,                127,               1.006
> >   4096,         2,   64,    0,                0,                127,               1.005
> >   4096,         0,  128,    0,                0,                127,               0.973
> >   4096,         3,  128,    0,                0,                127,               0.974
> >   4096,         0,  256,    0,                0,                127,               0.969
> >   4096,         4,  256,    0,                0,                127,               0.971
> >   4096,         0,  512,    0,                0,                127,                 1.0
> >   4096,         5,  512,    0,                0,                127,               1.005
> >   4096,         0, 1024,    0,                0,                127,               1.007
> >   4096,         6, 1024,    0,                0,                127,               1.009
> >   4096,         0, 2048,    0,                0,                127,               1.005
> >   4096,         7, 2048,    0,                0,                127,               1.007
> >    256,         1,   64,    0,                0,                127,               0.994
> >    256,         2,   64,    0,                0,                127,               1.008
> >    256,         3,   64,    0,                0,                127,               1.019
> >    256,         4,   64,    0,                0,                127,               0.991
> >    256,         5,   64,    0,                0,                127,               0.992
> >    256,         6,   64,    0,                0,                127,               0.991
> >    256,         7,   64,    0,                0,                127,               0.988
> >    512,         0,  256,    0,                0,                127,               0.971
> >    512,        16,  256,    0,                0,                127,               0.967
> >    512,        32,  256,    0,                0,                127,               1.005
> >    512,        48,  256,    0,                0,                127,               1.001
> >    512,        64,  256,    0,                0,                127,               1.009
> >    512,        80,  256,    0,                0,                127,               1.008
> >    512,        96,  256,    0,                0,                127,               1.009
> >    512,       112,  256,    0,                0,                127,               1.016
> >      1,         0,    0,    0,                0,                127,               1.038
> >      2,         0,    1,    0,                0,                127,               1.009
> >      3,         0,    2,    0,                0,                127,               0.992
> >      4,         0,    3,    0,                0,                127,               1.004
> >      5,         0,    4,    0,                0,                127,               0.966
> >      6,         0,    5,    0,                0,                127,               0.968
> >      7,         0,    6,    0,                0,                127,               1.004
> >      8,         0,    7,    0,                0,                127,                0.99
> >      9,         0,    8,    0,                0,                127,               0.958
> >     10,         0,    9,    0,                0,                127,                0.96
> >     11,         0,   10,    0,                0,                127,               0.948
> >     12,         0,   11,    0,                0,                127,               0.984
> >     13,         0,   12,    0,                0,                127,               0.967
> >     14,         0,   13,    0,                0,                127,               0.993
> >     15,         0,   14,    0,                0,                127,               0.991
> >     16,         0,   15,    0,                0,                127,                 1.0
> >     17,         0,   16,    0,                0,                127,               0.982
> >     18,         0,   17,    0,                0,                127,               0.977
> >     19,         0,   18,    0,                0,                127,               0.987
> >     20,         0,   19,    0,                0,                127,               0.978
> >     21,         0,   20,    0,                0,                127,                 1.0
> >     22,         0,   21,    0,                0,                127,                0.99
> >     23,         0,   22,    0,                0,                127,               0.988
> >     24,         0,   23,    0,                0,                127,               0.997
> >     25,         0,   24,    0,                0,                127,               1.003
> >     26,         0,   25,    0,                0,                127,               1.004
> >     27,         0,   26,    0,                0,                127,               0.982
> >     28,         0,   27,    0,                0,                127,               0.972
> >     29,         0,   28,    0,                0,                127,               0.978
> >     30,         0,   29,    0,                0,                127,               0.992
> >     31,         0,   30,    0,                0,                127,               0.986
> >     32,         0,   31,    0,                0,                127,                 1.0
> >
> >     16,         0,   15,    1,                1,                  0,               0.997
> >     16,         0,   15,    1,                0,                  0,               1.001
> >     16,         0,   15,    1,                1,                0.1,               0.984
> >     16,         0,   15,    1,                0,                0.1,               0.999
> >     16,         0,   15,    1,                1,               0.25,               0.929
> >     16,         0,   15,    1,                0,               0.25,               1.001
> >     16,         0,   15,    1,                1,               0.33,               0.892
> >     16,         0,   15,    1,                0,               0.33,               0.996
> >     16,         0,   15,    1,                1,                0.5,               0.897
> >     16,         0,   15,    1,                0,                0.5,               1.009
> >     16,         0,   15,    1,                1,               0.66,               0.882
> >     16,         0,   15,    1,                0,               0.66,               0.967
> >     16,         0,   15,    1,                1,               0.75,               0.919
> >     16,         0,   15,    1,                0,               0.75,               1.027
> >     16,         0,   15,    1,                1,                0.9,               0.949
> >     16,         0,   15,    1,                0,                0.9,               1.021
> >     16,         0,   15,    1,                1,                  1,               0.998
> >     16,         0,   15,    1,                0,                  1,               0.999
> >
> >  sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++-----------
> >  1 file changed, 80 insertions(+), 66 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
> > index f62cd9d144..ec739fb8f9 100644
> > --- a/sysdeps/x86_64/multiarch/strchr-evex.S
> > +++ b/sysdeps/x86_64/multiarch/strchr-evex.S
> > @@ -30,6 +30,7 @@
> >  # ifdef USE_AS_WCSCHR
> >  #  define VPBROADCAST  vpbroadcastd
> >  #  define VPCMP                vpcmpd
> > +#  define VPTESTN      vptestnmd
> >  #  define VPMINU       vpminud
> >  #  define CHAR_REG     esi
> >  #  define SHIFT_REG    ecx
> > @@ -37,6 +38,7 @@
> >  # else
> >  #  define VPBROADCAST  vpbroadcastb
> >  #  define VPCMP                vpcmpb
> > +#  define VPTESTN      vptestnmb
> >  #  define VPMINU       vpminub
> >  #  define CHAR_REG     sil
> >  #  define SHIFT_REG    edx
> > @@ -61,13 +63,11 @@
> >  # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> >
> >         .section .text.evex,"ax",@progbits
> > -ENTRY (STRCHR)
> > +ENTRY_P2ALIGN (STRCHR, 5)
> >         /* Broadcast CHAR to YMM0.      */
> >         VPBROADCAST     %esi, %YMM0
> >         movl    %edi, %eax
> >         andl    $(PAGE_SIZE - 1), %eax
> > -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> > -
> >         /* Check if we cross page boundary with one vector load.
> >            Otherwise it is safe to use an unaligned load.  */
> >         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > @@ -81,49 +81,35 @@ ENTRY (STRCHR)
> >         vpxorq  %YMM1, %YMM0, %YMM2
> >         VPMINU  %YMM2, %YMM1, %YMM2
> >         /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> > +       VPTESTN %YMM2, %YMM2, %k0
> >         kmovd   %k0, %eax
> >         testl   %eax, %eax
> >         jz      L(aligned_more)
> >         tzcntl  %eax, %eax
> > +# ifndef USE_AS_STRCHRNUL
> > +       /* Found CHAR or the null byte.  */
> > +       cmp     (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > +       /* NB: Use a branch instead of cmovcc here. The expectation is
> > +          that with strchr the user will branch based on input being
> > +          null. Since this branch will be 100% predictive of the user
> > +          branch a branch miss here should save what otherwise would
> > +          be branch miss in the user code. Otherwise using a branch 1)
> > +          saves code size and 2) is faster in highly predictable
> > +          environments.  */
> > +       jne     L(zero)
> > +# endif
> >  # ifdef USE_AS_WCSCHR
> >         /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> >          */
> >         leaq    (%rdi, %rax, CHAR_SIZE), %rax
> >  # else
> >         addq    %rdi, %rax
> > -# endif
> > -# ifndef USE_AS_STRCHRNUL
> > -       /* Found CHAR or the null byte.  */
> > -       cmp     (%rax), %CHAR_REG
> > -       jne     L(zero)
> >  # endif
> >         ret
> >
> > -       /* .p2align 5 helps keep performance more consistent if ENTRY()
> > -          alignment % 32 was either 16 or 0. As well this makes the
> > -          alignment % 32 of the loop_4x_vec fixed which makes tuning it
> > -          easier.  */
> > -       .p2align 5
> > -L(first_vec_x3):
> > -       tzcntl  %eax, %eax
> > -# ifndef USE_AS_STRCHRNUL
> > -       /* Found CHAR or the null byte.  */
> > -       cmp     (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > -       jne     L(zero)
> > -# endif
> > -       /* NB: Multiply sizeof char type (1 or 4) to get the number of
> > -          bytes.  */
> > -       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > -       ret
> >
> > -# ifndef USE_AS_STRCHRNUL
> > -L(zero):
> > -       xorl    %eax, %eax
> > -       ret
> > -# endif
> >
> > -       .p2align 4
> > +       .p2align 4,, 10
> >  L(first_vec_x4):
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Check to see if first match was CHAR (k0) or null (k1).  */
> > @@ -144,9 +130,18 @@ L(first_vec_x4):
> >         leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> >         ret
> >
> > +# ifndef USE_AS_STRCHRNUL
> > +L(zero):
> > +       xorl    %eax, %eax
> > +       ret
> > +# endif
> > +
> > +
> >         .p2align 4
> >  L(first_vec_x1):
> > -       tzcntl  %eax, %eax
> > +       /* Use bsf here to save 1-byte keeping keeping the block in 1x
> > +          fetch block. eax guranteed non-zero.  */
> > +       bsfl    %eax, %eax
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Found CHAR or the null byte.  */
> >         cmp     (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > @@ -158,7 +153,7 @@ L(first_vec_x1):
> >         leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> >         ret
> >
> > -       .p2align 4
> > +       .p2align 4,, 10
> >  L(first_vec_x2):
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Check to see if first match was CHAR (k0) or null (k1).  */
> > @@ -179,6 +174,21 @@ L(first_vec_x2):
> >         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> >         ret
> >
> > +       .p2align 4,, 10
> > +L(first_vec_x3):
> > +       /* Use bsf here to save 1-byte keeping keeping the block in 1x
> > +          fetch block. eax guranteed non-zero.  */
> > +       bsfl    %eax, %eax
> > +# ifndef USE_AS_STRCHRNUL
> > +       /* Found CHAR or the null byte.  */
> > +       cmp     (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > +       jne     L(zero)
> > +# endif
> > +       /* NB: Multiply sizeof char type (1 or 4) to get the number of
> > +          bytes.  */
> > +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> > +
> >         .p2align 4
> >  L(aligned_more):
> >         /* Align data to VEC_SIZE.  */
> > @@ -195,7 +205,7 @@ L(cross_page_continue):
> >         vpxorq  %YMM1, %YMM0, %YMM2
> >         VPMINU  %YMM2, %YMM1, %YMM2
> >         /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> > +       VPTESTN %YMM2, %YMM2, %k0
> >         kmovd   %k0, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x1)
> > @@ -206,7 +216,7 @@ L(cross_page_continue):
> >         /* Each bit in K0 represents a CHAR in YMM1.  */
> >         VPCMP   $0, %YMM1, %YMM0, %k0
> >         /* Each bit in K1 represents a CHAR in YMM1.  */
> > -       VPCMP   $0, %YMM1, %YMMZERO, %k1
> > +       VPTESTN %YMM1, %YMM1, %k1
> >         kortestd        %k0, %k1
> >         jnz     L(first_vec_x2)
> >
> > @@ -215,7 +225,7 @@ L(cross_page_continue):
> >         vpxorq  %YMM1, %YMM0, %YMM2
> >         VPMINU  %YMM2, %YMM1, %YMM2
> >         /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> > +       VPTESTN %YMM2, %YMM2, %k0
> >         kmovd   %k0, %eax
> >         testl   %eax, %eax
> >         jnz     L(first_vec_x3)
> > @@ -224,7 +234,7 @@ L(cross_page_continue):
> >         /* Each bit in K0 represents a CHAR in YMM1.  */
> >         VPCMP   $0, %YMM1, %YMM0, %k0
> >         /* Each bit in K1 represents a CHAR in YMM1.  */
> > -       VPCMP   $0, %YMM1, %YMMZERO, %k1
> > +       VPTESTN %YMM1, %YMM1, %k1
> >         kortestd        %k0, %k1
> >         jnz     L(first_vec_x4)
> >
> > @@ -265,33 +275,33 @@ L(loop_4x_vec):
> >         VPMINU  %YMM3, %YMM4, %YMM4
> >         VPMINU  %YMM2, %YMM4, %YMM4{%k4}{z}
> >
> > -       VPCMP   $0, %YMMZERO, %YMM4, %k1
> > +       VPTESTN %YMM4, %YMM4, %k1
> >         kmovd   %k1, %ecx
> >         subq    $-(VEC_SIZE * 4), %rdi
> >         testl   %ecx, %ecx
> >         jz      L(loop_4x_vec)
> >
> > -       VPCMP   $0, %YMMZERO, %YMM1, %k0
> > +       VPTESTN %YMM1, %YMM1, %k0
> >         kmovd   %k0, %eax
> >         testl   %eax, %eax
> >         jnz     L(last_vec_x1)
> >
> > -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> > +       VPTESTN %YMM2, %YMM2, %k0
> >         kmovd   %k0, %eax
> >         testl   %eax, %eax
> >         jnz     L(last_vec_x2)
> >
> > -       VPCMP   $0, %YMMZERO, %YMM3, %k0
> > +       VPTESTN %YMM3, %YMM3, %k0
> >         kmovd   %k0, %eax
> >         /* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
> >  # ifdef USE_AS_WCSCHR
> >         sall    $8, %ecx
> >         orl     %ecx, %eax
> > -       tzcntl  %eax, %eax
> > +       bsfl    %eax, %eax
> >  # else
> >         salq    $32, %rcx
> >         orq     %rcx, %rax
> > -       tzcntq  %rax, %rax
> > +       bsfq    %rax, %rax
> >  # endif
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Check if match was CHAR or null.  */
> > @@ -303,28 +313,28 @@ L(loop_4x_vec):
> >         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> >         ret
> >
> > -# ifndef USE_AS_STRCHRNUL
> > -L(zero_end):
> > -       xorl    %eax, %eax
> > -       ret
> > +       .p2align 4,, 8
> > +L(last_vec_x1):
> > +       bsfl    %eax, %eax
> > +# ifdef USE_AS_WCSCHR
> > +       /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> > +          */
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > +       addq    %rdi, %rax
> >  # endif
> >
> > -       .p2align 4
> > -L(last_vec_x1):
> > -       tzcntl  %eax, %eax
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Check if match was null.  */
> > -       cmp     (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > +       cmp     (%rax), %CHAR_REG
> >         jne     L(zero_end)
> >  # endif
> > -       /* NB: Multiply sizeof char type (1 or 4) to get the number of
> > -          bytes.  */
> > -       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +
> >         ret
> >
> > -       .p2align 4
> > +       .p2align 4,, 8
> >  L(last_vec_x2):
> > -       tzcntl  %eax, %eax
> > +       bsfl    %eax, %eax
> >  # ifndef USE_AS_STRCHRNUL
> >         /* Check if match was null.  */
> >         cmp     (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> > @@ -336,7 +346,7 @@ L(last_vec_x2):
> >         ret
> >
> >         /* Cold case for crossing page with first load.  */
> > -       .p2align 4
> > +       .p2align 4,, 8
> >  L(cross_page_boundary):
> >         movq    %rdi, %rdx
> >         /* Align rdi.  */
> > @@ -346,9 +356,9 @@ L(cross_page_boundary):
> >         vpxorq  %YMM1, %YMM0, %YMM2
> >         VPMINU  %YMM2, %YMM1, %YMM2
> >         /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> > -       VPCMP   $0, %YMMZERO, %YMM2, %k0
> > +       VPTESTN %YMM2, %YMM2, %k0
> >         kmovd   %k0, %eax
> > -       /* Remove the leading bits.      */
> > +       /* Remove the leading bits.  */
> >  # ifdef USE_AS_WCSCHR
> >         movl    %edx, %SHIFT_REG
> >         /* NB: Divide shift count by 4 since each bit in K1 represent 4
> > @@ -360,20 +370,24 @@ L(cross_page_boundary):
> >         /* If eax is zero continue.  */
> >         testl   %eax, %eax
> >         jz      L(cross_page_continue)
> > -       tzcntl  %eax, %eax
> > -# ifndef USE_AS_STRCHRNUL
> > -       /* Check to see if match was CHAR or null.  */
> > -       cmp     (%rdx, %rax, CHAR_SIZE), %CHAR_REG
> > -       jne     L(zero_end)
> > -# endif
> > +       bsfl    %eax, %eax
> > +
> >  # ifdef USE_AS_WCSCHR
> >         /* NB: Multiply wchar_t count by 4 to get the number of
> >            bytes.  */
> >         leaq    (%rdx, %rax, CHAR_SIZE), %rax
> >  # else
> >         addq    %rdx, %rax
> > +# endif
> > +# ifndef USE_AS_STRCHRNUL
> > +       /* Check to see if match was CHAR or null.  */
> > +       cmp     (%rax), %CHAR_REG
> > +       je      L(cross_page_ret)
> > +L(zero_end):
> > +       xorl    %eax, %eax
> > +L(cross_page_ret):
> >  # endif
> >         ret
> >
> >  END (STRCHR)
> > -# endif
> > +#endif
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil
  

Patch

diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
index f62cd9d144..ec739fb8f9 100644
--- a/sysdeps/x86_64/multiarch/strchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -30,6 +30,7 @@ 
 # ifdef USE_AS_WCSCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMP		vpcmpd
+#  define VPTESTN	vptestnmd
 #  define VPMINU	vpminud
 #  define CHAR_REG	esi
 #  define SHIFT_REG	ecx
@@ -37,6 +38,7 @@ 
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMP		vpcmpb
+#  define VPTESTN	vptestnmb
 #  define VPMINU	vpminub
 #  define CHAR_REG	sil
 #  define SHIFT_REG	edx
@@ -61,13 +63,11 @@ 
 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
 
 	.section .text.evex,"ax",@progbits
-ENTRY (STRCHR)
+ENTRY_P2ALIGN (STRCHR, 5)
 	/* Broadcast CHAR to YMM0.	*/
 	VPBROADCAST	%esi, %YMM0
 	movl	%edi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-
 	/* Check if we cross page boundary with one vector load.
 	   Otherwise it is safe to use an unaligned load.  */
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
@@ -81,49 +81,35 @@  ENTRY (STRCHR)
 	vpxorq	%YMM1, %YMM0, %YMM2
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jz	L(aligned_more)
 	tzcntl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.  */
+	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	/* NB: Use a branch instead of cmovcc here. The expectation is
+	   that with strchr the user will branch based on input being
+	   null. Since this branch will be 100% predictive of the user
+	   branch a branch miss here should save what otherwise would
+	   be branch miss in the user code. Otherwise using a branch 1)
+	   saves code size and 2) is faster in highly predictable
+	   environments.  */
+	jne	L(zero)
+# endif
 # ifdef USE_AS_WCSCHR
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
 	 */
 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
 	addq	%rdi, %rax
-# endif
-# ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
-	cmp	(%rax), %CHAR_REG
-	jne	L(zero)
 # endif
 	ret
 
-	/* .p2align 5 helps keep performance more consistent if ENTRY()
-	   alignment % 32 was either 16 or 0. As well this makes the
-	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
-	   easier.  */
-	.p2align 5
-L(first_vec_x3):
-	tzcntl	%eax, %eax
-# ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
-	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
-	jne	L(zero)
-# endif
-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
-	   bytes.  */
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
-	ret
 
-# ifndef USE_AS_STRCHRNUL
-L(zero):
-	xorl	%eax, %eax
-	ret
-# endif
 
-	.p2align 4
+	.p2align 4,, 10
 L(first_vec_x4):
 # ifndef USE_AS_STRCHRNUL
 	/* Check to see if first match was CHAR (k0) or null (k1).  */
@@ -144,9 +130,18 @@  L(first_vec_x4):
 	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
+# ifndef USE_AS_STRCHRNUL
+L(zero):
+	xorl	%eax, %eax
+	ret
+# endif
+
+
 	.p2align 4
 L(first_vec_x1):
-	tzcntl	%eax, %eax
+	/* Use bsf here to save 1-byte keeping keeping the block in 1x
+	   fetch block. eax guranteed non-zero.  */
+	bsfl	%eax, %eax
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
@@ -158,7 +153,7 @@  L(first_vec_x1):
 	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
+	.p2align 4,, 10
 L(first_vec_x2):
 # ifndef USE_AS_STRCHRNUL
 	/* Check to see if first match was CHAR (k0) or null (k1).  */
@@ -179,6 +174,21 @@  L(first_vec_x2):
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
+	.p2align 4,, 10
+L(first_vec_x3):
+	/* Use bsf here to save 1-byte keeping keeping the block in 1x
+	   fetch block. eax guranteed non-zero.  */
+	bsfl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero)
+# endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
 	.p2align 4
 L(aligned_more):
 	/* Align data to VEC_SIZE.  */
@@ -195,7 +205,7 @@  L(cross_page_continue):
 	vpxorq	%YMM1, %YMM0, %YMM2
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
@@ -206,7 +216,7 @@  L(cross_page_continue):
 	/* Each bit in K0 represents a CHAR in YMM1.  */
 	VPCMP	$0, %YMM1, %YMM0, %k0
 	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMM1, %YMMZERO, %k1
+	VPTESTN	%YMM1, %YMM1, %k1
 	kortestd	%k0, %k1
 	jnz	L(first_vec_x2)
 
@@ -215,7 +225,7 @@  L(cross_page_continue):
 	vpxorq	%YMM1, %YMM0, %YMM2
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
@@ -224,7 +234,7 @@  L(cross_page_continue):
 	/* Each bit in K0 represents a CHAR in YMM1.  */
 	VPCMP	$0, %YMM1, %YMM0, %k0
 	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMM1, %YMMZERO, %k1
+	VPTESTN	%YMM1, %YMM1, %k1
 	kortestd	%k0, %k1
 	jnz	L(first_vec_x4)
 
@@ -265,33 +275,33 @@  L(loop_4x_vec):
 	VPMINU	%YMM3, %YMM4, %YMM4
 	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
 
-	VPCMP	$0, %YMMZERO, %YMM4, %k1
+	VPTESTN	%YMM4, %YMM4, %k1
 	kmovd	%k1, %ecx
 	subq	$-(VEC_SIZE * 4), %rdi
 	testl	%ecx, %ecx
 	jz	L(loop_4x_vec)
 
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+	VPTESTN	%YMM1, %YMM1, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x1)
 
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x2)
 
-	VPCMP	$0, %YMMZERO, %YMM3, %k0
+	VPTESTN	%YMM3, %YMM3, %k0
 	kmovd	%k0, %eax
 	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
 # ifdef USE_AS_WCSCHR
 	sall	$8, %ecx
 	orl	%ecx, %eax
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 # else
 	salq	$32, %rcx
 	orq	%rcx, %rax
-	tzcntq	%rax, %rax
+	bsfq	%rax, %rax
 # endif
 # ifndef USE_AS_STRCHRNUL
 	/* Check if match was CHAR or null.  */
@@ -303,28 +313,28 @@  L(loop_4x_vec):
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-# ifndef USE_AS_STRCHRNUL
-L(zero_end):
-	xorl	%eax, %eax
-	ret
+	.p2align 4,, 8
+L(last_vec_x1):
+	bsfl	%eax, %eax
+# ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+	   */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
 # endif
 
-	.p2align 4
-L(last_vec_x1):
-	tzcntl	%eax, %eax
 # ifndef USE_AS_STRCHRNUL
 	/* Check if match was null.  */
-	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	cmp	(%rax), %CHAR_REG
 	jne	L(zero_end)
 # endif
-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
-	   bytes.  */
-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+
 	ret
 
-	.p2align 4
+	.p2align 4,, 8
 L(last_vec_x2):
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 # ifndef USE_AS_STRCHRNUL
 	/* Check if match was null.  */
 	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
@@ -336,7 +346,7 @@  L(last_vec_x2):
 	ret
 
 	/* Cold case for crossing page with first load.	 */
-	.p2align 4
+	.p2align 4,, 8
 L(cross_page_boundary):
 	movq	%rdi, %rdx
 	/* Align rdi.  */
@@ -346,9 +356,9 @@  L(cross_page_boundary):
 	vpxorq	%YMM1, %YMM0, %YMM2
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %eax
-	/* Remove the leading bits.	 */
+	/* Remove the leading bits.  */
 # ifdef USE_AS_WCSCHR
 	movl	%edx, %SHIFT_REG
 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
@@ -360,20 +370,24 @@  L(cross_page_boundary):
 	/* If eax is zero continue.  */
 	testl	%eax, %eax
 	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
-# ifndef USE_AS_STRCHRNUL
-	/* Check to see if match was CHAR or null.  */
-	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
-	jne	L(zero_end)
-# endif
+	bsfl	%eax, %eax
+
 # ifdef USE_AS_WCSCHR
 	/* NB: Multiply wchar_t count by 4 to get the number of
 	   bytes.  */
 	leaq	(%rdx, %rax, CHAR_SIZE), %rax
 # else
 	addq	%rdx, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if match was CHAR or null.  */
+	cmp	(%rax), %CHAR_REG
+	je	L(cross_page_ret)
+L(zero_end):
+	xorl	%eax, %eax
+L(cross_page_ret):
 # endif
 	ret
 
 END (STRCHR)
-# endif
+#endif