[v1,07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c

Message ID 20220323215734.3927131-7-goldstein.w.n@gmail.com
State Accepted, archived
Headers
Series [v1,01/23] benchtests: Use json-lib in bench-strchr.c |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Noah Goldstein March 23, 2022, 9:57 p.m. UTC
  Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
sign extensions.

geometric_mean(N=20) of all benchmarks that dont fallback on
sse2/strlen; New / Original: .928

All string/memory tests pass.
---
Geomtric Mean N=20 runs; All functions page aligned
len, align1, align2,  pos, New Time / Old Time
  0,      0,      0,  512,               1.207
  1,      0,      0,  512,               1.039
  1,      1,      0,  512,               0.997
  1,      0,      1,  512,               0.981
  1,      1,      1,  512,               0.977
  2,      0,      0,  512,                1.02
  2,      2,      0,  512,               0.979
  2,      0,      2,  512,               0.902
  2,      2,      2,  512,               0.958
  3,      0,      0,  512,               0.978
  3,      3,      0,  512,               0.988
  3,      0,      3,  512,               0.979
  3,      3,      3,  512,               0.955
  4,      0,      0,  512,               0.969
  4,      4,      0,  512,               0.991
  4,      0,      4,  512,                0.94
  4,      4,      4,  512,               0.958
  5,      0,      0,  512,               0.963
  5,      5,      0,  512,               1.004
  5,      0,      5,  512,               0.948
  5,      5,      5,  512,               0.971
  6,      0,      0,  512,               0.933
  6,      6,      0,  512,               1.007
  6,      0,      6,  512,               0.921
  6,      6,      6,  512,               0.969
  7,      0,      0,  512,               0.928
  7,      7,      0,  512,               0.976
  7,      0,      7,  512,               0.932
  7,      7,      7,  512,               0.995
  8,      0,      0,  512,               0.931
  8,      0,      8,  512,               0.766
  9,      0,      0,  512,               0.965
  9,      1,      0,  512,               0.999
  9,      0,      9,  512,               0.765
  9,      1,      9,  512,                0.97
 10,      0,      0,  512,               0.976
 10,      2,      0,  512,               0.991
 10,      0,     10,  512,               0.768
 10,      2,     10,  512,               0.926
 11,      0,      0,  512,               0.958
 11,      3,      0,  512,               1.006
 11,      0,     11,  512,               0.768
 11,      3,     11,  512,               0.908
 12,      0,      0,  512,               0.945
 12,      4,      0,  512,               0.896
 12,      0,     12,  512,               0.764
 12,      4,     12,  512,               0.785
 13,      0,      0,  512,               0.957
 13,      5,      0,  512,               1.019
 13,      0,     13,  512,                0.76
 13,      5,     13,  512,               0.785
 14,      0,      0,  512,               0.918
 14,      6,      0,  512,               1.004
 14,      0,     14,  512,                0.78
 14,      6,     14,  512,               0.711
 15,      0,      0,  512,               0.855
 15,      7,      0,  512,               0.985
 15,      0,     15,  512,               0.779
 15,      7,     15,  512,               0.772
 16,      0,      0,  512,               0.987
 16,      0,     16,  512,                0.99
 17,      0,      0,  512,               0.996
 17,      1,      0,  512,               0.979
 17,      0,     17,  512,               1.001
 17,      1,     17,  512,                1.03
 18,      0,      0,  512,               0.976
 18,      2,      0,  512,               0.989
 18,      0,     18,  512,               0.976
 18,      2,     18,  512,               0.992
 19,      0,      0,  512,               0.991
 19,      3,      0,  512,               0.988
 19,      0,     19,  512,               1.009
 19,      3,     19,  512,               1.018
 20,      0,      0,  512,               0.999
 20,      4,      0,  512,               1.005
 20,      0,     20,  512,               0.993
 20,      4,     20,  512,               0.983
 21,      0,      0,  512,               0.982
 21,      5,      0,  512,               0.988
 21,      0,     21,  512,               0.978
 21,      5,     21,  512,               0.984
 22,      0,      0,  512,               0.988
 22,      6,      0,  512,               0.979
 22,      0,     22,  512,               0.984
 22,      6,     22,  512,               0.983
 23,      0,      0,  512,               0.996
 23,      7,      0,  512,               0.998
 23,      0,     23,  512,               0.979
 23,      7,     23,  512,               0.987
 24,      0,      0,  512,                0.99
 24,      0,     24,  512,               0.979
 25,      0,      0,  512,               0.985
 25,      1,      0,  512,               0.988
 25,      0,     25,  512,                0.99
 25,      1,     25,  512,               0.986
 26,      0,      0,  512,               1.005
 26,      2,      0,  512,               0.995
 26,      0,     26,  512,               0.992
 26,      2,     26,  512,               0.983
 27,      0,      0,  512,               0.986
 27,      3,      0,  512,               0.978
 27,      0,     27,  512,               0.986
 27,      3,     27,  512,               0.973
 28,      0,      0,  512,               0.995
 28,      4,      0,  512,               0.993
 28,      0,     28,  512,               0.983
 28,      4,     28,  512,               1.005
 29,      0,      0,  512,               0.983
 29,      5,      0,  512,               0.982
 29,      0,     29,  512,               0.984
 29,      5,     29,  512,               1.005
 30,      0,      0,  512,               0.978
 30,      6,      0,  512,               0.985
 30,      0,     30,  512,               0.994
 30,      6,     30,  512,               0.993
 31,      0,      0,  512,               0.984
 31,      7,      0,  512,               0.983
 31,      0,     31,  512,                 1.0
 31,      7,     31,  512,               1.031
  4,      0,      0,   32,               0.916
  4,      1,      0,   32,               0.952
  4,      0,      1,   32,               0.927
  4,      1,      1,   32,               0.969
  4,      0,      0,   64,               0.961
  4,      2,      0,   64,               0.955
  4,      0,      2,   64,               0.975
  4,      2,      2,   64,               0.972
  4,      0,      0,  128,               0.971
  4,      3,      0,  128,               0.982
  4,      0,      3,  128,               0.945
  4,      3,      3,  128,               0.971
  4,      0,      0,  256,               1.004
  4,      4,      0,  256,               0.966
  4,      0,      4,  256,               0.961
  4,      4,      4,  256,               0.971
  4,      5,      0,  512,               0.929
  4,      0,      5,  512,               0.969
  4,      5,      5,  512,               0.985
  4,      0,      0, 1024,               1.003
  4,      6,      0, 1024,               1.009
  4,      0,      6, 1024,               1.005
  4,      6,      6, 1024,               0.999
  4,      0,      0, 2048,               0.917
  4,      7,      0, 2048,               1.015
  4,      0,      7, 2048,               1.011
  4,      7,      7, 2048,               0.907
 10,      1,      0,   64,               0.964
 10,      1,      1,   64,               0.966
 10,      2,      0,   64,               0.953
 10,      2,      2,   64,               0.972
 10,      3,      0,   64,               0.962
 10,      3,      3,   64,               0.969
 10,      4,      0,   64,               0.957
 10,      4,      4,   64,               0.969
 10,      5,      0,   64,               0.961
 10,      5,      5,   64,               0.965
 10,      6,      0,   64,               0.949
 10,      6,      6,   64,                 0.9
 10,      7,      0,   64,               0.957
 10,      7,      7,   64,               0.897
  6,      0,      0,    0,               0.991
  6,      0,      0,    1,               1.011
  6,      0,      1,    1,               0.939
  6,      0,      0,    2,               1.016
  6,      0,      2,    2,                0.94
  6,      0,      0,    3,               1.019
  6,      0,      3,    3,               0.941
  6,      0,      0,    4,               1.056
  6,      0,      4,    4,               0.884
  6,      0,      0,    5,               0.977
  6,      0,      5,    5,               0.934
  6,      0,      0,    6,               0.954
  6,      0,      6,    6,                0.93
  6,      0,      0,    7,               0.963
  6,      0,      7,    7,               0.916
  6,      0,      0,    8,               0.963
  6,      0,      8,    8,               0.945
  6,      0,      0,    9,               1.028
  6,      0,      9,    9,               0.942
  6,      0,      0,   10,               0.955
  6,      0,     10,   10,               0.831
  6,      0,      0,   11,               0.948
  6,      0,     11,   11,                0.82
  6,      0,      0,   12,               1.033
  6,      0,     12,   12,               0.873
  6,      0,      0,   13,               0.983
  6,      0,     13,   13,               0.852
  6,      0,      0,   14,               0.984
  6,      0,     14,   14,               0.853
  6,      0,      0,   15,               0.984
  6,      0,     15,   15,               0.882
  6,      0,      0,   16,               0.971
  6,      0,     16,   16,               0.958
  6,      0,      0,   17,               0.938
  6,      0,     17,   17,               0.947
  6,      0,      0,   18,                0.96
  6,      0,     18,   18,               0.938
  6,      0,      0,   19,               0.903
  6,      0,     19,   19,               0.943
  6,      0,      0,   20,               0.947
  6,      0,     20,   20,               0.951
  6,      0,      0,   21,               0.948
  6,      0,     21,   21,                0.96
  6,      0,      0,   22,               0.926
  6,      0,     22,   22,               0.951
  6,      0,      0,   23,               0.923
  6,      0,     23,   23,               0.959
  6,      0,      0,   24,               0.918
  6,      0,     24,   24,               0.952
  6,      0,      0,   25,                0.97
  6,      0,     25,   25,               0.952
  6,      0,      0,   26,               0.871
  6,      0,     26,   26,               0.869
  6,      0,      0,   27,               0.935
  6,      0,     27,   27,               0.836
  6,      0,      0,   28,               0.936
  6,      0,     28,   28,               0.857
  6,      0,      0,   29,               0.876
  6,      0,     29,   29,               0.859
  6,      0,      0,   30,               0.934
  6,      0,     30,   30,               0.857
  6,      0,      0,   31,               0.962
  6,      0,     31,   31,                0.86
  6,      0,      0,   32,               0.912
  6,      0,     32,   32,                0.94
  6,      0,      0,   33,               0.903
  6,      0,     33,   33,               0.968
  6,      0,      0,   34,               0.913
  6,      0,     34,   34,               0.896
  6,      0,      0,   35,               0.904
  6,      0,     35,   35,               0.913
  6,      0,      0,   36,               0.905
  6,      0,     36,   36,               0.907
  6,      0,      0,   37,               0.899
  6,      0,     37,   37,                 0.9
  6,      0,      0,   38,               0.912
  6,      0,     38,   38,               0.919
  6,      0,      0,   39,               0.925
  6,      0,     39,   39,               0.927
  6,      0,      0,   40,               0.923
  6,      0,     40,   40,               0.972
  6,      0,      0,   41,                0.92
  6,      0,     41,   41,               0.966
  6,      0,      0,   42,               0.915
  6,      0,     42,   42,               0.834
  6,      0,      0,   43,                0.92
  6,      0,     43,   43,               0.856
  6,      0,      0,   44,               0.908
  6,      0,     44,   44,               0.858
  6,      0,      0,   45,               0.932
  6,      0,     45,   45,               0.847
  6,      0,      0,   46,               0.927
  6,      0,     46,   46,               0.859
  6,      0,      0,   47,               0.902
  6,      0,     47,   47,               0.855
  6,      0,      0,   48,               0.949
  6,      0,     48,   48,               0.934
  6,      0,      0,   49,               0.907
  6,      0,     49,   49,               0.943
  6,      0,      0,   50,               0.934
  6,      0,     50,   50,               0.943
  6,      0,      0,   51,               0.933
  6,      0,     51,   51,               0.939
  6,      0,      0,   52,               0.944
  6,      0,     52,   52,               0.944
  6,      0,      0,   53,               0.939
  6,      0,     53,   53,               0.938
  6,      0,      0,   54,                 0.9
  6,      0,     54,   54,               0.923
  6,      0,      0,   55,                 0.9
  6,      0,     55,   55,               0.927
  6,      0,      0,   56,                 0.9
  6,      0,     56,   56,               0.917
  6,      0,      0,   57,                 0.9
  6,      0,     57,   57,               0.916
  6,      0,      0,   58,               0.914
  6,      0,     58,   58,               0.784
  6,      0,      0,   59,               0.863
  6,      0,     59,   59,               0.846
  6,      0,      0,   60,                0.88
  6,      0,     60,   60,               0.827
  6,      0,      0,   61,               0.896
  6,      0,     61,   61,               0.847
  6,      0,      0,   62,               0.894
  6,      0,     62,   62,               0.865
  6,      0,      0,   63,               0.934
  6,      0,     63,   63,               0.866

 sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++---------------
 1 file changed, 37 insertions(+), 46 deletions(-)
  

Comments

H.J. Lu March 24, 2022, 6:55 p.m. UTC | #1
On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
> _mm_cmpistri. Also change offset to unsigned to avoid unnecessary
> sign extensions.
>
> geometric_mean(N=20) of all benchmarks that dont fallback on
> sse2/strlen; New / Original: .928
>
> All string/memory tests pass.
> ---
> Geomtric Mean N=20 runs; All functions page aligned
> len, align1, align2,  pos, New Time / Old Time
>   0,      0,      0,  512,               1.207
>   1,      0,      0,  512,               1.039
>   1,      1,      0,  512,               0.997
>   1,      0,      1,  512,               0.981
>   1,      1,      1,  512,               0.977
>   2,      0,      0,  512,                1.02
>   2,      2,      0,  512,               0.979
>   2,      0,      2,  512,               0.902
>   2,      2,      2,  512,               0.958
>   3,      0,      0,  512,               0.978
>   3,      3,      0,  512,               0.988
>   3,      0,      3,  512,               0.979
>   3,      3,      3,  512,               0.955
>   4,      0,      0,  512,               0.969
>   4,      4,      0,  512,               0.991
>   4,      0,      4,  512,                0.94
>   4,      4,      4,  512,               0.958
>   5,      0,      0,  512,               0.963
>   5,      5,      0,  512,               1.004
>   5,      0,      5,  512,               0.948
>   5,      5,      5,  512,               0.971
>   6,      0,      0,  512,               0.933
>   6,      6,      0,  512,               1.007
>   6,      0,      6,  512,               0.921
>   6,      6,      6,  512,               0.969
>   7,      0,      0,  512,               0.928
>   7,      7,      0,  512,               0.976
>   7,      0,      7,  512,               0.932
>   7,      7,      7,  512,               0.995
>   8,      0,      0,  512,               0.931
>   8,      0,      8,  512,               0.766
>   9,      0,      0,  512,               0.965
>   9,      1,      0,  512,               0.999
>   9,      0,      9,  512,               0.765
>   9,      1,      9,  512,                0.97
>  10,      0,      0,  512,               0.976
>  10,      2,      0,  512,               0.991
>  10,      0,     10,  512,               0.768
>  10,      2,     10,  512,               0.926
>  11,      0,      0,  512,               0.958
>  11,      3,      0,  512,               1.006
>  11,      0,     11,  512,               0.768
>  11,      3,     11,  512,               0.908
>  12,      0,      0,  512,               0.945
>  12,      4,      0,  512,               0.896
>  12,      0,     12,  512,               0.764
>  12,      4,     12,  512,               0.785
>  13,      0,      0,  512,               0.957
>  13,      5,      0,  512,               1.019
>  13,      0,     13,  512,                0.76
>  13,      5,     13,  512,               0.785
>  14,      0,      0,  512,               0.918
>  14,      6,      0,  512,               1.004
>  14,      0,     14,  512,                0.78
>  14,      6,     14,  512,               0.711
>  15,      0,      0,  512,               0.855
>  15,      7,      0,  512,               0.985
>  15,      0,     15,  512,               0.779
>  15,      7,     15,  512,               0.772
>  16,      0,      0,  512,               0.987
>  16,      0,     16,  512,                0.99
>  17,      0,      0,  512,               0.996
>  17,      1,      0,  512,               0.979
>  17,      0,     17,  512,               1.001
>  17,      1,     17,  512,                1.03
>  18,      0,      0,  512,               0.976
>  18,      2,      0,  512,               0.989
>  18,      0,     18,  512,               0.976
>  18,      2,     18,  512,               0.992
>  19,      0,      0,  512,               0.991
>  19,      3,      0,  512,               0.988
>  19,      0,     19,  512,               1.009
>  19,      3,     19,  512,               1.018
>  20,      0,      0,  512,               0.999
>  20,      4,      0,  512,               1.005
>  20,      0,     20,  512,               0.993
>  20,      4,     20,  512,               0.983
>  21,      0,      0,  512,               0.982
>  21,      5,      0,  512,               0.988
>  21,      0,     21,  512,               0.978
>  21,      5,     21,  512,               0.984
>  22,      0,      0,  512,               0.988
>  22,      6,      0,  512,               0.979
>  22,      0,     22,  512,               0.984
>  22,      6,     22,  512,               0.983
>  23,      0,      0,  512,               0.996
>  23,      7,      0,  512,               0.998
>  23,      0,     23,  512,               0.979
>  23,      7,     23,  512,               0.987
>  24,      0,      0,  512,                0.99
>  24,      0,     24,  512,               0.979
>  25,      0,      0,  512,               0.985
>  25,      1,      0,  512,               0.988
>  25,      0,     25,  512,                0.99
>  25,      1,     25,  512,               0.986
>  26,      0,      0,  512,               1.005
>  26,      2,      0,  512,               0.995
>  26,      0,     26,  512,               0.992
>  26,      2,     26,  512,               0.983
>  27,      0,      0,  512,               0.986
>  27,      3,      0,  512,               0.978
>  27,      0,     27,  512,               0.986
>  27,      3,     27,  512,               0.973
>  28,      0,      0,  512,               0.995
>  28,      4,      0,  512,               0.993
>  28,      0,     28,  512,               0.983
>  28,      4,     28,  512,               1.005
>  29,      0,      0,  512,               0.983
>  29,      5,      0,  512,               0.982
>  29,      0,     29,  512,               0.984
>  29,      5,     29,  512,               1.005
>  30,      0,      0,  512,               0.978
>  30,      6,      0,  512,               0.985
>  30,      0,     30,  512,               0.994
>  30,      6,     30,  512,               0.993
>  31,      0,      0,  512,               0.984
>  31,      7,      0,  512,               0.983
>  31,      0,     31,  512,                 1.0
>  31,      7,     31,  512,               1.031
>   4,      0,      0,   32,               0.916
>   4,      1,      0,   32,               0.952
>   4,      0,      1,   32,               0.927
>   4,      1,      1,   32,               0.969
>   4,      0,      0,   64,               0.961
>   4,      2,      0,   64,               0.955
>   4,      0,      2,   64,               0.975
>   4,      2,      2,   64,               0.972
>   4,      0,      0,  128,               0.971
>   4,      3,      0,  128,               0.982
>   4,      0,      3,  128,               0.945
>   4,      3,      3,  128,               0.971
>   4,      0,      0,  256,               1.004
>   4,      4,      0,  256,               0.966
>   4,      0,      4,  256,               0.961
>   4,      4,      4,  256,               0.971
>   4,      5,      0,  512,               0.929
>   4,      0,      5,  512,               0.969
>   4,      5,      5,  512,               0.985
>   4,      0,      0, 1024,               1.003
>   4,      6,      0, 1024,               1.009
>   4,      0,      6, 1024,               1.005
>   4,      6,      6, 1024,               0.999
>   4,      0,      0, 2048,               0.917
>   4,      7,      0, 2048,               1.015
>   4,      0,      7, 2048,               1.011
>   4,      7,      7, 2048,               0.907
>  10,      1,      0,   64,               0.964
>  10,      1,      1,   64,               0.966
>  10,      2,      0,   64,               0.953
>  10,      2,      2,   64,               0.972
>  10,      3,      0,   64,               0.962
>  10,      3,      3,   64,               0.969
>  10,      4,      0,   64,               0.957
>  10,      4,      4,   64,               0.969
>  10,      5,      0,   64,               0.961
>  10,      5,      5,   64,               0.965
>  10,      6,      0,   64,               0.949
>  10,      6,      6,   64,                 0.9
>  10,      7,      0,   64,               0.957
>  10,      7,      7,   64,               0.897
>   6,      0,      0,    0,               0.991
>   6,      0,      0,    1,               1.011
>   6,      0,      1,    1,               0.939
>   6,      0,      0,    2,               1.016
>   6,      0,      2,    2,                0.94
>   6,      0,      0,    3,               1.019
>   6,      0,      3,    3,               0.941
>   6,      0,      0,    4,               1.056
>   6,      0,      4,    4,               0.884
>   6,      0,      0,    5,               0.977
>   6,      0,      5,    5,               0.934
>   6,      0,      0,    6,               0.954
>   6,      0,      6,    6,                0.93
>   6,      0,      0,    7,               0.963
>   6,      0,      7,    7,               0.916
>   6,      0,      0,    8,               0.963
>   6,      0,      8,    8,               0.945
>   6,      0,      0,    9,               1.028
>   6,      0,      9,    9,               0.942
>   6,      0,      0,   10,               0.955
>   6,      0,     10,   10,               0.831
>   6,      0,      0,   11,               0.948
>   6,      0,     11,   11,                0.82
>   6,      0,      0,   12,               1.033
>   6,      0,     12,   12,               0.873
>   6,      0,      0,   13,               0.983
>   6,      0,     13,   13,               0.852
>   6,      0,      0,   14,               0.984
>   6,      0,     14,   14,               0.853
>   6,      0,      0,   15,               0.984
>   6,      0,     15,   15,               0.882
>   6,      0,      0,   16,               0.971
>   6,      0,     16,   16,               0.958
>   6,      0,      0,   17,               0.938
>   6,      0,     17,   17,               0.947
>   6,      0,      0,   18,                0.96
>   6,      0,     18,   18,               0.938
>   6,      0,      0,   19,               0.903
>   6,      0,     19,   19,               0.943
>   6,      0,      0,   20,               0.947
>   6,      0,     20,   20,               0.951
>   6,      0,      0,   21,               0.948
>   6,      0,     21,   21,                0.96
>   6,      0,      0,   22,               0.926
>   6,      0,     22,   22,               0.951
>   6,      0,      0,   23,               0.923
>   6,      0,     23,   23,               0.959
>   6,      0,      0,   24,               0.918
>   6,      0,     24,   24,               0.952
>   6,      0,      0,   25,                0.97
>   6,      0,     25,   25,               0.952
>   6,      0,      0,   26,               0.871
>   6,      0,     26,   26,               0.869
>   6,      0,      0,   27,               0.935
>   6,      0,     27,   27,               0.836
>   6,      0,      0,   28,               0.936
>   6,      0,     28,   28,               0.857
>   6,      0,      0,   29,               0.876
>   6,      0,     29,   29,               0.859
>   6,      0,      0,   30,               0.934
>   6,      0,     30,   30,               0.857
>   6,      0,      0,   31,               0.962
>   6,      0,     31,   31,                0.86
>   6,      0,      0,   32,               0.912
>   6,      0,     32,   32,                0.94
>   6,      0,      0,   33,               0.903
>   6,      0,     33,   33,               0.968
>   6,      0,      0,   34,               0.913
>   6,      0,     34,   34,               0.896
>   6,      0,      0,   35,               0.904
>   6,      0,     35,   35,               0.913
>   6,      0,      0,   36,               0.905
>   6,      0,     36,   36,               0.907
>   6,      0,      0,   37,               0.899
>   6,      0,     37,   37,                 0.9
>   6,      0,      0,   38,               0.912
>   6,      0,     38,   38,               0.919
>   6,      0,      0,   39,               0.925
>   6,      0,     39,   39,               0.927
>   6,      0,      0,   40,               0.923
>   6,      0,     40,   40,               0.972
>   6,      0,      0,   41,                0.92
>   6,      0,     41,   41,               0.966
>   6,      0,      0,   42,               0.915
>   6,      0,     42,   42,               0.834
>   6,      0,      0,   43,                0.92
>   6,      0,     43,   43,               0.856
>   6,      0,      0,   44,               0.908
>   6,      0,     44,   44,               0.858
>   6,      0,      0,   45,               0.932
>   6,      0,     45,   45,               0.847
>   6,      0,      0,   46,               0.927
>   6,      0,     46,   46,               0.859
>   6,      0,      0,   47,               0.902
>   6,      0,     47,   47,               0.855
>   6,      0,      0,   48,               0.949
>   6,      0,     48,   48,               0.934
>   6,      0,      0,   49,               0.907
>   6,      0,     49,   49,               0.943
>   6,      0,      0,   50,               0.934
>   6,      0,     50,   50,               0.943
>   6,      0,      0,   51,               0.933
>   6,      0,     51,   51,               0.939
>   6,      0,      0,   52,               0.944
>   6,      0,     52,   52,               0.944
>   6,      0,      0,   53,               0.939
>   6,      0,     53,   53,               0.938
>   6,      0,      0,   54,                 0.9
>   6,      0,     54,   54,               0.923
>   6,      0,      0,   55,                 0.9
>   6,      0,     55,   55,               0.927
>   6,      0,      0,   56,                 0.9
>   6,      0,     56,   56,               0.917
>   6,      0,      0,   57,                 0.9
>   6,      0,     57,   57,               0.916
>   6,      0,      0,   58,               0.914
>   6,      0,     58,   58,               0.784
>   6,      0,      0,   59,               0.863
>   6,      0,     59,   59,               0.846
>   6,      0,      0,   60,                0.88
>   6,      0,     60,   60,               0.827
>   6,      0,      0,   61,               0.896
>   6,      0,     61,   61,               0.847
>   6,      0,      0,   62,               0.894
>   6,      0,     62,   62,               0.865
>   6,      0,      0,   63,               0.934
>   6,      0,     63,   63,               0.866
>
>  sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++---------------
>  1 file changed, 37 insertions(+), 46 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
> index 013aebf797..c312fab8b1 100644
> --- a/sysdeps/x86_64/multiarch/strcspn-c.c
> +++ b/sysdeps/x86_64/multiarch/strcspn-c.c
> @@ -84,83 +84,74 @@ STRCSPN_SSE42 (const char *s, const char *a)
>      RETURN (NULL, strlen (s));
>
>    const char *aligned;
> -  __m128i mask;
> -  int offset = (int) ((size_t) a & 15);
> +  __m128i mask, maskz, zero;
> +  unsigned int maskz_bits;
> +  unsigned int offset = (unsigned int) ((size_t) a & 15);
> +  zero = _mm_set1_epi8 (0);
>    if (offset != 0)
>      {
>        /* Load masks.  */
>        aligned = (const char *) ((size_t) a & -16L);
>        __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> -
> -      mask = __m128i_shift_right (mask0, offset);
> +      maskz = _mm_cmpeq_epi8 (mask0, zero);
>
>        /* Find where the NULL terminator is.  */
> -      int length = _mm_cmpistri (mask, mask, 0x3a);
> -      if (length == 16 - offset)
> -       {
> -         /* There is no NULL terminator.  */
> -         __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
> -         int index = _mm_cmpistri (mask1, mask1, 0x3a);
> -         length += index;
> -
> -         /* Don't use SSE4.2 if the length of A > 16.  */
> -         if (length > 16)
> -           return STRCSPN_SSE2 (s, a);
> -
> -         if (index != 0)
> -           {
> -             /* Combine mask0 and mask1.  We could play games with
> -                palignr, but frankly this data should be in L1 now
> -                so do the merge via an unaligned load.  */
> -             mask = _mm_loadu_si128 ((__m128i *) a);
> -           }
> -       }
> +      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> +      if (maskz_bits != 0)
> +        {
> +          mask = __m128i_shift_right (mask0, offset);
> +          offset = (unsigned int) ((size_t) s & 15);
> +          if (offset)
> +            goto start_unaligned;
> +
> +          aligned = s;
> +          goto start_loop;
> +        }
>      }
> -  else
> -    {
> -      /* A is aligned.  */
> -      mask = _mm_load_si128 ((__m128i *) a);
>
> -      /* Find where the NULL terminator is.  */
> -      int length = _mm_cmpistri (mask, mask, 0x3a);
> -      if (length == 16)
> -       {
> -         /* There is no NULL terminator.  Don't use SSE4.2 if the length
> -            of A > 16.  */
> -         if (a[16] != 0)
> -           return STRCSPN_SSE2 (s, a);
> -       }
> +  /* A is aligned.  */
> +  mask = _mm_loadu_si128 ((__m128i *) a);
> +  /* Find where the NULL terminator is.  */
> +  maskz = _mm_cmpeq_epi8 (mask, zero);
> +  maskz_bits = _mm_movemask_epi8 (maskz);
> +  if (maskz_bits == 0)
> +    {
> +      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> +         of A > 16.  */
> +      if (a[16] != 0)
> +        return STRCSPN_SSE2 (s, a);
>      }
>
> -  offset = (int) ((size_t) s & 15);
> +  aligned = s;
> +  offset = (unsigned int) ((size_t) s & 15);
>    if (offset != 0)
>      {
> +    start_unaligned:
>        /* Check partial string.  */
>        aligned = (const char *) ((size_t) s & -16L);
>        __m128i value = _mm_load_si128 ((__m128i *) aligned);
>
>        value = __m128i_shift_right (value, offset);
>
> -      int length = _mm_cmpistri (mask, value, 0x2);
> +      unsigned int length = _mm_cmpistri (mask, value, 0x2);
>        /* No need to check ZFlag since ZFlag is always 1.  */
> -      int cflag = _mm_cmpistrc (mask, value, 0x2);
> +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
>        if (cflag)
>         RETURN ((char *) (s + length), length);
>        /* Find where the NULL terminator is.  */
> -      int index = _mm_cmpistri (value, value, 0x3a);
> +      unsigned int index = _mm_cmpistri (value, value, 0x3a);
>        if (index < 16 - offset)
>         RETURN (NULL, index);
>        aligned += 16;
>      }
> -  else
> -    aligned = s;
>
> +start_loop:
>    while (1)
>      {
>        __m128i value = _mm_load_si128 ((__m128i *) aligned);
> -      int index = _mm_cmpistri (mask, value, 0x2);
> -      int cflag = _mm_cmpistrc (mask, value, 0x2);
> -      int zflag = _mm_cmpistrz (mask, value, 0x2);
> +      unsigned int index = _mm_cmpistri (mask, value, 0x2);
> +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> +      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
>        if (cflag)
>         RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
>        if (zflag)
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.
  
Sunil Pandey May 12, 2022, 7:34 p.m. UTC | #2
On Thu, Mar 24, 2022 at 11:57 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Wed, Mar 23, 2022 at 2:59 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
> > _mm_cmpistri. Also change offset to unsigned to avoid unnecessary
> > sign extensions.
> >
> > geometric_mean(N=20) of all benchmarks that dont fallback on
> > sse2/strlen; New / Original: .928
> >
> > All string/memory tests pass.
> > ---
> > Geomtric Mean N=20 runs; All functions page aligned
> > len, align1, align2,  pos, New Time / Old Time
> >   0,      0,      0,  512,               1.207
> >   1,      0,      0,  512,               1.039
> >   1,      1,      0,  512,               0.997
> >   1,      0,      1,  512,               0.981
> >   1,      1,      1,  512,               0.977
> >   2,      0,      0,  512,                1.02
> >   2,      2,      0,  512,               0.979
> >   2,      0,      2,  512,               0.902
> >   2,      2,      2,  512,               0.958
> >   3,      0,      0,  512,               0.978
> >   3,      3,      0,  512,               0.988
> >   3,      0,      3,  512,               0.979
> >   3,      3,      3,  512,               0.955
> >   4,      0,      0,  512,               0.969
> >   4,      4,      0,  512,               0.991
> >   4,      0,      4,  512,                0.94
> >   4,      4,      4,  512,               0.958
> >   5,      0,      0,  512,               0.963
> >   5,      5,      0,  512,               1.004
> >   5,      0,      5,  512,               0.948
> >   5,      5,      5,  512,               0.971
> >   6,      0,      0,  512,               0.933
> >   6,      6,      0,  512,               1.007
> >   6,      0,      6,  512,               0.921
> >   6,      6,      6,  512,               0.969
> >   7,      0,      0,  512,               0.928
> >   7,      7,      0,  512,               0.976
> >   7,      0,      7,  512,               0.932
> >   7,      7,      7,  512,               0.995
> >   8,      0,      0,  512,               0.931
> >   8,      0,      8,  512,               0.766
> >   9,      0,      0,  512,               0.965
> >   9,      1,      0,  512,               0.999
> >   9,      0,      9,  512,               0.765
> >   9,      1,      9,  512,                0.97
> >  10,      0,      0,  512,               0.976
> >  10,      2,      0,  512,               0.991
> >  10,      0,     10,  512,               0.768
> >  10,      2,     10,  512,               0.926
> >  11,      0,      0,  512,               0.958
> >  11,      3,      0,  512,               1.006
> >  11,      0,     11,  512,               0.768
> >  11,      3,     11,  512,               0.908
> >  12,      0,      0,  512,               0.945
> >  12,      4,      0,  512,               0.896
> >  12,      0,     12,  512,               0.764
> >  12,      4,     12,  512,               0.785
> >  13,      0,      0,  512,               0.957
> >  13,      5,      0,  512,               1.019
> >  13,      0,     13,  512,                0.76
> >  13,      5,     13,  512,               0.785
> >  14,      0,      0,  512,               0.918
> >  14,      6,      0,  512,               1.004
> >  14,      0,     14,  512,                0.78
> >  14,      6,     14,  512,               0.711
> >  15,      0,      0,  512,               0.855
> >  15,      7,      0,  512,               0.985
> >  15,      0,     15,  512,               0.779
> >  15,      7,     15,  512,               0.772
> >  16,      0,      0,  512,               0.987
> >  16,      0,     16,  512,                0.99
> >  17,      0,      0,  512,               0.996
> >  17,      1,      0,  512,               0.979
> >  17,      0,     17,  512,               1.001
> >  17,      1,     17,  512,                1.03
> >  18,      0,      0,  512,               0.976
> >  18,      2,      0,  512,               0.989
> >  18,      0,     18,  512,               0.976
> >  18,      2,     18,  512,               0.992
> >  19,      0,      0,  512,               0.991
> >  19,      3,      0,  512,               0.988
> >  19,      0,     19,  512,               1.009
> >  19,      3,     19,  512,               1.018
> >  20,      0,      0,  512,               0.999
> >  20,      4,      0,  512,               1.005
> >  20,      0,     20,  512,               0.993
> >  20,      4,     20,  512,               0.983
> >  21,      0,      0,  512,               0.982
> >  21,      5,      0,  512,               0.988
> >  21,      0,     21,  512,               0.978
> >  21,      5,     21,  512,               0.984
> >  22,      0,      0,  512,               0.988
> >  22,      6,      0,  512,               0.979
> >  22,      0,     22,  512,               0.984
> >  22,      6,     22,  512,               0.983
> >  23,      0,      0,  512,               0.996
> >  23,      7,      0,  512,               0.998
> >  23,      0,     23,  512,               0.979
> >  23,      7,     23,  512,               0.987
> >  24,      0,      0,  512,                0.99
> >  24,      0,     24,  512,               0.979
> >  25,      0,      0,  512,               0.985
> >  25,      1,      0,  512,               0.988
> >  25,      0,     25,  512,                0.99
> >  25,      1,     25,  512,               0.986
> >  26,      0,      0,  512,               1.005
> >  26,      2,      0,  512,               0.995
> >  26,      0,     26,  512,               0.992
> >  26,      2,     26,  512,               0.983
> >  27,      0,      0,  512,               0.986
> >  27,      3,      0,  512,               0.978
> >  27,      0,     27,  512,               0.986
> >  27,      3,     27,  512,               0.973
> >  28,      0,      0,  512,               0.995
> >  28,      4,      0,  512,               0.993
> >  28,      0,     28,  512,               0.983
> >  28,      4,     28,  512,               1.005
> >  29,      0,      0,  512,               0.983
> >  29,      5,      0,  512,               0.982
> >  29,      0,     29,  512,               0.984
> >  29,      5,     29,  512,               1.005
> >  30,      0,      0,  512,               0.978
> >  30,      6,      0,  512,               0.985
> >  30,      0,     30,  512,               0.994
> >  30,      6,     30,  512,               0.993
> >  31,      0,      0,  512,               0.984
> >  31,      7,      0,  512,               0.983
> >  31,      0,     31,  512,                 1.0
> >  31,      7,     31,  512,               1.031
> >   4,      0,      0,   32,               0.916
> >   4,      1,      0,   32,               0.952
> >   4,      0,      1,   32,               0.927
> >   4,      1,      1,   32,               0.969
> >   4,      0,      0,   64,               0.961
> >   4,      2,      0,   64,               0.955
> >   4,      0,      2,   64,               0.975
> >   4,      2,      2,   64,               0.972
> >   4,      0,      0,  128,               0.971
> >   4,      3,      0,  128,               0.982
> >   4,      0,      3,  128,               0.945
> >   4,      3,      3,  128,               0.971
> >   4,      0,      0,  256,               1.004
> >   4,      4,      0,  256,               0.966
> >   4,      0,      4,  256,               0.961
> >   4,      4,      4,  256,               0.971
> >   4,      5,      0,  512,               0.929
> >   4,      0,      5,  512,               0.969
> >   4,      5,      5,  512,               0.985
> >   4,      0,      0, 1024,               1.003
> >   4,      6,      0, 1024,               1.009
> >   4,      0,      6, 1024,               1.005
> >   4,      6,      6, 1024,               0.999
> >   4,      0,      0, 2048,               0.917
> >   4,      7,      0, 2048,               1.015
> >   4,      0,      7, 2048,               1.011
> >   4,      7,      7, 2048,               0.907
> >  10,      1,      0,   64,               0.964
> >  10,      1,      1,   64,               0.966
> >  10,      2,      0,   64,               0.953
> >  10,      2,      2,   64,               0.972
> >  10,      3,      0,   64,               0.962
> >  10,      3,      3,   64,               0.969
> >  10,      4,      0,   64,               0.957
> >  10,      4,      4,   64,               0.969
> >  10,      5,      0,   64,               0.961
> >  10,      5,      5,   64,               0.965
> >  10,      6,      0,   64,               0.949
> >  10,      6,      6,   64,                 0.9
> >  10,      7,      0,   64,               0.957
> >  10,      7,      7,   64,               0.897
> >   6,      0,      0,    0,               0.991
> >   6,      0,      0,    1,               1.011
> >   6,      0,      1,    1,               0.939
> >   6,      0,      0,    2,               1.016
> >   6,      0,      2,    2,                0.94
> >   6,      0,      0,    3,               1.019
> >   6,      0,      3,    3,               0.941
> >   6,      0,      0,    4,               1.056
> >   6,      0,      4,    4,               0.884
> >   6,      0,      0,    5,               0.977
> >   6,      0,      5,    5,               0.934
> >   6,      0,      0,    6,               0.954
> >   6,      0,      6,    6,                0.93
> >   6,      0,      0,    7,               0.963
> >   6,      0,      7,    7,               0.916
> >   6,      0,      0,    8,               0.963
> >   6,      0,      8,    8,               0.945
> >   6,      0,      0,    9,               1.028
> >   6,      0,      9,    9,               0.942
> >   6,      0,      0,   10,               0.955
> >   6,      0,     10,   10,               0.831
> >   6,      0,      0,   11,               0.948
> >   6,      0,     11,   11,                0.82
> >   6,      0,      0,   12,               1.033
> >   6,      0,     12,   12,               0.873
> >   6,      0,      0,   13,               0.983
> >   6,      0,     13,   13,               0.852
> >   6,      0,      0,   14,               0.984
> >   6,      0,     14,   14,               0.853
> >   6,      0,      0,   15,               0.984
> >   6,      0,     15,   15,               0.882
> >   6,      0,      0,   16,               0.971
> >   6,      0,     16,   16,               0.958
> >   6,      0,      0,   17,               0.938
> >   6,      0,     17,   17,               0.947
> >   6,      0,      0,   18,                0.96
> >   6,      0,     18,   18,               0.938
> >   6,      0,      0,   19,               0.903
> >   6,      0,     19,   19,               0.943
> >   6,      0,      0,   20,               0.947
> >   6,      0,     20,   20,               0.951
> >   6,      0,      0,   21,               0.948
> >   6,      0,     21,   21,                0.96
> >   6,      0,      0,   22,               0.926
> >   6,      0,     22,   22,               0.951
> >   6,      0,      0,   23,               0.923
> >   6,      0,     23,   23,               0.959
> >   6,      0,      0,   24,               0.918
> >   6,      0,     24,   24,               0.952
> >   6,      0,      0,   25,                0.97
> >   6,      0,     25,   25,               0.952
> >   6,      0,      0,   26,               0.871
> >   6,      0,     26,   26,               0.869
> >   6,      0,      0,   27,               0.935
> >   6,      0,     27,   27,               0.836
> >   6,      0,      0,   28,               0.936
> >   6,      0,     28,   28,               0.857
> >   6,      0,      0,   29,               0.876
> >   6,      0,     29,   29,               0.859
> >   6,      0,      0,   30,               0.934
> >   6,      0,     30,   30,               0.857
> >   6,      0,      0,   31,               0.962
> >   6,      0,     31,   31,                0.86
> >   6,      0,      0,   32,               0.912
> >   6,      0,     32,   32,                0.94
> >   6,      0,      0,   33,               0.903
> >   6,      0,     33,   33,               0.968
> >   6,      0,      0,   34,               0.913
> >   6,      0,     34,   34,               0.896
> >   6,      0,      0,   35,               0.904
> >   6,      0,     35,   35,               0.913
> >   6,      0,      0,   36,               0.905
> >   6,      0,     36,   36,               0.907
> >   6,      0,      0,   37,               0.899
> >   6,      0,     37,   37,                 0.9
> >   6,      0,      0,   38,               0.912
> >   6,      0,     38,   38,               0.919
> >   6,      0,      0,   39,               0.925
> >   6,      0,     39,   39,               0.927
> >   6,      0,      0,   40,               0.923
> >   6,      0,     40,   40,               0.972
> >   6,      0,      0,   41,                0.92
> >   6,      0,     41,   41,               0.966
> >   6,      0,      0,   42,               0.915
> >   6,      0,     42,   42,               0.834
> >   6,      0,      0,   43,                0.92
> >   6,      0,     43,   43,               0.856
> >   6,      0,      0,   44,               0.908
> >   6,      0,     44,   44,               0.858
> >   6,      0,      0,   45,               0.932
> >   6,      0,     45,   45,               0.847
> >   6,      0,      0,   46,               0.927
> >   6,      0,     46,   46,               0.859
> >   6,      0,      0,   47,               0.902
> >   6,      0,     47,   47,               0.855
> >   6,      0,      0,   48,               0.949
> >   6,      0,     48,   48,               0.934
> >   6,      0,      0,   49,               0.907
> >   6,      0,     49,   49,               0.943
> >   6,      0,      0,   50,               0.934
> >   6,      0,     50,   50,               0.943
> >   6,      0,      0,   51,               0.933
> >   6,      0,     51,   51,               0.939
> >   6,      0,      0,   52,               0.944
> >   6,      0,     52,   52,               0.944
> >   6,      0,      0,   53,               0.939
> >   6,      0,     53,   53,               0.938
> >   6,      0,      0,   54,                 0.9
> >   6,      0,     54,   54,               0.923
> >   6,      0,      0,   55,                 0.9
> >   6,      0,     55,   55,               0.927
> >   6,      0,      0,   56,                 0.9
> >   6,      0,     56,   56,               0.917
> >   6,      0,      0,   57,                 0.9
> >   6,      0,     57,   57,               0.916
> >   6,      0,      0,   58,               0.914
> >   6,      0,     58,   58,               0.784
> >   6,      0,      0,   59,               0.863
> >   6,      0,     59,   59,               0.846
> >   6,      0,      0,   60,                0.88
> >   6,      0,     60,   60,               0.827
> >   6,      0,      0,   61,               0.896
> >   6,      0,     61,   61,               0.847
> >   6,      0,      0,   62,               0.894
> >   6,      0,     62,   62,               0.865
> >   6,      0,      0,   63,               0.934
> >   6,      0,     63,   63,               0.866
> >
> >  sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++---------------
> >  1 file changed, 37 insertions(+), 46 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
> > index 013aebf797..c312fab8b1 100644
> > --- a/sysdeps/x86_64/multiarch/strcspn-c.c
> > +++ b/sysdeps/x86_64/multiarch/strcspn-c.c
> > @@ -84,83 +84,74 @@ STRCSPN_SSE42 (const char *s, const char *a)
> >      RETURN (NULL, strlen (s));
> >
> >    const char *aligned;
> > -  __m128i mask;
> > -  int offset = (int) ((size_t) a & 15);
> > +  __m128i mask, maskz, zero;
> > +  unsigned int maskz_bits;
> > +  unsigned int offset = (unsigned int) ((size_t) a & 15);
> > +  zero = _mm_set1_epi8 (0);
> >    if (offset != 0)
> >      {
> >        /* Load masks.  */
> >        aligned = (const char *) ((size_t) a & -16L);
> >        __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
> > -
> > -      mask = __m128i_shift_right (mask0, offset);
> > +      maskz = _mm_cmpeq_epi8 (mask0, zero);
> >
> >        /* Find where the NULL terminator is.  */
> > -      int length = _mm_cmpistri (mask, mask, 0x3a);
> > -      if (length == 16 - offset)
> > -       {
> > -         /* There is no NULL terminator.  */
> > -         __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
> > -         int index = _mm_cmpistri (mask1, mask1, 0x3a);
> > -         length += index;
> > -
> > -         /* Don't use SSE4.2 if the length of A > 16.  */
> > -         if (length > 16)
> > -           return STRCSPN_SSE2 (s, a);
> > -
> > -         if (index != 0)
> > -           {
> > -             /* Combine mask0 and mask1.  We could play games with
> > -                palignr, but frankly this data should be in L1 now
> > -                so do the merge via an unaligned load.  */
> > -             mask = _mm_loadu_si128 ((__m128i *) a);
> > -           }
> > -       }
> > +      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
> > +      if (maskz_bits != 0)
> > +        {
> > +          mask = __m128i_shift_right (mask0, offset);
> > +          offset = (unsigned int) ((size_t) s & 15);
> > +          if (offset)
> > +            goto start_unaligned;
> > +
> > +          aligned = s;
> > +          goto start_loop;
> > +        }
> >      }
> > -  else
> > -    {
> > -      /* A is aligned.  */
> > -      mask = _mm_load_si128 ((__m128i *) a);
> >
> > -      /* Find where the NULL terminator is.  */
> > -      int length = _mm_cmpistri (mask, mask, 0x3a);
> > -      if (length == 16)
> > -       {
> > -         /* There is no NULL terminator.  Don't use SSE4.2 if the length
> > -            of A > 16.  */
> > -         if (a[16] != 0)
> > -           return STRCSPN_SSE2 (s, a);
> > -       }
> > +  /* A is aligned.  */
> > +  mask = _mm_loadu_si128 ((__m128i *) a);
> > +  /* Find where the NULL terminator is.  */
> > +  maskz = _mm_cmpeq_epi8 (mask, zero);
> > +  maskz_bits = _mm_movemask_epi8 (maskz);
> > +  if (maskz_bits == 0)
> > +    {
> > +      /* There is no NULL terminator.  Don't use SSE4.2 if the length
> > +         of A > 16.  */
> > +      if (a[16] != 0)
> > +        return STRCSPN_SSE2 (s, a);
> >      }
> >
> > -  offset = (int) ((size_t) s & 15);
> > +  aligned = s;
> > +  offset = (unsigned int) ((size_t) s & 15);
> >    if (offset != 0)
> >      {
> > +    start_unaligned:
> >        /* Check partial string.  */
> >        aligned = (const char *) ((size_t) s & -16L);
> >        __m128i value = _mm_load_si128 ((__m128i *) aligned);
> >
> >        value = __m128i_shift_right (value, offset);
> >
> > -      int length = _mm_cmpistri (mask, value, 0x2);
> > +      unsigned int length = _mm_cmpistri (mask, value, 0x2);
> >        /* No need to check ZFlag since ZFlag is always 1.  */
> > -      int cflag = _mm_cmpistrc (mask, value, 0x2);
> > +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> >        if (cflag)
> >         RETURN ((char *) (s + length), length);
> >        /* Find where the NULL terminator is.  */
> > -      int index = _mm_cmpistri (value, value, 0x3a);
> > +      unsigned int index = _mm_cmpistri (value, value, 0x3a);
> >        if (index < 16 - offset)
> >         RETURN (NULL, index);
> >        aligned += 16;
> >      }
> > -  else
> > -    aligned = s;
> >
> > +start_loop:
> >    while (1)
> >      {
> >        __m128i value = _mm_load_si128 ((__m128i *) aligned);
> > -      int index = _mm_cmpistri (mask, value, 0x2);
> > -      int cflag = _mm_cmpistrc (mask, value, 0x2);
> > -      int zflag = _mm_cmpistrz (mask, value, 0x2);
> > +      unsigned int index = _mm_cmpistri (mask, value, 0x2);
> > +      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
> > +      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
> >        if (cflag)
> >         RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
> >        if (zflag)
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil
  

Patch

diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c
index 013aebf797..c312fab8b1 100644
--- a/sysdeps/x86_64/multiarch/strcspn-c.c
+++ b/sysdeps/x86_64/multiarch/strcspn-c.c
@@ -84,83 +84,74 @@  STRCSPN_SSE42 (const char *s, const char *a)
     RETURN (NULL, strlen (s));
 
   const char *aligned;
-  __m128i mask;
-  int offset = (int) ((size_t) a & 15);
+  __m128i mask, maskz, zero;
+  unsigned int maskz_bits;
+  unsigned int offset = (unsigned int) ((size_t) a & 15);
+  zero = _mm_set1_epi8 (0);
   if (offset != 0)
     {
       /* Load masks.  */
       aligned = (const char *) ((size_t) a & -16L);
       __m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
-      mask = __m128i_shift_right (mask0, offset);
+      maskz = _mm_cmpeq_epi8 (mask0, zero);
 
       /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16 - offset)
-	{
-	  /* There is no NULL terminator.  */
-	  __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
-	  int index = _mm_cmpistri (mask1, mask1, 0x3a);
-	  length += index;
-
-	  /* Don't use SSE4.2 if the length of A > 16.  */
-	  if (length > 16)
-	    return STRCSPN_SSE2 (s, a);
-
-	  if (index != 0)
-	    {
-	      /* Combine mask0 and mask1.  We could play games with
-		 palignr, but frankly this data should be in L1 now
-		 so do the merge via an unaligned load.  */
-	      mask = _mm_loadu_si128 ((__m128i *) a);
-	    }
-	}
+      maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+      if (maskz_bits != 0)
+        {
+          mask = __m128i_shift_right (mask0, offset);
+          offset = (unsigned int) ((size_t) s & 15);
+          if (offset)
+            goto start_unaligned;
+
+          aligned = s;
+          goto start_loop;
+        }
     }
-  else
-    {
-      /* A is aligned.  */
-      mask = _mm_load_si128 ((__m128i *) a);
 
-      /* Find where the NULL terminator is.  */
-      int length = _mm_cmpistri (mask, mask, 0x3a);
-      if (length == 16)
-	{
-	  /* There is no NULL terminator.  Don't use SSE4.2 if the length
-	     of A > 16.  */
-	  if (a[16] != 0)
-	    return STRCSPN_SSE2 (s, a);
-	}
+  /* A is aligned.  */
+  mask = _mm_loadu_si128 ((__m128i *) a);
+  /* Find where the NULL terminator is.  */
+  maskz = _mm_cmpeq_epi8 (mask, zero);
+  maskz_bits = _mm_movemask_epi8 (maskz);
+  if (maskz_bits == 0)
+    {
+      /* There is no NULL terminator.  Don't use SSE4.2 if the length
+         of A > 16.  */
+      if (a[16] != 0)
+        return STRCSPN_SSE2 (s, a);
     }
 
-  offset = (int) ((size_t) s & 15);
+  aligned = s;
+  offset = (unsigned int) ((size_t) s & 15);
   if (offset != 0)
     {
+    start_unaligned:
       /* Check partial string.  */
       aligned = (const char *) ((size_t) s & -16L);
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
 
       value = __m128i_shift_right (value, offset);
 
-      int length = _mm_cmpistri (mask, value, 0x2);
+      unsigned int length = _mm_cmpistri (mask, value, 0x2);
       /* No need to check ZFlag since ZFlag is always 1.  */
-      int cflag = _mm_cmpistrc (mask, value, 0x2);
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
       if (cflag)
 	RETURN ((char *) (s + length), length);
       /* Find where the NULL terminator is.  */
-      int index = _mm_cmpistri (value, value, 0x3a);
+      unsigned int index = _mm_cmpistri (value, value, 0x3a);
       if (index < 16 - offset)
 	RETURN (NULL, index);
       aligned += 16;
     }
-  else
-    aligned = s;
 
+start_loop:
   while (1)
     {
       __m128i value = _mm_load_si128 ((__m128i *) aligned);
-      int index = _mm_cmpistri (mask, value, 0x2);
-      int cflag = _mm_cmpistrc (mask, value, 0x2);
-      int zflag = _mm_cmpistrz (mask, value, 0x2);
+      unsigned int index = _mm_cmpistri (mask, value, 0x2);
+      unsigned int cflag = _mm_cmpistrc (mask, value, 0x2);
+      unsigned int zflag = _mm_cmpistrz (mask, value, 0x2);
       if (cflag)
 	RETURN ((char *) (aligned + index), (size_t) (aligned + index - s));
       if (zflag)