[v1,5/5] x86: Optimize {str|wcs}rchr-evex

Message ID 20220421031410.2142238-5-goldstein.w.n@gmail.com
State Superseded
Headers
Series [v1,1/5] benchtests: Improve bench-strrchr |

Checks

Context Check Description
dj/TryBot-32bit success Build for i686

Commit Message

Noah Goldstein April 21, 2022, 3:14 a.m. UTC
  The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.

Geometric Mean of all benchmarks New / Old: 0.755
See email for all results.

Full xcheck passes on x86_64 with and without multiarch enabled.
---
Results For: strrchr

Geometric Mean of N=30 runs.

Geometric Mean of all benchmarks New / Old: 0.755
Benchmarks performance on Tigerlake:
https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html

 len, align,  pos, seek, max_char, freq, New Time / Old Time
2048,     0,   32,    0,      127,    1,               0.669
2048,     1,   32,    0,      127,    1,               0.672
2048,     0,   64,    0,      127,    1,               0.579
2048,     2,   64,    0,      127,    1,               0.579
2048,     0,  128,    0,      127,    1,               0.828
2048,     3,  128,    0,      127,    1,               0.827
2048,     0,  256,    0,      127,    1,               0.693
2048,     4,  256,    0,      127,    1,               0.692
2048,     0,  512,    0,      127,    1,               0.619
2048,     5,  512,    0,      127,    1,               0.622
2048,     0, 1024,    0,      127,    1,               0.626
2048,     6, 1024,    0,      127,    1,               0.627
2048,     0, 2048,    0,      127,    1,                0.85
2048,     7, 2048,    0,      127,    1,               0.855
2048,     0, 4096,    0,      127,    1,               0.849
2048,     8, 4096,    0,      127,    1,               0.848
 256,     1,   64,    0,      127,    1,               0.579
 256,    15,   64,    0,      127,    1,               0.579
 256,     2,   64,    0,      127,    1,               0.579
 256,    30,   64,    0,      127,    1,               0.579
 256,     3,   64,    0,      127,    1,               0.579
 256,    45,   64,    0,      127,    1,               0.551
 256,     4,   64,    0,      127,    1,               0.579
 256,    60,   64,    0,      127,    1,               0.553
 256,     5,   64,    0,      127,    1,               0.579
 256,    75,   64,    0,      127,    1,               0.578
 256,     6,   64,    0,      127,    1,               0.578
 256,    90,   64,    0,      127,    1,               0.579
 256,     7,   64,    0,      127,    1,               0.579
 256,   105,   64,    0,      127,    1,                0.55
   1,     0,    0,    0,      127,    1,               0.795
   2,     0,    1,    0,      127,    1,               0.797
   3,     0,    2,    0,      127,    1,               0.796
   4,     0,    3,    0,      127,    1,               0.792
   5,     0,    4,    0,      127,    1,               0.789
   6,     0,    5,    0,      127,    1,               0.791
   7,     0,    6,    0,      127,    1,               0.793
   8,     0,    7,    0,      127,    1,               0.789
   9,     0,    8,    0,      127,    1,               0.797
  10,     0,    9,    0,      127,    1,               0.788
  11,     0,   10,    0,      127,    1,               0.796
  12,     0,   11,    0,      127,    1,               0.793
  13,     0,   12,    0,      127,    1,               0.797
  14,     0,   13,    0,      127,    1,               0.795
  15,     0,   14,    0,      127,    1,               0.795
  16,     0,   15,    0,      127,    1,               0.791
  17,     0,   16,    0,      127,    1,               0.798
  18,     0,   17,    0,      127,    1,                 0.8
  19,     0,   18,    0,      127,    1,               0.797
  20,     0,   19,    0,      127,    1,               0.798
  21,     0,   20,    0,      127,    1,               0.797
  22,     0,   21,    0,      127,    1,               0.796
  23,     0,   22,    0,      127,    1,               0.792
  24,     0,   23,    0,      127,    1,               0.791
  25,     0,   24,    0,      127,    1,               0.794
  26,     0,   25,    0,      127,    1,               0.797
  27,     0,   26,    0,      127,    1,               0.793
  28,     0,   27,    0,      127,    1,                0.79
  29,     0,   28,    0,      127,    1,                0.79
  30,     0,   29,    0,      127,    1,               0.791
  31,     0,   30,    0,      127,    1,               0.791
  32,     0,   31,    0,      127,    1,                0.79
2048,     0,   32,   23,      127,    1,               0.734
2048,     1,   32,   23,      127,    1,               0.748
2048,     0,   64,   23,      127,    1,               0.759
2048,     2,   64,   23,      127,    1,               0.753
2048,     0,  128,   23,      127,    1,               0.834
2048,     3,  128,   23,      127,    1,               0.835
2048,     0,  256,   23,      127,    1,               0.789
2048,     4,  256,   23,      127,    1,               0.791
2048,     0,  512,   23,      127,    1,               0.882
2048,     5,  512,   23,      127,    1,               0.861
2048,     0, 1024,   23,      127,    1,               0.643
2048,     6, 1024,   23,      127,    1,               0.643
2048,     0, 2048,   23,      127,    1,               0.931
2048,     7, 2048,   23,      127,    1,               0.929
2048,     0, 4096,   23,      127,    1,               0.922
2048,     8, 4096,   23,      127,    1,               0.934
 256,     1,   64,   23,      127,    1,                0.73
 256,    15,   64,   23,      127,    1,               0.729
 256,     2,   64,   23,      127,    1,               0.725
 256,    30,   64,   23,      127,    1,               0.728
 256,     3,   64,   23,      127,    1,               0.727
 256,    45,   64,   23,      127,    1,               0.749
 256,     4,   64,   23,      127,    1,                0.73
 256,    60,   64,   23,      127,    1,               0.752
 256,     5,   64,   23,      127,    1,               0.729
 256,    75,   64,   23,      127,    1,               0.727
 256,     6,   64,   23,      127,    1,               0.693
 256,    90,   64,   23,      127,    1,                0.73
 256,     7,   64,   23,      127,    1,                0.73
 256,   105,   64,   23,      127,    1,               0.751
   1,     0,    0,   23,      127,    1,               0.797
   2,     0,    1,   23,      127,    1,               0.794
   3,     0,    2,   23,      127,    1,               0.797
   4,     0,    3,   23,      127,    1,               0.792
   5,     0,    4,   23,      127,    1,               0.781
   6,     0,    5,   23,      127,    1,               0.783
   7,     0,    6,   23,      127,    1,                0.79
   8,     0,    7,   23,      127,    1,               0.791
   9,     0,    8,   23,      127,    1,               0.794
  10,     0,    9,   23,      127,    1,               0.795
  11,     0,   10,   23,      127,    1,               0.795
  12,     0,   11,   23,      127,    1,               0.795
  13,     0,   12,   23,      127,    1,               0.794
  14,     0,   13,   23,      127,    1,               0.792
  15,     0,   14,   23,      127,    1,                0.79
  16,     0,   15,   23,      127,    1,               0.793
  17,     0,   16,   23,      127,    1,               0.795
  18,     0,   17,   23,      127,    1,               0.797
  19,     0,   18,   23,      127,    1,               0.796
  20,     0,   19,   23,      127,    1,               0.796
  21,     0,   20,   23,      127,    1,               0.794
  22,     0,   21,   23,      127,    1,               0.794
  23,     0,   22,   23,      127,    1,               0.793
  24,     0,   23,   23,      127,    1,               0.792
  25,     0,   24,   23,      127,    1,               0.795
  26,     0,   25,   23,      127,    1,               0.792
  27,     0,   26,   23,      127,    1,               0.789
  28,     0,   27,   23,      127,    1,               0.794
  29,     0,   28,   23,      127,    1,               0.793
  30,     0,   29,   23,      127,    1,               0.795
  31,     0,   30,   23,      127,    1,               0.797
  32,     0,   31,   23,      127,    1,               0.775
2048,     0,   32,   23,      127,    2,               0.736
2048,     1,   32,   23,      127,    2,               0.738
2048,     0,   64,   23,      127,    2,               0.895
2048,     2,   64,   23,      127,    2,               0.897
2048,     0,  128,   23,      127,    2,               0.852
2048,     3,  128,   23,      127,    2,               0.845
2048,     0,  256,   23,      127,    2,               0.755
2048,     4,  256,   23,      127,    2,               0.712
2048,     0,  512,   23,      127,    2,               0.857
2048,     5,  512,   23,      127,    2,               0.849
2048,     0, 1024,   23,      127,    2,               0.626
2048,     6, 1024,   23,      127,    2,               0.661
2048,     0, 2048,   23,      127,    2,                0.67
2048,     7, 2048,   23,      127,    2,                0.67
2048,     0, 4096,   23,      127,    2,               0.928
2048,     8, 4096,   23,      127,    2,               0.935
 256,     1,   64,   23,      127,    2,               0.693
 256,    15,   64,   23,      127,    2,               0.692
 256,     2,   64,   23,      127,    2,               0.693
 256,    30,   64,   23,      127,    2,               0.692
 256,     3,   64,   23,      127,    2,               0.692
 256,    45,   64,   23,      127,    2,               0.701
 256,     4,   64,   23,      127,    2,               0.692
 256,    60,   64,   23,      127,    2,               0.701
 256,     5,   64,   23,      127,    2,                0.69
 256,    75,   64,   23,      127,    2,               0.693
 256,     6,   64,   23,      127,    2,               0.691
 256,    90,   64,   23,      127,    2,               0.692
 256,     7,   64,   23,      127,    2,               0.693
 256,   105,   64,   23,      127,    2,               0.701
   1,     0,    0,   23,      127,    2,               0.797
   2,     0,    1,   23,      127,    2,               0.787
   3,     0,    2,   23,      127,    2,               0.797
   4,     0,    3,   23,      127,    2,               0.793
   5,     0,    4,   23,      127,    2,               0.792
   6,     0,    5,   23,      127,    2,               0.795
   7,     0,    6,   23,      127,    2,               0.791
   8,     0,    7,   23,      127,    2,               0.792
   9,     0,    8,   23,      127,    2,               0.796
  10,     0,    9,   23,      127,    2,               0.797
  11,     0,   10,   23,      127,    2,               0.797
  12,     0,   11,   23,      127,    2,               0.798
  13,     0,   12,   23,      127,    2,               0.799
  14,     0,   13,   23,      127,    2,               0.796
  15,     0,   14,   23,      127,    2,               0.796
  16,     0,   15,   23,      127,    2,               0.794
  17,     0,   16,   23,      127,    2,               0.795
  18,     0,   17,   23,      127,    2,               0.797
  19,     0,   18,   23,      127,    2,               0.793
  20,     0,   19,   23,      127,    2,               0.795
  21,     0,   20,   23,      127,    2,               0.794
  22,     0,   21,   23,      127,    2,               0.794
  23,     0,   22,   23,      127,    2,               0.796
  24,     0,   23,   23,      127,    2,               0.794
  25,     0,   24,   23,      127,    2,               0.794
  26,     0,   25,   23,      127,    2,               0.794
  27,     0,   26,   23,      127,    2,               0.788
  28,     0,   27,   23,      127,    2,               0.791
  29,     0,   28,   23,      127,    2,               0.791
  30,     0,   29,   23,      127,    2,               0.793
  31,     0,   30,   23,      127,    2,               0.796
  32,     0,   31,   23,      127,    2,               0.628
2048,     0,   32,   23,      127,    4,               0.742
2048,     1,   32,   23,      127,    4,               0.742
2048,     0,   64,   23,      127,    4,               0.899
2048,     2,   64,   23,      127,    4,               0.912
2048,     0,  128,   23,      127,    4,               0.783
2048,     3,  128,   23,      127,    4,               0.815
2048,     0,  256,   23,      127,    4,               0.854
2048,     4,  256,   23,      127,    4,               0.858
2048,     0,  512,   23,      127,    4,               0.907
2048,     5,  512,   23,      127,    4,               0.873
2048,     0, 1024,   23,      127,    4,               0.657
2048,     6, 1024,   23,      127,    4,               0.653
2048,     0, 2048,   23,      127,    4,               0.666
2048,     7, 2048,   23,      127,    4,               0.667
2048,     0, 4096,   23,      127,    4,                0.67
2048,     8, 4096,   23,      127,    4,                0.67
 256,     1,   64,   23,      127,    4,               0.686
 256,    15,   64,   23,      127,    4,               0.687
 256,     2,   64,   23,      127,    4,               0.687
 256,    30,   64,   23,      127,    4,               0.687
 256,     3,   64,   23,      127,    4,               0.687
 256,    45,   64,   23,      127,    4,               0.672
 256,     4,   64,   23,      127,    4,               0.687
 256,    60,   64,   23,      127,    4,               0.701
 256,     5,   64,   23,      127,    4,               0.687
 256,    75,   64,   23,      127,    4,               0.686
 256,     6,   64,   23,      127,    4,               0.687
 256,    90,   64,   23,      127,    4,               0.686
 256,     7,   64,   23,      127,    4,                0.69
 256,   105,   64,   23,      127,    4,               0.672
   1,     0,    0,   23,      127,    4,               0.798
   2,     0,    1,   23,      127,    4,               0.791
   3,     0,    2,   23,      127,    4,               0.792
   4,     0,    3,   23,      127,    4,               0.795
   5,     0,    4,   23,      127,    4,               0.791
   6,     0,    5,   23,      127,    4,               0.793
   7,     0,    6,   23,      127,    4,                0.78
   8,     0,    7,   23,      127,    4,               0.791
   9,     0,    8,   23,      127,    4,               0.788
  10,     0,    9,   23,      127,    4,               0.798
  11,     0,   10,   23,      127,    4,               0.796
  12,     0,   11,   23,      127,    4,               0.794
  13,     0,   12,   23,      127,    4,               0.795
  14,     0,   13,   23,      127,    4,               0.793
  15,     0,   14,   23,      127,    4,                 0.8
  16,     0,   15,   23,      127,    4,               0.796
  17,     0,   16,   23,      127,    4,               0.796
  18,     0,   17,   23,      127,    4,               0.796
  19,     0,   18,   23,      127,    4,               0.798
  20,     0,   19,   23,      127,    4,               0.796
  21,     0,   20,   23,      127,    4,               0.796
  22,     0,   21,   23,      127,    4,               0.796
  23,     0,   22,   23,      127,    4,               0.801
  24,     0,   23,   23,      127,    4,               0.799
  25,     0,   24,   23,      127,    4,               0.795
  26,     0,   25,   23,      127,    4,               0.793
  27,     0,   26,   23,      127,    4,               0.796
  28,     0,   27,   23,      127,    4,               0.794
  29,     0,   28,   23,      127,    4,               0.798
  30,     0,   29,   23,      127,    4,               0.795
  31,     0,   30,   23,      127,    4,               0.797
  32,     0,   31,   23,      127,    4,               0.628
2048,     0,   32,   23,      127,    8,               0.738
2048,     1,   32,   23,      127,    8,               0.747
2048,     0,   64,   23,      127,    8,               0.905
2048,     2,   64,   23,      127,    8,               0.906
2048,     0,  128,   23,      127,    8,               0.822
2048,     3,  128,   23,      127,    8,               0.827
2048,     0,  256,   23,      127,    8,               0.825
2048,     4,  256,   23,      127,    8,               0.825
2048,     0,  512,   23,      127,    8,               0.851
2048,     5,  512,   23,      127,    8,               0.855
2048,     0, 1024,   23,      127,    8,               0.653
2048,     6, 1024,   23,      127,    8,               0.651
2048,     0, 2048,   23,      127,    8,               0.644
2048,     7, 2048,   23,      127,    8,               0.643
2048,     0, 4096,   23,      127,    8,                0.67
2048,     8, 4096,   23,      127,    8,                0.67
 256,     1,   64,   23,      127,    8,               0.686
 256,    15,   64,   23,      127,    8,               0.686
 256,     2,   64,   23,      127,    8,               0.686
 256,    30,   64,   23,      127,    8,               0.687
 256,     3,   64,   23,      127,    8,               0.686
 256,    45,   64,   23,      127,    8,               0.671
 256,     4,   64,   23,      127,    8,                0.69
 256,    60,   64,   23,      127,    8,               0.705
 256,     5,   64,   23,      127,    8,               0.688
 256,    75,   64,   23,      127,    8,               0.687
 256,     6,   64,   23,      127,    8,               0.692
 256,    90,   64,   23,      127,    8,               0.689
 256,     7,   64,   23,      127,    8,                0.69
 256,   105,   64,   23,      127,    8,               0.674
   1,     0,    0,   23,      127,    8,               0.798
   2,     0,    1,   23,      127,    8,               0.798
   3,     0,    2,   23,      127,    8,               0.797
   4,     0,    3,   23,      127,    8,               0.792
   5,     0,    4,   23,      127,    8,               0.795
   6,     0,    5,   23,      127,    8,               0.792
   7,     0,    6,   23,      127,    8,               0.792
   8,     0,    7,   23,      127,    8,               0.795
   9,     0,    8,   23,      127,    8,               0.799
  10,     0,    9,   23,      127,    8,               0.798
  11,     0,   10,   23,      127,    8,               0.795
  12,     0,   11,   23,      127,    8,               0.795
  13,     0,   12,   23,      127,    8,               0.797
  14,     0,   13,   23,      127,    8,               0.796
  15,     0,   14,   23,      127,    8,               0.795
  16,     0,   15,   23,      127,    8,               0.796
  17,     0,   16,   23,      127,    8,               0.798
  18,     0,   17,   23,      127,    8,               0.798
  19,     0,   18,   23,      127,    8,               0.795
  20,     0,   19,   23,      127,    8,               0.797
  21,     0,   20,   23,      127,    8,               0.797
  22,     0,   21,   23,      127,    8,               0.793
  23,     0,   22,   23,      127,    8,               0.797
  24,     0,   23,   23,      127,    8,                 0.8
  25,     0,   24,   23,      127,    8,               0.796
  26,     0,   25,   23,      127,    8,               0.796
  27,     0,   26,   23,      127,    8,               0.791
  28,     0,   27,   23,      127,    8,               0.795
  29,     0,   28,   23,      127,    8,               0.786
  30,     0,   29,   23,      127,    8,               0.797
  31,     0,   30,   23,      127,    8,               0.791
  32,     0,   31,   23,      127,    8,               0.628
2048,     0,   32,   23,      127,   16,               0.736
2048,     1,   32,   23,      127,   16,               0.737
2048,     0,   64,   23,      127,   16,               0.905
2048,     2,   64,   23,      127,   16,               0.908
2048,     0,  128,   23,      127,   16,               0.829
2048,     3,  128,   23,      127,   16,               0.824
2048,     0,  256,   23,      127,   16,               0.827
2048,     4,  256,   23,      127,   16,               0.825
2048,     0,  512,   23,      127,   16,               0.694
2048,     5,  512,   23,      127,   16,               0.687
2048,     0, 1024,   23,      127,   16,               0.568
2048,     6, 1024,   23,      127,   16,               0.667
2048,     0, 2048,   23,      127,   16,               0.766
2048,     7, 2048,   23,      127,   16,               0.781
2048,     0, 4096,   23,      127,   16,               0.646
2048,     8, 4096,   23,      127,   16,               0.646
 256,     1,   64,   23,      127,   16,               0.697
 256,    15,   64,   23,      127,   16,               0.686
 256,     2,   64,   23,      127,   16,               0.687
 256,    30,   64,   23,      127,   16,               0.687
 256,     3,   64,   23,      127,   16,               0.686
 256,    45,   64,   23,      127,   16,               0.672
 256,     4,   64,   23,      127,   16,               0.686
 256,    60,   64,   23,      127,   16,               0.701
 256,     5,   64,   23,      127,   16,               0.686
 256,    75,   64,   23,      127,   16,               0.686
 256,     6,   64,   23,      127,   16,               0.691
 256,    90,   64,   23,      127,   16,               0.687
 256,     7,   64,   23,      127,   16,               0.688
 256,   105,   64,   23,      127,   16,               0.674
   1,     0,    0,   23,      127,   16,               0.797
   2,     0,    1,   23,      127,   16,               0.798
   3,     0,    2,   23,      127,   16,               0.786
   4,     0,    3,   23,      127,   16,               0.792
   5,     0,    4,   23,      127,   16,               0.792
   6,     0,    5,   23,      127,   16,               0.795
   7,     0,    6,   23,      127,   16,               0.796
   8,     0,    7,   23,      127,   16,               0.798
   9,     0,    8,   23,      127,   16,               0.795
  10,     0,    9,   23,      127,   16,               0.797
  11,     0,   10,   23,      127,   16,               0.797
  12,     0,   11,   23,      127,   16,               0.797
  13,     0,   12,   23,      127,   16,               0.799
  14,     0,   13,   23,      127,   16,               0.798
  15,     0,   14,   23,      127,   16,               0.798
  16,     0,   15,   23,      127,   16,               0.796
  17,     0,   16,   23,      127,   16,               0.798
  18,     0,   17,   23,      127,   16,               0.796
  19,     0,   18,   23,      127,   16,               0.797
  20,     0,   19,   23,      127,   16,               0.797
  21,     0,   20,   23,      127,   16,               0.798
  22,     0,   21,   23,      127,   16,               0.797
  23,     0,   22,   23,      127,   16,               0.797
  24,     0,   23,   23,      127,   16,               0.797
  25,     0,   24,   23,      127,   16,               0.798
  26,     0,   25,   23,      127,   16,               0.794
  27,     0,   26,   23,      127,   16,               0.796
  28,     0,   27,   23,      127,   16,               0.796
  29,     0,   28,   23,      127,   16,               0.792
  30,     0,   29,   23,      127,   16,               0.788
  31,     0,   30,   23,      127,   16,                0.79
  32,     0,   31,   23,      127,   16,               0.628

 sysdeps/x86_64/multiarch/strrchr-evex.S | 441 ++++++++++++++----------
 1 file changed, 259 insertions(+), 182 deletions(-)
  

Patch

diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index adeddaed32..5cf9a8315b 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -24,242 +24,319 @@ 
 #  define STRRCHR	__strrchr_evex
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
+# define VMOVU	vmovdqu64
+# define VMOVA	vmovdqa64
 
 # ifdef USE_AS_WCSRCHR
+#  define SHIFT_REG	esi
+
+#  define kunpck	kunpckbw
+#  define kmov_2x	kmovd
+#  define maskz_2x	ecx
+#  define maskm_2x	eax
+#  define CHAR_SIZE	4
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
 #  define VPBROADCAST	vpbroadcastd
-#  define VPCMP		vpcmpd
-#  define SHIFT_REG	r8d
+#  define VPCMP	vpcmpd
 # else
+#  define SHIFT_REG	edi
+
+#  define kunpck	kunpckdq
+#  define kmov_2x	kmovq
+#  define maskz_2x	rcx
+#  define maskm_2x	rax
+
+#  define CHAR_SIZE	1
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
 #  define VPBROADCAST	vpbroadcastb
-#  define VPCMP		vpcmpb
-#  define SHIFT_REG	ecx
+#  define VPCMP	vpcmpb
 # endif
 
 # define XMMZERO	xmm16
 # define YMMZERO	ymm16
 # define YMMMATCH	ymm17
-# define YMM1		ymm18
+# define YMMSAVE	ymm18
+
+# define YMM1	ymm19
+# define YMM2	ymm20
+# define YMM3	ymm21
+# define YMM4	ymm22
+# define YMM5	ymm23
+# define YMM6	ymm24
+# define YMM7	ymm25
+# define YMM8	ymm26
 
-# define VEC_SIZE	32
 
-	.section .text.evex,"ax",@progbits
-ENTRY (STRRCHR)
-	movl	%edi, %ecx
+# define VEC_SIZE	32
+# define PAGE_SIZE	4096
+	.section .text.evex, "ax", @progbits
+ENTRY(STRRCHR)
+	movl	%edi, %eax
 	/* Broadcast CHAR to YMMMATCH.  */
 	VPBROADCAST %esi, %YMMMATCH
 
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(cross_page_boundary)
 
+L(page_cross_continue):
 	VMOVU	(%rdi), %YMM1
-
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	VPTESTN	%YMM1, %YMM1, %k0
 	kmovd	%k0, %ecx
-	kmovd	%k1, %eax
-
-	addq	$VEC_SIZE, %rdi
-
-	testl	%eax, %eax
-	jnz	L(first_vec)
-
 	testl	%ecx, %ecx
-	jnz	L(return_null)
-
-	andq	$-VEC_SIZE, %rdi
-	xorl	%edx, %edx
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(first_vec):
-	/* Check if there is a null byte.  */
-	testl	%ecx, %ecx
-	jnz	L(char_and_nul_in_first_vec)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	movq	%rdi, %rsi
-	andq	$-VEC_SIZE, %rdi
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-
+	jz	L(aligned_more)
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k1, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
 # ifdef USE_AS_WCSRCHR
-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
-	   bytes.  */
-	movl	%ecx, %SHIFT_REG
-	sarl	$2, %SHIFT_REG
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
 # endif
+L(ret0):
+	ret
 
-	VMOVA	(%rdi), %YMM1
-
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
+	/* Returns for first vec x1/x2/x3 have hard coded backward
+	   search path for earlier matches.  */
+	.p2align 4,, 6
+L(first_vec_x1):
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
+	kmovd	%k1, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jnz	L(first_vec_x1_return)
+	.p2align 4,, 4
+L(first_vec_x0_test):
 	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k0, %edx
 	kmovd	%k1, %eax
-
-	shrxl	%SHIFT_REG, %edx, %edx
-	shrxl	%SHIFT_REG, %eax, %eax
-	addq	$VEC_SIZE, %rdi
-
-	/* Check if there is a CHAR.  */
 	testl	%eax, %eax
-	jnz	L(found_char)
-
-	testl	%edx, %edx
-	jnz	L(return_null)
-
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(found_char):
-	testl	%edx, %edx
-	jnz	L(char_and_nul)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	leaq	(%rdi, %rcx), %rsi
+	jz	L(ret1)
+	bsrl	%eax, %eax
+# ifdef USE_AS_WCSRCHR
+	leaq	(%rsi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rsi, %rax
+# endif
+L(ret1):
+	ret
 
-	.p2align 4
-L(aligned_loop):
-	VMOVA	(%rdi), %YMM1
-	addq	$VEC_SIZE, %rdi
+	.p2align 4,, 10
+L(first_vec_x1_or_x2):
+	VPCMP	$0, %YMM3, %YMMMATCH, %k3
+	VPCMP	$0, %YMM2, %YMMMATCH, %k2
+	kortestd %k2, %k3
+	jz	L(first_vec_x0_test)
+
+	kunpck	%k2, %k3, %k3
+	kmovq	%k3, %rax
+	bsrq	%rax, %rax
+	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k0, %ecx
+	.p2align 4,, 6
+L(first_vec_x3):
+	VPCMP	$0, %YMMMATCH, %YMM4, %k1
 	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x1_or_x2)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	VMOVA	(%rdi), %YMM1
-	add	$VEC_SIZE, %rdi
+	.p2align 4,, 6
+L(first_vec_x0_x1_test):
+	VPCMP	$0, %YMMMATCH, %YMM2, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jz	L(first_vec_x0_test)
+	.p2align 4,, 4
+L(first_vec_x1_return):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k0, %ecx
+	.p2align 4,, 10
+L(first_vec_x2):
+	VPCMP	$0, %YMMMATCH, %YMM3, %k1
 	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	VMOVA	(%rdi), %YMM1
-	addq	$VEC_SIZE, %rdi
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	.p2align 4
+L(aligned_more):
+	/* Need to keep original pointer incase YMM1 has last match.  */
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rdi
+	VMOVU	VEC_SIZE(%rdi), %YMM2
+	VPTESTN	%YMM2, %YMM2, %k0
 	kmovd	%k0, %ecx
-	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x1)
 
-	VMOVA	(%rdi), %YMM1
-	addq	$VEC_SIZE, %rdi
+	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
+	VPTESTN	%YMM3, %YMM3, %k0
+	kmovd	%k0, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
 
-	/* Each bit in K0 represents a null byte in YMM1.  */
-	VPCMP	$0, %YMMZERO, %YMM1, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
+	VPTESTN	%YMM4, %YMM4, %k0
 	kmovd	%k0, %ecx
-	kmovd	%k1, %eax
-	orl	%eax, %ecx
-	jz	L(aligned_loop)
+	movq	%rdi, %r8
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x3)
 
+	andq	$-(VEC_SIZE * 2), %rdi
 	.p2align 4
-L(char_nor_null):
-	/* Find a CHAR or a null byte in a loop.  */
+L(first_aligned_loop):
+	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
+	   they don't store a match.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
+	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
+
+	VPCMP	$0, %YMM5, %YMMMATCH, %k2
+	vpxord	%YMM6, %YMMMATCH, %YMM7
+
+	VPMIN	%YMM5, %YMM6, %YMM8
+	VPMIN	%YMM8, %YMM7, %YMM7
+
+	VPTESTN	%YMM7, %YMM7, %k1
+	subq	$(VEC_SIZE * -2), %rdi
+	kortestd %k1, %k2
+	jz	L(first_aligned_loop)
+
+	VPCMP	$0, %YMM6, %YMMMATCH, %k3
+	VPTESTN	%YMM8, %YMM8, %k1
+	ktestd	%k1, %k1
+	jz	L(second_aligned_loop_prep)
+
+	kortestd %k2, %k3
+	jnz	L(return_first_aligned_loop)
+
+	.p2align 4,, 6
+L(first_vec_x1_or_x2_or_x3):
+	VPCMP	$0, %YMM4, %YMMMATCH, %k4
+	kmovd	%k4, %eax
 	testl	%eax, %eax
-	jnz	L(match)
-L(return_value):
-	testl	%edx, %edx
-	jz	L(return_null)
-	movl	%edx, %eax
-	movq	%rsi, %rdi
+	jz	L(first_vec_x1_or_x2)
 	bsrl	%eax, %eax
-# ifdef USE_AS_WCSRCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
-# else
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-# endif
+	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
-L(match):
-	/* Find a CHAR.  Check if there is a null byte.  */
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
-	jnz	L(find_nul)
+	.p2align 4,, 8
+L(return_first_aligned_loop):
+	VPTESTN	%YMM5, %YMM5, %k0
+	kunpck	%k0, %k1, %k0
+	kmov_2x	%k0, %maskz_2x
+
+	blsmsk	%maskz_2x, %maskz_2x
+	kunpck	%k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	and	%maskz_2x, %maskm_2x
+	jz	L(first_vec_x1_or_x2_or_x3)
+
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
+	.p2align 4
+	/* We can throw away the work done for the first 4x checks here
+	   as we have a later match. This is the 'fast' path persay.
+	 */
+L(second_aligned_loop_prep):
+L(second_aligned_loop_set_furthest_match):
 	movq	%rdi, %rsi
-	jmp	L(aligned_loop)
+	kunpck	%k2, %k3, %k4
 
 	.p2align 4
-L(find_nul):
-	/* Mask out any matching bits after the null byte.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
-	testl	%eax, %eax
-	/* If there is no CHAR here, return the remembered one.  */
-	jz	L(return_value)
-	bsrl	%eax, %eax
+L(second_aligned_loop):
+	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
+	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
+
+	VPCMP	$0, %YMM1, %YMMMATCH, %k2
+	vpxord	%YMM2, %YMMMATCH, %YMM3
+
+	VPMIN	%YMM1, %YMM2, %YMM4
+	VPMIN	%YMM3, %YMM4, %YMM3
+
+	VPTESTN	%YMM3, %YMM3, %k1
+	subq	$(VEC_SIZE * -2), %rdi
+	kortestd %k1, %k2
+	jz	L(second_aligned_loop)
+
+	VPCMP	$0, %YMM2, %YMMMATCH, %k3
+	VPTESTN	%YMM4, %YMM4, %k1
+	ktestd	%k1, %k1
+	jz	L(second_aligned_loop_set_furthest_match)
+
+	kortestd %k2, %k3
+	/* branch here because there is a significant advantage interms
+	   of output dependency chance in using edx.  */
+	jnz	L(return_new_match)
+L(return_old_match):
+	kmovq	%k4, %rax
+	bsrq	%rax, %rax
+	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
+	ret
+
+L(return_new_match):
+	VPTESTN	%YMM1, %YMM1, %k0
+	kunpck	%k0, %k1, %k0
+	kmov_2x	%k0, %maskz_2x
+
+	blsmsk	%maskz_2x, %maskz_2x
+	kunpck	%k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	and	%maskz_2x, %maskm_2x
+	jz	L(return_old_match)
+
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	/* This block is horribly aligned (% 16 == 15). This is
+	   intentional. The L(cross_page_boundary) block is exactly
+	   32-bytes of code size. Ultimately this is a cold case so
+	   save the code size by leaving misaligned.  */
+L(cross_page_boundary):
+	xorq	%rdi, %rax
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
+	VPTESTN	%YMM1, %YMM1, %k0
+	kmovd	%k0, %ecx
 # ifdef USE_AS_WCSRCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
-# else
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	movl	%edi, %esi
+	andl	$(VEC_SIZE - 1), %esi
+	shrl	$2, %esi
 # endif
-	ret
+	shrxl	%SHIFT_REG, %ecx, %ecx
 
-	.p2align 4
-L(char_and_nul):
-	/* Find both a CHAR and a null byte.  */
-	addq	%rcx, %rdi
-	movl	%edx, %ecx
-L(char_and_nul_in_first_vec):
-	/* Mask out any matching bits after the null byte.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
-	testl	%eax, %eax
-	/* Return null pointer if the null byte comes first.  */
-	jz	L(return_null)
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
+	VPCMP	$0, %YMMMATCH, %YMM1, %k1
+	kmovd	%k1, %eax
+	shrxl	%SHIFT_REG, %eax, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret3)
 	bsrl	%eax, %eax
 # ifdef USE_AS_WCSRCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	-VEC_SIZE(%rdi, %rax, 4), %rax
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	addq	%rdi, %rax
 # endif
+L(ret3):
 	ret
 
-	.p2align 4
-L(return_null):
-	xorl	%eax, %eax
-	ret
-
-END (STRRCHR)
+END(STRRCHR)
 #endif