powerpc: Optimized rawmemchr for POWER9

Message ID 20200514214916.5eb363bd@kryten.localdomain
State Committed
Headers
Series powerpc: Optimized rawmemchr for POWER9 |

Commit Message

Anton Blanchard May 14, 2020, 11:49 a.m. UTC
  This version uses vector instructions and is up to 60% faster on medium
matches and up to 90% faster on long matches, compared to the POWER7
version. A few examples:

                            __rawmemchr_power9  __rawmemchr_power7
Length   32, alignment  0:   2.27566             3.77765
Length   64, alignment  2:   2.46231             3.51064
Length 1024, alignment  0:  17.3059             32.6678
---
 .../powerpc/powerpc64/le/power9/rawmemchr.S   | 107 ++++++++++++++++++
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
 .../powerpc64/multiarch/ifunc-impl-list.c     |   5 +
 .../powerpc64/multiarch/rawmemchr-power9.S    |  21 ++++
 .../powerpc/powerpc64/multiarch/rawmemchr.c   |  12 +-
 5 files changed, 144 insertions(+), 3 deletions(-)
 create mode 100644 sysdeps/powerpc/powerpc64/le/power9/rawmemchr.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power9.S
  

Comments

Paul E Murphy May 18, 2020, 6:53 p.m. UTC | #1
On 5/14/20 6:49 AM, Anton Blanchard via Libc-alpha wrote:
> This version uses vector instructions and is up to 60% faster on medium
> matches and up to 90% faster on long matches, compared to the POWER7
> version. A few examples:
> 
>                              __rawmemchr_power9  __rawmemchr_power7
> Length   32, alignment  0:   2.27566             3.77765
> Length   64, alignment  2:   2.46231             3.51064
> Length 1024, alignment  0:  17.3059             32.6678

I think this looks OK.  A few trivial modifications and this could also 
replace strlen.  I played around for a little bit and saw 17-60% 
speedups above P8 until it drops to about -4% of P8 for large strings 
(>2kB).  I am not sure if that tradeoff is acceptable.

Attached are full benchmarks for this patch, and a hypothetical patch to 
reuse this as __strlen_power9.
../benchtests/scripts/compare_strings.py -i benchtests/bench-strlen.out -s ../benchtests/scripts/benchout_strings.schema.json -a length,alignment -b __strlen_power8
Function: strlen
Variant: 
                                    generic_strlen	memchr_strlen	__strlen_power9	__strlen_power8	__strlen_power7	__strlen_ppc
========================================================================================================================
               length=1, alignment=1:         2.93 ( -8.62%)	        5.43 (-101.11%)	        2.23 ( 17.32%)	        2.70	        2.46 (  8.68%)	        2.92 ( -8.30%)	
               length=1, alignment=0:         3.58 (-32.06%)	        8.81 (-224.91%)	        2.23 ( 17.68%)	        2.71	        2.46 (  9.10%)	        2.91 ( -7.22%)	
               length=2, alignment=2:         4.36 (-145.95%)	        4.68 (-163.85%)	        1.47 ( 17.32%)	        1.77	        1.62 (  8.70%)	        1.91 ( -7.69%)	
               length=2, alignment=0:         2.48 (-40.00%)	        5.80 (-226.79%)	        1.47 ( 17.32%)	        1.77	        1.62 (  8.70%)	        1.91 ( -7.69%)	
               length=3, alignment=3:         3.27 (-84.42%)	        3.52 (-98.64%)	        1.48 ( 16.78%)	        1.77	        1.62 (  8.69%)	        1.91 ( -7.69%)	
               length=3, alignment=0:         2.50 (-41.05%)	        5.79 (-226.34%)	        1.47 ( 16.95%)	        1.77	        1.62 (  8.69%)	        1.91 ( -7.69%)	
               length=4, alignment=4:         4.20 (-119.67%)	        3.73 (-94.98%)	        1.47 ( 22.83%)	        1.91	        1.77 (  7.15%)	        2.07 ( -8.33%)	
               length=4, alignment=0:         2.87 (-61.53%)	        5.79 (-226.33%)	        1.48 ( 16.55%)	        1.77	        1.62 (  8.70%)	        1.91 ( -7.68%)	
               length=5, alignment=5:         4.41 (-130.63%)	        3.73 (-94.99%)	        1.47 ( 22.87%)	        1.91	        1.77 (  7.14%)	        2.07 ( -8.34%)	
               length=5, alignment=0:         2.50 (-41.06%)	        5.79 (-226.30%)	        1.48 ( 16.69%)	        1.77	        1.62 (  8.69%)	        1.91 ( -7.69%)	
               length=6, alignment=6:         3.95 (-106.78%)	        3.72 (-94.98%)	        1.48 ( 22.78%)	        1.91	        1.77 (  7.14%)	        2.07 ( -8.33%)	
               length=6, alignment=0:         2.66 (-49.99%)	        5.79 (-226.32%)	        1.48 ( 16.79%)	        1.77	        1.62 (  8.69%)	        1.91 ( -7.69%)	
               length=7, alignment=7:         2.94 (-53.19%)	        3.73 (-94.02%)	        1.47 ( 23.61%)	        1.92	        1.77 (  7.60%)	        2.07 ( -7.79%)	
               length=7, alignment=0:         3.01 (-69.62%)	        5.79 (-226.34%)	        1.47 ( 16.97%)	        1.77	        1.62 (  8.69%)	        1.91 ( -7.69%)	
               length=4, alignment=0:         2.50 (-40.54%)	        5.79 (-225.23%)	        1.47 ( 17.62%)	        1.78	        1.62 (  9.02%)	        1.91 ( -7.31%)	
               length=4, alignment=7:         2.79 (-46.25%)	        3.73 (-94.99%)	        1.47 ( 23.23%)	        1.91	        1.78 (  6.79%)	        2.07 ( -8.33%)	
               length=4, alignment=2:         3.72 (-109.89%)	        3.52 (-98.64%)	        1.47 ( 17.32%)	        1.77	        1.63 (  8.34%)	        1.91 ( -7.69%)	
               length=2, alignment=2:         2.84 (-60.00%)	        3.52 (-98.64%)	        1.47 ( 17.32%)	        1.77	        1.62 (  8.69%)	        2.09 (-18.08%)	
               length=8, alignment=0:         4.33 (-126.58%)	        5.79 (-202.96%)	        1.47 ( 23.23%)	        1.91	        1.78 (  6.80%)	        2.07 ( -8.33%)	
               length=8, alignment=7:         2.94 (-53.96%)	        3.72 (-95.00%)	        1.47 ( 23.22%)	        1.91	        1.77 (  7.14%)	        2.37 (-24.11%)	
               length=8, alignment=3:         5.01 (-162.51%)	        3.73 (-95.00%)	        1.47 ( 23.23%)	        1.91	        1.78 (  6.79%)	        2.07 ( -8.34%)	
               length=5, alignment=3:         4.84 (-153.36%)	        3.73 (-95.00%)	        1.47 ( 23.23%)	        1.91	        1.78 (  6.79%)	        2.07 ( -8.34%)	
              length=16, alignment=0:         4.02 (-61.27%)	        5.79 (-132.24%)	        1.80 ( 27.80%)	        2.49	        2.07 ( 16.98%)	        2.20 ( 11.62%)	
              length=16, alignment=7:         3.54 (-60.27%)	        4.39 (-98.58%)	        1.88 ( 14.96%)	        2.21	        2.07 (  6.35%)	        2.20 (  0.31%)	
              length=16, alignment=4:         4.86 (-95.67%)	        4.39 (-76.98%)	        1.77 ( 28.56%)	        2.48	        2.07 ( 16.66%)	        2.20 ( 11.29%)	
              length=10, alignment=4:         4.52 (-136.44%)	        3.73 (-95.33%)	        1.47 ( 23.23%)	        1.91	        1.77 (  7.15%)	        2.07 ( -8.33%)	
              length=32, alignment=0:         5.12 (-117.78%)	        5.80 (-146.33%)	        2.19 (  6.85%)	        2.35	        3.49 (-48.14%)	        2.71 (-15.38%)	
              length=32, alignment=7:         5.41 (-129.27%)	        4.85 (-105.50%)	        1.81 ( 23.28%)	        2.36	        3.47 (-47.25%)	        2.70 (-14.59%)	
              length=32, alignment=5:         6.91 (-193.61%)	        4.85 (-106.30%)	        1.81 ( 23.09%)	        2.35	        3.53 (-50.04%)	        2.70 (-14.91%)	
              length=21, alignment=5:         5.73 (-176.90%)	        4.39 (-112.06%)	        1.77 ( 14.29%)	        2.07	        2.07 (  0.00%)	        2.21 ( -6.61%)	
              length=64, alignment=0:         6.36 (  6.66%)	        6.73 (  1.28%)	        2.44 ( 64.23%)	        6.82	        3.96 ( 41.83%)	        4.55 ( 33.24%)	
              length=64, alignment=7:         6.88 ( -0.98%)	        8.14 (-19.36%)	        2.23 ( 67.36%)	        6.82	        3.31 ( 51.42%)	        4.55 ( 33.21%)	
              length=64, alignment=6:         7.29 ( -7.00%)	        8.13 (-19.36%)	        2.22 ( 67.35%)	        6.81	        3.31 ( 51.41%)	        4.55 ( 33.27%)	
              length=42, alignment=6:         6.16 (-128.02%)	        5.39 (-99.42%)	        2.03 ( 24.84%)	        2.70	        3.07 (-13.64%)	        4.01 (-48.36%)	
             length=128, alignment=0:        11.70 (-58.66%)	        7.94 ( -7.71%)	        3.44 ( 53.30%)	        7.37	        5.30 ( 28.08%)	        6.31 ( 14.37%)	
             length=128, alignment=7:        11.18 (-51.43%)	        9.12 (-23.52%)	        3.38 ( 54.27%)	        7.38	        5.28 ( 28.45%)	        6.33 ( 14.28%)	
             length=128, alignment=7:        11.21 (-51.82%)	        9.13 (-23.63%)	        3.30 ( 55.31%)	        7.38	        5.28 ( 28.46%)	        6.31 ( 14.47%)	
              length=85, alignment=7:        12.24 (-79.55%)	        8.14 (-19.38%)	        2.45 ( 64.10%)	        6.82	        3.51 ( 48.52%)	        5.20 ( 23.70%)	
             length=256, alignment=0:        25.19 (-173.67%)	       10.23 (-11.15%)	        5.69 ( 38.16%)	        9.20	        8.34 (  9.37%)	       10.81 (-17.46%)	
             length=256, alignment=7:        21.51 (-133.69%)	       11.42 (-24.12%)	        5.42 ( 41.07%)	        9.20	        8.33 (  9.43%)	       10.84 (-17.78%)	
             length=256, alignment=8:        22.41 (-155.01%)	       11.09 (-26.20%)	        5.42 ( 38.28%)	        8.79	        8.08 (  8.11%)	       10.49 (-19.38%)	
             length=170, alignment=8:        15.02 (-116.28%)	        8.79 (-26.59%)	        4.00 ( 42.37%)	        6.94	        5.68 ( 18.20%)	        8.59 (-23.69%)	
             length=512, alignment=0:        60.93 (-382.58%)	       14.90 (-17.98%)	        9.15 ( 27.53%)	       12.63	       17.00 (-34.63%)	       16.89 (-33.76%)	
             length=512, alignment=7:        50.68 (-301.25%)	       16.09 (-27.37%)	        8.83 ( 30.08%)	       12.63	       15.77 (-24.83%)	       17.00 (-34.62%)	
             length=512, alignment=9:        61.72 (-405.21%)	       15.75 (-28.96%)	        8.84 ( 27.65%)	       12.22	       16.11 (-31.85%)	       17.74 (-45.22%)	
             length=341, alignment=9:        29.93 (-211.24%)	       12.26 (-27.50%)	        6.50 ( 32.46%)	        9.62	       11.84 (-23.13%)	       12.65 (-31.49%)	
            length=1024, alignment=0:       107.30 (-447.19%)	       24.23 (-23.55%)	       16.22 ( 17.31%)	       19.61	       30.64 (-56.22%)	       35.84 (-82.78%)	
            length=1024, alignment=7:        89.54 (-356.77%)	       25.43 (-29.71%)	       16.10 ( 17.84%)	       19.60	       30.63 (-56.28%)	       36.33 (-85.35%)	
           length=1024, alignment=10:        92.07 (-379.95%)	       25.12 (-30.94%)	       16.09 ( 16.11%)	       19.18	       30.54 (-59.21%)	       35.22 (-83.59%)	
            length=682, alignment=10:        64.62 (-362.39%)	       18.09 (-29.43%)	       11.45 ( 18.04%)	       13.98	       22.90 (-63.88%)	       27.65 (-97.87%)	
            length=2048, alignment=0:       196.52 (-486.73%)	       42.91 (-28.12%)	       30.31 (  9.49%)	       33.49	       70.78 (-111.33%)	       64.64 (-92.99%)	
            length=2048, alignment=7:       171.50 (-412.03%)	       44.13 (-31.74%)	       31.64 (  5.53%)	       33.49	       70.49 (-110.45%)	       65.18 (-94.61%)	
           length=2048, alignment=11:       178.28 (-439.18%)	       43.79 (-32.44%)	       31.64 (  4.31%)	       33.07	       68.49 (-107.13%)	       63.54 (-92.15%)	
           length=1365, alignment=11:       138.60 (-486.11%)	       30.93 (-30.78%)	       20.83 ( 11.93%)	       23.65	       41.52 (-75.57%)	       45.88 (-94.01%)	
            length=4096, alignment=0:       364.28 (-492.57%)	       85.25 (-38.68%)	       63.66 ( -3.55%)	       61.47	      126.05 (-105.04%)	      121.66 (-97.90%)	
            length=4096, alignment=7:       335.96 (-446.51%)	       86.71 (-41.05%)	       63.51 ( -3.31%)	       61.47	      126.08 (-105.09%)	      121.27 (-97.27%)	
           length=4096, alignment=12:       350.54 (-473.97%)	       86.26 (-41.24%)	       63.53 ( -4.03%)	       61.07	      125.44 (-105.40%)	      120.05 (-96.56%)	
           length=2730, alignment=12:       230.73 (-448.69%)	       72.19 (-71.68%)	       43.72 ( -3.98%)	       42.05	       87.75 (-108.67%)	       82.98 (-97.32%)
generic_rawmemchr	__rawmemchr_power9	__rawmemchr_power7	__rawmemchr_ppc
Length   32, alignment  0:	8.58591	2.95789	3.51514	5.54899
Length   64, alignment  1:	7.98616	2.3189	3.31495	7.88867
Length   32, alignment  0:	3.54776	2.12725	3.52489	3.60254
Length   64, alignment  1:	3.7281	2.31716	3.31183	3.94692
Length   64, alignment  0:	6.57635	2.53432	3.31231	6.48307
Length   64, alignment  2:	7.96782	2.31863	3.31161	7.88694
Length   64, alignment  0:	3.72673	2.53292	3.31484	3.72669
Length   64, alignment  2:	3.72656	2.31818	3.31198	3.72661
Length  128, alignment  0:	7.8613	3.53673	5.28895	7.76962
Length   64, alignment  3:	7.99033	2.31707	3.31312	7.87987
Length  128, alignment  0:	4.50304	3.53528	5.2881	4.51603
Length   64, alignment  3:	3.72656	2.31671	3.31136	3.72718
Length  256, alignment  0:	10.1273	5.77793	8.35357	10.0325
Length   64, alignment  4:	7.97362	2.31668	3.31165	7.87519
Length  256, alignment  0:	6.43153	5.77725	8.35359	6.4362
Length   64, alignment  4:	3.72658	2.31826	3.312	3.94599
Length  512, alignment  0:	14.7778	9.25884	16.1109	14.706
Length   64, alignment  5:	7.99086	2.31711	3.3163	7.87734
Length  512, alignment  0:	9.89168	9.25762	15.8989	9.91191
Length   64, alignment  5:	3.72848	2.31657	3.31264	3.94673
Length 1024, alignment  0:	24.1056	16.3134	30.8402	24.0373
Length   64, alignment  6:	7.97495	2.31669	3.31309	7.90495
Length 1024, alignment  0:	16.9337	16.3131	30.7522	16.9331
Length   64, alignment  6:	3.72745	2.31668	3.31355	3.72733
Length    1, alignment  0:	5.63447	1.61322	1.61961	5.55166
Length    1, alignment  0:	3.0819	1.61321	1.61958	3.08123
Length    2, alignment  0:	5.63207	1.61322	1.61956	5.54972
Length    2, alignment  0:	3.0812	1.61322	1.62108	3.07972
Length    3, alignment  0:	5.63272	1.6132	1.62107	5.5476
Length    3, alignment  0:	3.08124	1.6132	1.62111	3.07972
Length    4, alignment  0:	5.63321	1.61319	1.62106	5.54706
Length    4, alignment  0:	3.08188	1.6132	1.62183	3.08416
Length    5, alignment  0:	5.63267	1.6132	1.62185	5.54831
Length    5, alignment  0:	3.08305	1.61322	1.62194	3.08117
Length    6, alignment  0:	5.63117	1.61472	1.61957	5.54776
Length    6, alignment  0:	3.08141	1.61872	1.61958	3.08136
Length    7, alignment  0:	5.63099	1.61629	1.61957	5.54885
Length    7, alignment  0:	3.08289	1.61549	1.61958	3.08124
Length    8, alignment  0:	5.63119	1.6147	1.77383	5.54863
Length    8, alignment  0:	3.07972	1.61473	1.77381	3.08176
Length    9, alignment  0:	5.63169	1.61567	1.7738	5.55121
Length    9, alignment  0:	3.0812	1.61321	1.77381	3.08126
Length   10, alignment  0:	5.63256	1.61321	1.77532	5.54814
Length   10, alignment  0:	3.08123	1.61322	1.77571	3.07972
Length   11, alignment  0:	5.63148	1.61318	1.77533	5.54778
Length   11, alignment  0:	3.08121	1.61321	1.77537	3.08177
Length   12, alignment  0:	5.63178	1.61321	1.77622	5.54894
Length   12, alignment  0:	3.08177	1.61474	1.77381	3.08187
Length   13, alignment  0:	5.63276	1.6156	1.7738	5.55147
Length   13, alignment  0:	3.07968	1.61563	1.77381	3.08274
Length   14, alignment  0:	5.63092	1.61471	1.77379	5.55216
Length   14, alignment  0:	3.0797	1.61483	1.77381	3.08126
Length   15, alignment  0:	5.63231	1.6132	1.7738	5.55169
Length   15, alignment  0:	3.08122	1.61319	1.77532	3.07972
Length   16, alignment  0:	5.63293	2.06979	2.07095	5.54899
Length   16, alignment  0:	3.38795	2.07126	2.06942	3.38803
Length   17, alignment  0:	5.63383	2.06969	2.07176	5.54989
Length   17, alignment  0:	3.38835	2.06977	2.07096	3.38799
Length   18, alignment  0:	5.63066	2.0712	2.06943	5.54854
Length   18, alignment  0:	3.38788	2.06976	2.07095	3.38888
Length   19, alignment  0:	5.63239	2.0723	2.0694	5.55062
Length   19, alignment  0:	3.38848	2.07124	2.06941	3.388
Length   20, alignment  0:	5.6326	2.0697	2.07093	5.54772
Length   20, alignment  0:	3.38783	2.0718	2.06945	3.38877
Length   21, alignment  0:	5.63132	2.07281	2.06943	5.54997
Length   21, alignment  0:	3.3879	2.06973	2.07107	3.38798
Length   22, alignment  0:	5.63151	2.07122	2.06942	5.54916
Length   22, alignment  0:	3.38846	2.06968	2.07098	3.72648
Length   23, alignment  0:	5.63097	2.07134	2.06942	5.54751
Length   23, alignment  0:	3.38787	2.07129	2.06941	3.38803
Length   24, alignment  0:	5.6351	2.06973	2.07171	5.54899
Length   24, alignment  0:	3.38881	2.07208	2.06942	3.52875
Length   25, alignment  0:	5.63102	2.07122	2.07092	5.54754
Length   25, alignment  0:	3.38853	2.06979	2.07173	3.48468
Length   26, alignment  0:	5.63251	2.07177	2.0694	5.54913
Length   26, alignment  0:	3.38784	2.06972	2.07095	3.38644
Length   27, alignment  0:	5.63079	2.06975	2.07095	5.54818
Length   27, alignment  0:	3.38786	2.07126	2.06945	3.38808
Length   28, alignment  0:	5.6328	2.06974	2.07158	5.549
Length   28, alignment  0:	3.38932	2.07203	2.06942	3.38798
Length   29, alignment  0:	5.63081	2.07126	2.06943	5.54868
Length   29, alignment  0:	3.38784	2.06971	2.07163	3.63224
Length   30, alignment  0:	5.63112	2.07288	2.06941	5.54999
Length   30, alignment  0:	3.38786	2.06977	2.07096	3.38644
Length   31, alignment  0:	5.63243	2.0697	2.07145	5.54951
Length   31, alignment  0:	3.38906	2.07255	2.06944	3.38924
  
Paul E Murphy May 18, 2020, 10:36 p.m. UTC | #2
On 5/18/20 1:53 PM, Paul E Murphy via Libc-alpha wrote:
> 
> 
> On 5/14/20 6:49 AM, Anton Blanchard via Libc-alpha wrote:
>> This version uses vector instructions and is up to 60% faster on medium
>> matches and up to 90% faster on long matches, compared to the POWER7
>> version. A few examples:
>>
>>                              __rawmemchr_power9  __rawmemchr_power7
>> Length   32, alignment  0:   2.27566             3.77765
>> Length   64, alignment  2:   2.46231             3.51064
>> Length 1024, alignment  0:  17.3059             32.6678
> 
> I think this looks OK.  A few trivial modifications and this could also 
> replace strlen.  I played around for a little bit and saw 17-60% 
> speedups above P8 until it drops to about -4% of P8 for large strings 
> (>2kB).  I am not sure if that tradeoff is acceptable.
> 
> Attached are full benchmarks for this patch, and a hypothetical patch to 
> reuse this as __strlen_power9.

And pushed with a trivial merge conflict fixup in the Makefile from your 
earlier two patches.  Thank you for your contribution.
  

Patch

diff --git a/sysdeps/powerpc/powerpc64/le/power9/rawmemchr.S b/sysdeps/powerpc/powerpc64/le/power9/rawmemchr.S
new file mode 100644
index 0000000000..9d0276c931
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/rawmemchr.S
@@ -0,0 +1,107 @@ 
+/* Optimized rawmemchr implementation for PowerPC64/POWER9.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifndef RAWMEMCHR
+# define RAWMEMCHR __rawmemchr
+#endif
+
+/* Implements the function
+
+   int [r3] rawmemchr (void *s [r3], int c [r4])
+
+   The implementation can load bytes past a matching byte, but only
+   up to the next 16B boundary, so it never crosses a page.  */
+
+.machine power9
+ENTRY_TOCLESS (RAWMEMCHR, 4)
+	CALL_MCOUNT 2
+
+	xori	r5,r4,0xff
+
+	mtvsrd	v18+32,r4	/* matching char in v18  */
+	mtvsrd	v19+32,r5	/* non matching char in v19  */
+
+	vspltb	v18,v18,7	/* replicate  */
+	vspltb	v19,v19,7	/* replicate  */
+
+	neg	r5,r3
+	rldicl	r9,r5,0,60	/* How many bytes to get source 16B aligned?  */
+
+	/* Align data and fill bytes not loaded with non matching char  */
+	lvx	v0,0,r3
+	lvsr	v1,0,r3
+	vperm	v0,v19,v0,v1
+
+	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */
+	beq	cr6,L(aligned)
+
+	vctzlsbb r0,v6
+	add	r3,r3,r0
+	blr
+
+L(aligned):
+	add	r3,r3,r9
+
+L(loop):
+	lxv	v0+32,0(r3)
+	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */
+	bne	cr6,L(tail1)
+
+	lxv	v0+32,16(r3)
+	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */
+	bne	cr6,L(tail2)
+
+	lxv	v0+32,32(r3)
+	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */
+	bne	cr6,L(tail3)
+
+	lxv	v0+32,48(r3)
+	vcmpequb. v6,v0,v18	/* 0xff if byte matches, 0x00 otherwise  */
+	bne	cr6,L(tail4)
+
+	addi	r3,r3,64
+	b	L(loop)
+
+L(tail1):
+	vctzlsbb r0,v6
+	add	r3,r3,r0
+	blr
+
+L(tail2):
+	vctzlsbb r0,v6
+	add	r3,r3,r0
+	addi	r3,r3,16
+	blr
+
+L(tail3):
+	vctzlsbb r0,v6
+	add	r3,r3,r0
+	addi	r3,r3,32
+	blr
+
+L(tail4):
+	vctzlsbb r0,v6
+	add	r3,r3,r0
+	addi	r3,r3,48
+	blr
+
+END (RAWMEMCHR)
+weak_alias (__rawmemchr,rawmemchr)
+libc_hidden_builtin_def (__rawmemchr)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index ea936bf9ed..3b04512f1d 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -32,7 +32,7 @@  sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 		   strncase-power8
 
 ifneq (,$(filter %le,$(config-machine)))
-sysdep_routines += strcmp-power9 strncmp-power9
+sysdep_routines += strcmp-power9 strncmp-power9 rawmemchr-power9
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index b9fef3f43c..386bb6936c 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -208,6 +208,11 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c.  */
   IFUNC_IMPL (i, name, rawmemchr,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __rawmemchr_power9)
+#endif
 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __rawmemchr_power7)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power9.S b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power9.S
new file mode 100644
index 0000000000..bac0a9090e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power9.S
@@ -0,0 +1,21 @@ 
+/* Optimized rawmemchr implementation for PowerPC64/POWER9.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define RAWMEMCHR __rawmemchr_power9
+
+#include <sysdeps/powerpc/powerpc64/le/power9/rawmemchr.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
index 847157e5f0..c49c0c51ff 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
@@ -24,13 +24,21 @@ 
 
 extern __typeof (__rawmemchr) __rawmemchr_ppc attribute_hidden;
 extern __typeof (__rawmemchr) __rawmemchr_power7 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden;
+# endif
+
 # undef __rawmemchr
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr,
-		       (hwcap & PPC_FEATURE_HAS_VSX)
-		       ? __rawmemchr_power7
+# ifdef __LITTLE_ENDIAN__
+		       (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+		       ? __rawmemchr_power9 :
+# endif
+		         (hwcap & PPC_FEATURE_HAS_VSX)
+		         ? __rawmemchr_power7
 		       : __rawmemchr_ppc);
 
 weak_alias (__rawmemchr, rawmemchr)