powerpc: Optimized rawmemchr for POWER9
Commit Message
This version uses vector instructions and is up to 60% faster on medium
matches and up to 90% faster on long matches, compared to the POWER7
version. A few examples:
__rawmemchr_power9 __rawmemchr_power7
Length 32, alignment 0: 2.27566 3.77765
Length 64, alignment 2: 2.46231 3.51064
Length 1024, alignment 0: 17.3059 32.6678
---
.../powerpc/powerpc64/le/power9/rawmemchr.S | 107 ++++++++++++++++++
sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
.../powerpc64/multiarch/ifunc-impl-list.c | 5 +
.../powerpc64/multiarch/rawmemchr-power9.S | 21 ++++
.../powerpc/powerpc64/multiarch/rawmemchr.c | 12 +-
5 files changed, 144 insertions(+), 3 deletions(-)
create mode 100644 sysdeps/powerpc/powerpc64/le/power9/rawmemchr.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power9.S
Comments
On 5/14/20 6:49 AM, Anton Blanchard via Libc-alpha wrote:
> This version uses vector instructions and is up to 60% faster on medium
> matches and up to 90% faster on long matches, compared to the POWER7
> version. A few examples:
>
> __rawmemchr_power9 __rawmemchr_power7
> Length 32, alignment 0: 2.27566 3.77765
> Length 64, alignment 2: 2.46231 3.51064
> Length 1024, alignment 0: 17.3059 32.6678
I think this looks OK. A few trivial modifications and this could also
replace strlen. I played around for a little bit and saw 17-60%
speedups above P8 until it drops to about -4% of P8 for large strings
(>2kB). I am not sure if that tradeoff is acceptable.
Attached are full benchmarks for this patch, and a hypothetical patch to
reuse this as __strlen_power9.
../benchtests/scripts/compare_strings.py -i benchtests/bench-strlen.out -s ../benchtests/scripts/benchout_strings.schema.json -a length,alignment -b __strlen_power8
Function: strlen
Variant:
generic_strlen memchr_strlen __strlen_power9 __strlen_power8 __strlen_power7 __strlen_ppc
========================================================================================================================
length=1, alignment=1: 2.93 ( -8.62%) 5.43 (-101.11%) 2.23 ( 17.32%) 2.70 2.46 ( 8.68%) 2.92 ( -8.30%)
length=1, alignment=0: 3.58 (-32.06%) 8.81 (-224.91%) 2.23 ( 17.68%) 2.71 2.46 ( 9.10%) 2.91 ( -7.22%)
length=2, alignment=2: 4.36 (-145.95%) 4.68 (-163.85%) 1.47 ( 17.32%) 1.77 1.62 ( 8.70%) 1.91 ( -7.69%)
length=2, alignment=0: 2.48 (-40.00%) 5.80 (-226.79%) 1.47 ( 17.32%) 1.77 1.62 ( 8.70%) 1.91 ( -7.69%)
length=3, alignment=3: 3.27 (-84.42%) 3.52 (-98.64%) 1.48 ( 16.78%) 1.77 1.62 ( 8.69%) 1.91 ( -7.69%)
length=3, alignment=0: 2.50 (-41.05%) 5.79 (-226.34%) 1.47 ( 16.95%) 1.77 1.62 ( 8.69%) 1.91 ( -7.69%)
length=4, alignment=4: 4.20 (-119.67%) 3.73 (-94.98%) 1.47 ( 22.83%) 1.91 1.77 ( 7.15%) 2.07 ( -8.33%)
length=4, alignment=0: 2.87 (-61.53%) 5.79 (-226.33%) 1.48 ( 16.55%) 1.77 1.62 ( 8.70%) 1.91 ( -7.68%)
length=5, alignment=5: 4.41 (-130.63%) 3.73 (-94.99%) 1.47 ( 22.87%) 1.91 1.77 ( 7.14%) 2.07 ( -8.34%)
length=5, alignment=0: 2.50 (-41.06%) 5.79 (-226.30%) 1.48 ( 16.69%) 1.77 1.62 ( 8.69%) 1.91 ( -7.69%)
length=6, alignment=6: 3.95 (-106.78%) 3.72 (-94.98%) 1.48 ( 22.78%) 1.91 1.77 ( 7.14%) 2.07 ( -8.33%)
length=6, alignment=0: 2.66 (-49.99%) 5.79 (-226.32%) 1.48 ( 16.79%) 1.77 1.62 ( 8.69%) 1.91 ( -7.69%)
length=7, alignment=7: 2.94 (-53.19%) 3.73 (-94.02%) 1.47 ( 23.61%) 1.92 1.77 ( 7.60%) 2.07 ( -7.79%)
length=7, alignment=0: 3.01 (-69.62%) 5.79 (-226.34%) 1.47 ( 16.97%) 1.77 1.62 ( 8.69%) 1.91 ( -7.69%)
length=4, alignment=0: 2.50 (-40.54%) 5.79 (-225.23%) 1.47 ( 17.62%) 1.78 1.62 ( 9.02%) 1.91 ( -7.31%)
length=4, alignment=7: 2.79 (-46.25%) 3.73 (-94.99%) 1.47 ( 23.23%) 1.91 1.78 ( 6.79%) 2.07 ( -8.33%)
length=4, alignment=2: 3.72 (-109.89%) 3.52 (-98.64%) 1.47 ( 17.32%) 1.77 1.63 ( 8.34%) 1.91 ( -7.69%)
length=2, alignment=2: 2.84 (-60.00%) 3.52 (-98.64%) 1.47 ( 17.32%) 1.77 1.62 ( 8.69%) 2.09 (-18.08%)
length=8, alignment=0: 4.33 (-126.58%) 5.79 (-202.96%) 1.47 ( 23.23%) 1.91 1.78 ( 6.80%) 2.07 ( -8.33%)
length=8, alignment=7: 2.94 (-53.96%) 3.72 (-95.00%) 1.47 ( 23.22%) 1.91 1.77 ( 7.14%) 2.37 (-24.11%)
length=8, alignment=3: 5.01 (-162.51%) 3.73 (-95.00%) 1.47 ( 23.23%) 1.91 1.78 ( 6.79%) 2.07 ( -8.34%)
length=5, alignment=3: 4.84 (-153.36%) 3.73 (-95.00%) 1.47 ( 23.23%) 1.91 1.78 ( 6.79%) 2.07 ( -8.34%)
length=16, alignment=0: 4.02 (-61.27%) 5.79 (-132.24%) 1.80 ( 27.80%) 2.49 2.07 ( 16.98%) 2.20 ( 11.62%)
length=16, alignment=7: 3.54 (-60.27%) 4.39 (-98.58%) 1.88 ( 14.96%) 2.21 2.07 ( 6.35%) 2.20 ( 0.31%)
length=16, alignment=4: 4.86 (-95.67%) 4.39 (-76.98%) 1.77 ( 28.56%) 2.48 2.07 ( 16.66%) 2.20 ( 11.29%)
length=10, alignment=4: 4.52 (-136.44%) 3.73 (-95.33%) 1.47 ( 23.23%) 1.91 1.77 ( 7.15%) 2.07 ( -8.33%)
length=32, alignment=0: 5.12 (-117.78%) 5.80 (-146.33%) 2.19 ( 6.85%) 2.35 3.49 (-48.14%) 2.71 (-15.38%)
length=32, alignment=7: 5.41 (-129.27%) 4.85 (-105.50%) 1.81 ( 23.28%) 2.36 3.47 (-47.25%) 2.70 (-14.59%)
length=32, alignment=5: 6.91 (-193.61%) 4.85 (-106.30%) 1.81 ( 23.09%) 2.35 3.53 (-50.04%) 2.70 (-14.91%)
length=21, alignment=5: 5.73 (-176.90%) 4.39 (-112.06%) 1.77 ( 14.29%) 2.07 2.07 ( 0.00%) 2.21 ( -6.61%)
length=64, alignment=0: 6.36 ( 6.66%) 6.73 ( 1.28%) 2.44 ( 64.23%) 6.82 3.96 ( 41.83%) 4.55 ( 33.24%)
length=64, alignment=7: 6.88 ( -0.98%) 8.14 (-19.36%) 2.23 ( 67.36%) 6.82 3.31 ( 51.42%) 4.55 ( 33.21%)
length=64, alignment=6: 7.29 ( -7.00%) 8.13 (-19.36%) 2.22 ( 67.35%) 6.81 3.31 ( 51.41%) 4.55 ( 33.27%)
length=42, alignment=6: 6.16 (-128.02%) 5.39 (-99.42%) 2.03 ( 24.84%) 2.70 3.07 (-13.64%) 4.01 (-48.36%)
length=128, alignment=0: 11.70 (-58.66%) 7.94 ( -7.71%) 3.44 ( 53.30%) 7.37 5.30 ( 28.08%) 6.31 ( 14.37%)
length=128, alignment=7: 11.18 (-51.43%) 9.12 (-23.52%) 3.38 ( 54.27%) 7.38 5.28 ( 28.45%) 6.33 ( 14.28%)
length=128, alignment=7: 11.21 (-51.82%) 9.13 (-23.63%) 3.30 ( 55.31%) 7.38 5.28 ( 28.46%) 6.31 ( 14.47%)
length=85, alignment=7: 12.24 (-79.55%) 8.14 (-19.38%) 2.45 ( 64.10%) 6.82 3.51 ( 48.52%) 5.20 ( 23.70%)
length=256, alignment=0: 25.19 (-173.67%) 10.23 (-11.15%) 5.69 ( 38.16%) 9.20 8.34 ( 9.37%) 10.81 (-17.46%)
length=256, alignment=7: 21.51 (-133.69%) 11.42 (-24.12%) 5.42 ( 41.07%) 9.20 8.33 ( 9.43%) 10.84 (-17.78%)
length=256, alignment=8: 22.41 (-155.01%) 11.09 (-26.20%) 5.42 ( 38.28%) 8.79 8.08 ( 8.11%) 10.49 (-19.38%)
length=170, alignment=8: 15.02 (-116.28%) 8.79 (-26.59%) 4.00 ( 42.37%) 6.94 5.68 ( 18.20%) 8.59 (-23.69%)
length=512, alignment=0: 60.93 (-382.58%) 14.90 (-17.98%) 9.15 ( 27.53%) 12.63 17.00 (-34.63%) 16.89 (-33.76%)
length=512, alignment=7: 50.68 (-301.25%) 16.09 (-27.37%) 8.83 ( 30.08%) 12.63 15.77 (-24.83%) 17.00 (-34.62%)
length=512, alignment=9: 61.72 (-405.21%) 15.75 (-28.96%) 8.84 ( 27.65%) 12.22 16.11 (-31.85%) 17.74 (-45.22%)
length=341, alignment=9: 29.93 (-211.24%) 12.26 (-27.50%) 6.50 ( 32.46%) 9.62 11.84 (-23.13%) 12.65 (-31.49%)
length=1024, alignment=0: 107.30 (-447.19%) 24.23 (-23.55%) 16.22 ( 17.31%) 19.61 30.64 (-56.22%) 35.84 (-82.78%)
length=1024, alignment=7: 89.54 (-356.77%) 25.43 (-29.71%) 16.10 ( 17.84%) 19.60 30.63 (-56.28%) 36.33 (-85.35%)
length=1024, alignment=10: 92.07 (-379.95%) 25.12 (-30.94%) 16.09 ( 16.11%) 19.18 30.54 (-59.21%) 35.22 (-83.59%)
length=682, alignment=10: 64.62 (-362.39%) 18.09 (-29.43%) 11.45 ( 18.04%) 13.98 22.90 (-63.88%) 27.65 (-97.87%)
length=2048, alignment=0: 196.52 (-486.73%) 42.91 (-28.12%) 30.31 ( 9.49%) 33.49 70.78 (-111.33%) 64.64 (-92.99%)
length=2048, alignment=7: 171.50 (-412.03%) 44.13 (-31.74%) 31.64 ( 5.53%) 33.49 70.49 (-110.45%) 65.18 (-94.61%)
length=2048, alignment=11: 178.28 (-439.18%) 43.79 (-32.44%) 31.64 ( 4.31%) 33.07 68.49 (-107.13%) 63.54 (-92.15%)
length=1365, alignment=11: 138.60 (-486.11%) 30.93 (-30.78%) 20.83 ( 11.93%) 23.65 41.52 (-75.57%) 45.88 (-94.01%)
length=4096, alignment=0: 364.28 (-492.57%) 85.25 (-38.68%) 63.66 ( -3.55%) 61.47 126.05 (-105.04%) 121.66 (-97.90%)
length=4096, alignment=7: 335.96 (-446.51%) 86.71 (-41.05%) 63.51 ( -3.31%) 61.47 126.08 (-105.09%) 121.27 (-97.27%)
length=4096, alignment=12: 350.54 (-473.97%) 86.26 (-41.24%) 63.53 ( -4.03%) 61.07 125.44 (-105.40%) 120.05 (-96.56%)
length=2730, alignment=12: 230.73 (-448.69%) 72.19 (-71.68%) 43.72 ( -3.98%) 42.05 87.75 (-108.67%) 82.98 (-97.32%)
generic_rawmemchr __rawmemchr_power9 __rawmemchr_power7 __rawmemchr_ppc
Length 32, alignment 0: 8.58591 2.95789 3.51514 5.54899
Length 64, alignment 1: 7.98616 2.3189 3.31495 7.88867
Length 32, alignment 0: 3.54776 2.12725 3.52489 3.60254
Length 64, alignment 1: 3.7281 2.31716 3.31183 3.94692
Length 64, alignment 0: 6.57635 2.53432 3.31231 6.48307
Length 64, alignment 2: 7.96782 2.31863 3.31161 7.88694
Length 64, alignment 0: 3.72673 2.53292 3.31484 3.72669
Length 64, alignment 2: 3.72656 2.31818 3.31198 3.72661
Length 128, alignment 0: 7.8613 3.53673 5.28895 7.76962
Length 64, alignment 3: 7.99033 2.31707 3.31312 7.87987
Length 128, alignment 0: 4.50304 3.53528 5.2881 4.51603
Length 64, alignment 3: 3.72656 2.31671 3.31136 3.72718
Length 256, alignment 0: 10.1273 5.77793 8.35357 10.0325
Length 64, alignment 4: 7.97362 2.31668 3.31165 7.87519
Length 256, alignment 0: 6.43153 5.77725 8.35359 6.4362
Length 64, alignment 4: 3.72658 2.31826 3.312 3.94599
Length 512, alignment 0: 14.7778 9.25884 16.1109 14.706
Length 64, alignment 5: 7.99086 2.31711 3.3163 7.87734
Length 512, alignment 0: 9.89168 9.25762 15.8989 9.91191
Length 64, alignment 5: 3.72848 2.31657 3.31264 3.94673
Length 1024, alignment 0: 24.1056 16.3134 30.8402 24.0373
Length 64, alignment 6: 7.97495 2.31669 3.31309 7.90495
Length 1024, alignment 0: 16.9337 16.3131 30.7522 16.9331
Length 64, alignment 6: 3.72745 2.31668 3.31355 3.72733
Length 1, alignment 0: 5.63447 1.61322 1.61961 5.55166
Length 1, alignment 0: 3.0819 1.61321 1.61958 3.08123
Length 2, alignment 0: 5.63207 1.61322 1.61956 5.54972
Length 2, alignment 0: 3.0812 1.61322 1.62108 3.07972
Length 3, alignment 0: 5.63272 1.6132 1.62107 5.5476
Length 3, alignment 0: 3.08124 1.6132 1.62111 3.07972
Length 4, alignment 0: 5.63321 1.61319 1.62106 5.54706
Length 4, alignment 0: 3.08188 1.6132 1.62183 3.08416
Length 5, alignment 0: 5.63267 1.6132 1.62185 5.54831
Length 5, alignment 0: 3.08305 1.61322 1.62194 3.08117
Length 6, alignment 0: 5.63117 1.61472 1.61957 5.54776
Length 6, alignment 0: 3.08141 1.61872 1.61958 3.08136
Length 7, alignment 0: 5.63099 1.61629 1.61957 5.54885
Length 7, alignment 0: 3.08289 1.61549 1.61958 3.08124
Length 8, alignment 0: 5.63119 1.6147 1.77383 5.54863
Length 8, alignment 0: 3.07972 1.61473 1.77381 3.08176
Length 9, alignment 0: 5.63169 1.61567 1.7738 5.55121
Length 9, alignment 0: 3.0812 1.61321 1.77381 3.08126
Length 10, alignment 0: 5.63256 1.61321 1.77532 5.54814
Length 10, alignment 0: 3.08123 1.61322 1.77571 3.07972
Length 11, alignment 0: 5.63148 1.61318 1.77533 5.54778
Length 11, alignment 0: 3.08121 1.61321 1.77537 3.08177
Length 12, alignment 0: 5.63178 1.61321 1.77622 5.54894
Length 12, alignment 0: 3.08177 1.61474 1.77381 3.08187
Length 13, alignment 0: 5.63276 1.6156 1.7738 5.55147
Length 13, alignment 0: 3.07968 1.61563 1.77381 3.08274
Length 14, alignment 0: 5.63092 1.61471 1.77379 5.55216
Length 14, alignment 0: 3.0797 1.61483 1.77381 3.08126
Length 15, alignment 0: 5.63231 1.6132 1.7738 5.55169
Length 15, alignment 0: 3.08122 1.61319 1.77532 3.07972
Length 16, alignment 0: 5.63293 2.06979 2.07095 5.54899
Length 16, alignment 0: 3.38795 2.07126 2.06942 3.38803
Length 17, alignment 0: 5.63383 2.06969 2.07176 5.54989
Length 17, alignment 0: 3.38835 2.06977 2.07096 3.38799
Length 18, alignment 0: 5.63066 2.0712 2.06943 5.54854
Length 18, alignment 0: 3.38788 2.06976 2.07095 3.38888
Length 19, alignment 0: 5.63239 2.0723 2.0694 5.55062
Length 19, alignment 0: 3.38848 2.07124 2.06941 3.388
Length 20, alignment 0: 5.6326 2.0697 2.07093 5.54772
Length 20, alignment 0: 3.38783 2.0718 2.06945 3.38877
Length 21, alignment 0: 5.63132 2.07281 2.06943 5.54997
Length 21, alignment 0: 3.3879 2.06973 2.07107 3.38798
Length 22, alignment 0: 5.63151 2.07122 2.06942 5.54916
Length 22, alignment 0: 3.38846 2.06968 2.07098 3.72648
Length 23, alignment 0: 5.63097 2.07134 2.06942 5.54751
Length 23, alignment 0: 3.38787 2.07129 2.06941 3.38803
Length 24, alignment 0: 5.6351 2.06973 2.07171 5.54899
Length 24, alignment 0: 3.38881 2.07208 2.06942 3.52875
Length 25, alignment 0: 5.63102 2.07122 2.07092 5.54754
Length 25, alignment 0: 3.38853 2.06979 2.07173 3.48468
Length 26, alignment 0: 5.63251 2.07177 2.0694 5.54913
Length 26, alignment 0: 3.38784 2.06972 2.07095 3.38644
Length 27, alignment 0: 5.63079 2.06975 2.07095 5.54818
Length 27, alignment 0: 3.38786 2.07126 2.06945 3.38808
Length 28, alignment 0: 5.6328 2.06974 2.07158 5.549
Length 28, alignment 0: 3.38932 2.07203 2.06942 3.38798
Length 29, alignment 0: 5.63081 2.07126 2.06943 5.54868
Length 29, alignment 0: 3.38784 2.06971 2.07163 3.63224
Length 30, alignment 0: 5.63112 2.07288 2.06941 5.54999
Length 30, alignment 0: 3.38786 2.06977 2.07096 3.38644
Length 31, alignment 0: 5.63243 2.0697 2.07145 5.54951
Length 31, alignment 0: 3.38906 2.07255 2.06944 3.38924
On 5/18/20 1:53 PM, Paul E Murphy via Libc-alpha wrote:
>
>
> On 5/14/20 6:49 AM, Anton Blanchard via Libc-alpha wrote:
>> This version uses vector instructions and is up to 60% faster on medium
>> matches and up to 90% faster on long matches, compared to the POWER7
>> version. A few examples:
>>
>> __rawmemchr_power9 __rawmemchr_power7
>> Length 32, alignment 0: 2.27566 3.77765
>> Length 64, alignment 2: 2.46231 3.51064
>> Length 1024, alignment 0: 17.3059 32.6678
>
> I think this looks OK. A few trivial modifications and this could also
> replace strlen. I played around for a little bit and saw 17-60%
> speedups above P8 until it drops to about -4% of P8 for large strings
> (>2kB). I am not sure if that tradeoff is acceptable.
>
> Attached are full benchmarks for this patch, and a hypothetical patch to
> reuse this as __strlen_power9.
And pushed with a trivial merge conflict fixup in the Makefile from your
earlier two patches. Thank you for your contribution.
new file mode 100644
@@ -0,0 +1,107 @@
+/* Optimized rawmemchr implementation for PowerPC64/POWER9.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifndef RAWMEMCHR
+# define RAWMEMCHR __rawmemchr
+#endif
+
+/* Implements the function
+
+ int [r3] rawmemchr (void *s [r3], int c [r4])
+
+ The implementation can load bytes past a matching byte, but only
+ up to the next 16B boundary, so it never crosses a page. */
+
+.machine power9
+ENTRY_TOCLESS (RAWMEMCHR, 4)
+ CALL_MCOUNT 2
+
+ xori r5,r4,0xff
+
+ mtvsrd v18+32,r4 /* matching char in v18 */
+ mtvsrd v19+32,r5 /* non matching char in v19 */
+
+ vspltb v18,v18,7 /* replicate */
+ vspltb v19,v19,7 /* replicate */
+
+ neg r5,r3
+ rldicl r9,r5,0,60 /* How many bytes to get source 16B aligned? */
+
+ /* Align data and fill bytes not loaded with non matching char */
+ lvx v0,0,r3
+ lvsr v1,0,r3
+ vperm v0,v19,v0,v1
+
+ vcmpequb. v6,v0,v18 /* 0xff if byte matches, 0x00 otherwise */
+ beq cr6,L(aligned)
+
+ vctzlsbb r0,v6
+ add r3,r3,r0
+ blr
+
+L(aligned):
+ add r3,r3,r9
+
+L(loop):
+ lxv v0+32,0(r3)
+ vcmpequb. v6,v0,v18 /* 0xff if byte matches, 0x00 otherwise */
+ bne cr6,L(tail1)
+
+ lxv v0+32,16(r3)
+ vcmpequb. v6,v0,v18 /* 0xff if byte matches, 0x00 otherwise */
+ bne cr6,L(tail2)
+
+ lxv v0+32,32(r3)
+ vcmpequb. v6,v0,v18 /* 0xff if byte matches, 0x00 otherwise */
+ bne cr6,L(tail3)
+
+ lxv v0+32,48(r3)
+ vcmpequb. v6,v0,v18 /* 0xff if byte matches, 0x00 otherwise */
+ bne cr6,L(tail4)
+
+ addi r3,r3,64
+ b L(loop)
+
+L(tail1):
+ vctzlsbb r0,v6
+ add r3,r3,r0
+ blr
+
+L(tail2):
+ vctzlsbb r0,v6
+ add r3,r3,r0
+ addi r3,r3,16
+ blr
+
+L(tail3):
+ vctzlsbb r0,v6
+ add r3,r3,r0
+ addi r3,r3,32
+ blr
+
+L(tail4):
+ vctzlsbb r0,v6
+ add r3,r3,r0
+ addi r3,r3,48
+ blr
+
+END (RAWMEMCHR)
+weak_alias (__rawmemchr,rawmemchr)
+libc_hidden_builtin_def (__rawmemchr)
@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
strncase-power8
ifneq (,$(filter %le,$(config-machine)))
-sysdep_routines += strcmp-power9 strncmp-power9
+sysdep_routines += strcmp-power9 strncmp-power9 rawmemchr-power9
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
@@ -208,6 +208,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c. */
IFUNC_IMPL (i, name, rawmemchr,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, rawmemchr,
+ hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ __rawmemchr_power9)
+#endif
IFUNC_IMPL_ADD (array, i, rawmemchr,
hwcap & PPC_FEATURE_HAS_VSX,
__rawmemchr_power7)
new file mode 100644
@@ -0,0 +1,21 @@
+/* Optimized rawmemchr implementation for PowerPC64/POWER9.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define RAWMEMCHR __rawmemchr_power9
+
+#include <sysdeps/powerpc/powerpc64/le/power9/rawmemchr.S>
@@ -24,13 +24,21 @@
extern __typeof (__rawmemchr) __rawmemchr_ppc attribute_hidden;
extern __typeof (__rawmemchr) __rawmemchr_power7 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden;
+# endif
+
# undef __rawmemchr
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __rawmemchr_power7
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ ? __rawmemchr_power9 :
+# endif
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __rawmemchr_power7
: __rawmemchr_ppc);
weak_alias (__rawmemchr, rawmemchr)