[v3,7/9] powerpc64: Add optimized chacha20

Message ID 20220419212812.2688764-8-adhemerval.zanella@linaro.org
State Superseded
Headers
Series Add arc4random support |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Adhemerval Zanella Netto April 19, 2022, 9:28 p.m. UTC
  It adds vectorized ChaCha20 implementation based on libgcrypt
cipher/chacha20-ppc.c.  It targets POWER8 and it is used on
default for LE.

On a POWER8 it shows the following improvements (using
formatted bench-arc4random data):

GENERIC (powerpc64-linux-gnu)
Function                                 MB/s
--------------------------------------------------
arc4random [single-thread]               71.08
arc4random_buf(16) [single-thread]       141.26
arc4random_buf(32) [single-thread]       198.31
arc4random_buf(48) [single-thread]       226.78
arc4random_buf(64) [single-thread]       246.69
arc4random_buf(80) [single-thread]       257.23
arc4random_buf(96) [single-thread]       268.06
arc4random_buf(112) [single-thread]      274.50
arc4random_buf(128) [single-thread]      279.56
--------------------------------------------------

POWER8
Function                                 MB/s
--------------------------------------------------
arc4random [single-thread]               84.68
arc4random_buf(16) [single-thread]       210.75
arc4random_buf(32) [single-thread]       366.11
arc4random_buf(48) [single-thread]       471.99
arc4random_buf(64) [single-thread]       567.06
arc4random_buf(80) [single-thread]       633.79
arc4random_buf(96) [single-thread]       693.16
arc4random_buf(112) [single-thread]      737.77
arc4random_buf(128) [single-thread]      774.38
--------------------------------------------------

Checked on powerpc64-linux-gnu and powerpc64le-linux-gnu.
---
 LICENSES                                  |   6 +-
 sysdeps/powerpc/powerpc64/Makefile        |   3 +
 sysdeps/powerpc/powerpc64/chacha20-ppc.c  | 236 ++++++++++++++++++++++
 sysdeps/powerpc/powerpc64/chacha20_arch.h |  47 +++++
 4 files changed, 289 insertions(+), 3 deletions(-)
 create mode 100644 sysdeps/powerpc/powerpc64/chacha20-ppc.c
 create mode 100644 sysdeps/powerpc/powerpc64/chacha20_arch.h
  

Comments

Paul E Murphy April 20, 2022, 6:38 p.m. UTC | #1
On 4/19/22 4:28 PM, Adhemerval Zanella via Libc-alpha wrote:
> It adds vectorized ChaCha20 implementation based on libgcrypt
> cipher/chacha20-ppc.c.  It targets POWER8 and it is used on
> default for LE.

> diff --git a/sysdeps/powerpc/powerpc64/chacha20-ppc.c b/sysdeps/powerpc/powerpc64/chacha20-ppc.c
> new file mode 100644
> index 0000000000..e2567c379a
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/chacha20-ppc.c

How difficult is it to keep this synchronized with the upstream version 
in libgcrypt?  Also, this seems like it would be a better placed in the 
power8 subdirectory.

> diff --git a/sysdeps/powerpc/powerpc64/chacha20_arch.h b/sysdeps/powerpc/powerpc64/chacha20_arch.h
> new file mode 100644
> index 0000000000..a18115392f
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/chacha20_arch.h
> @@ -0,0 +1,47 @@
> +/* PowerPC optimization for ChaCha20.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <stdbool.h>
> +#include <ldsodefs.h>
> +
> +unsigned int __chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst,
> +					const uint8_t *src, size_t nblks)
> +     attribute_hidden;
> +
> +static void
> +chacha20_crypt (uint32_t *state, uint8_t *dst,
> +		const uint8_t *src, size_t bytes)
> +{
> +  _Static_assert (CHACHA20_BUFSIZE % 4 == 0,
> +		  "CHACHA20_BUFSIZE not multiple of 4");
> +  _Static_assert (CHACHA20_BUFSIZE >= CHACHA20_BLOCK_SIZE * 4,
> +		  "CHACHA20_BUFSIZE < CHACHA20_BLOCK_SIZE * 4");
> +
> +#ifdef __LITTLE_ENDIAN__
> +  __chacha20_power8_blocks4 (state, dst, src,
> +			     CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
> +#else
> +  unsigned long int hwcap = GLRO(dl_hwcap);
> +  unsigned long int hwcap2 = GLRO(dl_hwcap2);
> +  if (hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
> +    __chacha20_power8_blocks4 (state, dst, src,
> +			       CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
> +  else
> +    chacha20_crypt_generic (state, dst, src, bytes);
> +#endif

This file doesn't seem to obey the multiarch conventions of other 
powerpc64 specific bits. Is it possible to implement multiarch support 
similar to the libc/libm routines?
  
Adhemerval Zanella Netto April 20, 2022, 7:23 p.m. UTC | #2
On 20/04/2022 15:38, Paul E Murphy wrote:
> 
> 
> On 4/19/22 4:28 PM, Adhemerval Zanella via Libc-alpha wrote:
>> It adds vectorized ChaCha20 implementation based on libgcrypt
>> cipher/chacha20-ppc.c.  It targets POWER8 and it is used on
>> default for LE.
> 
>> diff --git a/sysdeps/powerpc/powerpc64/chacha20-ppc.c b/sysdeps/powerpc/powerpc64/chacha20-ppc.c
>> new file mode 100644
>> index 0000000000..e2567c379a
>> --- /dev/null
>> +++ b/sysdeps/powerpc/powerpc64/chacha20-ppc.c
> 
> How difficult is it to keep this synchronized with the upstream version in libgcrypt?  Also, this seems like it would be a better placed in the power8 subdirectory.

It would be somewhat complicate because libgcrypt also implements the
poly1305 on the same file (which uses common macros and definition
for chacha20) and it adds final XOR based on input stream (which
for arc4random usage is not required since it does not add any
hardening).

It would require to refactor libgcrypt code a bit to split the
chacha and poly1305 and to add a macro to XOR the input.

> 
>> diff --git a/sysdeps/powerpc/powerpc64/chacha20_arch.h b/sysdeps/powerpc/powerpc64/chacha20_arch.h
>> new file mode 100644
>> index 0000000000..a18115392f
>> --- /dev/null
>> +++ b/sysdeps/powerpc/powerpc64/chacha20_arch.h
>> @@ -0,0 +1,47 @@
>> +/* PowerPC optimization for ChaCha20.
>> +   Copyright (C) 2022 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +#include <stdbool.h>
>> +#include <ldsodefs.h>
>> +
>> +unsigned int __chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst,
>> +                    const uint8_t *src, size_t nblks)
>> +     attribute_hidden;
>> +
>> +static void
>> +chacha20_crypt (uint32_t *state, uint8_t *dst,
>> +        const uint8_t *src, size_t bytes)
>> +{
>> +  _Static_assert (CHACHA20_BUFSIZE % 4 == 0,
>> +          "CHACHA20_BUFSIZE not multiple of 4");
>> +  _Static_assert (CHACHA20_BUFSIZE >= CHACHA20_BLOCK_SIZE * 4,
>> +          "CHACHA20_BUFSIZE < CHACHA20_BLOCK_SIZE * 4");
>> +
>> +#ifdef __LITTLE_ENDIAN__
>> +  __chacha20_power8_blocks4 (state, dst, src,
>> +                 CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
>> +#else
>> +  unsigned long int hwcap = GLRO(dl_hwcap);
>> +  unsigned long int hwcap2 = GLRO(dl_hwcap2);
>> +  if (hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
>> +    __chacha20_power8_blocks4 (state, dst, src,
>> +                   CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
>> +  else
>> +    chacha20_crypt_generic (state, dst, src, bytes);
>> +#endif
> 
> This file doesn't seem to obey the multiarch conventions of other powerpc64 specific bits. Is it possible to implement multiarch support similar to the libc/libm routines?

I am not very found of the powerpc multiarch convention and it would
require some more boilerplate code to handle BE, but it is doable.

So LE will continue to use __chacha20_power8_blocks4 as 
default, while BE will just select if --with-arch=power8 is defined
for for default build.  With --disable-multi-arch the power8 will be
select iff --with-arch=power8 is set.

---

diff --git a/sysdeps/powerpc/powerpc64/Makefile b/sysdeps/powerpc/powerpc64/Makefile
index 18943ef09e..679d5e49ba 100644
--- a/sysdeps/powerpc/powerpc64/Makefile
+++ b/sysdeps/powerpc/powerpc64/Makefile
@@ -66,9 +66,6 @@ tst-setjmp-bug21895-static-ENV = \
 endif
 
 ifeq ($(subdir),stdlib)
-sysdep_routines += chacha20-ppc
-CFLAGS-chacha20-ppc.c += -mcpu=power8
-
 CFLAGS-tst-ucontext-ppc64-vscr.c += -maltivec
 tests += tst-ucontext-ppc64-vscr
 endif
diff --git a/sysdeps/powerpc/powerpc64/be/multiarch/Makefile b/sysdeps/powerpc/powerpc64/be/multiarch/Makefile
new file mode 100644
index 0000000000..8c75165f7f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/be/multiarch/Makefile
@@ -0,0 +1,4 @@
+ifeq ($(subdir),stdlib)
+sysdep_routines += chacha20-ppc
+CFLAGS-chacha20-ppc.c += -mcpu=power8
+endif
diff --git a/sysdeps/powerpc/powerpc64/be/multiarch/chacha20-ppc.c b/sysdeps/powerpc/powerpc64/be/multiarch/chacha20-ppc.c
new file mode 100644
index 0000000000..cf9e735326
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/be/multiarch/chacha20-ppc.c
@@ -0,0 +1 @@
+#include <sysdeps/powerpc/powerpc64/power8/chacha20-ppc.c>
diff --git a/sysdeps/powerpc/powerpc64/chacha20_arch.h b/sysdeps/powerpc/powerpc64/be/multiarch/chacha20_arch.h
similarity index 92%
rename from sysdeps/powerpc/powerpc64/chacha20_arch.h
rename to sysdeps/powerpc/powerpc64/be/multiarch/chacha20_arch.h
index a18115392f..6d2762d82b 100644
--- a/sysdeps/powerpc/powerpc64/chacha20_arch.h
+++ b/sysdeps/powerpc/powerpc64/be/multiarch/chacha20_arch.h
@@ -32,10 +32,6 @@ chacha20_crypt (uint32_t *state, uint8_t *dst,
   _Static_assert (CHACHA20_BUFSIZE >= CHACHA20_BLOCK_SIZE * 4,
 		  "CHACHA20_BUFSIZE < CHACHA20_BLOCK_SIZE * 4");
 
-#ifdef __LITTLE_ENDIAN__
-  __chacha20_power8_blocks4 (state, dst, src,
-			     CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
-#else
   unsigned long int hwcap = GLRO(dl_hwcap);
   unsigned long int hwcap2 = GLRO(dl_hwcap2);
   if (hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
@@ -43,5 +39,4 @@ chacha20_crypt (uint32_t *state, uint8_t *dst,
 			       CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
   else
     chacha20_crypt_generic (state, dst, src, bytes);
-#endif
 }
diff --git a/sysdeps/powerpc/powerpc64/power8/Makefile b/sysdeps/powerpc/powerpc64/power8/Makefile
index 71a59529f3..abb0aa3f11 100644
--- a/sysdeps/powerpc/powerpc64/power8/Makefile
+++ b/sysdeps/powerpc/powerpc64/power8/Makefile
@@ -1,3 +1,8 @@
 ifeq ($(subdir),string)
 sysdep_routines += strcasestr-ppc64
 endif
+
+ifeq ($(subdir),stdlib)
+sysdep_routines += chacha20-ppc
+CFLAGS-chacha20-ppc.c += -mcpu=power8
+endif
diff --git a/sysdeps/powerpc/powerpc64/chacha20-ppc.c b/sysdeps/powerpc/powerpc64/power8/chacha20-ppc.c
similarity index 100%
rename from sysdeps/powerpc/powerpc64/chacha20-ppc.c
rename to sysdeps/powerpc/powerpc64/power8/chacha20-ppc.c
diff --git a/sysdeps/powerpc/powerpc64/power8/chacha20_arch.h b/sysdeps/powerpc/powerpc64/power8/chacha20_arch.h
new file mode 100644
index 0000000000..270c71130f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/chacha20_arch.h
@@ -0,0 +1,37 @@
+/* PowerPC optimization for ChaCha20.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdbool.h>
+#include <ldsodefs.h>
+
+unsigned int __chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst,
+					const uint8_t *src, size_t nblks)
+     attribute_hidden;
+
+static void
+chacha20_crypt (uint32_t *state, uint8_t *dst,
+		const uint8_t *src, size_t bytes)
+{
+  _Static_assert (CHACHA20_BUFSIZE % 4 == 0,
+		  "CHACHA20_BUFSIZE not multiple of 4");
+  _Static_assert (CHACHA20_BUFSIZE >= CHACHA20_BLOCK_SIZE * 4,
+		  "CHACHA20_BUFSIZE < CHACHA20_BLOCK_SIZE * 4");
+
+  __chacha20_power8_blocks4 (state, dst, src,
+			     CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
+}
  
Paul E Murphy April 22, 2022, 9:09 p.m. UTC | #3
On 4/20/22 2:23 PM, Adhemerval Zanella wrote:
> 
> 
> On 20/04/2022 15:38, Paul E Murphy wrote:
>>
>>
>> On 4/19/22 4:28 PM, Adhemerval Zanella via Libc-alpha wrote:
>>> It adds vectorized ChaCha20 implementation based on libgcrypt
>>> cipher/chacha20-ppc.c.  It targets POWER8 and it is used on
>>> default for LE.
>>
>>> diff --git a/sysdeps/powerpc/powerpc64/chacha20-ppc.c b/sysdeps/powerpc/powerpc64/chacha20-ppc.c
>>> new file mode 100644
>>> index 0000000000..e2567c379a
>>> --- /dev/null
>>> +++ b/sysdeps/powerpc/powerpc64/chacha20-ppc.c
>>
>> How difficult is it to keep this synchronized with the upstream version in libgcrypt?  Also, this seems like it would be a better placed in the power8 subdirectory.
> 
> It would be somewhat complicate because libgcrypt also implements the
> poly1305 on the same file (which uses common macros and definition
> for chacha20) and it adds final XOR based on input stream (which
> for arc4random usage is not required since it does not add any
> hardening).
> 
> It would require to refactor libgcrypt code a bit to split the
> chacha and poly1305 and to add a macro to XOR the input.

I think this is OK. Thanks for the explanation.

> 
>>
>>> diff --git a/sysdeps/powerpc/powerpc64/chacha20_arch.h b/sysdeps/powerpc/powerpc64/chacha20_arch.h
>>> new file mode 100644
>>> index 0000000000..a18115392f
>>> --- /dev/null
>>> +++ b/sysdeps/powerpc/powerpc64/chacha20_arch.h
>>> @@ -0,0 +1,47 @@
>>> +/* PowerPC optimization for ChaCha20.
>>> +   Copyright (C) 2022 Free Software Foundation, Inc.
>>> +   This file is part of the GNU C Library.
>>> +
>>> +   The GNU C Library is free software; you can redistribute it and/or
>>> +   modify it under the terms of the GNU Lesser General Public
>>> +   License as published by the Free Software Foundation; either
>>> +   version 2.1 of the License, or (at your option) any later version.
>>> +
>>> +   The GNU C Library is distributed in the hope that it will be useful,
>>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> +   Lesser General Public License for more details.
>>> +
>>> +   You should have received a copy of the GNU Lesser General Public
>>> +   License along with the GNU C Library; if not, see
>>> +   <http://www.gnu.org/licenses/>.  */
>>> +
>>> +#include <stdbool.h>
>>> +#include <ldsodefs.h>
>>> +
>>> +unsigned int __chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst,
>>> +                    const uint8_t *src, size_t nblks)
>>> +     attribute_hidden;
>>> +
>>> +static void
>>> +chacha20_crypt (uint32_t *state, uint8_t *dst,
>>> +        const uint8_t *src, size_t bytes)
>>> +{
>>> +  _Static_assert (CHACHA20_BUFSIZE % 4 == 0,
>>> +          "CHACHA20_BUFSIZE not multiple of 4");
>>> +  _Static_assert (CHACHA20_BUFSIZE >= CHACHA20_BLOCK_SIZE * 4,
>>> +          "CHACHA20_BUFSIZE < CHACHA20_BLOCK_SIZE * 4");
>>> +
>>> +#ifdef __LITTLE_ENDIAN__
>>> +  __chacha20_power8_blocks4 (state, dst, src,
>>> +                 CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
>>> +#else
>>> +  unsigned long int hwcap = GLRO(dl_hwcap);
>>> +  unsigned long int hwcap2 = GLRO(dl_hwcap2);
>>> +  if (hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
>>> +    __chacha20_power8_blocks4 (state, dst, src,
>>> +                   CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
>>> +  else
>>> +    chacha20_crypt_generic (state, dst, src, bytes);
>>> +#endif
>>
>> This file doesn't seem to obey the multiarch conventions of other powerpc64 specific bits. Is it possible to implement multiarch support similar to the libc/libm routines?
> 
> I am not very found of the powerpc multiarch convention and it would
> require some more boilerplate code to handle BE, but it is doable.
> 
> So LE will continue to use __chacha20_power8_blocks4 as
> default, while BE will just select if --with-arch=power8 is defined
> for for default build.  With --disable-multi-arch the power8 will be
> select iff --with-arch=power8 is set.
> 
> ---
> 
> diff --git a/sysdeps/powerpc/powerpc64/Makefile b/sysdeps/powerpc/powerpc64/Makefile
> index 18943ef09e..679d5e49ba 100644
> --- a/sysdeps/powerpc/powerpc64/Makefile
> +++ b/sysdeps/powerpc/powerpc64/Makefile
> @@ -66,9 +66,6 @@ tst-setjmp-bug21895-static-ENV = \
>   endif
>   
>   ifeq ($(subdir),stdlib)
> -sysdep_routines += chacha20-ppc
> -CFLAGS-chacha20-ppc.c += -mcpu=power8
> -
>   CFLAGS-tst-ucontext-ppc64-vscr.c += -maltivec
>   tests += tst-ucontext-ppc64-vscr
>   endif
> diff --git a/sysdeps/powerpc/powerpc64/be/multiarch/Makefile b/sysdeps/powerpc/powerpc64/be/multiarch/Makefile
> new file mode 100644
> index 0000000000..8c75165f7f
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/be/multiarch/Makefile
> @@ -0,0 +1,4 @@
> +ifeq ($(subdir),stdlib)
> +sysdep_routines += chacha20-ppc
> +CFLAGS-chacha20-ppc.c += -mcpu=power8
> +endif
> diff --git a/sysdeps/powerpc/powerpc64/be/multiarch/chacha20-ppc.c b/sysdeps/powerpc/powerpc64/be/multiarch/chacha20-ppc.c
> new file mode 100644
> index 0000000000..cf9e735326
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/be/multiarch/chacha20-ppc.c
> @@ -0,0 +1 @@
> +#include <sysdeps/powerpc/powerpc64/power8/chacha20-ppc.c>
> diff --git a/sysdeps/powerpc/powerpc64/chacha20_arch.h b/sysdeps/powerpc/powerpc64/be/multiarch/chacha20_arch.h
> similarity index 92%
> rename from sysdeps/powerpc/powerpc64/chacha20_arch.h
> rename to sysdeps/powerpc/powerpc64/be/multiarch/chacha20_arch.h
> index a18115392f..6d2762d82b 100644
> --- a/sysdeps/powerpc/powerpc64/chacha20_arch.h
> +++ b/sysdeps/powerpc/powerpc64/be/multiarch/chacha20_arch.h
> @@ -32,10 +32,6 @@ chacha20_crypt (uint32_t *state, uint8_t *dst,
>     _Static_assert (CHACHA20_BUFSIZE >= CHACHA20_BLOCK_SIZE * 4,
>   		  "CHACHA20_BUFSIZE < CHACHA20_BLOCK_SIZE * 4");
>   
> -#ifdef __LITTLE_ENDIAN__
> -  __chacha20_power8_blocks4 (state, dst, src,
> -			     CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
> -#else
>     unsigned long int hwcap = GLRO(dl_hwcap);
>     unsigned long int hwcap2 = GLRO(dl_hwcap2);
>     if (hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
> @@ -43,5 +39,4 @@ chacha20_crypt (uint32_t *state, uint8_t *dst,
>   			       CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
>     else
>       chacha20_crypt_generic (state, dst, src, bytes);
> -#endif
>   }
> diff --git a/sysdeps/powerpc/powerpc64/power8/Makefile b/sysdeps/powerpc/powerpc64/power8/Makefile
> index 71a59529f3..abb0aa3f11 100644
> --- a/sysdeps/powerpc/powerpc64/power8/Makefile
> +++ b/sysdeps/powerpc/powerpc64/power8/Makefile
> @@ -1,3 +1,8 @@
>   ifeq ($(subdir),string)
>   sysdep_routines += strcasestr-ppc64
>   endif
> +
> +ifeq ($(subdir),stdlib)
> +sysdep_routines += chacha20-ppc
> +CFLAGS-chacha20-ppc.c += -mcpu=power8

Is it required to specify mcpu=power8 here?  I am thinking about the 
case of building glibc for power9 (or newer), which could benefit from 
improved instruction selection when using the VSX builtins.

I think this is improved over V3, and seems OK. Thanks. It would be nice 
to refactor the multiarch/multi-cpu code on powerpc, I agree it is not 
ideal in its current implementation.
  

Patch

diff --git a/LICENSES b/LICENSES
index 05a5c07fcf..1c6c5d73e6 100644
--- a/LICENSES
+++ b/LICENSES
@@ -390,9 +390,9 @@  Copyright 2001 by Stephen L. Moshier <moshier@na-net.ornl.gov>
  License along with this library; if not, see
  <https://www.gnu.org/licenses/>.  */
 
-sysdeps/aarch64/chacha20.S, sysdeps/x86_64/chacha20-sse2.S, and
-sysdeps/x86_64/chacha20-avx2.S import code from libgcrypt, with the
-following notices:
+sysdeps/aarch64/chacha20.S, sysdeps/x86_64/chacha20-sse2.S,
+sysdeps/x86_64/chacha20-avx2.S, and sysdeps/powerpc/powerpc64/chacha20-ppc.c
+import code from libgcrypt, with the following notices:
 
 Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
 
diff --git a/sysdeps/powerpc/powerpc64/Makefile b/sysdeps/powerpc/powerpc64/Makefile
index 679d5e49ba..18943ef09e 100644
--- a/sysdeps/powerpc/powerpc64/Makefile
+++ b/sysdeps/powerpc/powerpc64/Makefile
@@ -66,6 +66,9 @@  tst-setjmp-bug21895-static-ENV = \
 endif
 
 ifeq ($(subdir),stdlib)
+sysdep_routines += chacha20-ppc
+CFLAGS-chacha20-ppc.c += -mcpu=power8
+
 CFLAGS-tst-ucontext-ppc64-vscr.c += -maltivec
 tests += tst-ucontext-ppc64-vscr
 endif
diff --git a/sysdeps/powerpc/powerpc64/chacha20-ppc.c b/sysdeps/powerpc/powerpc64/chacha20-ppc.c
new file mode 100644
index 0000000000..e2567c379a
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/chacha20-ppc.c
@@ -0,0 +1,236 @@ 
+/* Optimized PowerPC implementation of ChaCha20 cipher.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <altivec.h>
+#include <endian.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/cdefs.h>
+
+typedef vector unsigned char vector16x_u8;
+typedef vector unsigned int vector4x_u32;
+typedef vector unsigned long long vector2x_u64;
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+static const vector16x_u8 le_bswap_const =
+  { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+#endif
+
+static inline vector4x_u32
+vec_rol_elems (vector4x_u32 v, unsigned int idx)
+{
+#if __BYTE_ORDER != __BIG_ENDIAN
+  return vec_sld (v, v, (16 - (4 * idx)) & 15);
+#else
+  return vec_sld (v, v, (4 * idx) & 15);
+#endif
+}
+
+static inline vector4x_u32
+vec_load_le (unsigned long offset, const unsigned char *ptr)
+{
+  vector4x_u32 vec;
+  vec = vec_vsx_ld (offset, (const uint32_t *)ptr);
+#if __BYTE_ORDER == __BIG_ENDIAN
+  vec = (vector4x_u32) vec_perm ((vector16x_u8)vec, (vector16x_u8)vec,
+				 le_bswap_const);
+#endif
+  return vec;
+}
+
+static inline void
+vec_store_le (vector4x_u32 vec, unsigned long offset, unsigned char *ptr)
+{
+#if __BYTE_ORDER == __BIG_ENDIAN
+  vec = (vector4x_u32)vec_perm((vector16x_u8)vec, (vector16x_u8)vec,
+			       le_bswap_const);
+#endif
+  vec_vsx_st (vec, offset, (uint32_t *)ptr);
+}
+
+
+static inline vector4x_u32
+vec_add_ctr_u64 (vector4x_u32 v, vector4x_u32 a)
+{
+#if __BYTE_ORDER == __BIG_ENDIAN
+  static const vector16x_u8 swap32 =
+    { 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 };
+  vector2x_u64 vec, add, sum;
+
+  vec = (vector2x_u64)vec_perm ((vector16x_u8)v, (vector16x_u8)v, swap32);
+  add = (vector2x_u64)vec_perm ((vector16x_u8)a, (vector16x_u8)a, swap32);
+  sum = vec + add;
+  return (vector4x_u32)vec_perm ((vector16x_u8)sum, (vector16x_u8)sum, swap32);
+#else
+  return (vector4x_u32)((vector2x_u64)(v) + (vector2x_u64)(a));
+#endif
+}
+
+/**********************************************************************
+  4-way chacha20
+ **********************************************************************/
+
+#define ROTATE(v1,rolv)			\
+	__asm__ ("vrlw %0,%1,%2\n\t" : "=v" (v1) : "v" (v1), "v" (rolv))
+
+#define PLUS(ds,s) \
+	((ds) += (s))
+
+#define XOR(ds,s) \
+	((ds) ^= (s))
+
+#define ADD_U64(v,a) \
+	(v = vec_add_ctr_u64(v, a))
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0, x1, x2, x3) ({ \
+	vector4x_u32 t1 = vec_mergeh(x0, x2); \
+	vector4x_u32 t2 = vec_mergel(x0, x2); \
+	vector4x_u32 t3 = vec_mergeh(x1, x3); \
+	x3 = vec_mergel(x1, x3); \
+	x0 = vec_mergeh(t1, t3); \
+	x1 = vec_mergel(t1, t3); \
+	x2 = vec_mergeh(t2, x3); \
+	x3 = vec_mergel(t2, x3); \
+      })
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2)			\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE(d1, rotate_16); ROTATE(d2, rotate_16);	\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE(b1, rotate_12); ROTATE(b2, rotate_12);	\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE(d1, rotate_8); ROTATE(d2, rotate_8);		\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE(b1, rotate_7); ROTATE(b2, rotate_7);
+
+unsigned int attribute_hidden
+__chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst, const uint8_t *src,
+			   size_t nblks)
+{
+  vector4x_u32 counters_0123 = { 0, 1, 2, 3 };
+  vector4x_u32 counter_4 = { 4, 0, 0, 0 };
+  vector4x_u32 rotate_16 = { 16, 16, 16, 16 };
+  vector4x_u32 rotate_12 = { 12, 12, 12, 12 };
+  vector4x_u32 rotate_8 = { 8, 8, 8, 8 };
+  vector4x_u32 rotate_7 = { 7, 7, 7, 7 };
+  vector4x_u32 state0, state1, state2, state3;
+  vector4x_u32 v0, v1, v2, v3, v4, v5, v6, v7;
+  vector4x_u32 v8, v9, v10, v11, v12, v13, v14, v15;
+  vector4x_u32 tmp;
+  int i;
+
+  /* Force preload of constants to vector registers.  */
+  __asm__ ("": "+v" (counters_0123) :: "memory");
+  __asm__ ("": "+v" (counter_4) :: "memory");
+  __asm__ ("": "+v" (rotate_16) :: "memory");
+  __asm__ ("": "+v" (rotate_12) :: "memory");
+  __asm__ ("": "+v" (rotate_8) :: "memory");
+  __asm__ ("": "+v" (rotate_7) :: "memory");
+
+  state0 = vec_vsx_ld (0 * 16, state);
+  state1 = vec_vsx_ld (1 * 16, state);
+  state2 = vec_vsx_ld (2 * 16, state);
+  state3 = vec_vsx_ld (3 * 16, state);
+
+  do
+    {
+      v0 = vec_splat (state0, 0);
+      v1 = vec_splat (state0, 1);
+      v2 = vec_splat (state0, 2);
+      v3 = vec_splat (state0, 3);
+      v4 = vec_splat (state1, 0);
+      v5 = vec_splat (state1, 1);
+      v6 = vec_splat (state1, 2);
+      v7 = vec_splat (state1, 3);
+      v8 = vec_splat (state2, 0);
+      v9 = vec_splat (state2, 1);
+      v10 = vec_splat (state2, 2);
+      v11 = vec_splat (state2, 3);
+      v12 = vec_splat (state3, 0);
+      v13 = vec_splat (state3, 1);
+      v14 = vec_splat (state3, 2);
+      v15 = vec_splat (state3, 3);
+
+      v12 += counters_0123;
+      v13 -= vec_cmplt (v12, counters_0123);
+
+      for (i = 20; i > 0; i -= 2)
+	{
+	  QUARTERROUND2 (v0, v4,  v8, v12,   v1, v5,  v9, v13)
+	  QUARTERROUND2 (v2, v6, v10, v14,   v3, v7, v11, v15)
+	  QUARTERROUND2 (v0, v5, v10, v15,   v1, v6, v11, v12)
+	  QUARTERROUND2 (v2, v7,  v8, v13,   v3, v4,  v9, v14)
+	}
+
+      v0 += vec_splat (state0, 0);
+      v1 += vec_splat (state0, 1);
+      v2 += vec_splat (state0, 2);
+      v3 += vec_splat (state0, 3);
+      v4 += vec_splat (state1, 0);
+      v5 += vec_splat (state1, 1);
+      v6 += vec_splat (state1, 2);
+      v7 += vec_splat (state1, 3);
+      v8 += vec_splat (state2, 0);
+      v9 += vec_splat (state2, 1);
+      v10 += vec_splat (state2, 2);
+      v11 += vec_splat (state2, 3);
+      tmp = vec_splat( state3, 0);
+      tmp += counters_0123;
+      v12 += tmp;
+      v13 += vec_splat (state3, 1) - vec_cmplt (tmp, counters_0123);
+      v14 += vec_splat (state3, 2);
+      v15 += vec_splat (state3, 3);
+      ADD_U64 (state3, counter_4);
+
+      transpose_4x4 (v0, v1, v2, v3);
+      transpose_4x4 (v4, v5, v6, v7);
+      transpose_4x4 (v8, v9, v10, v11);
+      transpose_4x4 (v12, v13, v14, v15);
+
+      vec_store_le (v0, (64 * 0 + 16 * 0), dst);
+      vec_store_le (v1, (64 * 1 + 16 * 0), dst);
+      vec_store_le (v2, (64 * 2 + 16 * 0), dst);
+      vec_store_le (v3, (64 * 3 + 16 * 0), dst);
+
+      vec_store_le (v4, (64 * 0 + 16 * 1), dst);
+      vec_store_le (v5, (64 * 1 + 16 * 1), dst);
+      vec_store_le (v6, (64 * 2 + 16 * 1), dst);
+      vec_store_le (v7, (64 * 3 + 16 * 1), dst);
+
+      vec_store_le (v8, (64 * 0 + 16 * 2), dst);
+      vec_store_le (v9, (64 * 1 + 16 * 2), dst);
+      vec_store_le (v10, (64 * 2 + 16 * 2), dst);
+      vec_store_le (v11, (64 * 3 + 16 * 2), dst);
+
+      vec_store_le (v12, (64 * 0 + 16 * 3), dst);
+      vec_store_le (v13, (64 * 1 + 16 * 3), dst);
+      vec_store_le (v14, (64 * 2 + 16 * 3), dst);
+      vec_store_le (v15, (64 * 3 + 16 * 3), dst);
+
+      src += 4*64;
+      dst += 4*64;
+
+      nblks -= 4;
+    }
+  while (nblks);
+
+  vec_vsx_st (state3, 3 * 16, state);
+
+  return 0;
+}
diff --git a/sysdeps/powerpc/powerpc64/chacha20_arch.h b/sysdeps/powerpc/powerpc64/chacha20_arch.h
new file mode 100644
index 0000000000..a18115392f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/chacha20_arch.h
@@ -0,0 +1,47 @@ 
+/* PowerPC optimization for ChaCha20.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdbool.h>
+#include <ldsodefs.h>
+
+unsigned int __chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst,
+					const uint8_t *src, size_t nblks)
+     attribute_hidden;
+
+static void
+chacha20_crypt (uint32_t *state, uint8_t *dst,
+		const uint8_t *src, size_t bytes)
+{
+  _Static_assert (CHACHA20_BUFSIZE % 4 == 0,
+		  "CHACHA20_BUFSIZE not multiple of 4");
+  _Static_assert (CHACHA20_BUFSIZE >= CHACHA20_BLOCK_SIZE * 4,
+		  "CHACHA20_BUFSIZE < CHACHA20_BLOCK_SIZE * 4");
+
+#ifdef __LITTLE_ENDIAN__
+  __chacha20_power8_blocks4 (state, dst, src,
+			     CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
+#else
+  unsigned long int hwcap = GLRO(dl_hwcap);
+  unsigned long int hwcap2 = GLRO(dl_hwcap2);
+  if (hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
+    __chacha20_power8_blocks4 (state, dst, src,
+			       CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
+  else
+    chacha20_crypt_generic (state, dst, src, bytes);
+#endif
+}