On 20/04/2022 15:38, Paul E Murphy wrote:
>
>
> On 4/19/22 4:28 PM, Adhemerval Zanella via Libc-alpha wrote:
>> It adds vectorized ChaCha20 implementation based on libgcrypt
>> cipher/chacha20-ppc.c. It targets POWER8 and it is used on
>> default for LE.
>
>> diff --git a/sysdeps/powerpc/powerpc64/chacha20-ppc.c b/sysdeps/powerpc/powerpc64/chacha20-ppc.c
>> new file mode 100644
>> index 0000000000..e2567c379a
>> --- /dev/null
>> +++ b/sysdeps/powerpc/powerpc64/chacha20-ppc.c
>
> How difficult is it to keep this synchronized with the upstream version in libgcrypt? Also, this seems like it would be a better placed in the power8 subdirectory.
It would be somewhat complicate because libgcrypt also implements the
poly1305 on the same file (which uses common macros and definition
for chacha20) and it adds final XOR based on input stream (which
for arc4random usage is not required since it does not add any
hardening).
It would require to refactor libgcrypt code a bit to split the
chacha and poly1305 and to add a macro to XOR the input.
>
>> diff --git a/sysdeps/powerpc/powerpc64/chacha20_arch.h b/sysdeps/powerpc/powerpc64/chacha20_arch.h
>> new file mode 100644
>> index 0000000000..a18115392f
>> --- /dev/null
>> +++ b/sysdeps/powerpc/powerpc64/chacha20_arch.h
>> @@ -0,0 +1,47 @@
>> +/* PowerPC optimization for ChaCha20.
>> + Copyright (C) 2022 Free Software Foundation, Inc.
>> + This file is part of the GNU C Library.
>> +
>> + The GNU C Library is free software; you can redistribute it and/or
>> + modify it under the terms of the GNU Lesser General Public
>> + License as published by the Free Software Foundation; either
>> + version 2.1 of the License, or (at your option) any later version.
>> +
>> + The GNU C Library is distributed in the hope that it will be useful,
>> + but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + Lesser General Public License for more details.
>> +
>> + You should have received a copy of the GNU Lesser General Public
>> + License along with the GNU C Library; if not, see
>> + <http://www.gnu.org/licenses/>. */
>> +
>> +#include <stdbool.h>
>> +#include <ldsodefs.h>
>> +
>> +unsigned int __chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst,
>> + const uint8_t *src, size_t nblks)
>> + attribute_hidden;
>> +
>> +static void
>> +chacha20_crypt (uint32_t *state, uint8_t *dst,
>> + const uint8_t *src, size_t bytes)
>> +{
>> + _Static_assert (CHACHA20_BUFSIZE % 4 == 0,
>> + "CHACHA20_BUFSIZE not multiple of 4");
>> + _Static_assert (CHACHA20_BUFSIZE >= CHACHA20_BLOCK_SIZE * 4,
>> + "CHACHA20_BUFSIZE < CHACHA20_BLOCK_SIZE * 4");
>> +
>> +#ifdef __LITTLE_ENDIAN__
>> + __chacha20_power8_blocks4 (state, dst, src,
>> + CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
>> +#else
>> + unsigned long int hwcap = GLRO(dl_hwcap);
>> + unsigned long int hwcap2 = GLRO(dl_hwcap2);
>> + if (hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
>> + __chacha20_power8_blocks4 (state, dst, src,
>> + CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
>> + else
>> + chacha20_crypt_generic (state, dst, src, bytes);
>> +#endif
>
> This file doesn't seem to obey the multiarch conventions of other powerpc64 specific bits. Is it possible to implement multiarch support similar to the libc/libm routines?
I am not very found of the powerpc multiarch convention and it would
require some more boilerplate code to handle BE, but it is doable.
So LE will continue to use __chacha20_power8_blocks4 as
default, while BE will just select if --with-arch=power8 is defined
for for default build. With --disable-multi-arch the power8 will be
select iff --with-arch=power8 is set.
---
diff --git a/sysdeps/powerpc/powerpc64/Makefile b/sysdeps/powerpc/powerpc64/Makefile
index 18943ef09e..679d5e49ba 100644
--- a/sysdeps/powerpc/powerpc64/Makefile
+++ b/sysdeps/powerpc/powerpc64/Makefile
@@ -66,9 +66,6 @@ tst-setjmp-bug21895-static-ENV = \
endif
ifeq ($(subdir),stdlib)
-sysdep_routines += chacha20-ppc
-CFLAGS-chacha20-ppc.c += -mcpu=power8
-
CFLAGS-tst-ucontext-ppc64-vscr.c += -maltivec
tests += tst-ucontext-ppc64-vscr
endif
diff --git a/sysdeps/powerpc/powerpc64/be/multiarch/Makefile b/sysdeps/powerpc/powerpc64/be/multiarch/Makefile
new file mode 100644
index 0000000000..8c75165f7f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/be/multiarch/Makefile
@@ -0,0 +1,4 @@
+ifeq ($(subdir),stdlib)
+sysdep_routines += chacha20-ppc
+CFLAGS-chacha20-ppc.c += -mcpu=power8
+endif
diff --git a/sysdeps/powerpc/powerpc64/be/multiarch/chacha20-ppc.c b/sysdeps/powerpc/powerpc64/be/multiarch/chacha20-ppc.c
new file mode 100644
index 0000000000..cf9e735326
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/be/multiarch/chacha20-ppc.c
@@ -0,0 +1 @@
+#include <sysdeps/powerpc/powerpc64/power8/chacha20-ppc.c>
diff --git a/sysdeps/powerpc/powerpc64/chacha20_arch.h b/sysdeps/powerpc/powerpc64/be/multiarch/chacha20_arch.h
similarity index 92%
rename from sysdeps/powerpc/powerpc64/chacha20_arch.h
rename to sysdeps/powerpc/powerpc64/be/multiarch/chacha20_arch.h
index a18115392f..6d2762d82b 100644
--- a/sysdeps/powerpc/powerpc64/chacha20_arch.h
+++ b/sysdeps/powerpc/powerpc64/be/multiarch/chacha20_arch.h
@@ -32,10 +32,6 @@ chacha20_crypt (uint32_t *state, uint8_t *dst,
_Static_assert (CHACHA20_BUFSIZE >= CHACHA20_BLOCK_SIZE * 4,
"CHACHA20_BUFSIZE < CHACHA20_BLOCK_SIZE * 4");
-#ifdef __LITTLE_ENDIAN__
- __chacha20_power8_blocks4 (state, dst, src,
- CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
-#else
unsigned long int hwcap = GLRO(dl_hwcap);
unsigned long int hwcap2 = GLRO(dl_hwcap2);
if (hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
@@ -43,5 +39,4 @@ chacha20_crypt (uint32_t *state, uint8_t *dst,
CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
else
chacha20_crypt_generic (state, dst, src, bytes);
-#endif
}
diff --git a/sysdeps/powerpc/powerpc64/power8/Makefile b/sysdeps/powerpc/powerpc64/power8/Makefile
index 71a59529f3..abb0aa3f11 100644
--- a/sysdeps/powerpc/powerpc64/power8/Makefile
+++ b/sysdeps/powerpc/powerpc64/power8/Makefile
@@ -1,3 +1,8 @@
ifeq ($(subdir),string)
sysdep_routines += strcasestr-ppc64
endif
+
+ifeq ($(subdir),stdlib)
+sysdep_routines += chacha20-ppc
+CFLAGS-chacha20-ppc.c += -mcpu=power8
+endif
diff --git a/sysdeps/powerpc/powerpc64/chacha20-ppc.c b/sysdeps/powerpc/powerpc64/power8/chacha20-ppc.c
similarity index 100%
rename from sysdeps/powerpc/powerpc64/chacha20-ppc.c
rename to sysdeps/powerpc/powerpc64/power8/chacha20-ppc.c
diff --git a/sysdeps/powerpc/powerpc64/power8/chacha20_arch.h b/sysdeps/powerpc/powerpc64/power8/chacha20_arch.h
new file mode 100644
index 0000000000..270c71130f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/chacha20_arch.h
@@ -0,0 +1,37 @@
+/* PowerPC optimization for ChaCha20.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stdbool.h>
+#include <ldsodefs.h>
+
+unsigned int __chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst,
+ const uint8_t *src, size_t nblks)
+ attribute_hidden;
+
+static void
+chacha20_crypt (uint32_t *state, uint8_t *dst,
+ const uint8_t *src, size_t bytes)
+{
+ _Static_assert (CHACHA20_BUFSIZE % 4 == 0,
+ "CHACHA20_BUFSIZE not multiple of 4");
+ _Static_assert (CHACHA20_BUFSIZE >= CHACHA20_BLOCK_SIZE * 4,
+ "CHACHA20_BUFSIZE < CHACHA20_BLOCK_SIZE * 4");
+
+ __chacha20_power8_blocks4 (state, dst, src,
+ CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
+}
On 4/20/22 2:23 PM, Adhemerval Zanella wrote:
>
>
> On 20/04/2022 15:38, Paul E Murphy wrote:
>>
>>
>> On 4/19/22 4:28 PM, Adhemerval Zanella via Libc-alpha wrote:
>>> It adds vectorized ChaCha20 implementation based on libgcrypt
>>> cipher/chacha20-ppc.c. It targets POWER8 and it is used on
>>> default for LE.
>>
>>> diff --git a/sysdeps/powerpc/powerpc64/chacha20-ppc.c b/sysdeps/powerpc/powerpc64/chacha20-ppc.c
>>> new file mode 100644
>>> index 0000000000..e2567c379a
>>> --- /dev/null
>>> +++ b/sysdeps/powerpc/powerpc64/chacha20-ppc.c
>>
>> How difficult is it to keep this synchronized with the upstream version in libgcrypt? Also, this seems like it would be a better placed in the power8 subdirectory.
>
> It would be somewhat complicate because libgcrypt also implements the
> poly1305 on the same file (which uses common macros and definition
> for chacha20) and it adds final XOR based on input stream (which
> for arc4random usage is not required since it does not add any
> hardening).
>
> It would require to refactor libgcrypt code a bit to split the
> chacha and poly1305 and to add a macro to XOR the input.
I think this is OK. Thanks for the explanation.
>
>>
>>> diff --git a/sysdeps/powerpc/powerpc64/chacha20_arch.h b/sysdeps/powerpc/powerpc64/chacha20_arch.h
>>> new file mode 100644
>>> index 0000000000..a18115392f
>>> --- /dev/null
>>> +++ b/sysdeps/powerpc/powerpc64/chacha20_arch.h
>>> @@ -0,0 +1,47 @@
>>> +/* PowerPC optimization for ChaCha20.
>>> + Copyright (C) 2022 Free Software Foundation, Inc.
>>> + This file is part of the GNU C Library.
>>> +
>>> + The GNU C Library is free software; you can redistribute it and/or
>>> + modify it under the terms of the GNU Lesser General Public
>>> + License as published by the Free Software Foundation; either
>>> + version 2.1 of the License, or (at your option) any later version.
>>> +
>>> + The GNU C Library is distributed in the hope that it will be useful,
>>> + but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>>> + Lesser General Public License for more details.
>>> +
>>> + You should have received a copy of the GNU Lesser General Public
>>> + License along with the GNU C Library; if not, see
>>> + <http://www.gnu.org/licenses/>. */
>>> +
>>> +#include <stdbool.h>
>>> +#include <ldsodefs.h>
>>> +
>>> +unsigned int __chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst,
>>> + const uint8_t *src, size_t nblks)
>>> + attribute_hidden;
>>> +
>>> +static void
>>> +chacha20_crypt (uint32_t *state, uint8_t *dst,
>>> + const uint8_t *src, size_t bytes)
>>> +{
>>> + _Static_assert (CHACHA20_BUFSIZE % 4 == 0,
>>> + "CHACHA20_BUFSIZE not multiple of 4");
>>> + _Static_assert (CHACHA20_BUFSIZE >= CHACHA20_BLOCK_SIZE * 4,
>>> + "CHACHA20_BUFSIZE < CHACHA20_BLOCK_SIZE * 4");
>>> +
>>> +#ifdef __LITTLE_ENDIAN__
>>> + __chacha20_power8_blocks4 (state, dst, src,
>>> + CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
>>> +#else
>>> + unsigned long int hwcap = GLRO(dl_hwcap);
>>> + unsigned long int hwcap2 = GLRO(dl_hwcap2);
>>> + if (hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
>>> + __chacha20_power8_blocks4 (state, dst, src,
>>> + CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
>>> + else
>>> + chacha20_crypt_generic (state, dst, src, bytes);
>>> +#endif
>>
>> This file doesn't seem to obey the multiarch conventions of other powerpc64 specific bits. Is it possible to implement multiarch support similar to the libc/libm routines?
>
> I am not very found of the powerpc multiarch convention and it would
> require some more boilerplate code to handle BE, but it is doable.
>
> So LE will continue to use __chacha20_power8_blocks4 as
> default, while BE will just select if --with-arch=power8 is defined
> for for default build. With --disable-multi-arch the power8 will be
> select iff --with-arch=power8 is set.
>
> ---
>
> diff --git a/sysdeps/powerpc/powerpc64/Makefile b/sysdeps/powerpc/powerpc64/Makefile
> index 18943ef09e..679d5e49ba 100644
> --- a/sysdeps/powerpc/powerpc64/Makefile
> +++ b/sysdeps/powerpc/powerpc64/Makefile
> @@ -66,9 +66,6 @@ tst-setjmp-bug21895-static-ENV = \
> endif
>
> ifeq ($(subdir),stdlib)
> -sysdep_routines += chacha20-ppc
> -CFLAGS-chacha20-ppc.c += -mcpu=power8
> -
> CFLAGS-tst-ucontext-ppc64-vscr.c += -maltivec
> tests += tst-ucontext-ppc64-vscr
> endif
> diff --git a/sysdeps/powerpc/powerpc64/be/multiarch/Makefile b/sysdeps/powerpc/powerpc64/be/multiarch/Makefile
> new file mode 100644
> index 0000000000..8c75165f7f
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/be/multiarch/Makefile
> @@ -0,0 +1,4 @@
> +ifeq ($(subdir),stdlib)
> +sysdep_routines += chacha20-ppc
> +CFLAGS-chacha20-ppc.c += -mcpu=power8
> +endif
> diff --git a/sysdeps/powerpc/powerpc64/be/multiarch/chacha20-ppc.c b/sysdeps/powerpc/powerpc64/be/multiarch/chacha20-ppc.c
> new file mode 100644
> index 0000000000..cf9e735326
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/be/multiarch/chacha20-ppc.c
> @@ -0,0 +1 @@
> +#include <sysdeps/powerpc/powerpc64/power8/chacha20-ppc.c>
> diff --git a/sysdeps/powerpc/powerpc64/chacha20_arch.h b/sysdeps/powerpc/powerpc64/be/multiarch/chacha20_arch.h
> similarity index 92%
> rename from sysdeps/powerpc/powerpc64/chacha20_arch.h
> rename to sysdeps/powerpc/powerpc64/be/multiarch/chacha20_arch.h
> index a18115392f..6d2762d82b 100644
> --- a/sysdeps/powerpc/powerpc64/chacha20_arch.h
> +++ b/sysdeps/powerpc/powerpc64/be/multiarch/chacha20_arch.h
> @@ -32,10 +32,6 @@ chacha20_crypt (uint32_t *state, uint8_t *dst,
> _Static_assert (CHACHA20_BUFSIZE >= CHACHA20_BLOCK_SIZE * 4,
> "CHACHA20_BUFSIZE < CHACHA20_BLOCK_SIZE * 4");
>
> -#ifdef __LITTLE_ENDIAN__
> - __chacha20_power8_blocks4 (state, dst, src,
> - CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
> -#else
> unsigned long int hwcap = GLRO(dl_hwcap);
> unsigned long int hwcap2 = GLRO(dl_hwcap2);
> if (hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
> @@ -43,5 +39,4 @@ chacha20_crypt (uint32_t *state, uint8_t *dst,
> CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
> else
> chacha20_crypt_generic (state, dst, src, bytes);
> -#endif
> }
> diff --git a/sysdeps/powerpc/powerpc64/power8/Makefile b/sysdeps/powerpc/powerpc64/power8/Makefile
> index 71a59529f3..abb0aa3f11 100644
> --- a/sysdeps/powerpc/powerpc64/power8/Makefile
> +++ b/sysdeps/powerpc/powerpc64/power8/Makefile
> @@ -1,3 +1,8 @@
> ifeq ($(subdir),string)
> sysdep_routines += strcasestr-ppc64
> endif
> +
> +ifeq ($(subdir),stdlib)
> +sysdep_routines += chacha20-ppc
> +CFLAGS-chacha20-ppc.c += -mcpu=power8
Is it required to specify mcpu=power8 here? I am thinking about the
case of building glibc for power9 (or newer), which could benefit from
improved instruction selection when using the VSX builtins.
I think this is improved over V3, and seems OK. Thanks. It would be nice
to refactor the multiarch/multi-cpu code on powerpc, I agree it is not
ideal in its current implementation.
@@ -390,9 +390,9 @@ Copyright 2001 by Stephen L. Moshier <moshier@na-net.ornl.gov>
License along with this library; if not, see
<https://www.gnu.org/licenses/>. */
-sysdeps/aarch64/chacha20.S, sysdeps/x86_64/chacha20-sse2.S, and
-sysdeps/x86_64/chacha20-avx2.S import code from libgcrypt, with the
-following notices:
+sysdeps/aarch64/chacha20.S, sysdeps/x86_64/chacha20-sse2.S,
+sysdeps/x86_64/chacha20-avx2.S, and sysdeps/powerpc/powerpc64/chacha20-ppc.c
+import code from libgcrypt, with the following notices:
Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
@@ -66,6 +66,9 @@ tst-setjmp-bug21895-static-ENV = \
endif
ifeq ($(subdir),stdlib)
+sysdep_routines += chacha20-ppc
+CFLAGS-chacha20-ppc.c += -mcpu=power8
+
CFLAGS-tst-ucontext-ppc64-vscr.c += -maltivec
tests += tst-ucontext-ppc64-vscr
endif
new file mode 100644
@@ -0,0 +1,236 @@
+/* Optimized PowerPC implementation of ChaCha20 cipher.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <altivec.h>
+#include <endian.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <sys/cdefs.h>
+
+typedef vector unsigned char vector16x_u8;
+typedef vector unsigned int vector4x_u32;
+typedef vector unsigned long long vector2x_u64;
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+static const vector16x_u8 le_bswap_const =
+ { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+#endif
+
+static inline vector4x_u32
+vec_rol_elems (vector4x_u32 v, unsigned int idx)
+{
+#if __BYTE_ORDER != __BIG_ENDIAN
+ return vec_sld (v, v, (16 - (4 * idx)) & 15);
+#else
+ return vec_sld (v, v, (4 * idx) & 15);
+#endif
+}
+
+static inline vector4x_u32
+vec_load_le (unsigned long offset, const unsigned char *ptr)
+{
+ vector4x_u32 vec;
+ vec = vec_vsx_ld (offset, (const uint32_t *)ptr);
+#if __BYTE_ORDER == __BIG_ENDIAN
+ vec = (vector4x_u32) vec_perm ((vector16x_u8)vec, (vector16x_u8)vec,
+ le_bswap_const);
+#endif
+ return vec;
+}
+
+static inline void
+vec_store_le (vector4x_u32 vec, unsigned long offset, unsigned char *ptr)
+{
+#if __BYTE_ORDER == __BIG_ENDIAN
+ vec = (vector4x_u32)vec_perm((vector16x_u8)vec, (vector16x_u8)vec,
+ le_bswap_const);
+#endif
+ vec_vsx_st (vec, offset, (uint32_t *)ptr);
+}
+
+
+static inline vector4x_u32
+vec_add_ctr_u64 (vector4x_u32 v, vector4x_u32 a)
+{
+#if __BYTE_ORDER == __BIG_ENDIAN
+ static const vector16x_u8 swap32 =
+ { 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 };
+ vector2x_u64 vec, add, sum;
+
+ vec = (vector2x_u64)vec_perm ((vector16x_u8)v, (vector16x_u8)v, swap32);
+ add = (vector2x_u64)vec_perm ((vector16x_u8)a, (vector16x_u8)a, swap32);
+ sum = vec + add;
+ return (vector4x_u32)vec_perm ((vector16x_u8)sum, (vector16x_u8)sum, swap32);
+#else
+ return (vector4x_u32)((vector2x_u64)(v) + (vector2x_u64)(a));
+#endif
+}
+
+/**********************************************************************
+ 4-way chacha20
+ **********************************************************************/
+
+#define ROTATE(v1,rolv) \
+ __asm__ ("vrlw %0,%1,%2\n\t" : "=v" (v1) : "v" (v1), "v" (rolv))
+
+#define PLUS(ds,s) \
+ ((ds) += (s))
+
+#define XOR(ds,s) \
+ ((ds) ^= (s))
+
+#define ADD_U64(v,a) \
+ (v = vec_add_ctr_u64(v, a))
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0, x1, x2, x3) ({ \
+ vector4x_u32 t1 = vec_mergeh(x0, x2); \
+ vector4x_u32 t2 = vec_mergel(x0, x2); \
+ vector4x_u32 t3 = vec_mergeh(x1, x3); \
+ x3 = vec_mergel(x1, x3); \
+ x0 = vec_mergeh(t1, t3); \
+ x1 = vec_mergel(t1, t3); \
+ x2 = vec_mergeh(t2, x3); \
+ x3 = vec_mergel(t2, x3); \
+ })
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2) \
+ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
+ ROTATE(d1, rotate_16); ROTATE(d2, rotate_16); \
+ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
+ ROTATE(b1, rotate_12); ROTATE(b2, rotate_12); \
+ PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2); \
+ ROTATE(d1, rotate_8); ROTATE(d2, rotate_8); \
+ PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2); \
+ ROTATE(b1, rotate_7); ROTATE(b2, rotate_7);
+
+unsigned int attribute_hidden
+__chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst, const uint8_t *src,
+ size_t nblks)
+{
+ vector4x_u32 counters_0123 = { 0, 1, 2, 3 };
+ vector4x_u32 counter_4 = { 4, 0, 0, 0 };
+ vector4x_u32 rotate_16 = { 16, 16, 16, 16 };
+ vector4x_u32 rotate_12 = { 12, 12, 12, 12 };
+ vector4x_u32 rotate_8 = { 8, 8, 8, 8 };
+ vector4x_u32 rotate_7 = { 7, 7, 7, 7 };
+ vector4x_u32 state0, state1, state2, state3;
+ vector4x_u32 v0, v1, v2, v3, v4, v5, v6, v7;
+ vector4x_u32 v8, v9, v10, v11, v12, v13, v14, v15;
+ vector4x_u32 tmp;
+ int i;
+
+ /* Force preload of constants to vector registers. */
+ __asm__ ("": "+v" (counters_0123) :: "memory");
+ __asm__ ("": "+v" (counter_4) :: "memory");
+ __asm__ ("": "+v" (rotate_16) :: "memory");
+ __asm__ ("": "+v" (rotate_12) :: "memory");
+ __asm__ ("": "+v" (rotate_8) :: "memory");
+ __asm__ ("": "+v" (rotate_7) :: "memory");
+
+ state0 = vec_vsx_ld (0 * 16, state);
+ state1 = vec_vsx_ld (1 * 16, state);
+ state2 = vec_vsx_ld (2 * 16, state);
+ state3 = vec_vsx_ld (3 * 16, state);
+
+ do
+ {
+ v0 = vec_splat (state0, 0);
+ v1 = vec_splat (state0, 1);
+ v2 = vec_splat (state0, 2);
+ v3 = vec_splat (state0, 3);
+ v4 = vec_splat (state1, 0);
+ v5 = vec_splat (state1, 1);
+ v6 = vec_splat (state1, 2);
+ v7 = vec_splat (state1, 3);
+ v8 = vec_splat (state2, 0);
+ v9 = vec_splat (state2, 1);
+ v10 = vec_splat (state2, 2);
+ v11 = vec_splat (state2, 3);
+ v12 = vec_splat (state3, 0);
+ v13 = vec_splat (state3, 1);
+ v14 = vec_splat (state3, 2);
+ v15 = vec_splat (state3, 3);
+
+ v12 += counters_0123;
+ v13 -= vec_cmplt (v12, counters_0123);
+
+ for (i = 20; i > 0; i -= 2)
+ {
+ QUARTERROUND2 (v0, v4, v8, v12, v1, v5, v9, v13)
+ QUARTERROUND2 (v2, v6, v10, v14, v3, v7, v11, v15)
+ QUARTERROUND2 (v0, v5, v10, v15, v1, v6, v11, v12)
+ QUARTERROUND2 (v2, v7, v8, v13, v3, v4, v9, v14)
+ }
+
+ v0 += vec_splat (state0, 0);
+ v1 += vec_splat (state0, 1);
+ v2 += vec_splat (state0, 2);
+ v3 += vec_splat (state0, 3);
+ v4 += vec_splat (state1, 0);
+ v5 += vec_splat (state1, 1);
+ v6 += vec_splat (state1, 2);
+ v7 += vec_splat (state1, 3);
+ v8 += vec_splat (state2, 0);
+ v9 += vec_splat (state2, 1);
+ v10 += vec_splat (state2, 2);
+ v11 += vec_splat (state2, 3);
+ tmp = vec_splat( state3, 0);
+ tmp += counters_0123;
+ v12 += tmp;
+ v13 += vec_splat (state3, 1) - vec_cmplt (tmp, counters_0123);
+ v14 += vec_splat (state3, 2);
+ v15 += vec_splat (state3, 3);
+ ADD_U64 (state3, counter_4);
+
+ transpose_4x4 (v0, v1, v2, v3);
+ transpose_4x4 (v4, v5, v6, v7);
+ transpose_4x4 (v8, v9, v10, v11);
+ transpose_4x4 (v12, v13, v14, v15);
+
+ vec_store_le (v0, (64 * 0 + 16 * 0), dst);
+ vec_store_le (v1, (64 * 1 + 16 * 0), dst);
+ vec_store_le (v2, (64 * 2 + 16 * 0), dst);
+ vec_store_le (v3, (64 * 3 + 16 * 0), dst);
+
+ vec_store_le (v4, (64 * 0 + 16 * 1), dst);
+ vec_store_le (v5, (64 * 1 + 16 * 1), dst);
+ vec_store_le (v6, (64 * 2 + 16 * 1), dst);
+ vec_store_le (v7, (64 * 3 + 16 * 1), dst);
+
+ vec_store_le (v8, (64 * 0 + 16 * 2), dst);
+ vec_store_le (v9, (64 * 1 + 16 * 2), dst);
+ vec_store_le (v10, (64 * 2 + 16 * 2), dst);
+ vec_store_le (v11, (64 * 3 + 16 * 2), dst);
+
+ vec_store_le (v12, (64 * 0 + 16 * 3), dst);
+ vec_store_le (v13, (64 * 1 + 16 * 3), dst);
+ vec_store_le (v14, (64 * 2 + 16 * 3), dst);
+ vec_store_le (v15, (64 * 3 + 16 * 3), dst);
+
+ src += 4*64;
+ dst += 4*64;
+
+ nblks -= 4;
+ }
+ while (nblks);
+
+ vec_vsx_st (state3, 3 * 16, state);
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,47 @@
+/* PowerPC optimization for ChaCha20.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <stdbool.h>
+#include <ldsodefs.h>
+
+unsigned int __chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst,
+ const uint8_t *src, size_t nblks)
+ attribute_hidden;
+
+static void
+chacha20_crypt (uint32_t *state, uint8_t *dst,
+ const uint8_t *src, size_t bytes)
+{
+ _Static_assert (CHACHA20_BUFSIZE % 4 == 0,
+ "CHACHA20_BUFSIZE not multiple of 4");
+ _Static_assert (CHACHA20_BUFSIZE >= CHACHA20_BLOCK_SIZE * 4,
+ "CHACHA20_BUFSIZE < CHACHA20_BLOCK_SIZE * 4");
+
+#ifdef __LITTLE_ENDIAN__
+ __chacha20_power8_blocks4 (state, dst, src,
+ CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
+#else
+ unsigned long int hwcap = GLRO(dl_hwcap);
+ unsigned long int hwcap2 = GLRO(dl_hwcap2);
+ if (hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
+ __chacha20_power8_blocks4 (state, dst, src,
+ CHACHA20_BUFSIZE / CHACHA20_BLOCK_SIZE);
+ else
+ chacha20_crypt_generic (state, dst, src, bytes);
+#endif
+}