[7/7] powerpc64: Add optimized chacha20

Message ID 20220413202401.408267-8-adhemerval.zanella@linaro.org
State Superseded
Headers
Series Add arc4random support |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686

Commit Message

Adhemerval Zanella Netto April 13, 2022, 8:24 p.m. UTC
  It adds vectorized ChaCha20 implementation based on libgcrypt
cipher/chacha20-ppc.c.  It targets POWER8 and it is used on
default for LE.

On a POWER8 it shows the following improvements (using
formatted bench-arc4random data):

GENERIC (powerpc64-linux-gnu)
Function                                 MB/s
--------------------------------------------------
arc4random [single-thread]               70.05
arc4random_buf(0) [single-thread]        143.62
arc4random_buf(16) [single-thread]       200.85
arc4random_buf(32) [single-thread]       247.87
arc4random_buf(64) [single-thread]       277.19
--------------------------------------------------
arc4random [multi-thread]                69.99
arc4random_buf(0) [multi-thread]         143.52
arc4random_buf(16) [multi-thread]        200.31
arc4random_buf(32) [multi-thread]        248.63
arc4random_buf(64) [multi-thread]        279.66
--------------------------------------------------

POWER8
Function                                 MB/s
--------------------------------------------------
arc4random [single-thread]               86.91
arc4random_buf(0) [single-thread]        212.20
arc4random_buf(16) [single-thread]       373.42
arc4random_buf(32) [single-thread]       572.93
arc4random_buf(64) [single-thread]       772.87
--------------------------------------------------
arc4random [multi-thread]                84.43
arc4random_buf(0) [multi-thread]         211.93
arc4random_buf(16) [multi-thread]        373.58
arc4random_buf(32) [multi-thread]        573.80
arc4random_buf(64) [multi-thread]        772.96
--------------------------------------------------

Checked on powerpc64-linux-gnu and powerpc64le-linux-gnu.
---
 LICENSES                                  |   4 +-
 sysdeps/powerpc/powerpc64/Makefile        |   3 +
 sysdeps/powerpc/powerpc64/chacha-ppc.c    | 254 ++++++++++++++++++++++
 sysdeps/powerpc/powerpc64/chacha20_arch.h |  53 +++++
 4 files changed, 312 insertions(+), 2 deletions(-)
 create mode 100644 sysdeps/powerpc/powerpc64/chacha-ppc.c
 create mode 100644 sysdeps/powerpc/powerpc64/chacha20_arch.h
  

Patch

diff --git a/LICENSES b/LICENSES
index b0c43495cb..f7dc51c3a9 100644
--- a/LICENSES
+++ b/LICENSES
@@ -391,8 +391,8 @@  Copyright 2001 by Stephen L. Moshier <moshier@na-net.ornl.gov>
  <https://www.gnu.org/licenses/>.  */
 
 sysdeps/x86_64/chacha20-ssse3.S, sysdeps/x86_64/chacha20-avx2.S, and
-sysdeps/aarch64/chacha20.S import code from libgcrypt, with the
-following notices:
+sysdeps/aarch64/chacha20.S, and sysdeps/powerpc/powerpc64/chacha-ppc.c
+import code from libgcrypt, with the following notices:
 
 Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
 
diff --git a/sysdeps/powerpc/powerpc64/Makefile b/sysdeps/powerpc/powerpc64/Makefile
index 679d5e49ba..d213d23dc4 100644
--- a/sysdeps/powerpc/powerpc64/Makefile
+++ b/sysdeps/powerpc/powerpc64/Makefile
@@ -66,6 +66,9 @@  tst-setjmp-bug21895-static-ENV = \
 endif
 
 ifeq ($(subdir),stdlib)
+sysdep_routines += chacha-ppc
+CFLAGS-chacha-ppc.c += -mcpu=power8
+
 CFLAGS-tst-ucontext-ppc64-vscr.c += -maltivec
 tests += tst-ucontext-ppc64-vscr
 endif
diff --git a/sysdeps/powerpc/powerpc64/chacha-ppc.c b/sysdeps/powerpc/powerpc64/chacha-ppc.c
new file mode 100644
index 0000000000..db87aa5823
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/chacha-ppc.c
@@ -0,0 +1,254 @@ 
+/* Optimized PowerPC implementation of ChaCha20 cipher.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <altivec.h>
+#include <stddef.h>
+#include <stdint.h>
+
+typedef vector unsigned char vector16x_u8;
+typedef vector unsigned int vector4x_u32;
+typedef vector unsigned long long vector2x_u64;
+
+#ifdef WORDS_BIGENDIAN
+static const vector16x_u8 le_bswap_const =
+  { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
+#endif
+
+static inline vector4x_u32
+vec_rol_elems (vector4x_u32 v, unsigned int idx)
+{
+#ifndef WORDS_BIGENDIAN
+  return vec_sld (v, v, (16 - (4 * idx)) & 15);
+#else
+  return vec_sld (v, v, (4 * idx) & 15);
+#endif
+}
+
+static inline vector4x_u32
+vec_load_le (unsigned long offset, const unsigned char *ptr)
+{
+  vector4x_u32 vec;
+  vec = vec_vsx_ld (offset, (const uint32_t *)ptr);
+#ifdef WORDS_BIGENDIAN
+  vec = (vector4x_u32) vec_perm ((vector16x_u8)vec, (vector16x_u8)vec,
+				 le_bswap_const);
+#endif
+  return vec;
+}
+
+static inline void
+vec_store_le (vector4x_u32 vec, unsigned long offset, unsigned char *ptr)
+{
+#ifdef WORDS_BIGENDIAN
+  vec = (vector4x_u32)vec_perm((vector16x_u8)vec, (vector16x_u8)vec,
+			       le_bswap_const);
+#endif
+  vec_vsx_st (vec, offset, (uint32_t *)ptr);
+}
+
+
+static inline vector4x_u32
+vec_add_ctr_u64 (vector4x_u32 v, vector4x_u32 a)
+{
+#ifdef WORDS_BIGENDIAN
+  static const vector16x_u8 swap32 =
+    { 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 };
+  vector2x_u64 vec, add, sum;
+
+  vec = (vector2x_u64)vec_perm ((vector16x_u8)v, (vector16x_u8)v, swap32);
+  add = (vector2x_u64)vec_perm ((vector16x_u8)a, (vector16x_u8)a, swap32);
+  sum = vec + add;
+  return (vector4x_u32)vec_perm ((vector16x_u8)sum, (vector16x_u8)sum, swap32);
+#else
+  return (vector4x_u32)((vector2x_u64)(v) + (vector2x_u64)(a));
+#endif
+}
+
+/**********************************************************************
+  4-way chacha20
+ **********************************************************************/
+
+#define ROTATE(v1,rolv)			\
+	__asm__ ("vrlw %0,%1,%2\n\t" : "=v" (v1) : "v" (v1), "v" (rolv))
+
+#define PLUS(ds,s) \
+	((ds) += (s))
+
+#define XOR(ds,s) \
+	((ds) ^= (s))
+
+#define ADD_U64(v,a) \
+	(v = vec_add_ctr_u64(v, a))
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0, x1, x2, x3) ({ \
+	vector4x_u32 t1 = vec_mergeh(x0, x2); \
+	vector4x_u32 t2 = vec_mergel(x0, x2); \
+	vector4x_u32 t3 = vec_mergeh(x1, x3); \
+	x3 = vec_mergel(x1, x3); \
+	x0 = vec_mergeh(t1, t3); \
+	x1 = vec_mergel(t1, t3); \
+	x2 = vec_mergeh(t2, x3); \
+	x3 = vec_mergel(t2, x3); \
+      })
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2)			\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE(d1, rotate_16); ROTATE(d2, rotate_16);	\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE(b1, rotate_12); ROTATE(b2, rotate_12);	\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE(d1, rotate_8); ROTATE(d2, rotate_8);		\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE(b1, rotate_7); ROTATE(b2, rotate_7);
+
+unsigned int
+__chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst, const uint8_t *src,
+			   size_t nblks)
+{
+  vector4x_u32 counters_0123 = { 0, 1, 2, 3 };
+  vector4x_u32 counter_4 = { 4, 0, 0, 0 };
+  vector4x_u32 rotate_16 = { 16, 16, 16, 16 };
+  vector4x_u32 rotate_12 = { 12, 12, 12, 12 };
+  vector4x_u32 rotate_8 = { 8, 8, 8, 8 };
+  vector4x_u32 rotate_7 = { 7, 7, 7, 7 };
+  vector4x_u32 state0, state1, state2, state3;
+  vector4x_u32 v0, v1, v2, v3, v4, v5, v6, v7;
+  vector4x_u32 v8, v9, v10, v11, v12, v13, v14, v15;
+  vector4x_u32 tmp;
+  int i;
+
+  /* Force preload of constants to vector registers.  */
+  __asm__ ("": "+v" (counters_0123) :: "memory");
+  __asm__ ("": "+v" (counter_4) :: "memory");
+  __asm__ ("": "+v" (rotate_16) :: "memory");
+  __asm__ ("": "+v" (rotate_12) :: "memory");
+  __asm__ ("": "+v" (rotate_8) :: "memory");
+  __asm__ ("": "+v" (rotate_7) :: "memory");
+
+  state0 = vec_vsx_ld (0 * 16, state);
+  state1 = vec_vsx_ld (1 * 16, state);
+  state2 = vec_vsx_ld (2 * 16, state);
+  state3 = vec_vsx_ld (3 * 16, state);
+
+  do
+    {
+      v0 = vec_splat (state0, 0);
+      v1 = vec_splat (state0, 1);
+      v2 = vec_splat (state0, 2);
+      v3 = vec_splat (state0, 3);
+      v4 = vec_splat (state1, 0);
+      v5 = vec_splat (state1, 1);
+      v6 = vec_splat (state1, 2);
+      v7 = vec_splat (state1, 3);
+      v8 = vec_splat (state2, 0);
+      v9 = vec_splat (state2, 1);
+      v10 = vec_splat (state2, 2);
+      v11 = vec_splat (state2, 3);
+      v12 = vec_splat (state3, 0);
+      v13 = vec_splat (state3, 1);
+      v14 = vec_splat (state3, 2);
+      v15 = vec_splat (state3, 3);
+
+      v12 += counters_0123;
+      v13 -= vec_cmplt (v12, counters_0123);
+
+      for (i = 20; i > 0; i -= 2)
+	{
+	  QUARTERROUND2 (v0, v4,  v8, v12,   v1, v5,  v9, v13)
+	  QUARTERROUND2 (v2, v6, v10, v14,   v3, v7, v11, v15)
+	  QUARTERROUND2 (v0, v5, v10, v15,   v1, v6, v11, v12)
+	  QUARTERROUND2 (v2, v7,  v8, v13,   v3, v4,  v9, v14)
+	}
+
+      v0 += vec_splat (state0, 0);
+      v1 += vec_splat (state0, 1);
+      v2 += vec_splat (state0, 2);
+      v3 += vec_splat (state0, 3);
+      v4 += vec_splat (state1, 0);
+      v5 += vec_splat (state1, 1);
+      v6 += vec_splat (state1, 2);
+      v7 += vec_splat (state1, 3);
+      v8 += vec_splat (state2, 0);
+      v9 += vec_splat (state2, 1);
+      v10 += vec_splat (state2, 2);
+      v11 += vec_splat (state2, 3);
+      tmp = vec_splat( state3, 0);
+      tmp += counters_0123;
+      v12 += tmp;
+      v13 += vec_splat (state3, 1) - vec_cmplt (tmp, counters_0123);
+      v14 += vec_splat (state3, 2);
+      v15 += vec_splat (state3, 3);
+      ADD_U64 (state3, counter_4);
+
+      transpose_4x4 (v0, v1, v2, v3);
+      transpose_4x4 (v4, v5, v6, v7);
+      transpose_4x4 (v8, v9, v10, v11);
+      transpose_4x4 (v12, v13, v14, v15);
+
+      v0 ^= vec_load_le ((64 * 0 + 16 * 0), src);
+      v1 ^= vec_load_le ((64 * 1 + 16 * 0), src);
+      v2 ^= vec_load_le ((64 * 2 + 16 * 0), src);
+      v3 ^= vec_load_le ((64 * 3 + 16 * 0), src);
+
+      v4 ^= vec_load_le ((64 * 0 + 16 * 1), src);
+      v5 ^= vec_load_le ((64 * 1 + 16 * 1), src);
+      v6 ^= vec_load_le ((64 * 2 + 16 * 1), src);
+      v7 ^= vec_load_le ((64 * 3 + 16 * 1), src);
+
+      v8 ^= vec_load_le ((64 * 0 + 16 * 2), src);
+      v9 ^= vec_load_le ((64 * 1 + 16 * 2), src);
+      v10 ^= vec_load_le ((64 * 2 + 16 * 2), src);
+      v11 ^= vec_load_le ((64 * 3 + 16 * 2), src);
+
+      v12 ^= vec_load_le ((64 * 0 + 16 * 3), src);
+      v13 ^= vec_load_le ((64 * 1 + 16 * 3), src);
+      v14 ^= vec_load_le ((64 * 2 + 16 * 3), src);
+      v15 ^= vec_load_le ((64 * 3 + 16 * 3), src);
+
+      vec_store_le (v0, (64 * 0 + 16 * 0), dst);
+      vec_store_le (v1, (64 * 1 + 16 * 0), dst);
+      vec_store_le (v2, (64 * 2 + 16 * 0), dst);
+      vec_store_le (v3, (64 * 3 + 16 * 0), dst);
+
+      vec_store_le (v4, (64 * 0 + 16 * 1), dst);
+      vec_store_le (v5, (64 * 1 + 16 * 1), dst);
+      vec_store_le (v6, (64 * 2 + 16 * 1), dst);
+      vec_store_le (v7, (64 * 3 + 16 * 1), dst);
+
+      vec_store_le (v8, (64 * 0 + 16 * 2), dst);
+      vec_store_le (v9, (64 * 1 + 16 * 2), dst);
+      vec_store_le (v10, (64 * 2 + 16 * 2), dst);
+      vec_store_le (v11, (64 * 3 + 16 * 2), dst);
+
+      vec_store_le (v12, (64 * 0 + 16 * 3), dst);
+      vec_store_le (v13, (64 * 1 + 16 * 3), dst);
+      vec_store_le (v14, (64 * 2 + 16 * 3), dst);
+      vec_store_le (v15, (64 * 3 + 16 * 3), dst);
+
+      src += 4*64;
+      dst += 4*64;
+
+      nblks -= 4;
+    }
+  while (nblks);
+
+  vec_vsx_st (state3, 3 * 16, state);
+
+  return 0;
+}
diff --git a/sysdeps/powerpc/powerpc64/chacha20_arch.h b/sysdeps/powerpc/powerpc64/chacha20_arch.h
new file mode 100644
index 0000000000..e958c73b3c
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/chacha20_arch.h
@@ -0,0 +1,53 @@ 
+/* PowerPC optimization for ChaCha20.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdbool.h>
+#include <ldsodefs.h>
+
+unsigned int __chacha20_power8_blocks4 (uint32_t *state, uint8_t *dst,
+					const uint8_t *src, size_t nblks);
+
+static inline bool
+is_power8 (void)
+{
+#ifdef __LITTLE_ENDIAN__
+  return true;
+#else
+  unsigned long int hwcap = GLRO(dl_hwcap);
+  unsigned long int hwcap2 = GLRO(dl_hwcap2);
+  return hwcap2 & PPC_FEATURE2_ARCH_2_07 && hwcap & PPC_FEATURE_HAS_ALTIVEC;
+#endif
+}
+
+static void
+chacha20_crypt (struct chacha20_state *state, uint8_t *dst,
+		const uint8_t *src, size_t bytes)
+{
+  if (is_power8 () && bytes >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      size_t nblocks = bytes / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 4;
+      __chacha20_power8_blocks4 (state->ctx, dst, src, nblocks);
+      bytes -= nblocks * CHACHA20_BLOCK_SIZE;
+      dst += nblocks * CHACHA20_BLOCK_SIZE;
+      src += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+
+  if (bytes > 0)
+    chacha20_crypt_generic (state, dst, src, bytes);
+}