[4/7] x86: Add SSSE3 optimized chacha20

Message ID 20220413202401.408267-5-adhemerval.zanella@linaro.org
State Superseded
Headers
Series Add arc4random support |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Adhemerval Zanella April 13, 2022, 8:23 p.m. UTC
  It adds vectorized ChaCha20 implementation based on libgcrypt
cipher/chacha20-amd64-ssse3.S.  It is used only if SSSE3 is supported
and enable by the architecture.

On a Ryzen 9 5900X it shows the following improvements (using
formatted bench-arc4random data):

GENERIC
Function                                 MB/s
--------------------------------------------------
arc4random [single-thread]               375.06
arc4random_buf(0) [single-thread]        498.50
arc4random_buf(16) [single-thread]       576.86
arc4random_buf(32) [single-thread]       615.76
arc4random_buf(64) [single-thread]       633.97
--------------------------------------------------
arc4random [multi-thread]                359.86
arc4random_buf(0) [multi-thread]         479.27
arc4random_buf(16) [multi-thread]        543.65
arc4random_buf(32) [multi-thread]        581.98
arc4random_buf(64) [multi-thread]        603.01
--------------------------------------------------

SSSE3:
Function                                 MB/s
--------------------------------------------------
arc4random [single-thread]               576.55
arc4random_buf(0) [single-thread]        961.77
arc4random_buf(16) [single-thread]       1309.38
arc4random_buf(32) [single-thread]       1558.69
arc4random_buf(64) [single-thread]       1728.54
--------------------------------------------------
arc4random [multi-thread]                589.52
arc4random_buf(0) [multi-thread]         967.39
arc4random_buf(16) [multi-thread]        1319.27
arc4random_buf(32) [multi-thread]        1552.96
arc4random_buf(64) [multi-thread]        1734.27
--------------------------------------------------

Checked on x86_64-linux-gnu.
---
 LICENSES                        |  20 ++
 sysdeps/generic/chacha20_arch.h |  24 +++
 sysdeps/x86_64/Makefile         |   6 +
 sysdeps/x86_64/chacha20-ssse3.S | 330 ++++++++++++++++++++++++++++++++
 sysdeps/x86_64/chacha20_arch.h  |  42 ++++
 5 files changed, 422 insertions(+)
 create mode 100644 sysdeps/generic/chacha20_arch.h
 create mode 100644 sysdeps/x86_64/chacha20-ssse3.S
 create mode 100644 sysdeps/x86_64/chacha20_arch.h
  

Comments

Noah Goldstein April 13, 2022, 11:12 p.m. UTC | #1
On Wed, Apr 13, 2022 at 1:27 PM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> It adds vectorized ChaCha20 implementation based on libgcrypt
> cipher/chacha20-amd64-ssse3.S.  It is used only if SSSE3 is supported
> and enable by the architecture.
>
> On a Ryzen 9 5900X it shows the following improvements (using
> formatted bench-arc4random data):
>
> GENERIC
> Function                                 MB/s
> --------------------------------------------------
> arc4random [single-thread]               375.06
> arc4random_buf(0) [single-thread]        498.50
> arc4random_buf(16) [single-thread]       576.86
> arc4random_buf(32) [single-thread]       615.76
> arc4random_buf(64) [single-thread]       633.97
> --------------------------------------------------
> arc4random [multi-thread]                359.86
> arc4random_buf(0) [multi-thread]         479.27
> arc4random_buf(16) [multi-thread]        543.65
> arc4random_buf(32) [multi-thread]        581.98
> arc4random_buf(64) [multi-thread]        603.01
> --------------------------------------------------
>
> SSSE3:
> Function                                 MB/s
> --------------------------------------------------
> arc4random [single-thread]               576.55
> arc4random_buf(0) [single-thread]        961.77
> arc4random_buf(16) [single-thread]       1309.38
> arc4random_buf(32) [single-thread]       1558.69
> arc4random_buf(64) [single-thread]       1728.54
> --------------------------------------------------
> arc4random [multi-thread]                589.52
> arc4random_buf(0) [multi-thread]         967.39
> arc4random_buf(16) [multi-thread]        1319.27
> arc4random_buf(32) [multi-thread]        1552.96
> arc4random_buf(64) [multi-thread]        1734.27
> --------------------------------------------------
>
> Checked on x86_64-linux-gnu.
> ---
>  LICENSES                        |  20 ++
>  sysdeps/generic/chacha20_arch.h |  24 +++
>  sysdeps/x86_64/Makefile         |   6 +
>  sysdeps/x86_64/chacha20-ssse3.S | 330 ++++++++++++++++++++++++++++++++
>  sysdeps/x86_64/chacha20_arch.h  |  42 ++++
>  5 files changed, 422 insertions(+)
>  create mode 100644 sysdeps/generic/chacha20_arch.h
>  create mode 100644 sysdeps/x86_64/chacha20-ssse3.S
>  create mode 100644 sysdeps/x86_64/chacha20_arch.h
>
> diff --git a/LICENSES b/LICENSES
> index 530893b1dc..2563abd9e2 100644
> --- a/LICENSES
> +++ b/LICENSES
> @@ -389,3 +389,23 @@ Copyright 2001 by Stephen L. Moshier <moshier@na-net.ornl.gov>
>   You should have received a copy of the GNU Lesser General Public
>   License along with this library; if not, see
>   <https://www.gnu.org/licenses/>.  */
> +
> +sysdeps/x86_64/chacha20-ssse3.S import code from libgcrypt, with the
> +following notices:
> +
> +Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
> +
> +This file is part of Libgcrypt.
> +
> +Libgcrypt is free software; you can redistribute it and/or modify
> +it under the terms of the GNU Lesser General Public License as
> +published by the Free Software Foundation; either version 2.1 of
> +the License, or (at your option) any later version.
> +
> +Libgcrypt is distributed in the hope that it will be useful,
> +but WITHOUT ANY WARRANTY; without even the implied warranty of
> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +GNU Lesser General Public License for more details.
> +
> +You should have received a copy of the GNU Lesser General Public
> +License along with this program; if not, see <http://www.gnu.org/licenses/>.
> diff --git a/sysdeps/generic/chacha20_arch.h b/sysdeps/generic/chacha20_arch.h
> new file mode 100644
> index 0000000000..d7200ac583
> --- /dev/null
> +++ b/sysdeps/generic/chacha20_arch.h
> @@ -0,0 +1,24 @@
> +/* Chacha20 implementation, generic interface.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +static inline void
> +chacha20_crypt (struct chacha20_state *state, uint8_t *dst,
> +               const uint8_t *src, size_t bytes)
> +{
> +  chacha20_crypt_generic (state, dst, src, bytes);
> +}
> diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
> index 79365aff2a..f43b6a1180 100644
> --- a/sysdeps/x86_64/Makefile
> +++ b/sysdeps/x86_64/Makefile
> @@ -5,6 +5,12 @@ ifeq ($(subdir),csu)
>  gen-as-const-headers += link-defines.sym
>  endif
>
> +ifeq ($(subdir),stdlib)
> +sysdep_routines += \
> +  chacha20-ssse3 \
> +  # sysdep_routines
> +endif
> +
>  ifeq ($(subdir),gmon)
>  sysdep_routines += _mcount
>  # We cannot compile _mcount.S with -pg because that would create
> diff --git a/sysdeps/x86_64/chacha20-ssse3.S b/sysdeps/x86_64/chacha20-ssse3.S
> new file mode 100644
> index 0000000000..f221daf634
> --- /dev/null
> +++ b/sysdeps/x86_64/chacha20-ssse3.S
> @@ -0,0 +1,330 @@
> +/* Optimized SSSE3 implementation of ChaCha20 cipher.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Based on D. J. Bernstein reference implementation at
> +   http://cr.yp.to/chacha.html:
> +
> +   chacha-regs.c version 20080118
> +   D. J. Bernstein
> +   Public domain.  */
> +
> +#include <sysdep.h>
> +
> +#ifdef PIC
> +#  define rRIP (%rip)
> +#else
> +#  define rRIP
> +#endif
> +
> +/* register macros */
> +#define INPUT %rdi
> +#define DST   %rsi
> +#define SRC   %rdx
> +#define NBLKS %rcx
> +#define ROUND %eax
> +
> +/* stack structure */
> +#define STACK_VEC_X12 (16)
> +#define STACK_VEC_X13 (16 + STACK_VEC_X12)
> +#define STACK_TMP     (16 + STACK_VEC_X13)
> +#define STACK_TMP1    (16 + STACK_TMP)
> +#define STACK_TMP2    (16 + STACK_TMP1)
> +
> +#define STACK_MAX     (16 + STACK_TMP2)
> +
> +/* vector registers */
> +#define X0 %xmm0
> +#define X1 %xmm1
> +#define X2 %xmm2
> +#define X3 %xmm3
> +#define X4 %xmm4
> +#define X5 %xmm5
> +#define X6 %xmm6
> +#define X7 %xmm7
> +#define X8 %xmm8
> +#define X9 %xmm9
> +#define X10 %xmm10
> +#define X11 %xmm11
> +#define X12 %xmm12
> +#define X13 %xmm13
> +#define X14 %xmm14
> +#define X15 %xmm15
> +
> +/**********************************************************************
> +  helper macros
> + **********************************************************************/
> +
> +/* 4x4 32-bit integer matrix transpose */
> +#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
> +       movdqa    x0, t2; \
> +       punpckhdq x1, t2; \
> +       punpckldq x1, x0; \
> +       \
> +       movdqa    x2, t1; \
> +       punpckldq x3, t1; \
> +       punpckhdq x3, x2; \
> +       \
> +       movdqa     x0, x1; \
> +       punpckhqdq t1, x1; \
> +       punpcklqdq t1, x0; \
> +       \
> +       movdqa     t2, x3; \
> +       punpckhqdq x2, x3; \
> +       punpcklqdq x2, t2; \
> +       movdqa     t2, x2;
> +
> +/* fill xmm register with 32-bit value from memory */
> +#define pbroadcastd(mem32, xreg) \
> +       movd mem32, xreg; \
> +       pshufd $0, xreg, xreg;
> +
> +/* xor with unaligned memory operand */
> +#define pxor_u(umem128, xreg, t) \
> +       movdqu umem128, t; \
> +       pxor t, xreg;
> +
> +/* xor register with unaligned src and save to unaligned dst */
> +#define xor_src_dst(dst, src, offset, xreg, t) \
> +       pxor_u(offset(src), xreg, t); \
> +       movdqu xreg, offset(dst);
> +
> +#define clear(x) pxor x,x;
> +
> +/**********************************************************************
> +  4-way chacha20
> + **********************************************************************/
> +
> +#define ROTATE2(v1,v2,c,tmp1,tmp2)     \
> +       movdqa v1, tmp1;                \
> +       movdqa v2, tmp2;                \
> +       psrld $(32 - (c)), v1;          \
> +       pslld $(c), tmp1;               \
> +       paddb tmp1, v1;                 \
> +       psrld $(32 - (c)), v2;          \
> +       pslld $(c), tmp2;               \
> +       paddb tmp2, v2;
> +
> +#define ROTATE_SHUF_2(v1,v2,shuf)      \
> +       pshufb shuf, v1;                \
> +       pshufb shuf, v2;
AFAICT this is the only ssse3 code.

Can you replace this optimized (maybe?) rotate with
rotate2 so this can be sse2?
> +
> +#define XOR(ds,s) \
> +       pxor s, ds;
> +
> +#define PLUS(ds,s) \
> +       paddd s, ds;
> +
> +#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2,\
> +                     interleave_op1,interleave_op2)            \
> +       movdqa L(shuf_rol16) rRIP, tmp1;                        \
> +               interleave_op1;                                 \
> +       PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);       \
> +           ROTATE_SHUF_2(d1, d2, tmp1);                        \
> +       PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);       \
> +           ROTATE2(b1, b2, 12, tmp1, tmp2);                    \
> +       movdqa L(shuf_rol8) rRIP, tmp1;                         \
> +               interleave_op2;                                 \
> +       PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);       \
> +           ROTATE_SHUF_2(d1, d2, tmp1);                        \
> +       PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);       \
> +           ROTATE2(b1, b2,  7, tmp1, tmp2);
> +
> +       .text
> +
> +chacha20_data:
> +       .align 16
> +L(shuf_rol16):
> +       .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
> +L(shuf_rol8):
> +       .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
> +L(counter1):
> +       .long 1,0,0,0
> +L(inc_counter):
> +       .long 0,1,2,3
> +L(unsigned_cmp):
> +       .long 0x80000000,0x80000000,0x80000000,0x80000000
> +
> +ENTRY (__chacha20_ssse3_blocks8)
> +       /* input:
> +        *      %rdi: input
> +        *      %rsi: dst
> +        *      %rdx: src
> +        *      %rcx: nblks (multiple of 4)
> +        */
> +
> +       pushq %rbp;
> +       cfi_adjust_cfa_offset(8);
> +       cfi_rel_offset(rbp, 0)
> +       movq %rsp, %rbp;
> +       cfi_def_cfa_register(%rbp);
> +
> +       subq $STACK_MAX, %rsp;
> +       andq $~15, %rsp;
> +
> +L(loop4):
> +       mov $20, ROUND;
> +
> +       /* Construct counter vectors X12 and X13 */
> +       movdqa L(inc_counter) rRIP, X0;
> +       movdqa L(unsigned_cmp) rRIP, X2;
> +       pbroadcastd((12 * 4)(INPUT), X12);
> +       pbroadcastd((13 * 4)(INPUT), X13);
> +       paddd X0, X12;
> +       movdqa X12, X1;
> +       pxor X2, X0;
> +       pxor X2, X1;
> +       pcmpgtd X1, X0;
> +       psubd X0, X13;
> +       movdqa X12, (STACK_VEC_X12)(%rsp);
> +       movdqa X13, (STACK_VEC_X13)(%rsp);
> +
> +       /* Load vectors */
> +       pbroadcastd((0 * 4)(INPUT), X0);
> +       pbroadcastd((1 * 4)(INPUT), X1);
> +       pbroadcastd((2 * 4)(INPUT), X2);
> +       pbroadcastd((3 * 4)(INPUT), X3);
> +       pbroadcastd((4 * 4)(INPUT), X4);
> +       pbroadcastd((5 * 4)(INPUT), X5);
> +       pbroadcastd((6 * 4)(INPUT), X6);
> +       pbroadcastd((7 * 4)(INPUT), X7);
> +       pbroadcastd((8 * 4)(INPUT), X8);
> +       pbroadcastd((9 * 4)(INPUT), X9);
> +       pbroadcastd((10 * 4)(INPUT), X10);
> +       pbroadcastd((11 * 4)(INPUT), X11);
> +       pbroadcastd((14 * 4)(INPUT), X14);
> +       pbroadcastd((15 * 4)(INPUT), X15);
> +       movdqa X11, (STACK_TMP)(%rsp);
> +       movdqa X15, (STACK_TMP1)(%rsp);
> +
> +L(round2_4):
> +       QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,,)
> +       movdqa (STACK_TMP)(%rsp), X11;
> +       movdqa (STACK_TMP1)(%rsp), X15;
> +       movdqa X8, (STACK_TMP)(%rsp);
> +       movdqa X9, (STACK_TMP1)(%rsp);
> +       QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,,)
> +       QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,,)
> +       movdqa (STACK_TMP)(%rsp), X8;
> +       movdqa (STACK_TMP1)(%rsp), X9;
> +       movdqa X11, (STACK_TMP)(%rsp);
> +       movdqa X15, (STACK_TMP1)(%rsp);
> +       QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,,)
> +       sub $2, ROUND;
> +       jnz .Lround2_4;
> +
> +       /* tmp := X15 */
> +       movdqa (STACK_TMP)(%rsp), X11;
> +       pbroadcastd((0 * 4)(INPUT), X15);
> +       PLUS(X0, X15);
> +       pbroadcastd((1 * 4)(INPUT), X15);
> +       PLUS(X1, X15);
> +       pbroadcastd((2 * 4)(INPUT), X15);
> +       PLUS(X2, X15);
> +       pbroadcastd((3 * 4)(INPUT), X15);
> +       PLUS(X3, X15);
> +       pbroadcastd((4 * 4)(INPUT), X15);
> +       PLUS(X4, X15);
> +       pbroadcastd((5 * 4)(INPUT), X15);
> +       PLUS(X5, X15);
> +       pbroadcastd((6 * 4)(INPUT), X15);
> +       PLUS(X6, X15);
> +       pbroadcastd((7 * 4)(INPUT), X15);
> +       PLUS(X7, X15);
> +       pbroadcastd((8 * 4)(INPUT), X15);
> +       PLUS(X8, X15);
> +       pbroadcastd((9 * 4)(INPUT), X15);
> +       PLUS(X9, X15);
> +       pbroadcastd((10 * 4)(INPUT), X15);
> +       PLUS(X10, X15);
> +       pbroadcastd((11 * 4)(INPUT), X15);
> +       PLUS(X11, X15);
> +       movdqa (STACK_VEC_X12)(%rsp), X15;
> +       PLUS(X12, X15);
> +       movdqa (STACK_VEC_X13)(%rsp), X15;
> +       PLUS(X13, X15);
> +       movdqa X13, (STACK_TMP)(%rsp);
> +       pbroadcastd((14 * 4)(INPUT), X15);
> +       PLUS(X14, X15);
> +       movdqa (STACK_TMP1)(%rsp), X15;
> +       movdqa X14, (STACK_TMP1)(%rsp);
> +       pbroadcastd((15 * 4)(INPUT), X13);
> +       PLUS(X15, X13);
> +       movdqa X15, (STACK_TMP2)(%rsp);
> +
> +       /* Update counter */
> +       addq $4, (12 * 4)(INPUT);
> +
> +       transpose_4x4(X0, X1, X2, X3, X13, X14, X15);
> +       xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15);
> +       xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15);
> +       xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15);
> +       xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15);
> +       transpose_4x4(X4, X5, X6, X7, X0, X1, X2);
> +       movdqa (STACK_TMP)(%rsp), X13;
> +       movdqa (STACK_TMP1)(%rsp), X14;
> +       movdqa (STACK_TMP2)(%rsp), X15;
> +       xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0);
> +       xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0);
> +       xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0);
> +       xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0);
> +       transpose_4x4(X8, X9, X10, X11, X0, X1, X2);
> +       xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0);
> +       xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0);
> +       xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0);
> +       xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0);
> +       transpose_4x4(X12, X13, X14, X15, X0, X1, X2);
> +       xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0);
> +       xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0);
> +       xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0);
> +       xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0);
> +
> +       sub $4, NBLKS;
> +       lea (4 * 64)(DST), DST;
> +       lea (4 * 64)(SRC), SRC;
> +       jnz L(loop4);
> +
> +       /* clear the used vector registers and stack */
> +       clear(X0);
> +       movdqa X0, (STACK_VEC_X12)(%rsp);
> +       movdqa X0, (STACK_VEC_X13)(%rsp);
> +       movdqa X0, (STACK_TMP)(%rsp);
> +       movdqa X0, (STACK_TMP1)(%rsp);
> +       movdqa X0, (STACK_TMP2)(%rsp);
> +       clear(X1);
> +       clear(X2);
> +       clear(X3);
> +       clear(X4);
> +       clear(X5);
> +       clear(X6);
> +       clear(X7);
> +       clear(X8);
> +       clear(X9);
> +       clear(X10);
> +       clear(X11);
> +       clear(X12);
> +       clear(X13);
> +       clear(X14);
> +       clear(X15);
> +
> +       /* eax zeroed by round loop. */
> +       leave;
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_def_cfa_register(%rsp);
> +       ret;
> +       int3;
why int3?
> +END (__chacha20_ssse3_blocks8)
> diff --git a/sysdeps/x86_64/chacha20_arch.h b/sysdeps/x86_64/chacha20_arch.h
> new file mode 100644
> index 0000000000..37a4fdfb1f
> --- /dev/null
> +++ b/sysdeps/x86_64/chacha20_arch.h
> @@ -0,0 +1,42 @@
> +/* Chacha20 implementation, used on arc4random.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <ldsodefs.h>
> +#include <cpu-features.h>
> +#include <sys/param.h>
> +
> +unsigned int __chacha20_ssse3_blocks8 (uint32_t *state, uint8_t *dst,
> +                                      const uint8_t *src, size_t nblks);
> +
> +static inline void
> +chacha20_crypt (struct chacha20_state *state, uint8_t *dst, const uint8_t *src,
> +               size_t bytes)
> +{
> +  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) && bytes >= CHACHA20_BLOCK_SIZE * 4)

Can we make this an ifunc?
> +    {
> +      size_t nblocks = bytes / CHACHA20_BLOCK_SIZE;
> +      nblocks -= nblocks % 4;
> +      __chacha20_ssse3_blocks8 (state->ctx, dst, src, nblocks);
> +      bytes -= nblocks * CHACHA20_BLOCK_SIZE;
> +      dst += nblocks * CHACHA20_BLOCK_SIZE;
> +      src += nblocks * CHACHA20_BLOCK_SIZE;
> +    }
> +
> +  if (bytes > 0)
> +    chacha20_crypt_generic (state, dst, src, bytes);
> +}
> --
> 2.32.0
>
  
Adhemerval Zanella April 14, 2022, 5:03 p.m. UTC | #2
On 13/04/2022 20:12, Noah Goldstein wrote:
> On Wed, Apr 13, 2022 at 1:27 PM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> +
>> +       /* eax zeroed by round loop. */
>> +       leave;
>> +       cfi_adjust_cfa_offset(-8)
>> +       cfi_def_cfa_register(%rsp);
>> +       ret;
>> +       int3;
> why int3?

It was originally added on libgcrypt by 11ade08efbfbc36dbf3571f1026946269950bc40,
as a straight-line speculation hardening.  It is was is emitted by clang 14 and
gcc 12 with -mharden-sls=return.

I am not sure if we need that kind of hardening, but I would prefer to the first
version be in sync with libgcrypt as much as possible so the future optimizations
would be simpler to keep localized to glibc (if libgcrypt does not want to
backport it).

>> +END (__chacha20_ssse3_blocks8)
>> diff --git a/sysdeps/x86_64/chacha20_arch.h b/sysdeps/x86_64/chacha20_arch.h
>> new file mode 100644
>> index 0000000000..37a4fdfb1f
>> --- /dev/null
>> +++ b/sysdeps/x86_64/chacha20_arch.h
>> @@ -0,0 +1,42 @@
>> +/* Chacha20 implementation, used on arc4random.
>> +   Copyright (C) 2022 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +#include <ldsodefs.h>
>> +#include <cpu-features.h>
>> +#include <sys/param.h>
>> +
>> +unsigned int __chacha20_ssse3_blocks8 (uint32_t *state, uint8_t *dst,
>> +                                      const uint8_t *src, size_t nblks);
>> +
>> +static inline void
>> +chacha20_crypt (struct chacha20_state *state, uint8_t *dst, const uint8_t *src,
>> +               size_t bytes)
>> +{
>> +  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) && bytes >= CHACHA20_BLOCK_SIZE * 4)
> 
> Can we make this an ifunc?

I though about it, but if you check on arc4random implementation the
chacha20_crypt is called for the whole internal buf once it is exhausted.
Assuming a 1 cycle per byte (as indicated by bench-slope libgrcypt on
my machine), it will be at least 1k cycles to encrypt each block.  I
am not sure if setting up an internal PLT call to save a couple of cycles
on a internal function will really show anything significant here (assuming
that the PLT call won't add more overhead in fact).

Besides that the code boilerplate to setup the internal ifunc is also
way more complex.
  
Noah Goldstein April 14, 2022, 5:10 p.m. UTC | #3
On Thu, Apr 14, 2022 at 12:03 PM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 13/04/2022 20:12, Noah Goldstein wrote:
> > On Wed, Apr 13, 2022 at 1:27 PM Adhemerval Zanella via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> >>
> >> +
> >> +       /* eax zeroed by round loop. */
> >> +       leave;
> >> +       cfi_adjust_cfa_offset(-8)
> >> +       cfi_def_cfa_register(%rsp);
> >> +       ret;
> >> +       int3;
> > why int3?
>
> It was originally added on libgcrypt by 11ade08efbfbc36dbf3571f1026946269950bc40,
> as a straight-line speculation hardening.  It is was is emitted by clang 14 and
> gcc 12 with -mharden-sls=return.
>
> I am not sure if we need that kind of hardening, but I would prefer to the first
> version be in sync with libgcrypt as much as possible so the future optimizations
> would be simpler to keep localized to glibc (if libgcrypt does not want to
> backport it).

Okay, can keep for now. Any thoughts on changing it to sse2?


>
> >> +END (__chacha20_ssse3_blocks8)
> >> diff --git a/sysdeps/x86_64/chacha20_arch.h b/sysdeps/x86_64/chacha20_arch.h
> >> new file mode 100644
> >> index 0000000000..37a4fdfb1f
> >> --- /dev/null
> >> +++ b/sysdeps/x86_64/chacha20_arch.h
> >> @@ -0,0 +1,42 @@
> >> +/* Chacha20 implementation, used on arc4random.
> >> +   Copyright (C) 2022 Free Software Foundation, Inc.
> >> +   This file is part of the GNU C Library.
> >> +
> >> +   The GNU C Library is free software; you can redistribute it and/or
> >> +   modify it under the terms of the GNU Lesser General Public
> >> +   License as published by the Free Software Foundation; either
> >> +   version 2.1 of the License, or (at your option) any later version.
> >> +
> >> +   The GNU C Library is distributed in the hope that it will be useful,
> >> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> +   Lesser General Public License for more details.
> >> +
> >> +   You should have received a copy of the GNU Lesser General Public
> >> +   License along with the GNU C Library; if not, see
> >> +   <http://www.gnu.org/licenses/>.  */
> >> +
> >> +#include <ldsodefs.h>
> >> +#include <cpu-features.h>
> >> +#include <sys/param.h>
> >> +
> >> +unsigned int __chacha20_ssse3_blocks8 (uint32_t *state, uint8_t *dst,
> >> +                                      const uint8_t *src, size_t nblks);
> >> +
> >> +static inline void
> >> +chacha20_crypt (struct chacha20_state *state, uint8_t *dst, const uint8_t *src,
> >> +               size_t bytes)
> >> +{
> >> +  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) && bytes >= CHACHA20_BLOCK_SIZE * 4)
> >
> > Can we make this an ifunc?
>
> I though about it, but if you check on arc4random implementation the
> chacha20_crypt is called for the whole internal buf once it is exhausted.
> Assuming a 1 cycle per byte (as indicated by bench-slope libgrcypt on
> my machine), it will be at least 1k cycles to encrypt each block.  I
> am not sure if setting up an internal PLT call to save a couple of cycles
> on a internal function will really show anything significant here (assuming
> that the PLT call won't add more overhead in fact).
>
> Besides that the code boilerplate to setup the internal ifunc is also
> way more complex.

Okay for now as long as open to changing later (not that we will but that
this isn't locking us into the decision).
  
Noah Goldstein April 14, 2022, 5:17 p.m. UTC | #4
On Wed, Apr 13, 2022 at 3:27 PM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> It adds vectorized ChaCha20 implementation based on libgcrypt
> cipher/chacha20-amd64-ssse3.S.  It is used only if SSSE3 is supported
> and enable by the architecture.
>
> On a Ryzen 9 5900X it shows the following improvements (using
> formatted bench-arc4random data):
>
> GENERIC
> Function                                 MB/s
> --------------------------------------------------
> arc4random [single-thread]               375.06
> arc4random_buf(0) [single-thread]        498.50
> arc4random_buf(16) [single-thread]       576.86
> arc4random_buf(32) [single-thread]       615.76
> arc4random_buf(64) [single-thread]       633.97
> --------------------------------------------------
> arc4random [multi-thread]                359.86
> arc4random_buf(0) [multi-thread]         479.27
> arc4random_buf(16) [multi-thread]        543.65
> arc4random_buf(32) [multi-thread]        581.98
> arc4random_buf(64) [multi-thread]        603.01
> --------------------------------------------------
>
> SSSE3:
> Function                                 MB/s
> --------------------------------------------------
> arc4random [single-thread]               576.55
> arc4random_buf(0) [single-thread]        961.77
> arc4random_buf(16) [single-thread]       1309.38
> arc4random_buf(32) [single-thread]       1558.69
> arc4random_buf(64) [single-thread]       1728.54
> --------------------------------------------------
> arc4random [multi-thread]                589.52
> arc4random_buf(0) [multi-thread]         967.39
> arc4random_buf(16) [multi-thread]        1319.27
> arc4random_buf(32) [multi-thread]        1552.96
> arc4random_buf(64) [multi-thread]        1734.27
> --------------------------------------------------
>
> Checked on x86_64-linux-gnu.
> ---
>  LICENSES                        |  20 ++
>  sysdeps/generic/chacha20_arch.h |  24 +++
>  sysdeps/x86_64/Makefile         |   6 +
>  sysdeps/x86_64/chacha20-ssse3.S | 330 ++++++++++++++++++++++++++++++++
>  sysdeps/x86_64/chacha20_arch.h  |  42 ++++
>  5 files changed, 422 insertions(+)
>  create mode 100644 sysdeps/generic/chacha20_arch.h
>  create mode 100644 sysdeps/x86_64/chacha20-ssse3.S
>  create mode 100644 sysdeps/x86_64/chacha20_arch.h
>
> diff --git a/LICENSES b/LICENSES
> index 530893b1dc..2563abd9e2 100644
> --- a/LICENSES
> +++ b/LICENSES
> @@ -389,3 +389,23 @@ Copyright 2001 by Stephen L. Moshier <moshier@na-net.ornl.gov>
>   You should have received a copy of the GNU Lesser General Public
>   License along with this library; if not, see
>   <https://www.gnu.org/licenses/>.  */
> +
> +sysdeps/x86_64/chacha20-ssse3.S import code from libgcrypt, with the
> +following notices:
> +
> +Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
> +
> +This file is part of Libgcrypt.
> +
> +Libgcrypt is free software; you can redistribute it and/or modify
> +it under the terms of the GNU Lesser General Public License as
> +published by the Free Software Foundation; either version 2.1 of
> +the License, or (at your option) any later version.
> +
> +Libgcrypt is distributed in the hope that it will be useful,
> +but WITHOUT ANY WARRANTY; without even the implied warranty of
> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +GNU Lesser General Public License for more details.
> +
> +You should have received a copy of the GNU Lesser General Public
> +License along with this program; if not, see <http://www.gnu.org/licenses/>.
> diff --git a/sysdeps/generic/chacha20_arch.h b/sysdeps/generic/chacha20_arch.h
> new file mode 100644
> index 0000000000..d7200ac583
> --- /dev/null
> +++ b/sysdeps/generic/chacha20_arch.h
> @@ -0,0 +1,24 @@
> +/* Chacha20 implementation, generic interface.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +static inline void
> +chacha20_crypt (struct chacha20_state *state, uint8_t *dst,
> +               const uint8_t *src, size_t bytes)
> +{
> +  chacha20_crypt_generic (state, dst, src, bytes);
> +}
> diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
> index 79365aff2a..f43b6a1180 100644
> --- a/sysdeps/x86_64/Makefile
> +++ b/sysdeps/x86_64/Makefile
> @@ -5,6 +5,12 @@ ifeq ($(subdir),csu)
>  gen-as-const-headers += link-defines.sym
>  endif
>
> +ifeq ($(subdir),stdlib)
> +sysdep_routines += \
> +  chacha20-ssse3 \
> +  # sysdep_routines
> +endif
> +
>  ifeq ($(subdir),gmon)
>  sysdep_routines += _mcount
>  # We cannot compile _mcount.S with -pg because that would create
> diff --git a/sysdeps/x86_64/chacha20-ssse3.S b/sysdeps/x86_64/chacha20-ssse3.S
> new file mode 100644
> index 0000000000..f221daf634
> --- /dev/null
> +++ b/sysdeps/x86_64/chacha20-ssse3.S
> @@ -0,0 +1,330 @@
> +/* Optimized SSSE3 implementation of ChaCha20 cipher.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Based on D. J. Bernstein reference implementation at
> +   http://cr.yp.to/chacha.html:
> +
> +   chacha-regs.c version 20080118
> +   D. J. Bernstein
> +   Public domain.  */
> +
> +#include <sysdep.h>
> +
> +#ifdef PIC
> +#  define rRIP (%rip)
> +#else
> +#  define rRIP
> +#endif
> +
> +/* register macros */
> +#define INPUT %rdi
> +#define DST   %rsi
> +#define SRC   %rdx
> +#define NBLKS %rcx
> +#define ROUND %eax
> +
> +/* stack structure */
> +#define STACK_VEC_X12 (16)
> +#define STACK_VEC_X13 (16 + STACK_VEC_X12)
> +#define STACK_TMP     (16 + STACK_VEC_X13)
> +#define STACK_TMP1    (16 + STACK_TMP)
> +#define STACK_TMP2    (16 + STACK_TMP1)
> +
> +#define STACK_MAX     (16 + STACK_TMP2)
> +
> +/* vector registers */
> +#define X0 %xmm0
> +#define X1 %xmm1
> +#define X2 %xmm2
> +#define X3 %xmm3
> +#define X4 %xmm4
> +#define X5 %xmm5
> +#define X6 %xmm6
> +#define X7 %xmm7
> +#define X8 %xmm8
> +#define X9 %xmm9
> +#define X10 %xmm10
> +#define X11 %xmm11
> +#define X12 %xmm12
> +#define X13 %xmm13
> +#define X14 %xmm14
> +#define X15 %xmm15
> +
> +/**********************************************************************
> +  helper macros
> + **********************************************************************/
> +
> +/* 4x4 32-bit integer matrix transpose */
> +#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
> +       movdqa    x0, t2; \
> +       punpckhdq x1, t2; \
> +       punpckldq x1, x0; \
> +       \
> +       movdqa    x2, t1; \
> +       punpckldq x3, t1; \
> +       punpckhdq x3, x2; \
> +       \
> +       movdqa     x0, x1; \
> +       punpckhqdq t1, x1; \
> +       punpcklqdq t1, x0; \
> +       \
> +       movdqa     t2, x3; \
> +       punpckhqdq x2, x3; \
> +       punpcklqdq x2, t2; \
> +       movdqa     t2, x2;
> +
> +/* fill xmm register with 32-bit value from memory */
> +#define pbroadcastd(mem32, xreg) \
> +       movd mem32, xreg; \
> +       pshufd $0, xreg, xreg;
> +
> +/* xor with unaligned memory operand */
> +#define pxor_u(umem128, xreg, t) \
> +       movdqu umem128, t; \
> +       pxor t, xreg;
> +
> +/* xor register with unaligned src and save to unaligned dst */
> +#define xor_src_dst(dst, src, offset, xreg, t) \
> +       pxor_u(offset(src), xreg, t); \
> +       movdqu xreg, offset(dst);
> +
> +#define clear(x) pxor x,x;
> +
> +/**********************************************************************
> +  4-way chacha20
> + **********************************************************************/
> +
> +#define ROTATE2(v1,v2,c,tmp1,tmp2)     \
> +       movdqa v1, tmp1;                \
> +       movdqa v2, tmp2;                \
> +       psrld $(32 - (c)), v1;          \
> +       pslld $(c), tmp1;               \
> +       paddb tmp1, v1;                 \
> +       psrld $(32 - (c)), v2;          \
> +       pslld $(c), tmp2;               \
> +       paddb tmp2, v2;
> +
> +#define ROTATE_SHUF_2(v1,v2,shuf)      \
> +       pshufb shuf, v1;                \
> +       pshufb shuf, v2;
> +
> +#define XOR(ds,s) \
> +       pxor s, ds;
> +
> +#define PLUS(ds,s) \
> +       paddd s, ds;
> +
> +#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2,\
> +                     interleave_op1,interleave_op2)            \
> +       movdqa L(shuf_rol16) rRIP, tmp1;                        \
> +               interleave_op1;                                 \
> +       PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);       \
> +           ROTATE_SHUF_2(d1, d2, tmp1);                        \
> +       PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);       \
> +           ROTATE2(b1, b2, 12, tmp1, tmp2);                    \
> +       movdqa L(shuf_rol8) rRIP, tmp1;                         \
> +               interleave_op2;                                 \
> +       PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);       \
> +           ROTATE_SHUF_2(d1, d2, tmp1);                        \
> +       PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);       \
> +           ROTATE2(b1, b2,  7, tmp1, tmp2);
> +
> +       .text
> +
> +chacha20_data:
> +       .align 16
> +L(shuf_rol16):
> +       .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
> +L(shuf_rol8):
> +       .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
> +L(counter1):
> +       .long 1,0,0,0
> +L(inc_counter):
> +       .long 0,1,2,3
> +L(unsigned_cmp):
> +       .long 0x80000000,0x80000000,0x80000000,0x80000000
> +
> +ENTRY (__chacha20_ssse3_blocks8)
> +       /* input:
> +        *      %rdi: input
> +        *      %rsi: dst
> +        *      %rdx: src
> +        *      %rcx: nblks (multiple of 4)
> +        */
> +
> +       pushq %rbp;
> +       cfi_adjust_cfa_offset(8);
> +       cfi_rel_offset(rbp, 0)
> +       movq %rsp, %rbp;
> +       cfi_def_cfa_register(%rbp);
> +
> +       subq $STACK_MAX, %rsp;
> +       andq $~15, %rsp;
> +
> +L(loop4):
> +       mov $20, ROUND;
> +
> +       /* Construct counter vectors X12 and X13 */
> +       movdqa L(inc_counter) rRIP, X0;
> +       movdqa L(unsigned_cmp) rRIP, X2;
> +       pbroadcastd((12 * 4)(INPUT), X12);
> +       pbroadcastd((13 * 4)(INPUT), X13);
> +       paddd X0, X12;
> +       movdqa X12, X1;
> +       pxor X2, X0;
> +       pxor X2, X1;
> +       pcmpgtd X1, X0;
> +       psubd X0, X13;
> +       movdqa X12, (STACK_VEC_X12)(%rsp);
> +       movdqa X13, (STACK_VEC_X13)(%rsp);
> +
> +       /* Load vectors */
> +       pbroadcastd((0 * 4)(INPUT), X0);
> +       pbroadcastd((1 * 4)(INPUT), X1);
> +       pbroadcastd((2 * 4)(INPUT), X2);
> +       pbroadcastd((3 * 4)(INPUT), X3);
> +       pbroadcastd((4 * 4)(INPUT), X4);
> +       pbroadcastd((5 * 4)(INPUT), X5);
> +       pbroadcastd((6 * 4)(INPUT), X6);
> +       pbroadcastd((7 * 4)(INPUT), X7);
> +       pbroadcastd((8 * 4)(INPUT), X8);
> +       pbroadcastd((9 * 4)(INPUT), X9);
> +       pbroadcastd((10 * 4)(INPUT), X10);
> +       pbroadcastd((11 * 4)(INPUT), X11);
> +       pbroadcastd((14 * 4)(INPUT), X14);
> +       pbroadcastd((15 * 4)(INPUT), X15);
> +       movdqa X11, (STACK_TMP)(%rsp);
> +       movdqa X15, (STACK_TMP1)(%rsp);
> +
> +L(round2_4):
> +       QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,,)
> +       movdqa (STACK_TMP)(%rsp), X11;
> +       movdqa (STACK_TMP1)(%rsp), X15;
> +       movdqa X8, (STACK_TMP)(%rsp);
> +       movdqa X9, (STACK_TMP1)(%rsp);
> +       QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,,)
> +       QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,,)
> +       movdqa (STACK_TMP)(%rsp), X8;
> +       movdqa (STACK_TMP1)(%rsp), X9;
> +       movdqa X11, (STACK_TMP)(%rsp);
> +       movdqa X15, (STACK_TMP1)(%rsp);
> +       QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,,)
> +       sub $2, ROUND;
> +       jnz .Lround2_4;
> +
> +       /* tmp := X15 */
> +       movdqa (STACK_TMP)(%rsp), X11;
> +       pbroadcastd((0 * 4)(INPUT), X15);
> +       PLUS(X0, X15);
> +       pbroadcastd((1 * 4)(INPUT), X15);
> +       PLUS(X1, X15);
> +       pbroadcastd((2 * 4)(INPUT), X15);
> +       PLUS(X2, X15);
> +       pbroadcastd((3 * 4)(INPUT), X15);
> +       PLUS(X3, X15);
> +       pbroadcastd((4 * 4)(INPUT), X15);
> +       PLUS(X4, X15);
> +       pbroadcastd((5 * 4)(INPUT), X15);
> +       PLUS(X5, X15);
> +       pbroadcastd((6 * 4)(INPUT), X15);
> +       PLUS(X6, X15);
> +       pbroadcastd((7 * 4)(INPUT), X15);
> +       PLUS(X7, X15);
> +       pbroadcastd((8 * 4)(INPUT), X15);
> +       PLUS(X8, X15);
> +       pbroadcastd((9 * 4)(INPUT), X15);
> +       PLUS(X9, X15);
> +       pbroadcastd((10 * 4)(INPUT), X15);
> +       PLUS(X10, X15);
> +       pbroadcastd((11 * 4)(INPUT), X15);
> +       PLUS(X11, X15);
> +       movdqa (STACK_VEC_X12)(%rsp), X15;
> +       PLUS(X12, X15);
> +       movdqa (STACK_VEC_X13)(%rsp), X15;
> +       PLUS(X13, X15);
> +       movdqa X13, (STACK_TMP)(%rsp);
> +       pbroadcastd((14 * 4)(INPUT), X15);
> +       PLUS(X14, X15);
> +       movdqa (STACK_TMP1)(%rsp), X15;
> +       movdqa X14, (STACK_TMP1)(%rsp);
> +       pbroadcastd((15 * 4)(INPUT), X13);
> +       PLUS(X15, X13);
> +       movdqa X15, (STACK_TMP2)(%rsp);
> +
> +       /* Update counter */
> +       addq $4, (12 * 4)(INPUT);
> +
> +       transpose_4x4(X0, X1, X2, X3, X13, X14, X15);
> +       xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15);
> +       xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15);
> +       xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15);
> +       xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15);
> +       transpose_4x4(X4, X5, X6, X7, X0, X1, X2);
> +       movdqa (STACK_TMP)(%rsp), X13;
> +       movdqa (STACK_TMP1)(%rsp), X14;
> +       movdqa (STACK_TMP2)(%rsp), X15;
> +       xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0);
> +       xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0);
> +       xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0);
> +       xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0);
> +       transpose_4x4(X8, X9, X10, X11, X0, X1, X2);
> +       xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0);
> +       xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0);
> +       xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0);
> +       xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0);
> +       transpose_4x4(X12, X13, X14, X15, X0, X1, X2);
> +       xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0);
> +       xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0);
> +       xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0);
> +       xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0);
> +
> +       sub $4, NBLKS;
> +       lea (4 * 64)(DST), DST;
> +       lea (4 * 64)(SRC), SRC;
> +       jnz L(loop4);
> +
> +       /* clear the used vector registers and stack */
> +       clear(X0);
> +       movdqa X0, (STACK_VEC_X12)(%rsp);
> +       movdqa X0, (STACK_VEC_X13)(%rsp);
> +       movdqa X0, (STACK_TMP)(%rsp);
> +       movdqa X0, (STACK_TMP1)(%rsp);
> +       movdqa X0, (STACK_TMP2)(%rsp);
> +       clear(X1);
> +       clear(X2);
> +       clear(X3);
> +       clear(X4);
> +       clear(X5);
> +       clear(X6);
> +       clear(X7);
> +       clear(X8);
> +       clear(X9);
> +       clear(X10);
> +       clear(X11);
> +       clear(X12);
> +       clear(X13);
> +       clear(X14);
> +       clear(X15);

No need to change now, but out of curiosity (and possible future optimization),
do we need the clears for our purposes?
> +
> +       /* eax zeroed by round loop. */
> +       leave;
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_def_cfa_register(%rsp);
> +       ret;
> +       int3;
> +END (__chacha20_ssse3_blocks8)
> diff --git a/sysdeps/x86_64/chacha20_arch.h b/sysdeps/x86_64/chacha20_arch.h
> new file mode 100644
> index 0000000000..37a4fdfb1f
> --- /dev/null
> +++ b/sysdeps/x86_64/chacha20_arch.h
> @@ -0,0 +1,42 @@
> +/* Chacha20 implementation, used on arc4random.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <ldsodefs.h>
> +#include <cpu-features.h>
> +#include <sys/param.h>
> +
> +unsigned int __chacha20_ssse3_blocks8 (uint32_t *state, uint8_t *dst,
> +                                      const uint8_t *src, size_t nblks);
> +
> +static inline void
> +chacha20_crypt (struct chacha20_state *state, uint8_t *dst, const uint8_t *src,
> +               size_t bytes)
> +{
> +  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) && bytes >= CHACHA20_BLOCK_SIZE * 4)
> +    {
> +      size_t nblocks = bytes / CHACHA20_BLOCK_SIZE;
> +      nblocks -= nblocks % 4;
> +      __chacha20_ssse3_blocks8 (state->ctx, dst, src, nblocks);
> +      bytes -= nblocks * CHACHA20_BLOCK_SIZE;
> +      dst += nblocks * CHACHA20_BLOCK_SIZE;
> +      src += nblocks * CHACHA20_BLOCK_SIZE;
> +    }
> +
> +  if (bytes > 0)
> +    chacha20_crypt_generic (state, dst, src, bytes);
> +}
> --
> 2.32.0
>
  
Adhemerval Zanella April 14, 2022, 5:18 p.m. UTC | #5
On 14/04/2022 14:10, Noah Goldstein wrote:
> On Thu, Apr 14, 2022 at 12:03 PM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>>
>> On 13/04/2022 20:12, Noah Goldstein wrote:
>>> On Wed, Apr 13, 2022 at 1:27 PM Adhemerval Zanella via Libc-alpha
>>> <libc-alpha@sourceware.org> wrote:
>>>>
>>>> +
>>>> +       /* eax zeroed by round loop. */
>>>> +       leave;
>>>> +       cfi_adjust_cfa_offset(-8)
>>>> +       cfi_def_cfa_register(%rsp);
>>>> +       ret;
>>>> +       int3;
>>> why int3?
>>
>> It was originally added on libgcrypt by 11ade08efbfbc36dbf3571f1026946269950bc40,
>> as a straight-line speculation hardening.  It is was is emitted by clang 14 and
>> gcc 12 with -mharden-sls=return.
>>
>> I am not sure if we need that kind of hardening, but I would prefer to the first
>> version be in sync with libgcrypt as much as possible so the future optimizations
>> would be simpler to keep localized to glibc (if libgcrypt does not want to
>> backport it).
> 
> Okay, can keep for now. Any thoughts on changing it to sse2?
> 

No strong feeling, I used the ssse3 one because it is readily available from
libgcrypt.

> 
>>
>>>> +END (__chacha20_ssse3_blocks8)
>>>> diff --git a/sysdeps/x86_64/chacha20_arch.h b/sysdeps/x86_64/chacha20_arch.h
>>>> new file mode 100644
>>>> index 0000000000..37a4fdfb1f
>>>> --- /dev/null
>>>> +++ b/sysdeps/x86_64/chacha20_arch.h
>>>> @@ -0,0 +1,42 @@
>>>> +/* Chacha20 implementation, used on arc4random.
>>>> +   Copyright (C) 2022 Free Software Foundation, Inc.
>>>> +   This file is part of the GNU C Library.
>>>> +
>>>> +   The GNU C Library is free software; you can redistribute it and/or
>>>> +   modify it under the terms of the GNU Lesser General Public
>>>> +   License as published by the Free Software Foundation; either
>>>> +   version 2.1 of the License, or (at your option) any later version.
>>>> +
>>>> +   The GNU C Library is distributed in the hope that it will be useful,
>>>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>>> +   Lesser General Public License for more details.
>>>> +
>>>> +   You should have received a copy of the GNU Lesser General Public
>>>> +   License along with the GNU C Library; if not, see
>>>> +   <http://www.gnu.org/licenses/>.  */
>>>> +
>>>> +#include <ldsodefs.h>
>>>> +#include <cpu-features.h>
>>>> +#include <sys/param.h>
>>>> +
>>>> +unsigned int __chacha20_ssse3_blocks8 (uint32_t *state, uint8_t *dst,
>>>> +                                      const uint8_t *src, size_t nblks);
>>>> +
>>>> +static inline void
>>>> +chacha20_crypt (struct chacha20_state *state, uint8_t *dst, const uint8_t *src,
>>>> +               size_t bytes)
>>>> +{
>>>> +  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) && bytes >= CHACHA20_BLOCK_SIZE * 4)
>>>
>>> Can we make this an ifunc?
>>
>> I though about it, but if you check on arc4random implementation the
>> chacha20_crypt is called for the whole internal buf once it is exhausted.
>> Assuming a 1 cycle per byte (as indicated by bench-slope libgrcypt on
>> my machine), it will be at least 1k cycles to encrypt each block.  I
>> am not sure if setting up an internal PLT call to save a couple of cycles
>> on a internal function will really show anything significant here (assuming
>> that the PLT call won't add more overhead in fact).
>>
>> Besides that the code boilerplate to setup the internal ifunc is also
>> way more complex.
> 
> Okay for now as long as open to changing later (not that we will but that
> this isn't locking us into the decision).

For sure, if iFUNC does help on this case the change should be simple for
the generic code.  The boilerplate is for the x86_64 bits in facts (to
setup the iFUNC resolver, Makefile, etc.).
  
Noah Goldstein April 14, 2022, 5:22 p.m. UTC | #6
On Thu, Apr 14, 2022 at 12:19 PM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 14/04/2022 14:10, Noah Goldstein wrote:
> > On Thu, Apr 14, 2022 at 12:03 PM Adhemerval Zanella
> > <adhemerval.zanella@linaro.org> wrote:
> >>
> >>
> >>
> >> On 13/04/2022 20:12, Noah Goldstein wrote:
> >>> On Wed, Apr 13, 2022 at 1:27 PM Adhemerval Zanella via Libc-alpha
> >>> <libc-alpha@sourceware.org> wrote:
> >>>>
> >>>> +
> >>>> +       /* eax zeroed by round loop. */
> >>>> +       leave;
> >>>> +       cfi_adjust_cfa_offset(-8)
> >>>> +       cfi_def_cfa_register(%rsp);
> >>>> +       ret;
> >>>> +       int3;
> >>> why int3?
> >>
> >> It was originally added on libgcrypt by 11ade08efbfbc36dbf3571f1026946269950bc40,
> >> as a straight-line speculation hardening.  It is was is emitted by clang 14 and
> >> gcc 12 with -mharden-sls=return.
> >>
> >> I am not sure if we need that kind of hardening, but I would prefer to the first
> >> version be in sync with libgcrypt as much as possible so the future optimizations
> >> would be simpler to keep localized to glibc (if libgcrypt does not want to
> >> backport it).
> >
> > Okay, can keep for now. Any thoughts on changing it to sse2?
> >
>
> No strong feeling, I used the ssse3 one because it is readily available from
> libgcrypt.

I think the only ssse3 is `pshufb` so you can just replace the optimized
rotates with the shift rotates and that will make it sse2 (unless I'm missing
an instruction).

Also can you add the proper .text section here as well (or .sse2 or .ssse3)

>
> >
> >>
> >>>> +END (__chacha20_ssse3_blocks8)
> >>>> diff --git a/sysdeps/x86_64/chacha20_arch.h b/sysdeps/x86_64/chacha20_arch.h
> >>>> new file mode 100644
> >>>> index 0000000000..37a4fdfb1f
> >>>> --- /dev/null
> >>>> +++ b/sysdeps/x86_64/chacha20_arch.h
> >>>> @@ -0,0 +1,42 @@
> >>>> +/* Chacha20 implementation, used on arc4random.
> >>>> +   Copyright (C) 2022 Free Software Foundation, Inc.
> >>>> +   This file is part of the GNU C Library.
> >>>> +
> >>>> +   The GNU C Library is free software; you can redistribute it and/or
> >>>> +   modify it under the terms of the GNU Lesser General Public
> >>>> +   License as published by the Free Software Foundation; either
> >>>> +   version 2.1 of the License, or (at your option) any later version.
> >>>> +
> >>>> +   The GNU C Library is distributed in the hope that it will be useful,
> >>>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> >>>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >>>> +   Lesser General Public License for more details.
> >>>> +
> >>>> +   You should have received a copy of the GNU Lesser General Public
> >>>> +   License along with the GNU C Library; if not, see
> >>>> +   <http://www.gnu.org/licenses/>.  */
> >>>> +
> >>>> +#include <ldsodefs.h>
> >>>> +#include <cpu-features.h>
> >>>> +#include <sys/param.h>
> >>>> +
> >>>> +unsigned int __chacha20_ssse3_blocks8 (uint32_t *state, uint8_t *dst,
> >>>> +                                      const uint8_t *src, size_t nblks);
> >>>> +
> >>>> +static inline void
> >>>> +chacha20_crypt (struct chacha20_state *state, uint8_t *dst, const uint8_t *src,
> >>>> +               size_t bytes)
> >>>> +{
> >>>> +  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) && bytes >= CHACHA20_BLOCK_SIZE * 4)
> >>>
> >>> Can we make this an ifunc?
> >>
> >> I though about it, but if you check on arc4random implementation the
> >> chacha20_crypt is called for the whole internal buf once it is exhausted.
> >> Assuming a 1 cycle per byte (as indicated by bench-slope libgrcypt on
> >> my machine), it will be at least 1k cycles to encrypt each block.  I
> >> am not sure if setting up an internal PLT call to save a couple of cycles
> >> on a internal function will really show anything significant here (assuming
> >> that the PLT call won't add more overhead in fact).
> >>
> >> Besides that the code boilerplate to setup the internal ifunc is also
> >> way more complex.
> >
> > Okay for now as long as open to changing later (not that we will but that
> > this isn't locking us into the decision).
>
> For sure, if iFUNC does help on this case the change should be simple for
> the generic code.  The boilerplate is for the x86_64 bits in facts (to
> setup the iFUNC resolver, Makefile, etc.).
  
Adhemerval Zanella April 14, 2022, 6:11 p.m. UTC | #7
On 14/04/2022 14:17, Noah Goldstein wrote:
> On Wed, Apr 13, 2022 at 3:27 PM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> +       clear(X1);
>> +       clear(X2);
>> +       clear(X3);
>> +       clear(X4);
>> +       clear(X5);
>> +       clear(X6);
>> +       clear(X7);
>> +       clear(X8);
>> +       clear(X9);
>> +       clear(X10);
>> +       clear(X11);
>> +       clear(X12);
>> +       clear(X13);
>> +       clear(X14);
>> +       clear(X15);
> 
> No need to change now, but out of curiosity (and possible future optimization),
> do we need the clears for our purposes?

That's a good question which I am not sure.  Distro do usually build glibc
with security options (such as stack protector and stack check) and keep
adding support for newer CPU security hardening (such as ARM PAC/BIT or
Intel CET). 

We also uses some more software oriented hardening, such as explicit_memset
on some places.

I would expect that distro might use -mharden-sls, but I am not sure if
we should enforce it on all assembly implementations.
  
Adhemerval Zanella April 14, 2022, 6:25 p.m. UTC | #8
On 14/04/2022 14:22, Noah Goldstein wrote:
> On Thu, Apr 14, 2022 at 12:19 PM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>>
>> On 14/04/2022 14:10, Noah Goldstein wrote:
>>> On Thu, Apr 14, 2022 at 12:03 PM Adhemerval Zanella
>>> <adhemerval.zanella@linaro.org> wrote:
>>>>
>>>>
>>>>
>>>> On 13/04/2022 20:12, Noah Goldstein wrote:
>>>>> On Wed, Apr 13, 2022 at 1:27 PM Adhemerval Zanella via Libc-alpha
>>>>> <libc-alpha@sourceware.org> wrote:
>>>>>>
>>>>>> +
>>>>>> +       /* eax zeroed by round loop. */
>>>>>> +       leave;
>>>>>> +       cfi_adjust_cfa_offset(-8)
>>>>>> +       cfi_def_cfa_register(%rsp);
>>>>>> +       ret;
>>>>>> +       int3;
>>>>> why int3?
>>>>
>>>> It was originally added on libgcrypt by 11ade08efbfbc36dbf3571f1026946269950bc40,
>>>> as a straight-line speculation hardening.  It is was is emitted by clang 14 and
>>>> gcc 12 with -mharden-sls=return.
>>>>
>>>> I am not sure if we need that kind of hardening, but I would prefer to the first
>>>> version be in sync with libgcrypt as much as possible so the future optimizations
>>>> would be simpler to keep localized to glibc (if libgcrypt does not want to
>>>> backport it).
>>>
>>> Okay, can keep for now. Any thoughts on changing it to sse2?
>>>
>>
>> No strong feeling, I used the ssse3 one because it is readily available from
>> libgcrypt.
> 
> I think the only ssse3 is `pshufb` so you can just replace the optimized
> rotates with the shift rotates and that will make it sse2 (unless I'm missing
> an instruction).

Right, do you have a patch for it? I can add it on the v2 I am will send.

> 
> Also can you add the proper .text section here as well (or .sse2 or .ssse3)
Ack.
  
Noah Goldstein April 14, 2022, 7:25 p.m. UTC | #9
On Wed, Apr 13, 2022 at 3:27 PM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> It adds vectorized ChaCha20 implementation based on libgcrypt
> cipher/chacha20-amd64-ssse3.S.  It is used only if SSSE3 is supported
> and enable by the architecture.
>
> On a Ryzen 9 5900X it shows the following improvements (using
> formatted bench-arc4random data):
>
> GENERIC
> Function                                 MB/s
> --------------------------------------------------
> arc4random [single-thread]               375.06
> arc4random_buf(0) [single-thread]        498.50
> arc4random_buf(16) [single-thread]       576.86
> arc4random_buf(32) [single-thread]       615.76
> arc4random_buf(64) [single-thread]       633.97
> --------------------------------------------------
> arc4random [multi-thread]                359.86
> arc4random_buf(0) [multi-thread]         479.27
> arc4random_buf(16) [multi-thread]        543.65
> arc4random_buf(32) [multi-thread]        581.98
> arc4random_buf(64) [multi-thread]        603.01
> --------------------------------------------------
>
> SSSE3:
> Function                                 MB/s
> --------------------------------------------------
> arc4random [single-thread]               576.55
> arc4random_buf(0) [single-thread]        961.77
> arc4random_buf(16) [single-thread]       1309.38
> arc4random_buf(32) [single-thread]       1558.69
> arc4random_buf(64) [single-thread]       1728.54
> --------------------------------------------------
> arc4random [multi-thread]                589.52
> arc4random_buf(0) [multi-thread]         967.39
> arc4random_buf(16) [multi-thread]        1319.27
> arc4random_buf(32) [multi-thread]        1552.96
> arc4random_buf(64) [multi-thread]        1734.27
> --------------------------------------------------
>
> Checked on x86_64-linux-gnu.
> ---
>  LICENSES                        |  20 ++
>  sysdeps/generic/chacha20_arch.h |  24 +++
>  sysdeps/x86_64/Makefile         |   6 +
>  sysdeps/x86_64/chacha20-ssse3.S | 330 ++++++++++++++++++++++++++++++++
>  sysdeps/x86_64/chacha20_arch.h  |  42 ++++
>  5 files changed, 422 insertions(+)
>  create mode 100644 sysdeps/generic/chacha20_arch.h
>  create mode 100644 sysdeps/x86_64/chacha20-ssse3.S
>  create mode 100644 sysdeps/x86_64/chacha20_arch.h
>
> diff --git a/LICENSES b/LICENSES
> index 530893b1dc..2563abd9e2 100644
> --- a/LICENSES
> +++ b/LICENSES
> @@ -389,3 +389,23 @@ Copyright 2001 by Stephen L. Moshier <moshier@na-net.ornl.gov>
>   You should have received a copy of the GNU Lesser General Public
>   License along with this library; if not, see
>   <https://www.gnu.org/licenses/>.  */
> +
> +sysdeps/x86_64/chacha20-ssse3.S import code from libgcrypt, with the
> +following notices:
> +
> +Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
> +
> +This file is part of Libgcrypt.
> +
> +Libgcrypt is free software; you can redistribute it and/or modify
> +it under the terms of the GNU Lesser General Public License as
> +published by the Free Software Foundation; either version 2.1 of
> +the License, or (at your option) any later version.
> +
> +Libgcrypt is distributed in the hope that it will be useful,
> +but WITHOUT ANY WARRANTY; without even the implied warranty of
> +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +GNU Lesser General Public License for more details.
> +
> +You should have received a copy of the GNU Lesser General Public
> +License along with this program; if not, see <http://www.gnu.org/licenses/>.
> diff --git a/sysdeps/generic/chacha20_arch.h b/sysdeps/generic/chacha20_arch.h
> new file mode 100644
> index 0000000000..d7200ac583
> --- /dev/null
> +++ b/sysdeps/generic/chacha20_arch.h
> @@ -0,0 +1,24 @@
> +/* Chacha20 implementation, generic interface.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +static inline void
> +chacha20_crypt (struct chacha20_state *state, uint8_t *dst,
> +               const uint8_t *src, size_t bytes)
> +{
> +  chacha20_crypt_generic (state, dst, src, bytes);
> +}
> diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
> index 79365aff2a..f43b6a1180 100644
> --- a/sysdeps/x86_64/Makefile
> +++ b/sysdeps/x86_64/Makefile
> @@ -5,6 +5,12 @@ ifeq ($(subdir),csu)
>  gen-as-const-headers += link-defines.sym
>  endif
>
> +ifeq ($(subdir),stdlib)
> +sysdep_routines += \
> +  chacha20-ssse3 \
> +  # sysdep_routines
> +endif
> +
>  ifeq ($(subdir),gmon)
>  sysdep_routines += _mcount
>  # We cannot compile _mcount.S with -pg because that would create
> diff --git a/sysdeps/x86_64/chacha20-ssse3.S b/sysdeps/x86_64/chacha20-ssse3.S
> new file mode 100644
> index 0000000000..f221daf634
> --- /dev/null
> +++ b/sysdeps/x86_64/chacha20-ssse3.S
> @@ -0,0 +1,330 @@
> +/* Optimized SSSE3 implementation of ChaCha20 cipher.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Based on D. J. Bernstein reference implementation at
> +   http://cr.yp.to/chacha.html:
> +
> +   chacha-regs.c version 20080118
> +   D. J. Bernstein
> +   Public domain.  */
> +
> +#include <sysdep.h>
> +
> +#ifdef PIC
> +#  define rRIP (%rip)
> +#else
> +#  define rRIP
> +#endif
> +
> +/* register macros */
> +#define INPUT %rdi
> +#define DST   %rsi
> +#define SRC   %rdx
> +#define NBLKS %rcx
> +#define ROUND %eax
> +
> +/* stack structure */
> +#define STACK_VEC_X12 (16)
> +#define STACK_VEC_X13 (16 + STACK_VEC_X12)
> +#define STACK_TMP     (16 + STACK_VEC_X13)
> +#define STACK_TMP1    (16 + STACK_TMP)
> +#define STACK_TMP2    (16 + STACK_TMP1)
> +
> +#define STACK_MAX     (16 + STACK_TMP2)
> +
> +/* vector registers */
> +#define X0 %xmm0
> +#define X1 %xmm1
> +#define X2 %xmm2
> +#define X3 %xmm3
> +#define X4 %xmm4
> +#define X5 %xmm5
> +#define X6 %xmm6
> +#define X7 %xmm7
> +#define X8 %xmm8
> +#define X9 %xmm9
> +#define X10 %xmm10
> +#define X11 %xmm11
> +#define X12 %xmm12
> +#define X13 %xmm13
> +#define X14 %xmm14
> +#define X15 %xmm15
> +
> +/**********************************************************************
> +  helper macros
> + **********************************************************************/
> +
> +/* 4x4 32-bit integer matrix transpose */
> +#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
> +       movdqa    x0, t2; \
> +       punpckhdq x1, t2; \
> +       punpckldq x1, x0; \
> +       \
> +       movdqa    x2, t1; \
> +       punpckldq x3, t1; \
> +       punpckhdq x3, x2; \
> +       \
> +       movdqa     x0, x1; \
> +       punpckhqdq t1, x1; \
> +       punpcklqdq t1, x0; \
> +       \
> +       movdqa     t2, x3; \
> +       punpckhqdq x2, x3; \
> +       punpcklqdq x2, t2; \
> +       movdqa     t2, x2;
> +
> +/* fill xmm register with 32-bit value from memory */
> +#define pbroadcastd(mem32, xreg) \
> +       movd mem32, xreg; \
> +       pshufd $0, xreg, xreg;
> +
> +/* xor with unaligned memory operand */
> +#define pxor_u(umem128, xreg, t) \
> +       movdqu umem128, t; \
> +       pxor t, xreg;
> +
> +/* xor register with unaligned src and save to unaligned dst */
> +#define xor_src_dst(dst, src, offset, xreg, t) \
> +       pxor_u(offset(src), xreg, t); \
> +       movdqu xreg, offset(dst);
> +
> +#define clear(x) pxor x,x;
> +
> +/**********************************************************************
> +  4-way chacha20
> + **********************************************************************/
> +
> +#define ROTATE2(v1,v2,c,tmp1,tmp2)     \
> +       movdqa v1, tmp1;                \
> +       movdqa v2, tmp2;                \
> +       psrld $(32 - (c)), v1;          \
> +       pslld $(c), tmp1;               \
> +       paddb tmp1, v1;                 \
> +       psrld $(32 - (c)), v2;          \
> +       pslld $(c), tmp2;               \
> +       paddb tmp2, v2;
> +
> +#define ROTATE_SHUF_2(v1,v2,shuf)      \
> +       pshufb shuf, v1;                \
> +       pshufb shuf, v2;
> +
> +#define XOR(ds,s) \
> +       pxor s, ds;
> +
> +#define PLUS(ds,s) \
> +       paddd s, ds;
> +
> +#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2,\
> +                     interleave_op1,interleave_op2)            \
> +       movdqa L(shuf_rol16) rRIP, tmp1;                        \
> +               interleave_op1;                                 \
> +       PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);       \
> +           ROTATE_SHUF_2(d1, d2, tmp1);                        \
> +       PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);       \
> +           ROTATE2(b1, b2, 12, tmp1, tmp2);                    \
> +       movdqa L(shuf_rol8) rRIP, tmp1;                         \
> +               interleave_op2;                                 \
> +       PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);       \
> +           ROTATE_SHUF_2(d1, d2, tmp1);                        \
> +       PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);       \
> +           ROTATE2(b1, b2,  7, tmp1, tmp2);
> +
> +       .text
> +
> +chacha20_data:
> +       .align 16
> +L(shuf_rol16):
> +       .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
> +L(shuf_rol8):
> +       .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
> +L(counter1):
> +       .long 1,0,0,0
> +L(inc_counter):
> +       .long 0,1,2,3
> +L(unsigned_cmp):
> +       .long 0x80000000,0x80000000,0x80000000,0x80000000
> +
> +ENTRY (__chacha20_ssse3_blocks8)
> +       /* input:
> +        *      %rdi: input
> +        *      %rsi: dst
> +        *      %rdx: src
> +        *      %rcx: nblks (multiple of 4)
> +        */
> +
> +       pushq %rbp;
> +       cfi_adjust_cfa_offset(8);
> +       cfi_rel_offset(rbp, 0)
> +       movq %rsp, %rbp;
> +       cfi_def_cfa_register(%rbp);
> +
> +       subq $STACK_MAX, %rsp;
> +       andq $~15, %rsp;
> +
> +L(loop4):
> +       mov $20, ROUND;
> +
> +       /* Construct counter vectors X12 and X13 */
> +       movdqa L(inc_counter) rRIP, X0;
> +       movdqa L(unsigned_cmp) rRIP, X2;
> +       pbroadcastd((12 * 4)(INPUT), X12);
> +       pbroadcastd((13 * 4)(INPUT), X13);
> +       paddd X0, X12;
> +       movdqa X12, X1;
> +       pxor X2, X0;
> +       pxor X2, X1;
> +       pcmpgtd X1, X0;
> +       psubd X0, X13;
> +       movdqa X12, (STACK_VEC_X12)(%rsp);
> +       movdqa X13, (STACK_VEC_X13)(%rsp);
> +
> +       /* Load vectors */
> +       pbroadcastd((0 * 4)(INPUT), X0);
> +       pbroadcastd((1 * 4)(INPUT), X1);
> +       pbroadcastd((2 * 4)(INPUT), X2);
> +       pbroadcastd((3 * 4)(INPUT), X3);
> +       pbroadcastd((4 * 4)(INPUT), X4);
> +       pbroadcastd((5 * 4)(INPUT), X5);
> +       pbroadcastd((6 * 4)(INPUT), X6);
> +       pbroadcastd((7 * 4)(INPUT), X7);
> +       pbroadcastd((8 * 4)(INPUT), X8);
> +       pbroadcastd((9 * 4)(INPUT), X9);
> +       pbroadcastd((10 * 4)(INPUT), X10);
> +       pbroadcastd((11 * 4)(INPUT), X11);
> +       pbroadcastd((14 * 4)(INPUT), X14);
> +       pbroadcastd((15 * 4)(INPUT), X15);
> +       movdqa X11, (STACK_TMP)(%rsp);
> +       movdqa X15, (STACK_TMP1)(%rsp);
> +
> +L(round2_4):
> +       QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,,)
> +       movdqa (STACK_TMP)(%rsp), X11;
> +       movdqa (STACK_TMP1)(%rsp), X15;
> +       movdqa X8, (STACK_TMP)(%rsp);
> +       movdqa X9, (STACK_TMP1)(%rsp);
> +       QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,,)
> +       QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,,)
> +       movdqa (STACK_TMP)(%rsp), X8;
> +       movdqa (STACK_TMP1)(%rsp), X9;
> +       movdqa X11, (STACK_TMP)(%rsp);
> +       movdqa X15, (STACK_TMP1)(%rsp);
> +       QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,,)
> +       sub $2, ROUND;
> +       jnz .Lround2_4;
> +
> +       /* tmp := X15 */
> +       movdqa (STACK_TMP)(%rsp), X11;
> +       pbroadcastd((0 * 4)(INPUT), X15);
> +       PLUS(X0, X15);
> +       pbroadcastd((1 * 4)(INPUT), X15);
> +       PLUS(X1, X15);
> +       pbroadcastd((2 * 4)(INPUT), X15);
> +       PLUS(X2, X15);
> +       pbroadcastd((3 * 4)(INPUT), X15);
> +       PLUS(X3, X15);
> +       pbroadcastd((4 * 4)(INPUT), X15);
> +       PLUS(X4, X15);
> +       pbroadcastd((5 * 4)(INPUT), X15);
> +       PLUS(X5, X15);
> +       pbroadcastd((6 * 4)(INPUT), X15);
> +       PLUS(X6, X15);
> +       pbroadcastd((7 * 4)(INPUT), X15);
> +       PLUS(X7, X15);
> +       pbroadcastd((8 * 4)(INPUT), X15);
> +       PLUS(X8, X15);
> +       pbroadcastd((9 * 4)(INPUT), X15);
> +       PLUS(X9, X15);
> +       pbroadcastd((10 * 4)(INPUT), X15);
> +       PLUS(X10, X15);
> +       pbroadcastd((11 * 4)(INPUT), X15);
> +       PLUS(X11, X15);
> +       movdqa (STACK_VEC_X12)(%rsp), X15;
> +       PLUS(X12, X15);
> +       movdqa (STACK_VEC_X13)(%rsp), X15;
> +       PLUS(X13, X15);
> +       movdqa X13, (STACK_TMP)(%rsp);
> +       pbroadcastd((14 * 4)(INPUT), X15);
> +       PLUS(X14, X15);
> +       movdqa (STACK_TMP1)(%rsp), X15;
> +       movdqa X14, (STACK_TMP1)(%rsp);
> +       pbroadcastd((15 * 4)(INPUT), X13);
> +       PLUS(X15, X13);
> +       movdqa X15, (STACK_TMP2)(%rsp);
> +
> +       /* Update counter */
> +       addq $4, (12 * 4)(INPUT);
> +
> +       transpose_4x4(X0, X1, X2, X3, X13, X14, X15);
> +       xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15);
> +       xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15);
> +       xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15);
> +       xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15);
> +       transpose_4x4(X4, X5, X6, X7, X0, X1, X2);
> +       movdqa (STACK_TMP)(%rsp), X13;
> +       movdqa (STACK_TMP1)(%rsp), X14;
> +       movdqa (STACK_TMP2)(%rsp), X15;
> +       xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0);
> +       xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0);
> +       xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0);
> +       xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0);
> +       transpose_4x4(X8, X9, X10, X11, X0, X1, X2);
> +       xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0);
> +       xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0);
> +       xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0);
> +       xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0);
> +       transpose_4x4(X12, X13, X14, X15, X0, X1, X2);
> +       xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0);
> +       xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0);
> +       xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0);
> +       xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0);
> +
> +       sub $4, NBLKS;
> +       lea (4 * 64)(DST), DST;
> +       lea (4 * 64)(SRC), SRC;
> +       jnz L(loop4);
> +
> +       /* clear the used vector registers and stack */
> +       clear(X0);
> +       movdqa X0, (STACK_VEC_X12)(%rsp);
> +       movdqa X0, (STACK_VEC_X13)(%rsp);
> +       movdqa X0, (STACK_TMP)(%rsp);
> +       movdqa X0, (STACK_TMP1)(%rsp);
> +       movdqa X0, (STACK_TMP2)(%rsp);
> +       clear(X1);
> +       clear(X2);
> +       clear(X3);
> +       clear(X4);
> +       clear(X5);
> +       clear(X6);
> +       clear(X7);
> +       clear(X8);
> +       clear(X9);
> +       clear(X10);
> +       clear(X11);
> +       clear(X12);
> +       clear(X13);
> +       clear(X14);
> +       clear(X15);
> +
> +       /* eax zeroed by round loop. */
> +       leave;
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_def_cfa_register(%rsp);
> +       ret;
> +       int3;
> +END (__chacha20_ssse3_blocks8)
> diff --git a/sysdeps/x86_64/chacha20_arch.h b/sysdeps/x86_64/chacha20_arch.h
> new file mode 100644
> index 0000000000..37a4fdfb1f
> --- /dev/null
> +++ b/sysdeps/x86_64/chacha20_arch.h
> @@ -0,0 +1,42 @@
> +/* Chacha20 implementation, used on arc4random.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <ldsodefs.h>
> +#include <cpu-features.h>
> +#include <sys/param.h>
> +
> +unsigned int __chacha20_ssse3_blocks8 (uint32_t *state, uint8_t *dst,
> +                                      const uint8_t *src, size_t nblks);
> +
> +static inline void
> +chacha20_crypt (struct chacha20_state *state, uint8_t *dst, const uint8_t *src,
> +               size_t bytes)
> +{
> +  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) && bytes >= CHACHA20_BLOCK_SIZE * 4)
> +    {
> +      size_t nblocks = bytes / CHACHA20_BLOCK_SIZE;
> +      nblocks -= nblocks % 4;

Are we locking ourselves into the api of __chacha_* expecting
this precomputation? I imagine we might want to move this to
assembly unless `nblock` is a compile time constant.

> +      __chacha20_ssse3_blocks8 (state->ctx, dst, src, nblocks);
> +      bytes -= nblocks * CHACHA20_BLOCK_SIZE;
> +      dst += nblocks * CHACHA20_BLOCK_SIZE;
> +      src += nblocks * CHACHA20_BLOCK_SIZE;
> +    }
> +
> +  if (bytes > 0)
> +    chacha20_crypt_generic (state, dst, src, bytes);
> +}
> --
> 2.32.0
>
  
Adhemerval Zanella April 14, 2022, 7:40 p.m. UTC | #10
On 14/04/2022 16:25, Noah Goldstein wrote:
> On Wed, Apr 13, 2022 at 3:27 PM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> +
>> +static inline void
>> +chacha20_crypt (struct chacha20_state *state, uint8_t *dst, const uint8_t *src,
>> +               size_t bytes)
>> +{
>> +  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) && bytes >= CHACHA20_BLOCK_SIZE * 4)
>> +    {
>> +      size_t nblocks = bytes / CHACHA20_BLOCK_SIZE;
>> +      nblocks -= nblocks % 4;
> 
> Are we locking ourselves into the api of __chacha_* expecting
> this precomputation? I imagine we might want to move this to
> assembly unless `nblock` is a compile time constant.
> 
>> +      __chacha20_ssse3_blocks8 (state->ctx, dst, src, nblocks);
>> +      bytes -= nblocks * CHACHA20_BLOCK_SIZE;
>> +      dst += nblocks * CHACHA20_BLOCK_SIZE;
>> +      src += nblocks * CHACHA20_BLOCK_SIZE;
>> +    }
>> +
>> +  if (bytes > 0)
>> +    chacha20_crypt_generic (state, dst, src, bytes);
>> +}
>> --
>> 2.32.0
>>

I think should be ok to _Static_assert that CHACHA20_BUFSIZE is a multiple of
the expected nblocks used by the optimized version and just call it without
the need to handle nblocks.

I will change it to v2.
  

Patch

diff --git a/LICENSES b/LICENSES
index 530893b1dc..2563abd9e2 100644
--- a/LICENSES
+++ b/LICENSES
@@ -389,3 +389,23 @@  Copyright 2001 by Stephen L. Moshier <moshier@na-net.ornl.gov>
  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, see
  <https://www.gnu.org/licenses/>.  */
+
+sysdeps/x86_64/chacha20-ssse3.S import code from libgcrypt, with the
+following notices:
+
+Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+
+This file is part of Libgcrypt.
+
+Libgcrypt is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as
+published by the Free Software Foundation; either version 2.1 of
+the License, or (at your option) any later version.
+
+Libgcrypt is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this program; if not, see <http://www.gnu.org/licenses/>.
diff --git a/sysdeps/generic/chacha20_arch.h b/sysdeps/generic/chacha20_arch.h
new file mode 100644
index 0000000000..d7200ac583
--- /dev/null
+++ b/sysdeps/generic/chacha20_arch.h
@@ -0,0 +1,24 @@ 
+/* Chacha20 implementation, generic interface.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+static inline void
+chacha20_crypt (struct chacha20_state *state, uint8_t *dst,
+		const uint8_t *src, size_t bytes)
+{
+  chacha20_crypt_generic (state, dst, src, bytes);
+}
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 79365aff2a..f43b6a1180 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -5,6 +5,12 @@  ifeq ($(subdir),csu)
 gen-as-const-headers += link-defines.sym
 endif
 
+ifeq ($(subdir),stdlib)
+sysdep_routines += \
+  chacha20-ssse3 \
+  # sysdep_routines
+endif
+
 ifeq ($(subdir),gmon)
 sysdep_routines += _mcount
 # We cannot compile _mcount.S with -pg because that would create
diff --git a/sysdeps/x86_64/chacha20-ssse3.S b/sysdeps/x86_64/chacha20-ssse3.S
new file mode 100644
index 0000000000..f221daf634
--- /dev/null
+++ b/sysdeps/x86_64/chacha20-ssse3.S
@@ -0,0 +1,330 @@ 
+/* Optimized SSSE3 implementation of ChaCha20 cipher.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Based on D. J. Bernstein reference implementation at
+   http://cr.yp.to/chacha.html:
+
+   chacha-regs.c version 20080118
+   D. J. Bernstein
+   Public domain.  */
+
+#include <sysdep.h>
+
+#ifdef PIC
+#  define rRIP (%rip)
+#else
+#  define rRIP
+#endif
+
+/* register macros */
+#define INPUT %rdi
+#define DST   %rsi
+#define SRC   %rdx
+#define NBLKS %rcx
+#define ROUND %eax
+
+/* stack structure */
+#define STACK_VEC_X12 (16)
+#define STACK_VEC_X13 (16 + STACK_VEC_X12)
+#define STACK_TMP     (16 + STACK_VEC_X13)
+#define STACK_TMP1    (16 + STACK_TMP)
+#define STACK_TMP2    (16 + STACK_TMP1)
+
+#define STACK_MAX     (16 + STACK_TMP2)
+
+/* vector registers */
+#define X0 %xmm0
+#define X1 %xmm1
+#define X2 %xmm2
+#define X3 %xmm3
+#define X4 %xmm4
+#define X5 %xmm5
+#define X6 %xmm6
+#define X7 %xmm7
+#define X8 %xmm8
+#define X9 %xmm9
+#define X10 %xmm10
+#define X11 %xmm11
+#define X12 %xmm12
+#define X13 %xmm13
+#define X14 %xmm14
+#define X15 %xmm15
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \
+	movdqa    x0, t2; \
+	punpckhdq x1, t2; \
+	punpckldq x1, x0; \
+	\
+	movdqa    x2, t1; \
+	punpckldq x3, t1; \
+	punpckhdq x3, x2; \
+	\
+	movdqa     x0, x1; \
+	punpckhqdq t1, x1; \
+	punpcklqdq t1, x0; \
+	\
+	movdqa     t2, x3; \
+	punpckhqdq x2, x3; \
+	punpcklqdq x2, t2; \
+	movdqa     t2, x2;
+
+/* fill xmm register with 32-bit value from memory */
+#define pbroadcastd(mem32, xreg) \
+	movd mem32, xreg; \
+	pshufd $0, xreg, xreg;
+
+/* xor with unaligned memory operand */
+#define pxor_u(umem128, xreg, t) \
+	movdqu umem128, t; \
+	pxor t, xreg;
+
+/* xor register with unaligned src and save to unaligned dst */
+#define xor_src_dst(dst, src, offset, xreg, t) \
+	pxor_u(offset(src), xreg, t); \
+	movdqu xreg, offset(dst);
+
+#define clear(x) pxor x,x;
+
+/**********************************************************************
+  4-way chacha20
+ **********************************************************************/
+
+#define ROTATE2(v1,v2,c,tmp1,tmp2)	\
+	movdqa v1, tmp1; 		\
+	movdqa v2, tmp2; 		\
+	psrld $(32 - (c)), v1;		\
+	pslld $(c), tmp1;		\
+	paddb tmp1, v1;			\
+	psrld $(32 - (c)), v2;		\
+	pslld $(c), tmp2;		\
+	paddb tmp2, v2;
+
+#define ROTATE_SHUF_2(v1,v2,shuf)	\
+	pshufb shuf, v1;		\
+	pshufb shuf, v2;
+
+#define XOR(ds,s) \
+	pxor s, ds;
+
+#define PLUS(ds,s) \
+	paddd s, ds;
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,tmp2,\
+		      interleave_op1,interleave_op2)		\
+	movdqa L(shuf_rol16) rRIP, tmp1;			\
+		interleave_op1;					\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2, 12, tmp1, tmp2);			\
+	movdqa L(shuf_rol8) rRIP, tmp1;				\
+		interleave_op2;					\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2,  7, tmp1, tmp2);
+
+	.text
+
+chacha20_data:
+	.align 16
+L(shuf_rol16):
+	.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+L(shuf_rol8):
+	.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+L(counter1):
+	.long 1,0,0,0
+L(inc_counter):
+	.long 0,1,2,3
+L(unsigned_cmp):
+	.long 0x80000000,0x80000000,0x80000000,0x80000000
+
+ENTRY (__chacha20_ssse3_blocks8)
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks (multiple of 4)
+	 */
+
+	pushq %rbp;
+	cfi_adjust_cfa_offset(8);
+	cfi_rel_offset(rbp, 0)
+	movq %rsp, %rbp;
+	cfi_def_cfa_register(%rbp);
+
+	subq $STACK_MAX, %rsp;
+	andq $~15, %rsp;
+
+L(loop4):
+	mov $20, ROUND;
+
+	/* Construct counter vectors X12 and X13 */
+	movdqa L(inc_counter) rRIP, X0;
+	movdqa L(unsigned_cmp) rRIP, X2;
+	pbroadcastd((12 * 4)(INPUT), X12);
+	pbroadcastd((13 * 4)(INPUT), X13);
+	paddd X0, X12;
+	movdqa X12, X1;
+	pxor X2, X0;
+	pxor X2, X1;
+	pcmpgtd X1, X0;
+	psubd X0, X13;
+	movdqa X12, (STACK_VEC_X12)(%rsp);
+	movdqa X13, (STACK_VEC_X13)(%rsp);
+
+	/* Load vectors */
+	pbroadcastd((0 * 4)(INPUT), X0);
+	pbroadcastd((1 * 4)(INPUT), X1);
+	pbroadcastd((2 * 4)(INPUT), X2);
+	pbroadcastd((3 * 4)(INPUT), X3);
+	pbroadcastd((4 * 4)(INPUT), X4);
+	pbroadcastd((5 * 4)(INPUT), X5);
+	pbroadcastd((6 * 4)(INPUT), X6);
+	pbroadcastd((7 * 4)(INPUT), X7);
+	pbroadcastd((8 * 4)(INPUT), X8);
+	pbroadcastd((9 * 4)(INPUT), X9);
+	pbroadcastd((10 * 4)(INPUT), X10);
+	pbroadcastd((11 * 4)(INPUT), X11);
+	pbroadcastd((14 * 4)(INPUT), X14);
+	pbroadcastd((15 * 4)(INPUT), X15);
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+
+L(round2_4):
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X11,X15,,)
+	movdqa (STACK_TMP)(%rsp), X11;
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X8, (STACK_TMP)(%rsp);
+	movdqa X9, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,X9,,)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,X9,,)
+	movdqa (STACK_TMP)(%rsp), X8;
+	movdqa (STACK_TMP1)(%rsp), X9;
+	movdqa X11, (STACK_TMP)(%rsp);
+	movdqa X15, (STACK_TMP1)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X11,X15,,)
+	sub $2, ROUND;
+	jnz .Lround2_4;
+
+	/* tmp := X15 */
+	movdqa (STACK_TMP)(%rsp), X11;
+	pbroadcastd((0 * 4)(INPUT), X15);
+	PLUS(X0, X15);
+	pbroadcastd((1 * 4)(INPUT), X15);
+	PLUS(X1, X15);
+	pbroadcastd((2 * 4)(INPUT), X15);
+	PLUS(X2, X15);
+	pbroadcastd((3 * 4)(INPUT), X15);
+	PLUS(X3, X15);
+	pbroadcastd((4 * 4)(INPUT), X15);
+	PLUS(X4, X15);
+	pbroadcastd((5 * 4)(INPUT), X15);
+	PLUS(X5, X15);
+	pbroadcastd((6 * 4)(INPUT), X15);
+	PLUS(X6, X15);
+	pbroadcastd((7 * 4)(INPUT), X15);
+	PLUS(X7, X15);
+	pbroadcastd((8 * 4)(INPUT), X15);
+	PLUS(X8, X15);
+	pbroadcastd((9 * 4)(INPUT), X15);
+	PLUS(X9, X15);
+	pbroadcastd((10 * 4)(INPUT), X15);
+	PLUS(X10, X15);
+	pbroadcastd((11 * 4)(INPUT), X15);
+	PLUS(X11, X15);
+	movdqa (STACK_VEC_X12)(%rsp), X15;
+	PLUS(X12, X15);
+	movdqa (STACK_VEC_X13)(%rsp), X15;
+	PLUS(X13, X15);
+	movdqa X13, (STACK_TMP)(%rsp);
+	pbroadcastd((14 * 4)(INPUT), X15);
+	PLUS(X14, X15);
+	movdqa (STACK_TMP1)(%rsp), X15;
+	movdqa X14, (STACK_TMP1)(%rsp);
+	pbroadcastd((15 * 4)(INPUT), X13);
+	PLUS(X15, X13);
+	movdqa X15, (STACK_TMP2)(%rsp);
+
+	/* Update counter */
+	addq $4, (12 * 4)(INPUT);
+
+	transpose_4x4(X0, X1, X2, X3, X13, X14, X15);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0, X15);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1, X15);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2, X15);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3, X15);
+	transpose_4x4(X4, X5, X6, X7, X0, X1, X2);
+	movdqa (STACK_TMP)(%rsp), X13;
+	movdqa (STACK_TMP1)(%rsp), X14;
+	movdqa (STACK_TMP2)(%rsp), X15;
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 1), X4, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 1), X5, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 1), X6, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 1), X7, X0);
+	transpose_4x4(X8, X9, X10, X11, X0, X1, X2);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11, X0);
+	transpose_4x4(X12, X13, X14, X15, X0, X1, X2);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 3), X12, X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 3), X13, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 3), X14, X0);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 3), X15, X0);
+
+	sub $4, NBLKS;
+	lea (4 * 64)(DST), DST;
+	lea (4 * 64)(SRC), SRC;
+	jnz L(loop4);
+
+	/* clear the used vector registers and stack */
+	clear(X0);
+	movdqa X0, (STACK_VEC_X12)(%rsp);
+	movdqa X0, (STACK_VEC_X13)(%rsp);
+	movdqa X0, (STACK_TMP)(%rsp);
+	movdqa X0, (STACK_TMP1)(%rsp);
+	movdqa X0, (STACK_TMP2)(%rsp);
+	clear(X1);
+	clear(X2);
+	clear(X3);
+	clear(X4);
+	clear(X5);
+	clear(X6);
+	clear(X7);
+	clear(X8);
+	clear(X9);
+	clear(X10);
+	clear(X11);
+	clear(X12);
+	clear(X13);
+	clear(X14);
+	clear(X15);
+
+	/* eax zeroed by round loop. */
+	leave;
+	cfi_adjust_cfa_offset(-8)
+	cfi_def_cfa_register(%rsp);
+	ret;
+	int3;
+END (__chacha20_ssse3_blocks8)
diff --git a/sysdeps/x86_64/chacha20_arch.h b/sysdeps/x86_64/chacha20_arch.h
new file mode 100644
index 0000000000..37a4fdfb1f
--- /dev/null
+++ b/sysdeps/x86_64/chacha20_arch.h
@@ -0,0 +1,42 @@ 
+/* Chacha20 implementation, used on arc4random.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <ldsodefs.h>
+#include <cpu-features.h>
+#include <sys/param.h>
+
+unsigned int __chacha20_ssse3_blocks8 (uint32_t *state, uint8_t *dst,
+				       const uint8_t *src, size_t nblks);
+
+static inline void
+chacha20_crypt (struct chacha20_state *state, uint8_t *dst, const uint8_t *src,
+		size_t bytes)
+{
+  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) && bytes >= CHACHA20_BLOCK_SIZE * 4)
+    {
+      size_t nblocks = bytes / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 4;
+      __chacha20_ssse3_blocks8 (state->ctx, dst, src, nblocks);
+      bytes -= nblocks * CHACHA20_BLOCK_SIZE;
+      dst += nblocks * CHACHA20_BLOCK_SIZE;
+      src += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+
+  if (bytes > 0)
+    chacha20_crypt_generic (state, dst, src, bytes);
+}