[5/7] x86: Add AVX2 optimized chacha20

Message ID 20220413202401.408267-6-adhemerval.zanella@linaro.org
State Superseded
Headers
Series Add arc4random support |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Adhemerval Zanella April 13, 2022, 8:23 p.m. UTC
  It adds vectorized ChaCha20 implementation based on libgcrypt
cipher/chacha20-amd64-avx2.S.  It is used only if AVX2 is supported
and enabled by the architecture.

On a Ryzen 9 5900X it shows the following improvements (using
formatted bench-arc4random data):

SSSE3:
Function                                 MB/s
--------------------------------------------------
arc4random [single-thread]               576.55
arc4random_buf(0) [single-thread]        961.77
arc4random_buf(16) [single-thread]       1309.38
arc4random_buf(32) [single-thread]       1558.69
arc4random_buf(64) [single-thread]       1728.54
--------------------------------------------------
arc4random [multi-thread]                589.52
arc4random_buf(0) [multi-thread]         967.39
arc4random_buf(16) [multi-thread]        1319.27
arc4random_buf(32) [multi-thread]        1552.96
arc4random_buf(64) [multi-thread]        1734.27
--------------------------------------------------

AVX2:
Function                                 MB/s
--------------------------------------------------
arc4random [single-thread]               672.49
arc4random_buf(0) [single-thread]        1234.85
arc4random_buf(16) [single-thread]       1892.67
arc4random_buf(32) [single-thread]       2491.10
arc4random_buf(64) [single-thread]       2696.27
--------------------------------------------------
arc4random [multi-thread]                661.25
arc4random_buf(0) [multi-thread]         1214.65
arc4random_buf(16) [multi-thread]        1867.98
arc4random_buf(32) [multi-thread]        2474.70
arc4random_buf(64) [multi-thread]        2893.21
--------------------------------------------------

Checked on x86_64-linux-gnu.
---
 LICENSES                       |   4 +-
 stdlib/chacha20.c              |   7 +-
 sysdeps/x86_64/Makefile        |   1 +
 sysdeps/x86_64/chacha20-avx2.S | 317 +++++++++++++++++++++++++++++++++
 sysdeps/x86_64/chacha20_arch.h |  14 ++
 5 files changed, 339 insertions(+), 4 deletions(-)
 create mode 100644 sysdeps/x86_64/chacha20-avx2.S
  

Comments

Noah Goldstein April 13, 2022, 11:04 p.m. UTC | #1
On Wed, Apr 13, 2022 at 1:27 PM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> It adds vectorized ChaCha20 implementation based on libgcrypt
> cipher/chacha20-amd64-avx2.S.  It is used only if AVX2 is supported
> and enabled by the architecture.
>
> On a Ryzen 9 5900X it shows the following improvements (using
> formatted bench-arc4random data):
>
> SSSE3:
> Function                                 MB/s
> --------------------------------------------------
> arc4random [single-thread]               576.55
> arc4random_buf(0) [single-thread]        961.77
> arc4random_buf(16) [single-thread]       1309.38
> arc4random_buf(32) [single-thread]       1558.69
> arc4random_buf(64) [single-thread]       1728.54
> --------------------------------------------------
> arc4random [multi-thread]                589.52
> arc4random_buf(0) [multi-thread]         967.39
> arc4random_buf(16) [multi-thread]        1319.27
> arc4random_buf(32) [multi-thread]        1552.96
> arc4random_buf(64) [multi-thread]        1734.27
> --------------------------------------------------
>
> AVX2:
> Function                                 MB/s
> --------------------------------------------------
> arc4random [single-thread]               672.49
> arc4random_buf(0) [single-thread]        1234.85
> arc4random_buf(16) [single-thread]       1892.67
> arc4random_buf(32) [single-thread]       2491.10
> arc4random_buf(64) [single-thread]       2696.27
> --------------------------------------------------
> arc4random [multi-thread]                661.25
> arc4random_buf(0) [multi-thread]         1214.65
> arc4random_buf(16) [multi-thread]        1867.98
> arc4random_buf(32) [multi-thread]        2474.70
> arc4random_buf(64) [multi-thread]        2893.21
> --------------------------------------------------
>
> Checked on x86_64-linux-gnu.
> ---
>  LICENSES                       |   4 +-
>  stdlib/chacha20.c              |   7 +-
>  sysdeps/x86_64/Makefile        |   1 +
>  sysdeps/x86_64/chacha20-avx2.S | 317 +++++++++++++++++++++++++++++++++
>  sysdeps/x86_64/chacha20_arch.h |  14 ++
>  5 files changed, 339 insertions(+), 4 deletions(-)
>  create mode 100644 sysdeps/x86_64/chacha20-avx2.S
>
> diff --git a/LICENSES b/LICENSES
> index 2563abd9e2..8ef0f023d7 100644
> --- a/LICENSES
> +++ b/LICENSES
> @@ -390,8 +390,8 @@ Copyright 2001 by Stephen L. Moshier <moshier@na-net.ornl.gov>
>   License along with this library; if not, see
>   <https://www.gnu.org/licenses/>.  */
>
> -sysdeps/x86_64/chacha20-ssse3.S import code from libgcrypt, with the
> -following notices:
> +sysdeps/x86_64/chacha20-ssse3.S and sysdeps/x86_64/chacha20-avx2.S
> +import code from libgcrypt, with the following notices:
>
>  Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
>
> diff --git a/stdlib/chacha20.c b/stdlib/chacha20.c
> index dbd87bd942..8569e1e78d 100644
> --- a/stdlib/chacha20.c
> +++ b/stdlib/chacha20.c
> @@ -190,8 +190,8 @@ memxorcpy (uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t len)
>  }
>
>  static void
> -chacha20_crypt (struct chacha20_state *state, uint8_t *dst,
> -               const uint8_t *src, size_t bytes)
> +chacha20_crypt_generic (struct chacha20_state *state, uint8_t *dst,
> +                       const uint8_t *src, size_t bytes)
>  {
>    uint8_t stream[CHACHA20_BLOCK_SIZE];
>
> @@ -209,3 +209,6 @@ chacha20_crypt (struct chacha20_state *state, uint8_t *dst,
>        memxorcpy (dst, src, stream, bytes);
>      }
>  }
> +
> +/* Get the arch-optimized implementation, if any.  */
> +#include <chacha20_arch.h>
> diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
> index f43b6a1180..afb4d173e8 100644
> --- a/sysdeps/x86_64/Makefile
> +++ b/sysdeps/x86_64/Makefile
> @@ -7,6 +7,7 @@ endif
>
>  ifeq ($(subdir),stdlib)
>  sysdep_routines += \
> +  chacha20-avx2 \
>    chacha20-ssse3 \
>    # sysdep_routines
>  endif
> diff --git a/sysdeps/x86_64/chacha20-avx2.S b/sysdeps/x86_64/chacha20-avx2.S
> new file mode 100644
> index 0000000000..96174c0e40
> --- /dev/null
> +++ b/sysdeps/x86_64/chacha20-avx2.S
> @@ -0,0 +1,317 @@
> +/* Optimized AVX2 implementation of ChaCha20 cipher.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +/* Based on D. J. Bernstein reference implementation at
> +   http://cr.yp.to/chacha.html:
> +
> +   chacha-regs.c version 20080118
> +   D. J. Bernstein
> +   Public domain.  */
> +
> +#ifdef PIC
> +#  define rRIP (%rip)
> +#else
> +#  define rRIP
> +#endif
> +
> +/* register macros */
> +#define INPUT %rdi
> +#define DST   %rsi
> +#define SRC   %rdx
> +#define NBLKS %rcx
> +#define ROUND %eax
> +
> +/* stack structure */
> +#define STACK_VEC_X12 (32)
> +#define STACK_VEC_X13 (32 + STACK_VEC_X12)
> +#define STACK_TMP     (32 + STACK_VEC_X13)
> +#define STACK_TMP1    (32 + STACK_TMP)
> +
> +#define STACK_MAX     (32 + STACK_TMP1)
> +
> +/* vector registers */
> +#define X0 %ymm0
> +#define X1 %ymm1
> +#define X2 %ymm2
> +#define X3 %ymm3
> +#define X4 %ymm4
> +#define X5 %ymm5
> +#define X6 %ymm6
> +#define X7 %ymm7
> +#define X8 %ymm8
> +#define X9 %ymm9
> +#define X10 %ymm10
> +#define X11 %ymm11
> +#define X12 %ymm12
> +#define X13 %ymm13
> +#define X14 %ymm14
> +#define X15 %ymm15
> +
> +#define X0h %xmm0
> +#define X1h %xmm1
> +#define X2h %xmm2
> +#define X3h %xmm3
> +#define X4h %xmm4
> +#define X5h %xmm5
> +#define X6h %xmm6
> +#define X7h %xmm7
> +#define X8h %xmm8
> +#define X9h %xmm9
> +#define X10h %xmm10
> +#define X11h %xmm11
> +#define X12h %xmm12
> +#define X13h %xmm13
> +#define X14h %xmm14
> +#define X15h %xmm15
> +
> +/**********************************************************************
> +  helper macros
> + **********************************************************************/
> +
> +/* 4x4 32-bit integer matrix transpose */
> +#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
> +       vpunpckhdq x1, x0, t2; \
> +       vpunpckldq x1, x0, x0; \
> +       \
> +       vpunpckldq x3, x2, t1; \
> +       vpunpckhdq x3, x2, x2; \
> +       \
> +       vpunpckhqdq t1, x0, x1; \
> +       vpunpcklqdq t1, x0, x0; \
> +       \
> +       vpunpckhqdq x2, t2, x3; \
> +       vpunpcklqdq x2, t2, x2;
> +
> +/* 2x2 128-bit matrix transpose */
> +#define transpose_16byte_2x2(x0,x1,t1) \
> +       vmovdqa    x0, t1; \
> +       vperm2i128 $0x20, x1, x0, x0; \
> +       vperm2i128 $0x31, x1, t1, x1;
> +
> +/* xor register with unaligned src and save to unaligned dst */
> +#define xor_src_dst(dst, src, offset, xreg) \
> +       vpxor offset(src), xreg, xreg; \
> +       vmovdqu xreg, offset(dst);
> +
> +/**********************************************************************
> +  8-way chacha20
> + **********************************************************************/
> +
> +#define ROTATE2(v1,v2,c,tmp)   \
> +       vpsrld $(32 - (c)), v1, tmp;    \
> +       vpslld $(c), v1, v1;            \
> +       vpaddb tmp, v1, v1;             \
> +       vpsrld $(32 - (c)), v2, tmp;    \
> +       vpslld $(c), v2, v2;            \
> +       vpaddb tmp, v2, v2;
> +
> +#define ROTATE_SHUF_2(v1,v2,shuf)      \
> +       vpshufb shuf, v1, v1;           \
> +       vpshufb shuf, v2, v2;
> +
> +#define XOR(ds,s) \
> +       vpxor s, ds, ds;
> +
> +#define PLUS(ds,s) \
> +       vpaddd s, ds, ds;
> +
> +#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,\
> +                     interleave_op1,interleave_op2,\
> +                     interleave_op3,interleave_op4)            \
> +       vbroadcasti128 .Lshuf_rol16 rRIP, tmp1;                 \
> +               interleave_op1;                                 \
> +       PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);       \
> +           ROTATE_SHUF_2(d1, d2, tmp1);                        \
> +               interleave_op2;                                 \
> +       PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);       \
> +           ROTATE2(b1, b2, 12, tmp1);                          \
> +       vbroadcasti128 .Lshuf_rol8 rRIP, tmp1;                  \
> +               interleave_op3;                                 \
> +       PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);       \
> +           ROTATE_SHUF_2(d1, d2, tmp1);                        \
> +               interleave_op4;                                 \
> +       PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);       \
> +           ROTATE2(b1, b2,  7, tmp1);
> +
> +       .text

section avx2

> +       .align 32
> +chacha20_data:
> +L(shuf_rol16):
> +       .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
> +L(shuf_rol8):
> +       .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
> +L(inc_counter):
> +       .byte 0,1,2,3,4,5,6,7
> +L(unsigned_cmp):
> +       .long 0x80000000
> +
> +ENTRY (__chacha20_avx2_blocks8)
> +       /* input:
> +        *      %rdi: input
> +        *      %rsi: dst
> +        *      %rdx: src
> +        *      %rcx: nblks (multiple of 8)
> +        */
> +       vzeroupper;

vzeroupper needs to be replaced with VZEROUPPER_RETURN
and we need a transaction safe version unless this can never
be called during a transaction.
> +
> +       pushq %rbp;
> +       cfi_adjust_cfa_offset(8);
> +       cfi_rel_offset(rbp, 0)
> +       movq %rsp, %rbp;
> +       cfi_def_cfa_register(rbp);
> +
> +       subq $STACK_MAX, %rsp;
> +       andq $~31, %rsp;
> +
> +L(loop8):
> +       mov $20, ROUND;
> +
> +       /* Construct counter vectors X12 and X13 */
> +       vpmovzxbd L(inc_counter) rRIP, X0;
> +       vpbroadcastd L(unsigned_cmp) rRIP, X2;
> +       vpbroadcastd (12 * 4)(INPUT), X12;
> +       vpbroadcastd (13 * 4)(INPUT), X13;
> +       vpaddd X0, X12, X12;
> +       vpxor X2, X0, X0;
> +       vpxor X2, X12, X1;
> +       vpcmpgtd X1, X0, X0;
> +       vpsubd X0, X13, X13;
> +       vmovdqa X12, (STACK_VEC_X12)(%rsp);
> +       vmovdqa X13, (STACK_VEC_X13)(%rsp);
> +
> +       /* Load vectors */
> +       vpbroadcastd (0 * 4)(INPUT), X0;
> +       vpbroadcastd (1 * 4)(INPUT), X1;
> +       vpbroadcastd (2 * 4)(INPUT), X2;
> +       vpbroadcastd (3 * 4)(INPUT), X3;
> +       vpbroadcastd (4 * 4)(INPUT), X4;
> +       vpbroadcastd (5 * 4)(INPUT), X5;
> +       vpbroadcastd (6 * 4)(INPUT), X6;
> +       vpbroadcastd (7 * 4)(INPUT), X7;
> +       vpbroadcastd (8 * 4)(INPUT), X8;
> +       vpbroadcastd (9 * 4)(INPUT), X9;
> +       vpbroadcastd (10 * 4)(INPUT), X10;
> +       vpbroadcastd (11 * 4)(INPUT), X11;
> +       vpbroadcastd (14 * 4)(INPUT), X14;
> +       vpbroadcastd (15 * 4)(INPUT), X15;
> +       vmovdqa X15, (STACK_TMP)(%rsp);
> +
> +L(round2):
> +       QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,,,,)
> +       vmovdqa (STACK_TMP)(%rsp), X15;
> +       vmovdqa X8, (STACK_TMP)(%rsp);
> +       QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,,,,)
> +       QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,,,,)
> +       vmovdqa (STACK_TMP)(%rsp), X8;
> +       vmovdqa X15, (STACK_TMP)(%rsp);
> +       QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,,,,)
> +       sub $2, ROUND;
> +       jnz L(round2);
> +
> +       vmovdqa X8, (STACK_TMP1)(%rsp);
> +
> +       /* tmp := X15 */
> +       vpbroadcastd (0 * 4)(INPUT), X15;
> +       PLUS(X0, X15);
> +       vpbroadcastd (1 * 4)(INPUT), X15;
> +       PLUS(X1, X15);
> +       vpbroadcastd (2 * 4)(INPUT), X15;
> +       PLUS(X2, X15);
> +       vpbroadcastd (3 * 4)(INPUT), X15;
> +       PLUS(X3, X15);
> +       vpbroadcastd (4 * 4)(INPUT), X15;
> +       PLUS(X4, X15);
> +       vpbroadcastd (5 * 4)(INPUT), X15;
> +       PLUS(X5, X15);
> +       vpbroadcastd (6 * 4)(INPUT), X15;
> +       PLUS(X6, X15);
> +       vpbroadcastd (7 * 4)(INPUT), X15;
> +       PLUS(X7, X15);
> +       transpose_4x4(X0, X1, X2, X3, X8, X15);
> +       transpose_4x4(X4, X5, X6, X7, X8, X15);
> +       vmovdqa (STACK_TMP1)(%rsp), X8;
> +       transpose_16byte_2x2(X0, X4, X15);
> +       transpose_16byte_2x2(X1, X5, X15);
> +       transpose_16byte_2x2(X2, X6, X15);
> +       transpose_16byte_2x2(X3, X7, X15);
> +       vmovdqa (STACK_TMP)(%rsp), X15;
> +       xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0);
> +       xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1);
> +       vpbroadcastd (8 * 4)(INPUT), X0;
> +       PLUS(X8, X0);
> +       vpbroadcastd (9 * 4)(INPUT), X0;
> +       PLUS(X9, X0);
> +       vpbroadcastd (10 * 4)(INPUT), X0;
> +       PLUS(X10, X0);
> +       vpbroadcastd (11 * 4)(INPUT), X0;
> +       PLUS(X11, X0);
> +       vmovdqa (STACK_VEC_X12)(%rsp), X0;
> +       PLUS(X12, X0);
> +       vmovdqa (STACK_VEC_X13)(%rsp), X0;
> +       PLUS(X13, X0);
> +       vpbroadcastd (14 * 4)(INPUT), X0;
> +       PLUS(X14, X0);
> +       vpbroadcastd (15 * 4)(INPUT), X0;
> +       PLUS(X15, X0);
> +       xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2);
> +       xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3);
> +
> +       /* Update counter */
> +       addq $8, (12 * 4)(INPUT);
> +
> +       transpose_4x4(X8, X9, X10, X11, X0, X1);
> +       transpose_4x4(X12, X13, X14, X15, X0, X1);
> +       xor_src_dst(DST, SRC, (64 * 4 + 16 * 0), X4);
> +       xor_src_dst(DST, SRC, (64 * 5 + 16 * 0), X5);
> +       transpose_16byte_2x2(X8, X12, X0);
> +       transpose_16byte_2x2(X9, X13, X0);
> +       transpose_16byte_2x2(X10, X14, X0);
> +       transpose_16byte_2x2(X11, X15, X0);
> +       xor_src_dst(DST, SRC, (64 * 6 + 16 * 0), X6);
> +       xor_src_dst(DST, SRC, (64 * 7 + 16 * 0), X7);
> +       xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8);
> +       xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9);
> +       xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10);
> +       xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11);
> +       xor_src_dst(DST, SRC, (64 * 4 + 16 * 2), X12);
> +       xor_src_dst(DST, SRC, (64 * 5 + 16 * 2), X13);
> +       xor_src_dst(DST, SRC, (64 * 6 + 16 * 2), X14);
> +       xor_src_dst(DST, SRC, (64 * 7 + 16 * 2), X15);
> +
> +       sub $8, NBLKS;
> +       lea (8 * 64)(DST), DST;
> +       lea (8 * 64)(SRC), SRC;
> +       jnz L(loop8);
> +
> +       /* clear the used vector registers and stack */
> +       vpxor X0, X0, X0;
> +       vmovdqa X0, (STACK_VEC_X12)(%rsp);
> +       vmovdqa X0, (STACK_VEC_X13)(%rsp);
> +       vmovdqa X0, (STACK_TMP)(%rsp);
> +       vmovdqa X0, (STACK_TMP1)(%rsp);
> +       vzeroall;

Do you need vzeroall?
Why not vzeroupper? Is it a security concern to leave info in the xmm pieces?


> +
> +       /* eax zeroed by round loop. */
> +       leave;
> +       cfi_adjust_cfa_offset(-8)
> +       cfi_def_cfa_register(%rsp);
> +       ret;
> +       int3;

Why do we need int3 here?
> +END(__chacha20_avx2_blocks8)
> diff --git a/sysdeps/x86_64/chacha20_arch.h b/sysdeps/x86_64/chacha20_arch.h
> index 37a4fdfb1f..7e9e7755f3 100644
> --- a/sysdeps/x86_64/chacha20_arch.h
> +++ b/sysdeps/x86_64/chacha20_arch.h
> @@ -22,11 +22,25 @@
>
>  unsigned int __chacha20_ssse3_blocks8 (uint32_t *state, uint8_t *dst,
>                                        const uint8_t *src, size_t nblks);
> +unsigned int __chacha20_avx2_blocks8 (uint32_t *state, uint8_t *dst,
> +                                     const uint8_t *src, size_t nblks);
>
>  static inline void
>  chacha20_crypt (struct chacha20_state *state, uint8_t *dst, const uint8_t *src,
>                 size_t bytes)
>  {
> +  const struct cpu_features* cpu_features = __get_cpu_features ();

Can we do this with an ifunc and take the cpufeature check off the critical
path?
> +
> +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) && bytes >= CHACHA20_BLOCK_SIZE * 8)
> +    {
> +      size_t nblocks = bytes / CHACHA20_BLOCK_SIZE;
> +      nblocks -= nblocks % 8;
> +      __chacha20_avx2_blocks8 (state->ctx, dst, src, nblocks);
> +      bytes -= nblocks * CHACHA20_BLOCK_SIZE;
> +      dst += nblocks * CHACHA20_BLOCK_SIZE;
> +      src += nblocks * CHACHA20_BLOCK_SIZE;
> +    }
> +
>    if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) && bytes >= CHACHA20_BLOCK_SIZE * 4)
>      {
>        size_t nblocks = bytes / CHACHA20_BLOCK_SIZE;
> --
> 2.32.0
>

Do you want optimization comments or do that later?
  
Adhemerval Zanella April 14, 2022, 5:16 p.m. UTC | #2
On 13/04/2022 20:04, Noah Goldstein wrote:
> On Wed, Apr 13, 2022 at 1:27 PM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> +       .text
> 
> section avx2
> 

Ack, I changed to '.section .text.avx2, "ax", @progbits'.

>> +       .align 32
>> +chacha20_data:
>> +L(shuf_rol16):
>> +       .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
>> +L(shuf_rol8):
>> +       .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
>> +L(inc_counter):
>> +       .byte 0,1,2,3,4,5,6,7
>> +L(unsigned_cmp):
>> +       .long 0x80000000
>> +
>> +ENTRY (__chacha20_avx2_blocks8)
>> +       /* input:
>> +        *      %rdi: input
>> +        *      %rsi: dst
>> +        *      %rdx: src
>> +        *      %rcx: nblks (multiple of 8)
>> +        */
>> +       vzeroupper;
> 
> vzeroupper needs to be replaced with VZEROUPPER_RETURN
> and we need a transaction safe version unless this can never
> be called during a transaction.

I think you meant VZEROUPPER here (VZEROUPPER_RETURN seems to trigger
test case failures). What do you mean by a 'transaction safe version'?
Ax extra __chacha20_avx2_blocks8 implementation to handle it? Or disable
it if RTM is enabled?

>> +
>> +       /* clear the used vector registers and stack */
>> +       vpxor X0, X0, X0;
>> +       vmovdqa X0, (STACK_VEC_X12)(%rsp);
>> +       vmovdqa X0, (STACK_VEC_X13)(%rsp);
>> +       vmovdqa X0, (STACK_TMP)(%rsp);
>> +       vmovdqa X0, (STACK_TMP1)(%rsp);
>> +       vzeroall;
> 
> Do you need vzeroall?
> Why not vzeroupper? Is it a security concern to leave info in the xmm pieces?

I would assume, since it is on the original libgrcypt optimization.  As
for the ssse3 version, I am not sure if we really need that level of
hardening, but it would be good to have the initial revision as close
as possible from libgcrypt.

> 
> 
>> +
>> +       /* eax zeroed by round loop. */
>> +       leave;
>> +       cfi_adjust_cfa_offset(-8)
>> +       cfi_def_cfa_register(%rsp);
>> +       ret;
>> +       int3;
> 
> Why do we need int3 here?

I think the ssse3 applies here as well.

>> +END(__chacha20_avx2_blocks8)
>> diff --git a/sysdeps/x86_64/chacha20_arch.h b/sysdeps/x86_64/chacha20_arch.h
>> index 37a4fdfb1f..7e9e7755f3 100644
>> --- a/sysdeps/x86_64/chacha20_arch.h
>> +++ b/sysdeps/x86_64/chacha20_arch.h
>> @@ -22,11 +22,25 @@
>>
>>  unsigned int __chacha20_ssse3_blocks8 (uint32_t *state, uint8_t *dst,
>>                                        const uint8_t *src, size_t nblks);
>> +unsigned int __chacha20_avx2_blocks8 (uint32_t *state, uint8_t *dst,
>> +                                     const uint8_t *src, size_t nblks);
>>
>>  static inline void
>>  chacha20_crypt (struct chacha20_state *state, uint8_t *dst, const uint8_t *src,
>>                 size_t bytes)
>>  {
>> +  const struct cpu_features* cpu_features = __get_cpu_features ();
> 
> Can we do this with an ifunc and take the cpufeature check off the critical
> path?

Ditto.

>> +
>> +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) && bytes >= CHACHA20_BLOCK_SIZE * 8)
>> +    {
>> +      size_t nblocks = bytes / CHACHA20_BLOCK_SIZE;
>> +      nblocks -= nblocks % 8;
>> +      __chacha20_avx2_blocks8 (state->ctx, dst, src, nblocks);
>> +      bytes -= nblocks * CHACHA20_BLOCK_SIZE;
>> +      dst += nblocks * CHACHA20_BLOCK_SIZE;
>> +      src += nblocks * CHACHA20_BLOCK_SIZE;
>> +    }
>> +
>>    if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) && bytes >= CHACHA20_BLOCK_SIZE * 4)
>>      {
>>        size_t nblocks = bytes / CHACHA20_BLOCK_SIZE;
>> --
>> 2.32.0
>>
> 
> Do you want optimization comments or do that later?

Ideally I would like to check if the proposed arc4random implementation 
is what we want (with current approach of using atfork handlers and the
key reschedule).  The cipher itself it not the utmost important in the 
sense it is transparent to user and we can eventually replace it if there
any issue or attack to ChaCha20.  Initially I won't add any arch-specific
optimization, but since libgcrypt provides some that fits on the current
approach I though it would be a nice thing to have.

For optimization comments it would be good to sync with libgcrypt as well,
I think the project will be interested in any performance improvement
you might have for the chacha implementations.
  
Noah Goldstein April 14, 2022, 5:20 p.m. UTC | #3
On Thu, Apr 14, 2022 at 12:17 PM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 13/04/2022 20:04, Noah Goldstein wrote:
> > On Wed, Apr 13, 2022 at 1:27 PM Adhemerval Zanella via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> >>
> >> +       .text
> >
> > section avx2
> >
>
> Ack, I changed to '.section .text.avx2, "ax", @progbits'.
>
> >> +       .align 32
> >> +chacha20_data:
> >> +L(shuf_rol16):
> >> +       .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
> >> +L(shuf_rol8):
> >> +       .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
> >> +L(inc_counter):
> >> +       .byte 0,1,2,3,4,5,6,7
> >> +L(unsigned_cmp):
> >> +       .long 0x80000000
> >> +
> >> +ENTRY (__chacha20_avx2_blocks8)
> >> +       /* input:
> >> +        *      %rdi: input
> >> +        *      %rsi: dst
> >> +        *      %rdx: src
> >> +        *      %rcx: nblks (multiple of 8)
> >> +        */
> >> +       vzeroupper;
> >
> > vzeroupper needs to be replaced with VZEROUPPER_RETURN
> > and we need a transaction safe version unless this can never
> > be called during a transaction.
>
> I think you meant VZEROUPPER here (VZEROUPPER_RETURN seems to trigger
> test case failures). What do you mean by a 'transaction safe version'?
> Ax extra __chacha20_avx2_blocks8 implementation to handle it? Or disable
> it if RTM is enabled?

For now you can just update the cpufeature check to do ssse3 if RTM is enabled.

>
> >> +
> >> +       /* clear the used vector registers and stack */
> >> +       vpxor X0, X0, X0;
> >> +       vmovdqa X0, (STACK_VEC_X12)(%rsp);
> >> +       vmovdqa X0, (STACK_VEC_X13)(%rsp);
> >> +       vmovdqa X0, (STACK_TMP)(%rsp);
> >> +       vmovdqa X0, (STACK_TMP1)(%rsp);
> >> +       vzeroall;
> >
> > Do you need vzeroall?
> > Why not vzeroupper? Is it a security concern to leave info in the xmm pieces?
>
> I would assume, since it is on the original libgrcypt optimization.  As
> for the ssse3 version, I am not sure if we really need that level of
> hardening, but it would be good to have the initial revision as close
> as possible from libgcrypt.

Got it.

>
> >
> >
> >> +
> >> +       /* eax zeroed by round loop. */
> >> +       leave;
> >> +       cfi_adjust_cfa_offset(-8)
> >> +       cfi_def_cfa_register(%rsp);
> >> +       ret;
> >> +       int3;
> >
> > Why do we need int3 here?
>
> I think the ssse3 applies here as well.
>
> >> +END(__chacha20_avx2_blocks8)
> >> diff --git a/sysdeps/x86_64/chacha20_arch.h b/sysdeps/x86_64/chacha20_arch.h
> >> index 37a4fdfb1f..7e9e7755f3 100644
> >> --- a/sysdeps/x86_64/chacha20_arch.h
> >> +++ b/sysdeps/x86_64/chacha20_arch.h
> >> @@ -22,11 +22,25 @@
> >>
> >>  unsigned int __chacha20_ssse3_blocks8 (uint32_t *state, uint8_t *dst,
> >>                                        const uint8_t *src, size_t nblks);
> >> +unsigned int __chacha20_avx2_blocks8 (uint32_t *state, uint8_t *dst,
> >> +                                     const uint8_t *src, size_t nblks);
> >>
> >>  static inline void
> >>  chacha20_crypt (struct chacha20_state *state, uint8_t *dst, const uint8_t *src,
> >>                 size_t bytes)
> >>  {
> >> +  const struct cpu_features* cpu_features = __get_cpu_features ();
> >
> > Can we do this with an ifunc and take the cpufeature check off the critical
> > path?
>
> Ditto.
>
> >> +
> >> +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) && bytes >= CHACHA20_BLOCK_SIZE * 8)
> >> +    {
> >> +      size_t nblocks = bytes / CHACHA20_BLOCK_SIZE;
> >> +      nblocks -= nblocks % 8;
> >> +      __chacha20_avx2_blocks8 (state->ctx, dst, src, nblocks);
> >> +      bytes -= nblocks * CHACHA20_BLOCK_SIZE;
> >> +      dst += nblocks * CHACHA20_BLOCK_SIZE;
> >> +      src += nblocks * CHACHA20_BLOCK_SIZE;
> >> +    }
> >> +
> >>    if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) && bytes >= CHACHA20_BLOCK_SIZE * 4)
> >>      {
> >>        size_t nblocks = bytes / CHACHA20_BLOCK_SIZE;
> >> --
> >> 2.32.0
> >>
> >
> > Do you want optimization comments or do that later?
>
> Ideally I would like to check if the proposed arc4random implementation
> is what we want (with current approach of using atfork handlers and the
> key reschedule).  The cipher itself it not the utmost important in the
> sense it is transparent to user and we can eventually replace it if there
> any issue or attack to ChaCha20.  Initially I won't add any arch-specific
> optimization, but since libgcrypt provides some that fits on the current
> approach I though it would be a nice thing to have.
>
> For optimization comments it would be good to sync with libgcrypt as well,
> I think the project will be interested in any performance improvement
> you might have for the chacha implementations.
Okay, I'll probably take a stab at this in the not too distant future.
  
Adhemerval Zanella April 14, 2022, 6:12 p.m. UTC | #4
On 14/04/2022 14:20, Noah Goldstein wrote:
> On Thu, Apr 14, 2022 at 12:17 PM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>>
>> On 13/04/2022 20:04, Noah Goldstein wrote:
>>> On Wed, Apr 13, 2022 at 1:27 PM Adhemerval Zanella via Libc-alpha
>>> <libc-alpha@sourceware.org> wrote:
>>>>
>>>> +       .text
>>>
>>> section avx2
>>>
>>
>> Ack, I changed to '.section .text.avx2, "ax", @progbits'.
>>
>>>> +       .align 32
>>>> +chacha20_data:
>>>> +L(shuf_rol16):
>>>> +       .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
>>>> +L(shuf_rol8):
>>>> +       .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
>>>> +L(inc_counter):
>>>> +       .byte 0,1,2,3,4,5,6,7
>>>> +L(unsigned_cmp):
>>>> +       .long 0x80000000
>>>> +
>>>> +ENTRY (__chacha20_avx2_blocks8)
>>>> +       /* input:
>>>> +        *      %rdi: input
>>>> +        *      %rsi: dst
>>>> +        *      %rdx: src
>>>> +        *      %rcx: nblks (multiple of 8)
>>>> +        */
>>>> +       vzeroupper;
>>>
>>> vzeroupper needs to be replaced with VZEROUPPER_RETURN
>>> and we need a transaction safe version unless this can never
>>> be called during a transaction.
>>
>> I think you meant VZEROUPPER here (VZEROUPPER_RETURN seems to trigger
>> test case failures). What do you mean by a 'transaction safe version'?
>> Ax extra __chacha20_avx2_blocks8 implementation to handle it? Or disable
>> it if RTM is enabled?
> 
> For now you can just update the cpufeature check to do ssse3 if RTM is enabled.

Right, I will do it.

> 
>>
>>>> +
>>>> +       /* clear the used vector registers and stack */
>>>> +       vpxor X0, X0, X0;
>>>> +       vmovdqa X0, (STACK_VEC_X12)(%rsp);
>>>> +       vmovdqa X0, (STACK_VEC_X13)(%rsp);
>>>> +       vmovdqa X0, (STACK_TMP)(%rsp);
>>>> +       vmovdqa X0, (STACK_TMP1)(%rsp);
>>>> +       vzeroall;
>>>
>>> Do you need vzeroall?
>>> Why not vzeroupper? Is it a security concern to leave info in the xmm pieces?
>>
>> I would assume, since it is on the original libgrcypt optimization.  As
>> for the ssse3 version, I am not sure if we really need that level of
>> hardening, but it would be good to have the initial revision as close
>> as possible from libgcrypt.
> 
> Got it.
> 
>>
>>>
>>>
>>>> +
>>>> +       /* eax zeroed by round loop. */
>>>> +       leave;
>>>> +       cfi_adjust_cfa_offset(-8)
>>>> +       cfi_def_cfa_register(%rsp);
>>>> +       ret;
>>>> +       int3;
>>>
>>> Why do we need int3 here?
>>
>> I think the ssse3 applies here as well.
>>
>>>> +END(__chacha20_avx2_blocks8)
>>>> diff --git a/sysdeps/x86_64/chacha20_arch.h b/sysdeps/x86_64/chacha20_arch.h
>>>> index 37a4fdfb1f..7e9e7755f3 100644
>>>> --- a/sysdeps/x86_64/chacha20_arch.h
>>>> +++ b/sysdeps/x86_64/chacha20_arch.h
>>>> @@ -22,11 +22,25 @@
>>>>
>>>>  unsigned int __chacha20_ssse3_blocks8 (uint32_t *state, uint8_t *dst,
>>>>                                        const uint8_t *src, size_t nblks);
>>>> +unsigned int __chacha20_avx2_blocks8 (uint32_t *state, uint8_t *dst,
>>>> +                                     const uint8_t *src, size_t nblks);
>>>>
>>>>  static inline void
>>>>  chacha20_crypt (struct chacha20_state *state, uint8_t *dst, const uint8_t *src,
>>>>                 size_t bytes)
>>>>  {
>>>> +  const struct cpu_features* cpu_features = __get_cpu_features ();
>>>
>>> Can we do this with an ifunc and take the cpufeature check off the critical
>>> path?
>>
>> Ditto.
>>
>>>> +
>>>> +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) && bytes >= CHACHA20_BLOCK_SIZE * 8)
>>>> +    {
>>>> +      size_t nblocks = bytes / CHACHA20_BLOCK_SIZE;
>>>> +      nblocks -= nblocks % 8;
>>>> +      __chacha20_avx2_blocks8 (state->ctx, dst, src, nblocks);
>>>> +      bytes -= nblocks * CHACHA20_BLOCK_SIZE;
>>>> +      dst += nblocks * CHACHA20_BLOCK_SIZE;
>>>> +      src += nblocks * CHACHA20_BLOCK_SIZE;
>>>> +    }
>>>> +
>>>>    if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) && bytes >= CHACHA20_BLOCK_SIZE * 4)
>>>>      {
>>>>        size_t nblocks = bytes / CHACHA20_BLOCK_SIZE;
>>>> --
>>>> 2.32.0
>>>>
>>>
>>> Do you want optimization comments or do that later?
>>
>> Ideally I would like to check if the proposed arc4random implementation
>> is what we want (with current approach of using atfork handlers and the
>> key reschedule).  The cipher itself it not the utmost important in the
>> sense it is transparent to user and we can eventually replace it if there
>> any issue or attack to ChaCha20.  Initially I won't add any arch-specific
>> optimization, but since libgcrypt provides some that fits on the current
>> approach I though it would be a nice thing to have.
>>
>> For optimization comments it would be good to sync with libgcrypt as well,
>> I think the project will be interested in any performance improvement
>> you might have for the chacha implementations.
> Okay, I'll probably take a stab at this in the not too distant future.

Thanks.
  

Patch

diff --git a/LICENSES b/LICENSES
index 2563abd9e2..8ef0f023d7 100644
--- a/LICENSES
+++ b/LICENSES
@@ -390,8 +390,8 @@  Copyright 2001 by Stephen L. Moshier <moshier@na-net.ornl.gov>
  License along with this library; if not, see
  <https://www.gnu.org/licenses/>.  */
 
-sysdeps/x86_64/chacha20-ssse3.S import code from libgcrypt, with the
-following notices:
+sysdeps/x86_64/chacha20-ssse3.S and sysdeps/x86_64/chacha20-avx2.S
+import code from libgcrypt, with the following notices:
 
 Copyright (C) 2017-2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
 
diff --git a/stdlib/chacha20.c b/stdlib/chacha20.c
index dbd87bd942..8569e1e78d 100644
--- a/stdlib/chacha20.c
+++ b/stdlib/chacha20.c
@@ -190,8 +190,8 @@  memxorcpy (uint8_t *dst, const uint8_t *src1, const uint8_t *src2, size_t len)
 }
 
 static void
-chacha20_crypt (struct chacha20_state *state, uint8_t *dst,
-		const uint8_t *src, size_t bytes)
+chacha20_crypt_generic (struct chacha20_state *state, uint8_t *dst,
+			const uint8_t *src, size_t bytes)
 {
   uint8_t stream[CHACHA20_BLOCK_SIZE];
 
@@ -209,3 +209,6 @@  chacha20_crypt (struct chacha20_state *state, uint8_t *dst,
       memxorcpy (dst, src, stream, bytes);
     }
 }
+
+/* Get the arch-optimized implementation, if any.  */
+#include <chacha20_arch.h>
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index f43b6a1180..afb4d173e8 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -7,6 +7,7 @@  endif
 
 ifeq ($(subdir),stdlib)
 sysdep_routines += \
+  chacha20-avx2 \
   chacha20-ssse3 \
   # sysdep_routines
 endif
diff --git a/sysdeps/x86_64/chacha20-avx2.S b/sysdeps/x86_64/chacha20-avx2.S
new file mode 100644
index 0000000000..96174c0e40
--- /dev/null
+++ b/sysdeps/x86_64/chacha20-avx2.S
@@ -0,0 +1,317 @@ 
+/* Optimized AVX2 implementation of ChaCha20 cipher.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Based on D. J. Bernstein reference implementation at
+   http://cr.yp.to/chacha.html:
+
+   chacha-regs.c version 20080118
+   D. J. Bernstein
+   Public domain.  */
+
+#ifdef PIC
+#  define rRIP (%rip)
+#else
+#  define rRIP
+#endif
+
+/* register macros */
+#define INPUT %rdi
+#define DST   %rsi
+#define SRC   %rdx
+#define NBLKS %rcx
+#define ROUND %eax
+
+/* stack structure */
+#define STACK_VEC_X12 (32)
+#define STACK_VEC_X13 (32 + STACK_VEC_X12)
+#define STACK_TMP     (32 + STACK_VEC_X13)
+#define STACK_TMP1    (32 + STACK_TMP)
+
+#define STACK_MAX     (32 + STACK_TMP1)
+
+/* vector registers */
+#define X0 %ymm0
+#define X1 %ymm1
+#define X2 %ymm2
+#define X3 %ymm3
+#define X4 %ymm4
+#define X5 %ymm5
+#define X6 %ymm6
+#define X7 %ymm7
+#define X8 %ymm8
+#define X9 %ymm9
+#define X10 %ymm10
+#define X11 %ymm11
+#define X12 %ymm12
+#define X13 %ymm13
+#define X14 %ymm14
+#define X15 %ymm15
+
+#define X0h %xmm0
+#define X1h %xmm1
+#define X2h %xmm2
+#define X3h %xmm3
+#define X4h %xmm4
+#define X5h %xmm5
+#define X6h %xmm6
+#define X7h %xmm7
+#define X8h %xmm8
+#define X9h %xmm9
+#define X10h %xmm10
+#define X11h %xmm11
+#define X12h %xmm12
+#define X13h %xmm13
+#define X14h %xmm14
+#define X15h %xmm15
+
+/**********************************************************************
+  helper macros
+ **********************************************************************/
+
+/* 4x4 32-bit integer matrix transpose */
+#define transpose_4x4(x0,x1,x2,x3,t1,t2) \
+	vpunpckhdq x1, x0, t2; \
+	vpunpckldq x1, x0, x0; \
+	\
+	vpunpckldq x3, x2, t1; \
+	vpunpckhdq x3, x2, x2; \
+	\
+	vpunpckhqdq t1, x0, x1; \
+	vpunpcklqdq t1, x0, x0; \
+	\
+	vpunpckhqdq x2, t2, x3; \
+	vpunpcklqdq x2, t2, x2;
+
+/* 2x2 128-bit matrix transpose */
+#define transpose_16byte_2x2(x0,x1,t1) \
+	vmovdqa    x0, t1; \
+	vperm2i128 $0x20, x1, x0, x0; \
+	vperm2i128 $0x31, x1, t1, x1;
+
+/* xor register with unaligned src and save to unaligned dst */
+#define xor_src_dst(dst, src, offset, xreg) \
+	vpxor offset(src), xreg, xreg; \
+	vmovdqu xreg, offset(dst);
+
+/**********************************************************************
+  8-way chacha20
+ **********************************************************************/
+
+#define ROTATE2(v1,v2,c,tmp)	\
+	vpsrld $(32 - (c)), v1, tmp;	\
+	vpslld $(c), v1, v1;		\
+	vpaddb tmp, v1, v1;		\
+	vpsrld $(32 - (c)), v2, tmp;	\
+	vpslld $(c), v2, v2;		\
+	vpaddb tmp, v2, v2;
+
+#define ROTATE_SHUF_2(v1,v2,shuf)	\
+	vpshufb shuf, v1, v1;		\
+	vpshufb shuf, v2, v2;
+
+#define XOR(ds,s) \
+	vpxor s, ds, ds;
+
+#define PLUS(ds,s) \
+	vpaddd s, ds, ds;
+
+#define QUARTERROUND2(a1,b1,c1,d1,a2,b2,c2,d2,ign,tmp1,\
+		      interleave_op1,interleave_op2,\
+		      interleave_op3,interleave_op4)		\
+	vbroadcasti128 .Lshuf_rol16 rRIP, tmp1;			\
+		interleave_op1;					\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+		interleave_op2;					\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2, 12, tmp1);				\
+	vbroadcasti128 .Lshuf_rol8 rRIP, tmp1;			\
+		interleave_op3;					\
+	PLUS(a1,b1); PLUS(a2,b2); XOR(d1,a1); XOR(d2,a2);	\
+	    ROTATE_SHUF_2(d1, d2, tmp1);			\
+		interleave_op4;					\
+	PLUS(c1,d1); PLUS(c2,d2); XOR(b1,c1); XOR(b2,c2);	\
+	    ROTATE2(b1, b2,  7, tmp1);
+
+	.text
+	.align 32
+chacha20_data:
+L(shuf_rol16):
+	.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13
+L(shuf_rol8):
+	.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14
+L(inc_counter):
+	.byte 0,1,2,3,4,5,6,7
+L(unsigned_cmp):
+	.long 0x80000000
+
+ENTRY (__chacha20_avx2_blocks8)
+	/* input:
+	 *	%rdi: input
+	 *	%rsi: dst
+	 *	%rdx: src
+	 *	%rcx: nblks (multiple of 8)
+	 */
+	vzeroupper;
+
+	pushq %rbp;
+	cfi_adjust_cfa_offset(8);
+	cfi_rel_offset(rbp, 0)
+	movq %rsp, %rbp;
+	cfi_def_cfa_register(rbp);
+
+	subq $STACK_MAX, %rsp;
+	andq $~31, %rsp;
+
+L(loop8):
+	mov $20, ROUND;
+
+	/* Construct counter vectors X12 and X13 */
+	vpmovzxbd L(inc_counter) rRIP, X0;
+	vpbroadcastd L(unsigned_cmp) rRIP, X2;
+	vpbroadcastd (12 * 4)(INPUT), X12;
+	vpbroadcastd (13 * 4)(INPUT), X13;
+	vpaddd X0, X12, X12;
+	vpxor X2, X0, X0;
+	vpxor X2, X12, X1;
+	vpcmpgtd X1, X0, X0;
+	vpsubd X0, X13, X13;
+	vmovdqa X12, (STACK_VEC_X12)(%rsp);
+	vmovdqa X13, (STACK_VEC_X13)(%rsp);
+
+	/* Load vectors */
+	vpbroadcastd (0 * 4)(INPUT), X0;
+	vpbroadcastd (1 * 4)(INPUT), X1;
+	vpbroadcastd (2 * 4)(INPUT), X2;
+	vpbroadcastd (3 * 4)(INPUT), X3;
+	vpbroadcastd (4 * 4)(INPUT), X4;
+	vpbroadcastd (5 * 4)(INPUT), X5;
+	vpbroadcastd (6 * 4)(INPUT), X6;
+	vpbroadcastd (7 * 4)(INPUT), X7;
+	vpbroadcastd (8 * 4)(INPUT), X8;
+	vpbroadcastd (9 * 4)(INPUT), X9;
+	vpbroadcastd (10 * 4)(INPUT), X10;
+	vpbroadcastd (11 * 4)(INPUT), X11;
+	vpbroadcastd (14 * 4)(INPUT), X14;
+	vpbroadcastd (15 * 4)(INPUT), X15;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+
+L(round2):
+	QUARTERROUND2(X0, X4,  X8, X12,   X1, X5,  X9, X13, tmp:=,X15,,,,)
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	vmovdqa X8, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X6, X10, X14,   X3, X7, X11, X15, tmp:=,X8,,,,)
+	QUARTERROUND2(X0, X5, X10, X15,   X1, X6, X11, X12, tmp:=,X8,,,,)
+	vmovdqa (STACK_TMP)(%rsp), X8;
+	vmovdqa X15, (STACK_TMP)(%rsp);
+	QUARTERROUND2(X2, X7,  X8, X13,   X3, X4,  X9, X14, tmp:=,X15,,,,)
+	sub $2, ROUND;
+	jnz L(round2);
+
+	vmovdqa X8, (STACK_TMP1)(%rsp);
+
+	/* tmp := X15 */
+	vpbroadcastd (0 * 4)(INPUT), X15;
+	PLUS(X0, X15);
+	vpbroadcastd (1 * 4)(INPUT), X15;
+	PLUS(X1, X15);
+	vpbroadcastd (2 * 4)(INPUT), X15;
+	PLUS(X2, X15);
+	vpbroadcastd (3 * 4)(INPUT), X15;
+	PLUS(X3, X15);
+	vpbroadcastd (4 * 4)(INPUT), X15;
+	PLUS(X4, X15);
+	vpbroadcastd (5 * 4)(INPUT), X15;
+	PLUS(X5, X15);
+	vpbroadcastd (6 * 4)(INPUT), X15;
+	PLUS(X6, X15);
+	vpbroadcastd (7 * 4)(INPUT), X15;
+	PLUS(X7, X15);
+	transpose_4x4(X0, X1, X2, X3, X8, X15);
+	transpose_4x4(X4, X5, X6, X7, X8, X15);
+	vmovdqa (STACK_TMP1)(%rsp), X8;
+	transpose_16byte_2x2(X0, X4, X15);
+	transpose_16byte_2x2(X1, X5, X15);
+	transpose_16byte_2x2(X2, X6, X15);
+	transpose_16byte_2x2(X3, X7, X15);
+	vmovdqa (STACK_TMP)(%rsp), X15;
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 0), X0);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 0), X1);
+	vpbroadcastd (8 * 4)(INPUT), X0;
+	PLUS(X8, X0);
+	vpbroadcastd (9 * 4)(INPUT), X0;
+	PLUS(X9, X0);
+	vpbroadcastd (10 * 4)(INPUT), X0;
+	PLUS(X10, X0);
+	vpbroadcastd (11 * 4)(INPUT), X0;
+	PLUS(X11, X0);
+	vmovdqa (STACK_VEC_X12)(%rsp), X0;
+	PLUS(X12, X0);
+	vmovdqa (STACK_VEC_X13)(%rsp), X0;
+	PLUS(X13, X0);
+	vpbroadcastd (14 * 4)(INPUT), X0;
+	PLUS(X14, X0);
+	vpbroadcastd (15 * 4)(INPUT), X0;
+	PLUS(X15, X0);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 0), X2);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 0), X3);
+
+	/* Update counter */
+	addq $8, (12 * 4)(INPUT);
+
+	transpose_4x4(X8, X9, X10, X11, X0, X1);
+	transpose_4x4(X12, X13, X14, X15, X0, X1);
+	xor_src_dst(DST, SRC, (64 * 4 + 16 * 0), X4);
+	xor_src_dst(DST, SRC, (64 * 5 + 16 * 0), X5);
+	transpose_16byte_2x2(X8, X12, X0);
+	transpose_16byte_2x2(X9, X13, X0);
+	transpose_16byte_2x2(X10, X14, X0);
+	transpose_16byte_2x2(X11, X15, X0);
+	xor_src_dst(DST, SRC, (64 * 6 + 16 * 0), X6);
+	xor_src_dst(DST, SRC, (64 * 7 + 16 * 0), X7);
+	xor_src_dst(DST, SRC, (64 * 0 + 16 * 2), X8);
+	xor_src_dst(DST, SRC, (64 * 1 + 16 * 2), X9);
+	xor_src_dst(DST, SRC, (64 * 2 + 16 * 2), X10);
+	xor_src_dst(DST, SRC, (64 * 3 + 16 * 2), X11);
+	xor_src_dst(DST, SRC, (64 * 4 + 16 * 2), X12);
+	xor_src_dst(DST, SRC, (64 * 5 + 16 * 2), X13);
+	xor_src_dst(DST, SRC, (64 * 6 + 16 * 2), X14);
+	xor_src_dst(DST, SRC, (64 * 7 + 16 * 2), X15);
+
+	sub $8, NBLKS;
+	lea (8 * 64)(DST), DST;
+	lea (8 * 64)(SRC), SRC;
+	jnz L(loop8);
+
+	/* clear the used vector registers and stack */
+	vpxor X0, X0, X0;
+	vmovdqa X0, (STACK_VEC_X12)(%rsp);
+	vmovdqa X0, (STACK_VEC_X13)(%rsp);
+	vmovdqa X0, (STACK_TMP)(%rsp);
+	vmovdqa X0, (STACK_TMP1)(%rsp);
+	vzeroall;
+
+	/* eax zeroed by round loop. */
+	leave;
+	cfi_adjust_cfa_offset(-8)
+	cfi_def_cfa_register(%rsp);
+	ret;
+	int3;
+END(__chacha20_avx2_blocks8)
diff --git a/sysdeps/x86_64/chacha20_arch.h b/sysdeps/x86_64/chacha20_arch.h
index 37a4fdfb1f..7e9e7755f3 100644
--- a/sysdeps/x86_64/chacha20_arch.h
+++ b/sysdeps/x86_64/chacha20_arch.h
@@ -22,11 +22,25 @@ 
 
 unsigned int __chacha20_ssse3_blocks8 (uint32_t *state, uint8_t *dst,
 				       const uint8_t *src, size_t nblks);
+unsigned int __chacha20_avx2_blocks8 (uint32_t *state, uint8_t *dst,
+				      const uint8_t *src, size_t nblks);
 
 static inline void
 chacha20_crypt (struct chacha20_state *state, uint8_t *dst, const uint8_t *src,
 		size_t bytes)
 {
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) && bytes >= CHACHA20_BLOCK_SIZE * 8)
+    {
+      size_t nblocks = bytes / CHACHA20_BLOCK_SIZE;
+      nblocks -= nblocks % 8;
+      __chacha20_avx2_blocks8 (state->ctx, dst, src, nblocks);
+      bytes -= nblocks * CHACHA20_BLOCK_SIZE;
+      dst += nblocks * CHACHA20_BLOCK_SIZE;
+      src += nblocks * CHACHA20_BLOCK_SIZE;
+    }
+
   if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) && bytes >= CHACHA20_BLOCK_SIZE * 4)
     {
       size_t nblocks = bytes / CHACHA20_BLOCK_SIZE;