[v5,2/4] x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions
Checks
Context |
Check |
Description |
dj/TryBot-apply_patch |
success
|
Patch applied to master at the time it was sent
|
Commit Message
Optimizations are:
1. Use more overlapping stores to avoid branches.
2. Reduce how unrolled the aligning copies are (this is more of a
code-size save, its a negative for some sizes in terms of
perf).
3. For st{r|p}n{cat|cpy} re-order the branches to minimize the
number that are taken.
Performance Changes:
Times are from N = 10 runs of the benchmark suite and are
reported as geometric mean of all ratios of
New Implementation / Old Implementation.
strcat-avx2 -> 0.998
strcpy-avx2 -> 0.937
stpcpy-avx2 -> 0.971
strncpy-avx2 -> 0.793
stpncpy-avx2 -> 0.775
strncat-avx2 -> 0.962
Code Size Changes:
function -> Bytes New / Bytes Old -> Ratio
strcat-avx2 -> 685 / 1639 -> 0.418
strcpy-avx2 -> 560 / 903 -> 0.620
stpcpy-avx2 -> 592 / 939 -> 0.630
strncpy-avx2 -> 1176 / 2390 -> 0.492
stpncpy-avx2 -> 1268 / 2438 -> 0.520
strncat-avx2 -> 1042 / 2563 -> 0.407
Notes:
1. Because of the significant difference between the
implementations they are split into three files.
strcpy-avx2.S -> strcpy, stpcpy, strcat
strncpy-avx2.S -> strncpy
strncat-avx2.S > strncat
I couldn't find a way to merge them without making the
ifdefs incredibly difficult to follow.
Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
---
sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S | 6 +-
sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S | 7 +-
sysdeps/x86_64/multiarch/stpncpy-avx2.S | 5 +-
sysdeps/x86_64/multiarch/strcat-avx2-rtm.S | 13 +-
sysdeps/x86_64/multiarch/strcat-avx2.S | 268 +---
.../x86_64/multiarch/strcat-strlen-avx2.h.S | 101 ++
sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S | 13 +-
sysdeps/x86_64/multiarch/strcpy-avx2.S | 1236 +++++------------
sysdeps/x86_64/multiarch/strncat-avx2-rtm.S | 6 +-
sysdeps/x86_64/multiarch/strncat-avx2.S | 424 +++++-
sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S | 6 +-
sysdeps/x86_64/multiarch/strncpy-avx2.S | 740 +++++++++-
sysdeps/x86_64/multiarch/x86-avx-vecs.h | 3 +-
13 files changed, 1594 insertions(+), 1234 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S
Comments
On Tue, Nov 08, 2022 at 05:38:39PM -0800, Noah Goldstein wrote:
> Optimizations are:
> 1. Use more overlapping stores to avoid branches.
> 2. Reduce how unrolled the aligning copies are (this is more of a
> code-size save, its a negative for some sizes in terms of
> perf).
> 3. For st{r|p}n{cat|cpy} re-order the branches to minimize the
> number that are taken.
>
> Performance Changes:
>
> Times are from N = 10 runs of the benchmark suite and are
> reported as geometric mean of all ratios of
> New Implementation / Old Implementation.
>
> strcat-avx2 -> 0.998
> strcpy-avx2 -> 0.937
> stpcpy-avx2 -> 0.971
>
> strncpy-avx2 -> 0.793
> stpncpy-avx2 -> 0.775
>
> strncat-avx2 -> 0.962
>
> Code Size Changes:
> function -> Bytes New / Bytes Old -> Ratio
>
> strcat-avx2 -> 685 / 1639 -> 0.418
> strcpy-avx2 -> 560 / 903 -> 0.620
> stpcpy-avx2 -> 592 / 939 -> 0.630
>
> strncpy-avx2 -> 1176 / 2390 -> 0.492
> stpncpy-avx2 -> 1268 / 2438 -> 0.520
>
> strncat-avx2 -> 1042 / 2563 -> 0.407
>
> Notes:
> 1. Because of the significant difference between the
> implementations they are split into three files.
>
> strcpy-avx2.S -> strcpy, stpcpy, strcat
> strncpy-avx2.S -> strncpy
> strncat-avx2.S > strncat
>
> I couldn't find a way to merge them without making the
> ifdefs incredibly difficult to follow.
>
> Full check passes on x86-64 and build succeeds for all ISA levels w/
> and w/o multiarch.
> ---
> sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S | 6 +-
> sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S | 7 +-
> sysdeps/x86_64/multiarch/stpncpy-avx2.S | 5 +-
> sysdeps/x86_64/multiarch/strcat-avx2-rtm.S | 13 +-
> sysdeps/x86_64/multiarch/strcat-avx2.S | 268 +---
> .../x86_64/multiarch/strcat-strlen-avx2.h.S | 101 ++
> sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S | 13 +-
> sysdeps/x86_64/multiarch/strcpy-avx2.S | 1236 +++++------------
> sysdeps/x86_64/multiarch/strncat-avx2-rtm.S | 6 +-
> sysdeps/x86_64/multiarch/strncat-avx2.S | 424 +++++-
> sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S | 6 +-
> sysdeps/x86_64/multiarch/strncpy-avx2.S | 740 +++++++++-
> sysdeps/x86_64/multiarch/x86-avx-vecs.h | 3 +-
> 13 files changed, 1594 insertions(+), 1234 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S
>
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> index 2b9c07a59f..90e532dbe8 100644
> --- a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
> @@ -1,3 +1,3 @@
> -#define USE_AS_STPCPY
> -#define STRCPY __stpcpy_avx2_rtm
> -#include "strcpy-avx2-rtm.S"
> +#define STPCPY __stpcpy_avx2_rtm
> +#include "x86-avx-rtm-vecs.h"
> +#include "stpcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> index 60a2ccfe53..46ee07be36 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
> @@ -1,4 +1,3 @@
> -#define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY __stpncpy_avx2_rtm
> -#include "strcpy-avx2-rtm.S"
> +#define STPNCPY __stpncpy_avx2_rtm
> +#include "x86-avx-rtm-vecs.h"
> +#include "stpncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> index b2f8c19143..a46a8edbe2 100644
> --- a/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
> @@ -3,6 +3,5 @@
> #endif
>
> #define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY STPNCPY
> -#include "strcpy-avx2.S"
> +#define STRNCPY STPNCPY
> +#include "strncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> index 637fb557c4..e84f4f1fef 100644
> --- a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
> @@ -1,12 +1,3 @@
> -#ifndef STRCAT
> -# define STRCAT __strcat_avx2_rtm
> -#endif
> -
> -#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> - ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> -
> -#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
> -
> -#define SECTION(p) p##.avx.rtm
> -
> +#define STRCAT __strcat_avx2_rtm
> +#include "x86-avx-rtm-vecs.h"
> #include "strcat-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
> index d9b7fb2a43..3f914fa342 100644
> --- a/sysdeps/x86_64/multiarch/strcat-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
> @@ -16,266 +16,10 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include <isa-level.h>
> -
> -#if ISA_SHOULD_BUILD (3)
> -
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -# define STRCAT __strcat_avx2
> -# endif
> -
> -# define USE_AS_STRCAT
> -
> -/* Number of bytes in a vector register */
> -# define VEC_SIZE 32
> -
> -# ifndef SECTION
> -# define SECTION(p) p##.avx
> -# endif
> -
> - .section SECTION(.text),"ax",@progbits
> -ENTRY (STRCAT)
> - mov %rdi, %r9
> -# ifdef USE_AS_STRNCAT
> - mov %rdx, %r8
> -# endif
> -
> - xor %eax, %eax
> - mov %edi, %ecx
> - and $((VEC_SIZE * 4) - 1), %ecx
> - vpxor %xmm6, %xmm6, %xmm6
> - cmp $(VEC_SIZE * 3), %ecx
> - ja L(fourth_vector_boundary)
> - vpcmpeqb (%rdi), %ymm6, %ymm0
> - vpmovmskb %ymm0, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_first_vector)
> - mov %rdi, %rax
> - and $-VEC_SIZE, %rax
> - jmp L(align_vec_size_start)
> -L(fourth_vector_boundary):
> - mov %rdi, %rax
> - and $-VEC_SIZE, %rax
> - vpcmpeqb (%rax), %ymm6, %ymm0
> - mov $-1, %r10d
> - sub %rax, %rcx
> - shl %cl, %r10d
> - vpmovmskb %ymm0, %edx
> - and %r10d, %edx
> - jnz L(exit)
> -
> -L(align_vec_size_start):
> - vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
> - vpmovmskb %ymm0, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_second_vector)
> -
> - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> - vpmovmskb %ymm1, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_third_vector)
> -
> - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> - vpmovmskb %ymm2, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_fourth_vector)
> -
> - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> - vpmovmskb %ymm3, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_fifth_vector)
> -
> - vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> - add $(VEC_SIZE * 4), %rax
> - vpmovmskb %ymm0, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_second_vector)
> -
> - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> - vpmovmskb %ymm1, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_third_vector)
> -
> - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> - vpmovmskb %ymm2, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_fourth_vector)
> -
> - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> - vpmovmskb %ymm3, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_fifth_vector)
> -
> - vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> - add $(VEC_SIZE * 4), %rax
> - vpmovmskb %ymm0, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_second_vector)
> -
> - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> - vpmovmskb %ymm1, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_third_vector)
> -
> - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> - vpmovmskb %ymm2, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_fourth_vector)
> -
> - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> - vpmovmskb %ymm3, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_fifth_vector)
> -
> - vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> - add $(VEC_SIZE * 4), %rax
> - vpmovmskb %ymm0, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_second_vector)
> -
> - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> - vpmovmskb %ymm1, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_third_vector)
> -
> - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> - vpmovmskb %ymm2, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_fourth_vector)
> -
> - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> - vpmovmskb %ymm3, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_fifth_vector)
> -
> - test $((VEC_SIZE * 4) - 1), %rax
> - jz L(align_four_vec_loop)
> -
> - vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
> - add $(VEC_SIZE * 5), %rax
> - vpmovmskb %ymm0, %edx
> - test %edx, %edx
> - jnz L(exit)
> -
> - test $((VEC_SIZE * 4) - 1), %rax
> - jz L(align_four_vec_loop)
> -
> - vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
> - add $VEC_SIZE, %rax
> - vpmovmskb %ymm1, %edx
> - test %edx, %edx
> - jnz L(exit)
> -
> - test $((VEC_SIZE * 4) - 1), %rax
> - jz L(align_four_vec_loop)
> -
> - vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
> - add $VEC_SIZE, %rax
> - vpmovmskb %ymm2, %edx
> - test %edx, %edx
> - jnz L(exit)
> -
> - test $((VEC_SIZE * 4) - 1), %rax
> - jz L(align_four_vec_loop)
> -
> - vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
> - add $VEC_SIZE, %rax
> - vpmovmskb %ymm3, %edx
> - test %edx, %edx
> - jnz L(exit)
> -
> - add $VEC_SIZE, %rax
> -
> - .p2align 4
> -L(align_four_vec_loop):
> - vmovaps (%rax), %ymm4
> - vpminub VEC_SIZE(%rax), %ymm4, %ymm4
> - vmovaps (VEC_SIZE * 2)(%rax), %ymm5
> - vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5
> - add $(VEC_SIZE * 4), %rax
> - vpminub %ymm4, %ymm5, %ymm5
> - vpcmpeqb %ymm5, %ymm6, %ymm5
> - vpmovmskb %ymm5, %edx
> - test %edx, %edx
> - jz L(align_four_vec_loop)
> -
> - vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
> - sub $(VEC_SIZE * 5), %rax
> - vpmovmskb %ymm0, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_second_vector)
> -
> - vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
> - vpmovmskb %ymm1, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_third_vector)
> -
> - vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
> - vpmovmskb %ymm2, %edx
> - test %edx, %edx
> - jnz L(exit_null_on_fourth_vector)
> -
> - vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
> - vpmovmskb %ymm3, %edx
> - sub %rdi, %rax
> - bsf %rdx, %rdx
> - add %rdx, %rax
> - add $(VEC_SIZE * 4), %rax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit):
> - sub %rdi, %rax
> -L(exit_null_on_first_vector):
> - bsf %rdx, %rdx
> - add %rdx, %rax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_null_on_second_vector):
> - sub %rdi, %rax
> - bsf %rdx, %rdx
> - add %rdx, %rax
> - add $VEC_SIZE, %rax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_null_on_third_vector):
> - sub %rdi, %rax
> - bsf %rdx, %rdx
> - add %rdx, %rax
> - add $(VEC_SIZE * 2), %rax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_null_on_fourth_vector):
> - sub %rdi, %rax
> - bsf %rdx, %rdx
> - add %rdx, %rax
> - add $(VEC_SIZE * 3), %rax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_null_on_fifth_vector):
> - sub %rdi, %rax
> - bsf %rdx, %rdx
> - add %rdx, %rax
> - add $(VEC_SIZE * 4), %rax
> -
> - .p2align 4
> -L(StartStrcpyPart):
> - lea (%r9, %rax), %rdi
> - mov %rsi, %rcx
> - mov %r9, %rax /* save result */
> -
> -# ifdef USE_AS_STRNCAT
> - test %r8, %r8
> - jz L(ExitZero)
> -# define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-avx2.S"
> +#ifndef STRCAT
> +# define STRCAT __strcat_avx2
> #endif
> +
> +#define USE_AS_STRCAT
> +#define STRCPY STRCAT
> +#include "strcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S
> new file mode 100644
> index 0000000000..f50514e07c
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S
> @@ -0,0 +1,101 @@
> +/* strlen used for begining of str{n}cat using AVX2.
> + Copyright (C) 2011-2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +
> +/* NOTE: This file is meant to be included by strcat-avx2 or
> + strncat-avx2 and does not standalone. Before including %rdi
> + must be saved in %rax. */
> +
> +
> +/* Simple strlen implementation that ends at
> + L(strcat_strlen_done). */
> + movq %rdi, %r8
> + andq $(VEC_SIZE * -1), %r8
> + VPCMPEQ (%r8), %VZERO, %VMM(0)
> + vpmovmskb %VMM(0), %ecx
> + shrxl %edi, %ecx, %ecx
> + testl %ecx, %ecx
> + jnz L(bsf_and_done_v0)
> +
> + VPCMPEQ VEC_SIZE(%r8), %VZERO, %VMM(0)
> + vpmovmskb %VMM(0), %ecx
> + leaq (VEC_SIZE)(%r8), %rdi
> + testl %ecx, %ecx
> + jnz L(bsf_and_done_v0)
> +
> + VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %VMM(0)
> + vpmovmskb %VMM(0), %ecx
> + testl %ecx, %ecx
> + jnz L(bsf_and_done_v1)
> +
> + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %VMM(0)
> + vpmovmskb %VMM(0), %ecx
> + testl %ecx, %ecx
> + jnz L(bsf_and_done_v2)
> +
> + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %VMM(0)
> + vpmovmskb %VMM(0), %ecx
> + testl %ecx, %ecx
> + jnz L(bsf_and_done_v3)
> +
> + orq $(VEC_SIZE * 4 - 1), %rdi
> + .p2align 4,, 8
> +L(loop_2x_vec):
> + VMOVA (VEC_SIZE * 0 + 1)(%rdi), %VMM(0)
> + VPMIN (VEC_SIZE * 1 + 1)(%rdi), %VMM(0), %VMM(1)
> + VMOVA (VEC_SIZE * 2 + 1)(%rdi), %VMM(2)
> + VPMIN (VEC_SIZE * 3 + 1)(%rdi), %VMM(2), %VMM(3)
> + VPMIN %VMM(1), %VMM(3), %VMM(3)
> + VPCMPEQ %VMM(3), %VZERO, %VMM(3)
> + vpmovmskb %VMM(3), %r8d
> + subq $(VEC_SIZE * -4), %rdi
> + testl %r8d, %r8d
> + jz L(loop_2x_vec)
> +
> + addq $(VEC_SIZE * -4 + 1), %rdi
> +
> + VPCMPEQ %VMM(0), %VZERO, %VMM(0)
> + vpmovmskb %VMM(0), %ecx
> + testl %ecx, %ecx
> + jnz L(bsf_and_done_v0)
> +
> + VPCMPEQ %VMM(1), %VZERO, %VMM(1)
> + vpmovmskb %VMM(1), %ecx
> + testl %ecx, %ecx
> + jnz L(bsf_and_done_v1)
> +
> + VPCMPEQ %VMM(2), %VZERO, %VMM(2)
> + vpmovmskb %VMM(2), %ecx
> + testl %ecx, %ecx
> + jnz L(bsf_and_done_v2)
> +
> + movl %r8d, %ecx
> +L(bsf_and_done_v3):
> + addq $VEC_SIZE, %rdi
> +L(bsf_and_done_v2):
> + bsfl %ecx, %ecx
> + leaq (VEC_SIZE * 2)(%rdi, %rcx), %rdi
> + jmp L(strcat_strlen_done)
> +
> + .p2align 4,, 4
> +L(bsf_and_done_v1):
> + addq $VEC_SIZE, %rdi
> +L(bsf_and_done_v0):
> + bsfl %ecx, %ecx
> + addq %rcx, %rdi
> +L(strcat_strlen_done):
> diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> index c2c581ecf7..3ae2de8ea9 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
> @@ -1,12 +1,3 @@
> -#ifndef STRCPY
> -# define STRCPY __strcpy_avx2_rtm
> -#endif
> -
> -#define ZERO_UPPER_VEC_REGISTERS_RETURN \
> - ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
> -
> -#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
> -
> -#define SECTION(p) p##.avx.rtm
> -
> +#define STRCPY __strcpy_avx2_rtm
> +#include "x86-avx-rtm-vecs.h"
> #include "strcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
> index c725834929..32f86baa4c 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
> @@ -20,984 +20,378 @@
>
> #if ISA_SHOULD_BUILD (3)
>
> +# include <sysdep.h>
>
> -# ifndef USE_AS_STRCAT
> -# include <sysdep.h>
> -
> -# ifndef STRCPY
> -# define STRCPY __strcpy_avx2
> -# endif
> -
> -# endif
> -
> -/* Number of bytes in a vector register */
> # ifndef VEC_SIZE
> -# define VEC_SIZE 32
> -# endif
> -
> -# ifndef VZEROUPPER
> -# define VZEROUPPER vzeroupper
> -# endif
> -
> -# ifndef SECTION
> -# define SECTION(p) p##.avx
> -# endif
> -
> -/* zero register */
> -#define xmmZ xmm0
> -#define ymmZ ymm0
> -
> -/* mask register */
> -#define ymmM ymm1
> -
> -# ifndef USE_AS_STRCAT
> -
> - .section SECTION(.text),"ax",@progbits
> -ENTRY (STRCPY)
> -# ifdef USE_AS_STRNCPY
> - mov %RDX_LP, %R8_LP
> - test %R8_LP, %R8_LP
> - jz L(ExitZero)
> -# endif
> - mov %rsi, %rcx
> -# ifndef USE_AS_STPCPY
> - mov %rdi, %rax /* save result */
> -# endif
> -
> +# include "x86-avx-vecs.h"
> # endif
>
> - vpxor %xmmZ, %xmmZ, %xmmZ
> -
> - and $((VEC_SIZE * 4) - 1), %ecx
> - cmp $(VEC_SIZE * 2), %ecx
> - jbe L(SourceStringAlignmentLessTwoVecSize)
> -
> - and $-VEC_SIZE, %rsi
> - and $(VEC_SIZE - 1), %ecx
> -
> - vpcmpeqb (%rsi), %ymmZ, %ymmM
> - vpmovmskb %ymmM, %edx
> - shr %cl, %rdx
> -
> -# ifdef USE_AS_STRNCPY
> -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> - mov $VEC_SIZE, %r10
> - sub %rcx, %r10
> - cmp %r10, %r8
> -# else
> - mov $(VEC_SIZE + 1), %r10
> - sub %rcx, %r10
> - cmp %r10, %r8
> -# endif
> - jbe L(CopyVecSizeTailCase2OrCase3)
> +# ifndef STRCPY
> +# define STRCPY __strcpy_avx2
> # endif
> - test %edx, %edx
> - jnz L(CopyVecSizeTail)
>
> - vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
> - vpmovmskb %ymm2, %edx
> + /* Use movsb in page cross case to save code size. */
> +# define USE_MOVSB_IN_PAGE_CROSS 1
>
> -# ifdef USE_AS_STRNCPY
> - add $VEC_SIZE, %r10
> - cmp %r10, %r8
> - jbe L(CopyTwoVecSizeCase2OrCase3)
> -# endif
> - test %edx, %edx
> - jnz L(CopyTwoVecSize)
> -
> - vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */
> - vmovdqu %ymm2, (%rdi)
> -
> -/* If source address alignment != destination address alignment */
> - .p2align 4
> -L(UnalignVecSizeBoth):
> - sub %rcx, %rdi
> -# ifdef USE_AS_STRNCPY
> - add %rcx, %r8
> - sbb %rcx, %rcx
> - or %rcx, %r8
> -# endif
> - mov $VEC_SIZE, %rcx
> - vmovdqa (%rsi, %rcx), %ymm2
> - vmovdqu %ymm2, (%rdi, %rcx)
> - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> - vpcmpeqb %ymm2, %ymmZ, %ymmM
> - vpmovmskb %ymmM, %edx
> - add $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $(VEC_SIZE * 3), %r8
> - jbe L(CopyVecSizeCase2OrCase3)
> -# endif
> - test %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - jnz L(CopyVecSizeUnalignedVec2)
> +# ifdef USE_AS_WCSCPY
> +# define VPCMPEQ vpcmpeqd
> +# define VPMIN vpminud
> +# define CHAR_SIZE 4
> # else
> - jnz L(CopyVecSize)
> +# define VPCMPEQ vpcmpeqb
> +# define VPMIN vpminub
> +# define CHAR_SIZE 1
> # endif
>
> - vmovdqu %ymm2, (%rdi, %rcx)
> - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
> - vpcmpeqb %ymm3, %ymmZ, %ymmM
> - vpmovmskb %ymmM, %edx
> - add $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $VEC_SIZE, %r8
> - jbe L(CopyVecSizeCase2OrCase3)
> -# endif
> - test %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - jnz L(CopyVecSizeUnalignedVec3)
> -# else
> - jnz L(CopyVecSize)
> -# endif
> +# define PAGE_SIZE 4096
>
> - vmovdqu %ymm3, (%rdi, %rcx)
> - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
> - vpcmpeqb %ymm4, %ymmZ, %ymmM
> - vpmovmskb %ymmM, %edx
> - add $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $VEC_SIZE, %r8
> - jbe L(CopyVecSizeCase2OrCase3)
> -# endif
> - test %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - jnz L(CopyVecSizeUnalignedVec4)
> +# ifdef USE_AS_STPCPY
> +# define END_REG rax
> # else
> - jnz L(CopyVecSize)
> +# define END_REG rdi, %rdx
> # endif
>
> - vmovdqu %ymm4, (%rdi, %rcx)
> - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> - vpcmpeqb %ymm2, %ymmZ, %ymmM
> - vpmovmskb %ymmM, %edx
> - add $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $VEC_SIZE, %r8
> - jbe L(CopyVecSizeCase2OrCase3)
> -# endif
> - test %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - jnz L(CopyVecSizeUnalignedVec2)
> +# ifdef USE_AS_STRCAT
> +# define PAGE_ALIGN_REG ecx
> # else
> - jnz L(CopyVecSize)
> +# define PAGE_ALIGN_REG eax
> # endif
>
> - vmovdqu %ymm2, (%rdi, %rcx)
> - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
> - vpcmpeqb %ymm2, %ymmZ, %ymmM
> - vpmovmskb %ymmM, %edx
> - add $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $VEC_SIZE, %r8
> - jbe L(CopyVecSizeCase2OrCase3)
> -# endif
> - test %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - jnz L(CopyVecSizeUnalignedVec2)
> -# else
> - jnz L(CopyVecSize)
> -# endif
> +# define VZERO VMM(7)
> +# define VZERO_128 VMM_128(7)
>
> - vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
> - vmovdqu %ymm2, (%rdi, %rcx)
> - vpcmpeqb %ymm3, %ymmZ, %ymmM
> - vpmovmskb %ymmM, %edx
> - add $VEC_SIZE, %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $VEC_SIZE, %r8
> - jbe L(CopyVecSizeCase2OrCase3)
> -# endif
> - test %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - jnz L(CopyVecSizeUnalignedVec3)
> -# else
> - jnz L(CopyVecSize)
> -# endif
> + .section SECTION(.text), "ax", @progbits
> +ENTRY(STRCPY)
> + vpxor %VZERO_128, %VZERO_128, %VZERO_128
>
> - vmovdqu %ymm3, (%rdi, %rcx)
> - mov %rsi, %rdx
> - lea VEC_SIZE(%rsi, %rcx), %rsi
> - and $-(VEC_SIZE * 4), %rsi
> - sub %rsi, %rdx
> - sub %rdx, %rdi
> -# ifdef USE_AS_STRNCPY
> - lea (VEC_SIZE * 8)(%r8, %rdx), %r8
> -# endif
> -L(UnalignedFourVecSizeLoop):
> - vmovdqa (%rsi), %ymm4
> - vmovdqa VEC_SIZE(%rsi), %ymm5
> - vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
> - vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
> - vpminub %ymm5, %ymm4, %ymm2
> - vpminub %ymm7, %ymm6, %ymm3
> - vpminub %ymm2, %ymm3, %ymm3
> - vpcmpeqb %ymmM, %ymm3, %ymm3
> - vpmovmskb %ymm3, %edx
> -# ifdef USE_AS_STRNCPY
> - sub $(VEC_SIZE * 4), %r8
> - jbe L(UnalignedLeaveCase2OrCase3)
> -# endif
> - test %edx, %edx
> - jnz L(UnalignedFourVecSizeLeave)
> -
> -L(UnalignedFourVecSizeLoop_start):
> - add $(VEC_SIZE * 4), %rdi
> - add $(VEC_SIZE * 4), %rsi
> - vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
> - vmovdqa (%rsi), %ymm4
> - vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
> - vmovdqa VEC_SIZE(%rsi), %ymm5
> - vpminub %ymm5, %ymm4, %ymm2
> - vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
> - vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
> - vmovdqu %ymm7, -VEC_SIZE(%rdi)
> - vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
> - vpminub %ymm7, %ymm6, %ymm3
> - vpminub %ymm2, %ymm3, %ymm3
> - vpcmpeqb %ymmM, %ymm3, %ymm3
> - vpmovmskb %ymm3, %edx
> -# ifdef USE_AS_STRNCPY
> - sub $(VEC_SIZE * 4), %r8
> - jbe L(UnalignedLeaveCase2OrCase3)
> -# endif
> - test %edx, %edx
> - jz L(UnalignedFourVecSizeLoop_start)
> -
> -L(UnalignedFourVecSizeLeave):
> - vpcmpeqb %ymm4, %ymmZ, %ymmM
> - vpmovmskb %ymmM, %edx
> - test %edx, %edx
> - jnz L(CopyVecSizeUnaligned_0)
> -
> - vpcmpeqb %ymm5, %ymmZ, %ymmM
> - vpmovmskb %ymmM, %ecx
> - test %ecx, %ecx
> - jnz L(CopyVecSizeUnaligned_16)
> -
> - vpcmpeqb %ymm6, %ymmZ, %ymmM
> - vpmovmskb %ymmM, %edx
> - test %edx, %edx
> - jnz L(CopyVecSizeUnaligned_32)
> -
> - vpcmpeqb %ymm7, %ymmZ, %ymmM
> - vpmovmskb %ymmM, %ecx
> - bsf %ecx, %edx
> - vmovdqu %ymm4, (%rdi)
> - vmovdqu %ymm5, VEC_SIZE(%rdi)
> - vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> - lea (VEC_SIZE * 3)(%rdi, %rdx), %rax
> -# endif
> - vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
> - add $(VEC_SIZE - 1), %r8
> - sub %rdx, %r8
> - lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
> - jmp L(StrncpyFillTailWithZero)
> -# else
> - add $(VEC_SIZE * 3), %rsi
> - add $(VEC_SIZE * 3), %rdi
> - jmp L(CopyVecSizeExit)
> +# ifdef USE_AS_STRCAT
> + movq %rdi, %rax
> +# include "strcat-strlen-avx2.h.S"
> # endif
>
> -/* If source address alignment == destination address alignment */
> -
> -L(SourceStringAlignmentLessTwoVecSize):
> - vmovdqu (%rsi), %ymm3
> - vmovdqu VEC_SIZE(%rsi), %ymm2
> - vpcmpeqb %ymm3, %ymmZ, %ymmM
> - vpmovmskb %ymmM, %edx
> -
> -# ifdef USE_AS_STRNCPY
> -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> - cmp $VEC_SIZE, %r8
> -# else
> - cmp $(VEC_SIZE + 1), %r8
> -# endif
> - jbe L(CopyVecSizeTail1Case2OrCase3)
> + movl %esi, %PAGE_ALIGN_REG
> + andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
> + cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
> + ja L(page_cross)
> +L(page_cross_continue):
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> + movq %rdi, %rax
> # endif
> - test %edx, %edx
> - jnz L(CopyVecSizeTail1)
> -
> - vmovdqu %ymm3, (%rdi)
> - vpcmpeqb %ymm2, %ymmZ, %ymmM
> - vpmovmskb %ymmM, %edx
> -
> -# ifdef USE_AS_STRNCPY
> -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> - cmp $(VEC_SIZE * 2), %r8
> -# else
> - cmp $((VEC_SIZE * 2) + 1), %r8
> -# endif
> - jbe L(CopyTwoVecSize1Case2OrCase3)
> -# endif
> - test %edx, %edx
> - jnz L(CopyTwoVecSize1)
> -
> - and $-VEC_SIZE, %rsi
> - and $(VEC_SIZE - 1), %ecx
> - jmp L(UnalignVecSizeBoth)
> + VMOVU (%rsi), %VMM(0)
> + VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
>
> -/*------End of main part with loops---------------------*/
> + testl %ecx, %ecx
> + jz L(more_1x_vec)
>
> -/* Case1 */
> + /* No longer need ymm registers so just vzeroupper so it doesn't
> + need to be duplicated at each return statement. */
> + COND_VZEROUPPER
>
> -# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
> - .p2align 4
> -L(CopyVecSize):
> - add %rcx, %rdi
> -# endif
> -L(CopyVecSizeTail):
> - add %rcx, %rsi
> -L(CopyVecSizeTail1):
> - bsf %edx, %edx
> -L(CopyVecSizeExit):
> - cmp $32, %edx
> - jae L(Exit32_63)
> - cmp $16, %edx
> - jae L(Exit16_31)
> - cmp $8, %edx
> - jae L(Exit8_15)
> - cmp $4, %edx
> - jae L(Exit4_7)
> - cmp $3, %edx
> - je L(Exit3)
> - cmp $1, %edx
> - ja L(Exit2)
> - je L(Exit1)
> - movb $0, (%rdi)
> + xorl %edx, %edx
> + bsfl %ecx, %edx
> # ifdef USE_AS_STPCPY
> - lea (%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $1, %r8
> - lea 1(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> -L(return_vzeroupper):
> - ZERO_UPPER_VEC_REGISTERS_RETURN
> -
> - .p2align 4
> -L(CopyTwoVecSize1):
> - add $VEC_SIZE, %rsi
> - add $VEC_SIZE, %rdi
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $VEC_SIZE, %r8
> -# endif
> - jmp L(CopyVecSizeTail1)
> -
> - .p2align 4
> -L(CopyTwoVecSize):
> - bsf %edx, %edx
> - add %rcx, %rsi
> - add $VEC_SIZE, %edx
> - sub %ecx, %edx
> - jmp L(CopyVecSizeExit)
> -
> - .p2align 4
> -L(CopyVecSizeUnaligned_0):
> - bsf %edx, %edx
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> - lea (%rdi, %rdx), %rax
> -# endif
> - vmovdqu %ymm4, (%rdi)
> - add $((VEC_SIZE * 4) - 1), %r8
> - sub %rdx, %r8
> - lea 1(%rdi, %rdx), %rdi
> - jmp L(StrncpyFillTailWithZero)
> -# else
> - jmp L(CopyVecSizeExit)
> -# endif
> -
> - .p2align 4
> -L(CopyVecSizeUnaligned_16):
> - bsf %ecx, %edx
> - vmovdqu %ymm4, (%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> - lea VEC_SIZE(%rdi, %rdx), %rax
> -# endif
> - vmovdqu %ymm5, VEC_SIZE(%rdi)
> - add $((VEC_SIZE * 3) - 1), %r8
> - sub %rdx, %r8
> - lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi
> - jmp L(StrncpyFillTailWithZero)
> + leaq (%rdi, %rdx), %rax
> +# endif
> +
> + /* Use mask bits in rcx to detect which copy we need. If the low
> + mask is zero then there must be a bit set in the upper half.
> + I.e if ecx != 0 and cx == 0, then match must be upper 16
> + bits so we use L(copy_16_31). */
> + testw %cx, %cx
> + jz L(copy_16_31)
> +
> + testb %cl, %cl
> + jz L(copy_8_15)
> +# ifdef USE_AS_WCSCPY
> + vmovd %xmm0, (%rdi)
> + movl $0, (%END_REG)
> + ret
> # else
> - add $VEC_SIZE, %rsi
> - add $VEC_SIZE, %rdi
> - jmp L(CopyVecSizeExit)
> -# endif
> -
> - .p2align 4
> -L(CopyVecSizeUnaligned_32):
> - bsf %edx, %edx
> - vmovdqu %ymm4, (%rdi)
> - vmovdqu %ymm5, VEC_SIZE(%rdi)
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> -# ifdef USE_AS_STPCPY
> - lea (VEC_SIZE * 2)(%rdi, %rdx), %rax
> -# endif
> - vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> - add $((VEC_SIZE * 2) - 1), %r8
> - sub %rdx, %r8
> - lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
> - jmp L(StrncpyFillTailWithZero)
> + testb $0x7, %cl
> + jz L(copy_4_7)
> +
> + testl %edx, %edx
> + jz L(set_null_term)
> + vmovd %xmm0, %ecx
> + movw %cx, (%rdi)
> +
> + .p2align 4,, 2
> +L(set_null_term):
> + movb $0, (%END_REG)
> + ret
> +
> + .p2align 4,, 12
> +L(copy_4_7):
> + movl -3(%rsi, %rdx), %ecx
> + vmovd %xmm0, (%rdi)
> + movl %ecx, -3(%END_REG)
> + ret
> +# endif
> +
> + .p2align 4,, 10
> +L(copy_16_31):
> + VMOVU -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
> + VMOVU %xmm0, (%rdi)
> + VMOVU %xmm1, -(16 - CHAR_SIZE)(%END_REG)
> + ret
> +
> + .p2align 4,, 10
> +L(copy_8_15):
> +# ifdef USE_AS_WCSCPY
> + movl -(8 - CHAR_SIZE)(%rsi, %rdx), %ecx
> # else
> - add $(VEC_SIZE * 2), %rsi
> - add $(VEC_SIZE * 2), %rdi
> - jmp L(CopyVecSizeExit)
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> -# ifndef USE_AS_STRCAT
> - .p2align 4
> -L(CopyVecSizeUnalignedVec6):
> - vmovdqu %ymm6, (%rdi, %rcx)
> - jmp L(CopyVecSizeVecExit)
> -
> - .p2align 4
> -L(CopyVecSizeUnalignedVec5):
> - vmovdqu %ymm5, (%rdi, %rcx)
> - jmp L(CopyVecSizeVecExit)
> -
> - .p2align 4
> -L(CopyVecSizeUnalignedVec4):
> - vmovdqu %ymm4, (%rdi, %rcx)
> - jmp L(CopyVecSizeVecExit)
> -
> - .p2align 4
> -L(CopyVecSizeUnalignedVec3):
> - vmovdqu %ymm3, (%rdi, %rcx)
> - jmp L(CopyVecSizeVecExit)
> -# endif
> -
> -/* Case2 */
> -
> - .p2align 4
> -L(CopyVecSizeCase2):
> - add $VEC_SIZE, %r8
> - add %rcx, %rdi
> - add %rcx, %rsi
> - bsf %edx, %edx
> - cmp %r8d, %edx
> - jb L(CopyVecSizeExit)
> - jmp L(StrncpyExit)
> -
> - .p2align 4
> -L(CopyTwoVecSizeCase2):
> - add %rcx, %rsi
> - bsf %edx, %edx
> - add $VEC_SIZE, %edx
> - sub %ecx, %edx
> - cmp %r8d, %edx
> - jb L(CopyVecSizeExit)
> - jmp L(StrncpyExit)
> -
> -L(CopyVecSizeTailCase2):
> - add %rcx, %rsi
> - bsf %edx, %edx
> - cmp %r8d, %edx
> - jb L(CopyVecSizeExit)
> - jmp L(StrncpyExit)
> -
> -L(CopyVecSizeTail1Case2):
> - bsf %edx, %edx
> - cmp %r8d, %edx
> - jb L(CopyVecSizeExit)
> - jmp L(StrncpyExit)
> -
> -/* Case2 or Case3, Case3 */
> -
> - .p2align 4
> -L(CopyVecSizeCase2OrCase3):
> - test %rdx, %rdx
> - jnz L(CopyVecSizeCase2)
> -L(CopyVecSizeCase3):
> - add $VEC_SIZE, %r8
> - add %rcx, %rdi
> - add %rcx, %rsi
> - jmp L(StrncpyExit)
> -
> - .p2align 4
> -L(CopyTwoVecSizeCase2OrCase3):
> - test %rdx, %rdx
> - jnz L(CopyTwoVecSizeCase2)
> - add %rcx, %rsi
> - jmp L(StrncpyExit)
> -
> - .p2align 4
> -L(CopyVecSizeTailCase2OrCase3):
> - test %rdx, %rdx
> - jnz L(CopyVecSizeTailCase2)
> - add %rcx, %rsi
> - jmp L(StrncpyExit)
> -
> - .p2align 4
> -L(CopyTwoVecSize1Case2OrCase3):
> - add $VEC_SIZE, %rdi
> - add $VEC_SIZE, %rsi
> - sub $VEC_SIZE, %r8
> -L(CopyVecSizeTail1Case2OrCase3):
> - test %rdx, %rdx
> - jnz L(CopyVecSizeTail1Case2)
> - jmp L(StrncpyExit)
> -# endif
> -
> -/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
> -
> - .p2align 4
> -L(Exit1):
> - movzwl (%rsi), %edx
> - mov %dx, (%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 1(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $2, %r8
> - lea 2(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - VZEROUPPER_RETURN
> -
> - .p2align 4
> -L(Exit2):
> - movzwl (%rsi), %ecx
> - mov %cx, (%rdi)
> - movb $0, 2(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 2(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $3, %r8
> - lea 3(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - VZEROUPPER_RETURN
> -
> - .p2align 4
> -L(Exit3):
> - mov (%rsi), %edx
> - mov %edx, (%rdi)
> + movq -(8 - CHAR_SIZE)(%rsi, %rdx), %rcx
> +# endif
> + vmovq %xmm0, (%rdi)
> + movq %rcx, -(8 - CHAR_SIZE)(%END_REG)
> + ret
> +
> +
> + .p2align 4,, 8
> +L(more_1x_vec):
> +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
> + VMOVU %VMM(0), (%rdi)
> +# endif
> + subq %rsi, %rdi
> + orq $(VEC_SIZE - 1), %rsi
> + addq %rsi, %rdi
> + VMOVA 1(%rsi), %VMM(1)
> +
> + /* Try and order stores after as many loads as is reasonable to
> + avoid potential false dependencies. */
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> + VMOVU %VMM(0), (%rax)
> +# endif
> + VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + testl %ecx, %ecx
> + jnz L(ret_vec_x1)
> +
> + VMOVA (VEC_SIZE + 1)(%rsi), %VMM(2)
> + VMOVU %VMM(1), 1(%rdi)
> +
> + VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + testl %ecx, %ecx
> + jnz L(ret_vec_x2)
> +
> + VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(3)
> + VMOVU %VMM(2), (VEC_SIZE + 1)(%rdi)
> +
> + VPCMPEQ %VMM(3), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + testl %ecx, %ecx
> + jnz L(ret_vec_x3)
> +
> + VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(4)
> + VMOVU %VMM(3), (VEC_SIZE * 2 + 1)(%rdi)
> + VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %edx
> + testl %edx, %edx
> + jnz L(ret_vec_x4)
> +
> + VMOVU %VMM(4), (VEC_SIZE * 3 + 1)(%rdi)
> +
> + /* Subtract rsi from rdi before aligning. Adding back rsi will
> + get proper rdi (dst) for new src. */
> + subq %rsi, %rdi
> + incq %rsi
> + orq $(VEC_SIZE * 4 - 1), %rsi
> +
> + /* Do first half of loop ahead of time so loop can just start by
> + storing. */
> + VMOVA (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
> + VMOVA (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
> + VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
> + VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
> +
> + VPMIN %VMM(0), %VMM(1), %VMM(4)
> + VPMIN %VMM(2), %VMM(3), %VMM(6)
> + VPMIN %VMM(4), %VMM(6), %VMM(6)
> + VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %edx
> + addq %rsi, %rdi
> +
> + testl %edx, %edx
> + jnz L(loop_4x_done)
> +
> + .p2align 4,, 11
> +L(loop_4x_vec):
> +
> + VMOVU %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
> + VMOVU %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
> + subq $(VEC_SIZE * -4), %rsi
> + VMOVU %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
> + VMOVU %VMM(3), (VEC_SIZE * 3 + 1)(%rdi)
> +
> +
> + VMOVA (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
> + VMOVA (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
> + VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
> + VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
> +
> + VPMIN %VMM(0), %VMM(1), %VMM(4)
> + VPMIN %VMM(2), %VMM(3), %VMM(6)
> + VPMIN %VMM(4), %VMM(6), %VMM(6)
> + VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> +
> + vpmovmskb %VMM(6), %edx
> + subq $(VEC_SIZE * -4), %rdi
> + testl %edx, %edx
> + jz L(loop_4x_vec)
> +
> +L(loop_4x_done):
> + VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + testl %ecx, %ecx
> + jnz L(ret_vec_x1)
> + VMOVU %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
> +
> + VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + testl %ecx, %ecx
> + jnz L(ret_vec_x2)
> + VMOVU %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
> +
> + VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + testl %ecx, %ecx
> + jnz L(ret_vec_x3)
> + VMOVU %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
> +L(ret_vec_x4):
> + bsfl %edx, %edx
> + VMOVU ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> + VMOVU %VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> # ifdef USE_AS_STPCPY
> - lea 3(%rdi), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub $4, %r8
> - lea 4(%rdi), %rdi
> - jnz L(StrncpyFillTailWithZero)
> + leaq (VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax
> # endif
> +L(return_end):
> VZEROUPPER_RETURN
>
> - .p2align 4
> -L(Exit4_7):
> - mov (%rsi), %ecx
> - mov %ecx, (%rdi)
> - mov -3(%rsi, %rdx), %ecx
> - mov %ecx, -3(%rdi, %rdx)
> + .p2align 4,, 8
> +L(ret_vec_x1):
> + bsfl %ecx, %ecx
> + VMOVU (1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> + VMOVU %VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
> # ifdef USE_AS_STPCPY
> - lea (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub %rdx, %r8
> - sub $1, %r8
> - lea 1(%rdi, %rdx), %rdi
> - jnz L(StrncpyFillTailWithZero)
> + leaq 1(%rcx, %rdi), %rax
> # endif
> - VZEROUPPER_RETURN
> -
> - .p2align 4
> -L(Exit8_15):
> - mov (%rsi), %rcx
> - mov -7(%rsi, %rdx), %r9
> - mov %rcx, (%rdi)
> - mov %r9, -7(%rdi, %rdx)
> -# ifdef USE_AS_STPCPY
> - lea (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub %rdx, %r8
> - sub $1, %r8
> - lea 1(%rdi, %rdx), %rdi
> - jnz L(StrncpyFillTailWithZero)
> -# endif
> - VZEROUPPER_RETURN
> +L(return_vzeroupper):
> + ZERO_UPPER_VEC_REGISTERS_RETURN
>
> - .p2align 4
> -L(Exit16_31):
> - vmovdqu (%rsi), %xmm2
> - vmovdqu -15(%rsi, %rdx), %xmm3
> - vmovdqu %xmm2, (%rdi)
> - vmovdqu %xmm3, -15(%rdi, %rdx)
> + .p2align 4,, 8
> +L(ret_vec_x2):
> + bsfl %ecx, %ecx
> + VMOVU ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> + VMOVU %VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
> # ifdef USE_AS_STPCPY
> - lea (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub %rdx, %r8
> - sub $1, %r8
> - lea 1(%rdi, %rdx), %rdi
> - jnz L(StrncpyFillTailWithZero)
> + leaq (VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax
> # endif
> VZEROUPPER_RETURN
>
> - .p2align 4
> -L(Exit32_63):
> - vmovdqu (%rsi), %ymm2
> - vmovdqu -31(%rsi, %rdx), %ymm3
> - vmovdqu %ymm2, (%rdi)
> - vmovdqu %ymm3, -31(%rdi, %rdx)
> + .p2align 4,, 8
> +L(ret_vec_x3):
> + bsfl %ecx, %ecx
> + VMOVU ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
> + VMOVU %VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
> # ifdef USE_AS_STPCPY
> - lea (%rdi, %rdx), %rax
> -# endif
> -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
> - sub %rdx, %r8
> - sub $1, %r8
> - lea 1(%rdi, %rdx), %rdi
> - jnz L(StrncpyFillTailWithZero)
> + leaq (VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax
> # endif
> VZEROUPPER_RETURN
>
> -# ifdef USE_AS_STRNCPY
>
> - .p2align 4
> -L(StrncpyExit1):
> - movzbl (%rsi), %edx
> - mov %dl, (%rdi)
> + .p2align 4,, 4
> +L(page_cross):
> + movq %rsi, %rcx
> + andq $(VEC_SIZE * -1), %rcx
> +
> + VPCMPEQ (%rcx), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + shrxl %esi, %ecx, %ecx
> +# if USE_MOVSB_IN_PAGE_CROSS
> + /* Optimizing more aggressively for space as this is very cold
> + code. This saves 2x cache lines. */
> +
> + /* This adds once to the later result which will get correct
> + copy bounds. NB: this can never zero-out a non-zero RCX as
> + to be in the page cross case rsi cannot be aligned and we
> + already right-shift rcx by the misalignment. */
> + shll $CHAR_SIZE, %ecx
> + jz L(page_cross_continue)
> + bsfl %ecx, %ecx
> +# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
> + movq %rdi, %rax
> +# endif
> + rep movsb
> # ifdef USE_AS_STPCPY
> - lea 1(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - movb $0, 1(%rdi)
> + leaq -CHAR_SIZE(%rdi), %rax
> # endif
> - VZEROUPPER_RETURN
>
> - .p2align 4
> -L(StrncpyExit2):
> - movzwl (%rsi), %edx
> - mov %dx, (%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 2(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - movb $0, 2(%rdi)
> -# endif
> VZEROUPPER_RETURN
>
> - .p2align 4
> -L(StrncpyExit3_4):
> - movzwl (%rsi), %ecx
> - movzwl -2(%rsi, %r8), %edx
> - mov %cx, (%rdi)
> - mov %dx, -2(%rdi, %r8)
> -# ifdef USE_AS_STPCPY
> - lea (%rdi, %r8), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - movb $0, (%rdi, %r8)
> -# endif
> - VZEROUPPER_RETURN
> -
> - .p2align 4
> -L(StrncpyExit5_8):
> - mov (%rsi), %ecx
> - mov -4(%rsi, %r8), %edx
> - mov %ecx, (%rdi)
> - mov %edx, -4(%rdi, %r8)
> -# ifdef USE_AS_STPCPY
> - lea (%rdi, %r8), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - movb $0, (%rdi, %r8)
> -# endif
> - VZEROUPPER_RETURN
> -
> - .p2align 4
> -L(StrncpyExit9_16):
> - mov (%rsi), %rcx
> - mov -8(%rsi, %r8), %rdx
> - mov %rcx, (%rdi)
> - mov %rdx, -8(%rdi, %r8)
> -# ifdef USE_AS_STPCPY
> - lea (%rdi, %r8), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - movb $0, (%rdi, %r8)
> -# endif
> - VZEROUPPER_RETURN
> -
> - .p2align 4
> -L(StrncpyExit17_32):
> - vmovdqu (%rsi), %xmm2
> - vmovdqu -16(%rsi, %r8), %xmm3
> - vmovdqu %xmm2, (%rdi)
> - vmovdqu %xmm3, -16(%rdi, %r8)
> -# ifdef USE_AS_STPCPY
> - lea (%rdi, %r8), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - movb $0, (%rdi, %r8)
> -# endif
> - VZEROUPPER_RETURN
> -
> - .p2align 4
> -L(StrncpyExit33_64):
> - /* 0/32, 31/16 */
> - vmovdqu (%rsi), %ymm2
> - vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
> - vmovdqu %ymm2, (%rdi)
> - vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
> -# ifdef USE_AS_STPCPY
> - lea (%rdi, %r8), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - movb $0, (%rdi, %r8)
> -# endif
> - VZEROUPPER_RETURN
> -
> - .p2align 4
> -L(StrncpyExit65):
> - /* 0/32, 32/32, 64/1 */
> - vmovdqu (%rsi), %ymm2
> - vmovdqu 32(%rsi), %ymm3
> - mov 64(%rsi), %cl
> - vmovdqu %ymm2, (%rdi)
> - vmovdqu %ymm3, 32(%rdi)
> - mov %cl, 64(%rdi)
> -# ifdef USE_AS_STPCPY
> - lea 65(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - movb $0, 65(%rdi)
> -# endif
> - VZEROUPPER_RETURN
> +# else
> + testl %ecx, %ecx
> + jz L(page_cross_continue)
>
> + /* Traditional copy case, essentially same as used in non-page-
> + cross case but since we can't reuse VMM(0) we need twice as
> + many loads from rsi. */
> # ifndef USE_AS_STRCAT
> -
> - .p2align 4
> -L(Fill1):
> - mov %dl, (%rdi)
> - VZEROUPPER_RETURN
> -
> - .p2align 4
> -L(Fill2):
> - mov %dx, (%rdi)
> - VZEROUPPER_RETURN
> -
> - .p2align 4
> -L(Fill3_4):
> - mov %dx, (%rdi)
> - mov %dx, -2(%rdi, %r8)
> - VZEROUPPER_RETURN
> -
> - .p2align 4
> -L(Fill5_8):
> - mov %edx, (%rdi)
> - mov %edx, -4(%rdi, %r8)
> - VZEROUPPER_RETURN
> -
> - .p2align 4
> -L(Fill9_16):
> - mov %rdx, (%rdi)
> - mov %rdx, -8(%rdi, %r8)
> - VZEROUPPER_RETURN
> -
> - .p2align 4
> -L(Fill17_32):
> - vmovdqu %xmmZ, (%rdi)
> - vmovdqu %xmmZ, -16(%rdi, %r8)
> - VZEROUPPER_RETURN
> -
> - .p2align 4
> -L(CopyVecSizeUnalignedVec2):
> - vmovdqu %ymm2, (%rdi, %rcx)
> -
> - .p2align 4
> -L(CopyVecSizeVecExit):
> - bsf %edx, %edx
> - add $(VEC_SIZE - 1), %r8
> - add %rcx, %rdi
> -# ifdef USE_AS_STPCPY
> - lea (%rdi, %rdx), %rax
> -# endif
> - sub %rdx, %r8
> - lea 1(%rdi, %rdx), %rdi
> -
> - .p2align 4
> -L(StrncpyFillTailWithZero):
> - xor %edx, %edx
> - sub $VEC_SIZE, %r8
> - jbe L(StrncpyFillExit)
> -
> - vmovdqu %ymmZ, (%rdi)
> - add $VEC_SIZE, %rdi
> -
> - mov %rdi, %rsi
> - and $(VEC_SIZE - 1), %esi
> - sub %rsi, %rdi
> - add %rsi, %r8
> - sub $(VEC_SIZE * 4), %r8
> - jb L(StrncpyFillLessFourVecSize)
> -
> -L(StrncpyFillLoopVmovdqa):
> - vmovdqa %ymmZ, (%rdi)
> - vmovdqa %ymmZ, VEC_SIZE(%rdi)
> - vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
> - vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
> - add $(VEC_SIZE * 4), %rdi
> - sub $(VEC_SIZE * 4), %r8
> - jae L(StrncpyFillLoopVmovdqa)
> -
> -L(StrncpyFillLessFourVecSize):
> - add $(VEC_SIZE * 2), %r8
> - jl L(StrncpyFillLessTwoVecSize)
> - vmovdqa %ymmZ, (%rdi)
> - vmovdqa %ymmZ, VEC_SIZE(%rdi)
> - add $(VEC_SIZE * 2), %rdi
> - sub $VEC_SIZE, %r8
> - jl L(StrncpyFillExit)
> - vmovdqa %ymmZ, (%rdi)
> - add $VEC_SIZE, %rdi
> - jmp L(Fill)
> -
> - .p2align 4
> -L(StrncpyFillLessTwoVecSize):
> - add $VEC_SIZE, %r8
> - jl L(StrncpyFillExit)
> - vmovdqa %ymmZ, (%rdi)
> - add $VEC_SIZE, %rdi
> - jmp L(Fill)
> -
> - .p2align 4
> -L(StrncpyFillExit):
> - add $VEC_SIZE, %r8
> -L(Fill):
> - cmp $17, %r8d
> - jae L(Fill17_32)
> - cmp $9, %r8d
> - jae L(Fill9_16)
> - cmp $5, %r8d
> - jae L(Fill5_8)
> - cmp $3, %r8d
> - jae L(Fill3_4)
> - cmp $1, %r8d
> - ja L(Fill2)
> - je L(Fill1)
> - VZEROUPPER_RETURN
> -
> -/* end of ifndef USE_AS_STRCAT */
> + xorl %edx, %edx
> # endif
> -
> - .p2align 4
> -L(UnalignedLeaveCase2OrCase3):
> - test %rdx, %rdx
> - jnz L(UnalignedFourVecSizeLeaveCase2)
> -L(UnalignedFourVecSizeLeaveCase3):
> - lea (VEC_SIZE * 4)(%r8), %rcx
> - and $-VEC_SIZE, %rcx
> - add $(VEC_SIZE * 3), %r8
> - jl L(CopyVecSizeCase3)
> - vmovdqu %ymm4, (%rdi)
> - sub $VEC_SIZE, %r8
> - jb L(CopyVecSizeCase3)
> - vmovdqu %ymm5, VEC_SIZE(%rdi)
> - sub $VEC_SIZE, %r8
> - jb L(CopyVecSizeCase3)
> - vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> - sub $VEC_SIZE, %r8
> - jb L(CopyVecSizeCase3)
> - vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
> + bsfl %ecx, %edx
> # ifdef USE_AS_STPCPY
> - lea (VEC_SIZE * 4)(%rdi), %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - movb $0, (VEC_SIZE * 4)(%rdi)
> + leaq (%rdi, %rdx), %rax
> +# elif !defined USE_AS_STRCAT
> + movq %rdi, %rax
> # endif
> - VZEROUPPER_RETURN
>
> - .p2align 4
> -L(UnalignedFourVecSizeLeaveCase2):
> - xor %ecx, %ecx
> - vpcmpeqb %ymm4, %ymmZ, %ymmM
> - vpmovmskb %ymmM, %edx
> - add $(VEC_SIZE * 3), %r8
> - jle L(CopyVecSizeCase2OrCase3)
> - test %edx, %edx
> -# ifndef USE_AS_STRCAT
> - jnz L(CopyVecSizeUnalignedVec4)
> -# else
> - jnz L(CopyVecSize)
> -# endif
> - vpcmpeqb %ymm5, %ymmZ, %ymmM
> - vpmovmskb %ymmM, %edx
> - vmovdqu %ymm4, (%rdi)
> - add $VEC_SIZE, %rcx
> - sub $VEC_SIZE, %r8
> - jbe L(CopyVecSizeCase2OrCase3)
> - test %edx, %edx
> -# ifndef USE_AS_STRCAT
> - jnz L(CopyVecSizeUnalignedVec5)
> -# else
> - jnz L(CopyVecSize)
> -# endif
> + /* vzeroupper early to avoid duplicating at each return. */
> + COND_VZEROUPPER
>
> - vpcmpeqb %ymm6, %ymmZ, %ymmM
> - vpmovmskb %ymmM, %edx
> - vmovdqu %ymm5, VEC_SIZE(%rdi)
> - add $VEC_SIZE, %rcx
> - sub $VEC_SIZE, %r8
> - jbe L(CopyVecSizeCase2OrCase3)
> - test %edx, %edx
> -# ifndef USE_AS_STRCAT
> - jnz L(CopyVecSizeUnalignedVec6)
> -# else
> - jnz L(CopyVecSize)
> -# endif
> + testw %cx, %cx
> + jz L(page_cross_copy_16_31)
>
> - vpcmpeqb %ymm7, %ymmZ, %ymmM
> - vpmovmskb %ymmM, %edx
> - vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
> - lea VEC_SIZE(%rdi, %rcx), %rdi
> - lea VEC_SIZE(%rsi, %rcx), %rsi
> - bsf %edx, %edx
> - cmp %r8d, %edx
> - jb L(CopyVecSizeExit)
> -L(StrncpyExit):
> - cmp $65, %r8d
> - je L(StrncpyExit65)
> - cmp $33, %r8d
> - jae L(StrncpyExit33_64)
> - cmp $17, %r8d
> - jae L(StrncpyExit17_32)
> - cmp $9, %r8d
> - jae L(StrncpyExit9_16)
> - cmp $5, %r8d
> - jae L(StrncpyExit5_8)
> - cmp $3, %r8d
> - jae L(StrncpyExit3_4)
> - cmp $1, %r8d
> - ja L(StrncpyExit2)
> - je L(StrncpyExit1)
> -# ifdef USE_AS_STPCPY
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRCAT
> - movb $0, (%rdi)
> -# endif
> - VZEROUPPER_RETURN
> -
> - .p2align 4
> -L(ExitZero):
> -# ifndef USE_AS_STRCAT
> - mov %rdi, %rax
> -# endif
> - VZEROUPPER_RETURN
> + testb %cl, %cl
> + jz L(page_cross_copy_8_15)
>
> -# endif
> + testl $0x7, %cl
> + jz L(page_cross_copy_4_7)
>
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# else
> -END (STRCAT)
> -# endif
> + testl %edx, %edx
> + jz L(page_cross_set_null_term)
> + movzwl (%rsi), %ecx
> + movw %cx, (%rdi)
> +L(page_cross_set_null_term):
> + movb $0, (%END_REG)
> + ret
> +
> + .p2align 4,, 4
> +L(page_cross_copy_4_7):
> + movl (%rsi), %ecx
> + movl -3(%rsi, %rdx), %esi
> + movl %ecx, (%rdi)
> + movl %esi, -3(%END_REG)
> + ret
> +
> + .p2align 4,, 4
> +L(page_cross_copy_8_15):
> + movq (%rsi), %rcx
> + movq -7(%rsi, %rdx), %rsi
> + movq %rcx, (%rdi)
> + movq %rsi, -7(%END_REG)
> + ret
> +
> +
> + .p2align 4,, 3
> +L(page_cross_copy_16_31):
> + VMOVU (%rsi), %xmm0
> + VMOVU -15(%rsi, %rdx), %xmm1
> + VMOVU %xmm0, (%rdi)
> + VMOVU %xmm1, -15(%END_REG)
> + ret
> +# endif
> +
> +END(STRCPY)
> #endif
> diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> index 0dcea18dbb..7272deef2c 100644
> --- a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S
> @@ -1,3 +1,3 @@
> -#define USE_AS_STRNCAT
> -#define STRCAT __strncat_avx2_rtm
> -#include "strcat-avx2-rtm.S"
> +#define STRNCAT __strncat_avx2_rtm
> +#include "x86-avx-rtm-vecs.h"
> +#include "strncat-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncat-avx2.S b/sysdeps/x86_64/multiarch/strncat-avx2.S
> index 52ecbca943..ffa58bd0de 100644
> --- a/sysdeps/x86_64/multiarch/strncat-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strncat-avx2.S
> @@ -1,7 +1,419 @@
> -#ifndef STRNCAT
> -# define STRNCAT __strncat_avx2
> -#endif
> +/* strncat with AVX2
> + Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (3)
> +
> +# include <sysdep.h>
> +
> +# ifndef VEC_SIZE
> +# include "x86-avx-vecs.h"
> +# endif
> +
> +# ifndef STRNCAT
> +# define STRNCAT __strncat_avx2
> +# endif
> +
> +# ifdef USE_AS_WCSCPY
> +# define MOVCHAR movl
> +# define VPCMPEQ vpcmpeqd
> +# define VPMIN vpminud
> +# define CHAR_SIZE 4
> +# else
> +# define MOVCHAR movb
> +# define VPCMPEQ vpcmpeqb
> +# define VPMIN vpminub
> +# define CHAR_SIZE 1
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# define PAGE_SIZE 4096
> +
> +# define VZERO VMM(7)
> +# define VZERO_128 VMM_128(7)
> +
> + .section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCAT)
> + /* Filter zero length strings and very long strings. Zero
> + length strings just return, very long strings are handled by
> + using the non-length variant {wcs|str}cat. */
> + movq %rdi, %rax
> +# ifdef USE_AS_WCSCPY
> + leaq -1(%rdx), %rcx
> + shr $56, %rcx
> + jnz L(zero_len)
> + salq $2, %rdx
> +# else
> + test %rdx, %rdx
> + jl L(zero_len)
> +# endif
> + vpxor %VZERO_128, %VZERO_128, %VZERO_128
> +
> +# include "strcat-strlen-avx2.h.S"
> +
> + movl %esi, %ecx
> + andl $(PAGE_SIZE - 1), %ecx
> + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
> + ja L(page_cross)
> +L(page_cross_continue):
> + VMOVU (%rsi), %VMM(0)
> + VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> +
> + tzcnt %ecx, %r8d
> + cmpq %r8, %rdx
> + jbe L(less_1x_vec)
> +
> + testl %ecx, %ecx
> + jz L(more_1x_vec)
> +
> + /* Hoist this to save code size. */
> +
> + movl %r8d, %edx
> +
> +L(less_1x_vec):
> + COND_VZEROUPPER
> +
> + cmpl $16, %edx
> + jae L(copy_16_31)
> + cmpl $8, %edx
> + jae L(copy_8_15)
> +
> +
> +# ifdef USE_AS_WCSCPY
> + vmovd %VMM_128(0), (%rdi)
> + MOVCHAR $0, (%rdi, %rdx)
> + ret
> +# else
> + cmpl $4, %edx
> + jae L(copy_4_7)
> +
> + movzbl (%rsi), %ecx
> + cmpl $1, %edx
> + jbe L(set_null_term)
> +
> + /* NB: make this `vmovw` if support for AVX512-FP16 is added.
> + */
> + movzwl 1(%rsi), %esi
> + movw %si, 1(%rdi)
> +
> + .p2align 4,, 1
> +L(set_null_term):
> + movb %cl, (%rdi)
> + MOVCHAR $0, (%rdi, %rdx)
> + ret
> +
> + .p2align 4,, 11
> +L(copy_4_7):
> + movl -(4)(%rsi, %rdx), %ecx
> + vmovd %xmm0, (%rdi)
> + movl %ecx, -(4)(%rdi, %rdx)
> + MOVCHAR $0, (%rdi, %rdx)
> + ret
> +# endif
> +
> +
> + .p2align 4,, 10
> +L(copy_16_31):
> + VMOVU -(16)(%rsi, %rdx), %xmm1
> + VMOVU %xmm0, (%rdi)
> + VMOVU %xmm1, -(16)(%rdi, %rdx)
> + MOVCHAR $0, (%rdi, %rdx)
> + ret
> +
> + .p2align 4,, 10
> +L(copy_8_15):
> + movq -(8)(%rsi, %rdx), %rcx
> + vmovq %xmm0, (%rdi)
> + movq %rcx, -(8)(%rdi, %rdx)
> + MOVCHAR $0, (%rdi, %rdx)
> + ret
> +
> + .p2align 4,, 8
> + .p2align 6,, 14
> +L(more_1x_vec):
> + VMOVU %VMM(0), (%rdi)
> +
> + /* Align rsi (src) and just rdx/rdi (length/dst). */
> + addq %rsi, %rdx
> + subq %rsi, %rdi
> + orq $(VEC_SIZE - 1), %rsi
> + incq %rsi
> + addq %rsi, %rdi
> +L(loop_last_4x_vec):
> + subq %rsi, %rdx
> + VMOVA 0(%rsi), %VMM(1)
> + VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + cmpq $(VEC_SIZE * 2), %rdx
> + ja L(more_2x_vec)
> +L(last_2x_vec):
> + tzcnt %ecx, %ecx
> + cmpl %ecx, %edx
> + jbe L(ret_vec_x1_len)
> +
> + cmpl $VEC_SIZE, %ecx
> + jnz L(ret_vec_x1)
> +
> + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2)
> + VMOVU %VMM(1), (%rdi)
> + VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + addl $-VEC_SIZE, %edx
> + bzhil %edx, %ecx, %r8d
> + jz L(ret_vec_x2_len)
> +L(ret_vec_x2):
> + bsfl %ecx, %edx
> +L(ret_vec_x2_len):
> + VMOVU (%rsi, %rdx), %VMM(0)
> + MOVCHAR $0, (VEC_SIZE)(%rdi, %rdx)
> + VMOVU %VMM(0), (%rdi, %rdx)
> +L(return_vzeroupper):
> + ZERO_UPPER_VEC_REGISTERS_RETURN
> +
> +
> + .p2align 4,, 12
> +L(ret_vec_x1_len):
> + movl %edx, %ecx
> +L(ret_vec_x1):
> + VMOVU -(VEC_SIZE)(%rsi, %rcx), %VMM(1)
> + MOVCHAR $0, (%rdi, %rcx)
> + VMOVU %VMM(1), -VEC_SIZE(%rdi, %rcx)
> + VZEROUPPER_RETURN
> +
> + .p2align 4,, 8
> +L(last_4x_vec):
> + subq $-(VEC_SIZE * 4), %rsi
> + VMOVA 0(%rsi), %VMM(1)
> + VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + subq $-(VEC_SIZE * 4), %rdi
> + addl $-(VEC_SIZE * 4), %edx
> + cmpl $(VEC_SIZE * 2), %edx
> + jbe L(last_2x_vec)
> + .p2align 4,, 8
> +L(more_2x_vec):
> + /* L(ret_vec_x1) expects ecx to have position of first match so
> + test with bsf. */
> + bsfl %ecx, %ecx
> + jnz L(ret_vec_x1)
> +
> + VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2)
> + VMOVU %VMM(1), (%rdi)
> +
> + VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + testl %ecx, %ecx
> + jnz L(ret_vec_x2)
> +
>
> -#define USE_AS_STRNCAT
> -#define STRCAT STRNCAT
> -#include "strcat-avx2.S"
> + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3)
> + VMOVU %VMM(2), (VEC_SIZE * 1)(%rdi)
> +
> + VPCMPEQ %VMM(3), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> +
> + /* Check if length is greater than 4x VEC. */
> + cmpq $(VEC_SIZE * 4), %rdx
> + ja L(more_4x_vec)
> +
> + addl $(VEC_SIZE * -2), %edx
> +
> + tzcnt %ecx, %ecx
> + cmpl %ecx, %edx
> + jbe L(ret_vec_x3_len)
> +
> + cmpl $VEC_SIZE, %ecx
> + jnz L(ret_vec_x3)
> +
> + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(4)
> + VMOVU %VMM(3), (VEC_SIZE * 2 + 0)(%rdi)
> + VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + addl $-VEC_SIZE, %edx
> + bzhil %edx, %ecx, %r8d
> + jz L(ret_vec_x4_len)
> +L(ret_vec_x4):
> + bsfl %ecx, %edx
> +L(ret_vec_x4_len):
> + VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(0)
> + MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rdx)
> + VMOVU %VMM(0), (VEC_SIZE * 2)(%rdi, %rdx)
> + VZEROUPPER_RETURN
> +
> + .p2align 4,, 4
> +L(ret_vec_x3_len):
> + movl %edx, %ecx
> +L(ret_vec_x3):
> + VMOVU (VEC_SIZE)(%rsi, %rcx), %VMM(0)
> + MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rcx)
> + VMOVU %VMM(0), (VEC_SIZE)(%rdi, %rcx)
> + VZEROUPPER_RETURN
> +
> +
> + .p2align 4,, 8
> +L(more_4x_vec):
> + bsfl %ecx, %ecx
> + jnz L(ret_vec_x3)
> +
> + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4)
> + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
> + VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + testl %ecx, %ecx
> + jnz L(ret_vec_x4)
> +
> + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi)
> +
> +
> + /* Recheck length before aligning. */
> + cmpq $(VEC_SIZE * 8), %rdx
> + jbe L(last_4x_vec)
> +
> + /* Align rsi (src) and just rdx/rdi (length/dst). */
> + addq %rsi, %rdx
> + subq %rsi, %rdi
> + subq $-(VEC_SIZE * 4), %rsi
> + andq $(VEC_SIZE * -4), %rsi
> +
> + /* Do first half of loop ahead of time so loop can just start by
> + storing. */
> + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> + VPMIN %VMM(0), %VMM(1), %VMM(4)
> + VPMIN %VMM(2), %VMM(3), %VMM(6)
> + VPMIN %VMM(4), %VMM(6), %VMM(6)
> + VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %r8d
> + addq %rsi, %rdi
> + testl %r8d, %r8d
> + jnz L(loop_4x_done)
> +
> + /* Use r9 for end of region before handling last 4x VEC
> + specially. */
> + leaq -(VEC_SIZE * 4)(%rdx), %r9
> +
> + .p2align 4,, 11
> +L(loop_4x_vec):
> +
> + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> + subq $(VEC_SIZE * -4), %rsi
> + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> +
> + subq $(VEC_SIZE * -4), %rdi
> + cmpq %rsi, %r9
> + jbe L(loop_last_4x_vec)
> +
> + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> + VPMIN %VMM(0), %VMM(1), %VMM(4)
> + VPMIN %VMM(2), %VMM(3), %VMM(6)
> + VPMIN %VMM(4), %VMM(6), %VMM(6)
> + VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> +
> + vpmovmskb %VMM(6), %r8d
> +
> + testl %r8d, %r8d
> + jz L(loop_4x_vec)
> +
> +L(loop_4x_done):
> + VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + /* L(ret_vec_x1) expects ecx to have position of first match so
> + test with bsf. */
> + bsfl %ecx, %ecx
> + jnz L(ret_vec_x1)
> + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> +
> + VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> +
> + testl %ecx, %ecx
> + jnz L(ret_vec_x2)
> + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> +
> + VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + bsfl %ecx, %ecx
> + jnz L(ret_vec_x3)
> +
> + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> + bsfl %r8d, %r8d
> + VMOVU (VEC_SIZE * 2 + CHAR_SIZE)(%rsi, %r8), %VMM(1)
> + VMOVU %VMM(1), (VEC_SIZE * 2 + CHAR_SIZE)(%rdi, %r8)
> + VZEROUPPER_RETURN
> +
> +
> +
> + .p2align 4,, 4
> +L(page_cross):
> + movq %rsi, %r8
> + andq $(VEC_SIZE * -1), %r8
> +
> + VPCMPEQ (%r8), %VZERO, %VMM(6)
> +
> + vpmovmskb %VMM(6), %ecx
> + shrxl %esi, %ecx, %ecx
> +
> + subl %esi, %r8d
> + andl $(VEC_SIZE - 1), %r8d
> + cmpq %r8, %rdx
> + jb L(page_cross_small)
> +
> + /* Optimizing more aggressively for space as this is very cold
> + code. This saves 2x cache lines. */
> +
> + /* This adds once to the later result which will get correct
> + copy bounds. NB: this can never zero-out a non-zero RCX as
> + to be in the page cross case rsi cannot be aligned and we
> + already right-shift rcx by the misalignment. */
> + shll $CHAR_SIZE, %ecx
> + jz L(page_cross_continue)
> + bsfl %ecx, %ecx
> + rep movsb
> + VZEROUPPER_RETURN
> +
> +L(page_cross_small):
> + tzcntl %ecx, %ecx
> + jz L(page_cross_setz)
> + cmpl %edx, %ecx
> + cmova %edx, %ecx
> + rep movsb
> +L(page_cross_setz):
> + MOVCHAR $0, (%rdi)
> + VZEROUPPER_RETURN
> +L(zero_len):
> +# ifdef USE_AS_WCSCPY
> + test %rdx, %rdx
> +# endif
> + jnz OVERFLOW_STRCAT
> + ret
> +
> +
> +END(STRNCAT)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> index 79e7083299..d42ad88b3d 100644
> --- a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S
> @@ -1,3 +1,3 @@
> -#define USE_AS_STRNCPY
> -#define STRCPY __strncpy_avx2_rtm
> -#include "strcpy-avx2-rtm.S"
> +#define STRNCPY __strncpy_avx2_rtm
> +#include "x86-avx-rtm-vecs.h"
> +#include "strncpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2.S b/sysdeps/x86_64/multiarch/strncpy-avx2.S
> index ce634e94fa..e9afd8fbed 100644
> --- a/sysdeps/x86_64/multiarch/strncpy-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strncpy-avx2.S
> @@ -1,7 +1,735 @@
> -#ifndef STRNCPY
> -# define STRNCPY __strncpy_avx2
> -#endif
> +/* strncpy with AVX2
> + Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (3)
> +
> +# include <sysdep.h>
> +
> +
> +# ifndef VEC_SIZE
> +# include "x86-avx-vecs.h"
> +# endif
> +
> +# ifndef STRNCPY
> +# define STRNCPY __strncpy_avx2
> +# endif
> +
> +
> +# ifdef USE_AS_WCSCPY
> +# define VPCMPEQ vpcmpeqd
> +# define VPMIN vpminud
> +# define CHAR_SIZE 4
> +# else
> +# define VPCMPEQ vpcmpeqb
> +# define VPMIN vpminub
> +# define CHAR_SIZE 1
> +# endif
> +
> +# include "strncpy-or-cat-overflow-def.h"
> +
> +# define PAGE_SIZE 4096
> +
> +# define VZERO VMM(7)
> +# define VZERO_128 VMM_128(7)
> +
> +
> + .section SECTION(.text), "ax", @progbits
> +ENTRY(STRNCPY)
> + /* Filter zero length strings and very long strings. Zero
> + length strings just return, very long strings are handled by
> + just running rep stos{b|l} to zero set (which will almost
> + certainly segfault), if that succeeds then just calling
> + OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */
> +# ifdef USE_AS_WCSCPY
> + decq %rdx
> + movq %rdx, %rax
> + /* 56 is end of max supported address space. */
> + shr $56, %rax
> + jnz L(zero_len)
> + salq $2, %rdx
> +# else
> + decq %rdx
> + /* `dec` can macrofuse with `jl`. If the flag needs to become
> + `jb` replace `dec` with `sub`. */
> + jl L(zero_len)
> +# endif
> +
> + vpxor %VZERO_128, %VZERO_128, %VZERO_128
> + movl %esi, %eax
> + andl $(PAGE_SIZE - 1), %eax
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> + ja L(page_cross)
> +
> +L(page_cross_continue):
> + VMOVU (%rsi), %VMM(0)
> + VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> +
> + /* If no STPCPY just save end ahead of time. */
> +# ifndef USE_AS_STPCPY
> + movq %rdi, %rax
> +# elif defined USE_AS_WCSCPY
> + /* Clear dependency as nearly all return code for wcpncpy uses
> + `setc %al`. */
> + xorl %eax, %eax
> +# endif
> +
> + cmpq $(VEC_SIZE - CHAR_SIZE), %rdx
> + /* `jb` because length rdx is now length - CHAR_SIZE. */
> + jbe L(less_1x_vec)
> +
> + /* This may overset but thats fine because we still need to zero
> + fill. */
> + VMOVU %VMM(0), (%rdi)
> +
> + testl %ecx, %ecx
> + jnz L(zfill)
> +
> + /* Align. */
> + addq %rsi, %rdx
> + subq %rsi, %rdi
> + orq $(VEC_SIZE - 1), %rsi
> + incq %rsi
> +L(last_4x_vec):
> + addq %rsi, %rdi
> +L(loop_last_4x_vec):
> + subq %rsi, %rdx
> +
> +
> + VMOVA 0(%rsi), %VMM(1)
> + VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> +
> + cmpq $(VEC_SIZE * 2), %rdx
> + jae L(more_2x_vec)
> +
> + cmpl $(VEC_SIZE), %edx
> + jb L(ret_vec_x1_len)
> +
> + testl %ecx, %ecx
> + jnz L(ret_vec_x1)
> +
> + VPCMPEQ VEC_SIZE(%rsi), %VZERO, %VMM(6)
> + VMOVU %VMM(1), (%rdi)
> + vpmovmskb %VMM(6), %ecx
> + shlq $VEC_SIZE, %rcx
> +L(ret_vec_x1_len):
> + tzcntq %rcx, %rcx
> + cmpl %ecx, %edx
> + jbe L(ret_vec_x1_len_no_zfill)
> + /* Fall through (expectation) is copy len < buffer len. */
> + VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +L(ret_vec_x1_len_no_zfill_mov):
> + movl %ecx, %edx
> +# ifdef USE_AS_STPCPY
> + /* clear flags. */
> + xorl %ecx, %ecx
> +# endif
> +L(ret_vec_x1_len_no_zfill):
> + VMOVU ((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> + VMOVU %VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +# ifdef USE_AS_STPCPY
> +# ifdef USE_AS_WCSCPY
> + setc %al
> + addq %rdx, %rdi
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> + movl %edx, %eax
> + adcq %rdi, %rax
> +# endif
> +# endif
> +L(return_vzeroupper):
> + ZERO_UPPER_VEC_REGISTERS_RETURN
> +
> + .p2align 4,, 6
> +L(ret_vec_x1):
> + bsfl %ecx, %ecx
> + VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> + subl %ecx, %edx
> + /* Check if we need to reload/store. */
> + cmpl $VEC_SIZE, %edx
> + jb L(ret_vec_x1_len_no_zfill_mov)
> + /* Otherwise safe to just store directly. */
> + VMOVU %VMM(1), (%rdi)
> + VMOVU %VZERO, (%rdi, %rcx)
> +# ifdef USE_AS_STPCPY
> + leaq (%rdi, %rcx), %rax
> +# endif
> + VZEROUPPER_RETURN
> +
> + .p2align 4,, 12
> +L(more_2x_vec):
> + VMOVU %VMM(1), (%rdi)
> + testl %ecx, %ecx
> + /* Must fill at least 2x VEC. */
> + jnz L(zfill_vec1)
> +
> + VMOVA VEC_SIZE(%rsi), %VMM(2)
> + VMOVU %VMM(2), VEC_SIZE(%rdi)
> + VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + testl %ecx, %ecx
> + /* Must fill at least 1x VEC. */
> + jnz L(zfill_vec2)
> +
> + VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3)
> + VPCMPEQ %VMM(3), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> +
> + /* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len -
> + CHAR_SIZE. */
> + cmpq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
> + ja L(more_4x_vec)
> +
> + subl $(VEC_SIZE * 3), %edx
> + jb L(ret_vec_x3_len)
> +
> + testl %ecx, %ecx
> + jnz L(ret_vec_x3)
> +
> + VPCMPEQ (VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6)
> + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
> + vpmovmskb %VMM(6), %ecx
> + tzcntl %ecx, %ecx
> + cmpl %ecx, %edx
> + jbe L(ret_vec_x4_len_no_zfill)
> + /* Fall through (expectation) is copy len < buffer len. */
> + VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> + movl %ecx, %edx
> +L(ret_vec_x4_len_no_zfill):
> + VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> + VMOVU %VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +# ifdef USE_AS_STPCPY
> +# ifdef USE_AS_WCSCPY
> + setc %al
> + addq %rdx, %rdi
> + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +# else
> + leal (VEC_SIZE * 3 + 0)(%edx), %eax
> + adcq %rdi, %rax
> +# endif
> +# endif
> + VZEROUPPER_RETURN
> +
> +
> +L(ret_vec_x3_len):
> + addl $(VEC_SIZE * 1), %edx
> + tzcntl %ecx, %ecx
> + cmpl %ecx, %edx
> + jbe L(ret_vec_x3_len_no_zfill)
> + /* Fall through (expectation) is copy len < buffer len. */
> + VMOVU %VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +L(ret_vec_x3_len_no_zfill_mov):
> + movl %ecx, %edx
> +# ifdef USE_AS_STPCPY
> + /* clear flags. */
> + xorl %ecx, %ecx
> +# endif
> + .p2align 4,, 4
> +L(ret_vec_x3_len_no_zfill):
> + VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
> + VMOVU %VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
> +# ifdef USE_AS_STPCPY
> +# ifdef USE_AS_WCSCPY
> + setc %al
> + addq %rdx, %rdi
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +# else
> + leal (VEC_SIZE * 2 + 0)(%rdx), %eax
> + adcq %rdi, %rax
> +# endif
> +# endif
> + VZEROUPPER_RETURN
> +
> +
> + .p2align 4,, 8
> +L(ret_vec_x3):
> + bsfl %ecx, %ecx
> + VMOVU %VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx)
> + subl %ecx, %edx
> + jl L(ret_vec_x3_len_no_zfill_mov)
> + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
> + VMOVU %VZERO, (VEC_SIZE * 2)(%rdi, %rcx)
> +# ifdef USE_AS_STPCPY
> + leaq (VEC_SIZE * 2)(%rdi, %rcx), %rax
> +# endif
> + VZEROUPPER_RETURN
> +
> + .p2align 4,, 8
> +L(more_4x_vec):
> +
> + VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
> + testl %ecx, %ecx
> + jnz L(zfill_vec3)
> +
> + VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4)
> + VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi)
> + VPCMPEQ %VMM(4), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + testl %ecx, %ecx
> + jnz L(zfill_vec4)
> +
> + movq %rdx, %rcx
> + addq %rsi, %rdx
> + subq %rsi, %rdi
> + subq $-(VEC_SIZE * 4), %rsi
> + /* Recheck length before aligning. */
> + cmpq $(VEC_SIZE * 8 - CHAR_SIZE), %rcx
> + jbe L(last_4x_vec)
> +
> + andq $(VEC_SIZE * -4), %rsi
> +
> + /* Do first half of loop ahead of time so loop can just start by
> + storing. */
> + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> + VPMIN %VMM(0), %VMM(1), %VMM(4)
> + VPMIN %VMM(2), %VMM(3), %VMM(6)
> + VPMIN %VMM(4), %VMM(6), %VMM(6)
> + VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %r8d
> + addq %rsi, %rdi
> + testl %r8d, %r8d
> + jnz L(loop_4x_done)
> +
> + /* Use r9 as end register. */
> + leaq -(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
>
> -#define USE_AS_STRNCPY
> -#define STRCPY STRNCPY
> -#include "strcpy-avx2.S"
> + .p2align 4,, 11
> +L(loop_4x_vec):
> +
> + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> + subq $(VEC_SIZE * -4), %rsi
> + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> +
> + subq $(VEC_SIZE * -4), %rdi
> + cmpq %rsi, %r9
> + jbe L(loop_last_4x_vec)
> +
> + VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
> + VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
> + VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
> + VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
> +
> + VPMIN %VMM(0), %VMM(1), %VMM(4)
> + VPMIN %VMM(2), %VMM(3), %VMM(6)
> + VPMIN %VMM(4), %VMM(6), %VMM(6)
> + VPCMPEQ %VMM(6), %VZERO, %VMM(6)
> +
> + vpmovmskb %VMM(6), %r8d
> +
> + testl %r8d, %r8d
> + jz L(loop_4x_vec)
> +
> +L(loop_4x_done):
> + subq %rsi, %rdx
> + VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
> + VPCMPEQ %VMM(0), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + testl %ecx, %ecx
> + jnz L(zfill_vec1)
> +
> + VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
> + VPCMPEQ %VMM(1), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + testl %ecx, %ecx
> + jnz L(zfill_vec2)
> +
> + VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
> + VPCMPEQ %VMM(2), %VZERO, %VMM(6)
> + vpmovmskb %VMM(6), %ecx
> + testl %ecx, %ecx
> + jnz L(zfill_vec3)
> +
> + VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
> + movl %r8d, %ecx
> +
> + // Zfill more....
> +
> + .p2align 4,, 4
> +L(zfill_vec4):
> + addq $(VEC_SIZE * 2), %rdi
> + subq $(VEC_SIZE * 2), %rdx
> +L(zfill_vec2):
> + shlq $VEC_SIZE, %rcx
> +L(zfill):
> + bsfq %rcx, %rcx
> + subq %rcx, %rdx
> + addq %rcx, %rdi
> +# ifdef USE_AS_STPCPY
> + movq %rdi, %rax
> +# endif
> +L(zfill_from_page_cross):
> + cmpq $VEC_SIZE, %rdx
> + jb L(zfill_less_vec_vzeroupper)
> +
> +L(zfill_more_1x_vec):
> + VMOVU %VZERO, CHAR_SIZE(%rdi)
> + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
> + cmpq $(VEC_SIZE * 2), %rdx
> + jae L(zfill_more_2x_vec)
> +L(zfill_done0):
> + VZEROUPPER_RETURN
> +
> + .p2align 4,, 8
> +L(zfill_vec3):
> + addq $(VEC_SIZE * 2), %rdi
> + subq $(VEC_SIZE * 2), %rdx
> + .p2align 4,, 2
> +L(zfill_vec1):
> + bsfl %ecx, %ecx
> + addq %rcx, %rdi
> + subq %rcx, %rdx
> +# ifdef USE_AS_STPCPY
> + movq %rdi, %rax
> +# endif
> + /* zfill from vec1/vec3 must have to set at least 2x VECS. */
> +
> + VMOVU %VZERO, CHAR_SIZE(%rdi)
> + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
> + cmpq $(VEC_SIZE * 2), %rdx
> + jb L(zfill_done0)
> +L(zfill_more_2x_vec):
> + VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx)
> + VMOVU %VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi)
> + subq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
> + jbe L(zfill_done)
> +
> + addq %rdi, %rdx
> + VMOVU %VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi)
> + VMOVU %VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi)
> +
> +
> + VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
> + VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
> +
> + subq $-(VEC_SIZE * 4 + CHAR_SIZE), %rdi
> + cmpq %rdi, %rdx
> + jbe L(zfill_done)
> +
> + andq $-(VEC_SIZE), %rdi
> + .p2align 4,, 12
> +L(zfill_loop_4x_vec):
> + VMOVA %VZERO, (VEC_SIZE * 0)(%rdi)
> + VMOVA %VZERO, (VEC_SIZE * 1)(%rdi)
> + VMOVA %VZERO, (VEC_SIZE * 2)(%rdi)
> + VMOVA %VZERO, (VEC_SIZE * 3)(%rdi)
> + subq $-(VEC_SIZE * 4), %rdi
> + cmpq %rdi, %rdx
> + ja L(zfill_loop_4x_vec)
> +L(zfill_done):
> + VZEROUPPER_RETURN
> +
> +
> + .p2align 4,, 8
> +L(copy_1x):
> + VMOVU %VMM(0), (%rdi)
> + testl %ecx, %ecx
> + jz L(ret_32_32)
> +L(zfill_less_vec):
> + bsfl %ecx, %ecx
> +L(zfill_less_vec_no_bsf):
> + subq %rcx, %rdx
> + addq %rcx, %rdi
> +# ifdef USE_AS_STPCPY
> + movq %rdi, %rax
> +# endif
> +L(zfill_less_vec_vzeroupper):
> + COND_VZEROUPPER
> + /* We are taking advantage of the fact that to be here we must
> + be writing null-term as (%rdi, %rcx) we have a byte of lee-
> + way for overwriting. */
> + cmpl $16, %edx
> + jb L(zfill_less_16)
> + VMOVU %VZERO_128, (%rdi)
> + VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx)
> + ret
> +# ifdef USE_AS_STPCPY
> +L(ret_32_32):
> + leaq CHAR_SIZE(%rdi, %rdx), %rax
> + VZEROUPPER_RETURN
> +# endif
> +
> + .p2align 4,, 4
> +L(copy_16_31):
> + /* Overfill to avoid branches. */
> + vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
> + vmovdqu %xmm0, (%rdi)
> + vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
> + cmpl %ecx, %edx
> + ja L(zfill_less_vec_no_bsf)
> +# ifndef USE_AS_STPCPY
> +L(ret_32_32):
> +# else
> +# ifdef USE_AS_WCSCPY
> + setc %al
> + addq %rdx, %rdi
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> + movl %edx, %eax
> + adcq %rdi, %rax
> +# endif
> +# endif
> + VZEROUPPER_RETURN
> +
> + .p2align 4,, 4
> +L(copy_8_15):
> + /* Overfill to avoid branches. */
> + movq -(8 - CHAR_SIZE)(%rsi, %rdx), %rsi
> + vmovq %xmm0, (%rdi)
> + movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx)
> + cmpl %ecx, %edx
> + jbe L(ret_8_15)
> + subq %rcx, %rdx
> + addq %rcx, %rdi
> +# ifdef USE_AS_STPCPY
> + movq %rdi, %rax
> +# endif
> + .p2align 4,, 8
> +L(zfill_less_16):
> + xorl %ecx, %ecx
> + cmpl $8, %edx
> + jb L(zfill_less_8)
> + movq %rcx, (%rdi)
> + movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
> +# ifndef USE_AS_STPCPY
> +L(ret_8_15):
> +# endif
> + ret
> +
> +
> + .p2align 4,, 8
> +L(less_1x_vec):
> + /* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many
> + buffer sizes are aligned conventially. */
> + je L(copy_1x)
> +
> + tzcntl %ecx, %ecx
> + cmpl $16, %edx
> + jae L(copy_16_31)
> +
> + COND_VZEROUPPER
> + cmpl $8, %edx
> + jae L(copy_8_15)
> +# ifdef USE_AS_WCSCPY
> + testl %ecx, %ecx
> + jz L(zfill_less_8_set_ret)
> +
> + movl (%rsi, %rdx), %esi
> + vmovd %xmm0, (%rdi)
> + movl %esi, (%rdi, %rdx)
> +
> +# ifdef USE_AS_STPCPY
> + cmpl %ecx, %edx
> +L(ret_8_15):
> + setc %al
> + addq %rdx, %rdi
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> +# endif
> + ret
> +L(zfill_less_8_set_ret):
> + xorl %ecx, %ecx
> +# ifdef USE_AS_STPCPY
> + movq %rdi, %rax
> +# endif
> +L(zfill_less_8):
> + movl %ecx, (%rdi)
> + movl %ecx, (%rdi, %rdx)
> + ret
> +
> +# else
> + cmpl $3, %edx
> + jb L(copy_0_3)
> + /* Overfill to avoid branches. */
> + movl -3(%rsi, %rdx), %esi
> + vmovd %xmm0, (%rdi)
> + movl %esi, -3(%rdi, %rdx)
> + cmpl %ecx, %edx
> + jbe L(ret_4_7)
> + subq %rcx, %rdx
> + addq %rcx, %rdi
> +# ifdef USE_AS_STPCPY
> + movq %rdi, %rax
> +# endif
> + xorl %ecx, %ecx
> + .p2align 4,, 8
> +L(zfill_less_8):
> + cmpl $3, %edx
> + jb L(zfill_less_3)
> + movl %ecx, (%rdi)
> + movl %ecx, -3(%rdi, %rdx)
> +# ifdef USE_AS_STPCPY
> + ret
> +# endif
> +
> +L(ret_4_7):
> +# ifdef USE_AS_STPCPY
> +L(ret_8_15):
> + movl %edx, %eax
> + adcq %rdi, %rax
> +# endif
> + ret
> +
> + .p2align 4,, 4
> +L(zfill_less_3):
> + testl %edx, %edx
> + jz L(zfill_1)
> + movw %cx, (%rdi)
> +L(zfill_1):
> + movb %cl, (%rdi, %rdx)
> + ret
> +
> + .p2align 4,, 8
> +L(copy_0_3):
> + vmovd %xmm0, %r8d
> + testl %edx, %edx
> + jz L(copy_1)
> + movw %r8w, (%rdi)
> + cmpl %ecx, %edx
> + ja L(zfill_from_1)
> + movzbl (%rsi, %rdx), %r8d
> +# ifdef USE_AS_STPCPY
> + movl %edx, %eax
> + adcq %rdi, %rax
> + movb %r8b, (%rdi, %rdx)
> + ret
> +# endif
> +
> +L(copy_1):
> +# ifdef USE_AS_STPCPY
> + movl %edx, %eax
> + cmpl %ecx, %edx
> + adcq %rdi, %rax
> +# endif
> +# ifdef USE_AS_WCSCPY
> + vmovd %xmm0, (%rdi)
> +# else
> + movb %r8b, (%rdi, %rdx)
> +# endif
> + ret
> +# endif
> +
> + .p2align 4,, 2
> +L(zero_len):
> + movq %rdi, %rax
> + ret
> +# ifndef USE_AS_WCSCPY
> + .p2align 4,, 8
> +L(zfill_from_1):
> +# ifdef USE_AS_STPCPY
> + leaq (%rdi, %rcx), %rax
> +# endif
> + movw $0, -1(%rdi, %rdx)
> + ret
> +# endif
> +
> + .p2align 4,, 4
> + .p2align 6,, 8
> +L(page_cross):
> + movq %rsi, %rax
> + andq $(VEC_SIZE * -1), %rax
> +
> + VPCMPEQ (%rax), %VZERO, %VMM(6)
> +
> + vpmovmskb %VMM(6), %ecx
> + shrxl %esi, %ecx, %ecx
> +
> + subl %esi, %eax
> + andl $(VEC_SIZE - 1), %eax
> + cmpq %rax, %rdx
> + jb L(page_cross_small)
> + /* Optimizing more aggressively for space as this is very cold
> + code. This saves 2x cache lines. */
> +
> + /* If rcx is non-zero then continue. */
> + shl $CHAR_SIZE, %ecx
> + jz L(page_cross_continue)
> + bsf %ecx, %ecx
> +
> + subq %rcx, %rdx
> +# ifdef USE_AS_STPCPY
> + leaq -CHAR_SIZE(%rdi, %rcx), %rax
> +# else
> + movq %rdi, %rax
> +# endif
> +
> + rep movsb
> +# ifdef USE_AS_WCSCPY
> + movl $0, (%rdi)
> +# else
> + movb $0, (%rdi)
> +# endif
> + jmp L(zfill_from_page_cross)
> +
> +L(page_cross_small):
> + tzcntl %ecx, %ecx
> + xorl %eax, %eax
> + cmpl %ecx, %edx
> + jbe L(page_cross_copy_only)
> +
> + /* Do a zfill of the tail before copying. */
> + movq %rdi, %r9
> + movl %ecx, %r8d
> +
> + subl %ecx, %edx
> + leaq CHAR_SIZE(%rdi, %rcx), %rdi
> + movl %edx, %ecx
> + rep stosb
> + movq %r9, %rdi
> + movl %r8d, %edx
> +L(page_cross_copy_only):
> + leal CHAR_SIZE(%rdx), %ecx
> +# ifdef USE_AS_STPCPY
> +# ifdef USE_AS_WCSCPY
> + setc %al
> + addq %rdi, %rdx
> + leaq (%rdx, %rax, CHAR_SIZE), %rax
> +# else
> + movl %edx, %eax
> + adcq %rdi, %rax
> +# endif
> +# else
> + movq %rdi, %rax
> +# endif
> + rep movsb
> + ret
> +
> +
> +L(best_effort_strncpy):
> + movq %rdx, %rcx
> + xorl %eax, %eax
> + movq %rdi, %r8
> + /* The length is >= 2^63. We very much so expect to segfault at
> + rep stos. If that doesn't happen then just strcpy to finish.
> + */
> +# ifdef USE_AS_WCSCPY
> + rep stosl
> +# else
> + rep stosb
> +# endif
> + movq %r8, %rdi
> + jmp OVERFLOW_STRCPY
> +END(STRNCPY)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/x86-avx-vecs.h b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> index dca1089060..275af7560a 100644
> --- a/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> +++ b/sysdeps/x86_64/multiarch/x86-avx-vecs.h
> @@ -27,7 +27,8 @@
> #define VEC_SIZE 32
> #include "x86-vec-macros.h"
>
> -#define USE_WITH_AVX 1
> +#define USE_WITH_AVX2 1
> +
> #define SECTION(p) p##.avx
>
> /* 4-byte mov instructions with AVX2. */
> --
> 2.34.1
>
LGTM.
Thanks.
H.J.
@@ -1,3 +1,3 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STPCPY __stpcpy_avx2_rtm
+#include "x86-avx-rtm-vecs.h"
+#include "stpcpy-avx2.S"
@@ -1,4 +1,3 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STPNCPY __stpncpy_avx2_rtm
+#include "x86-avx-rtm-vecs.h"
+#include "stpncpy-avx2.S"
@@ -3,6 +3,5 @@
#endif
#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY STPNCPY
-#include "strcpy-avx2.S"
+#define STRNCPY STPNCPY
+#include "strncpy-avx2.S"
@@ -1,12 +1,3 @@
-#ifndef STRCAT
-# define STRCAT __strcat_avx2_rtm
-#endif
-
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
- ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
-
-#define SECTION(p) p##.avx.rtm
-
+#define STRCAT __strcat_avx2_rtm
+#include "x86-avx-rtm-vecs.h"
#include "strcat-avx2.S"
@@ -16,266 +16,10 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (3)
-
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-# define STRCAT __strcat_avx2
-# endif
-
-# define USE_AS_STRCAT
-
-/* Number of bytes in a vector register */
-# define VEC_SIZE 32
-
-# ifndef SECTION
-# define SECTION(p) p##.avx
-# endif
-
- .section SECTION(.text),"ax",@progbits
-ENTRY (STRCAT)
- mov %rdi, %r9
-# ifdef USE_AS_STRNCAT
- mov %rdx, %r8
-# endif
-
- xor %eax, %eax
- mov %edi, %ecx
- and $((VEC_SIZE * 4) - 1), %ecx
- vpxor %xmm6, %xmm6, %xmm6
- cmp $(VEC_SIZE * 3), %ecx
- ja L(fourth_vector_boundary)
- vpcmpeqb (%rdi), %ymm6, %ymm0
- vpmovmskb %ymm0, %edx
- test %edx, %edx
- jnz L(exit_null_on_first_vector)
- mov %rdi, %rax
- and $-VEC_SIZE, %rax
- jmp L(align_vec_size_start)
-L(fourth_vector_boundary):
- mov %rdi, %rax
- and $-VEC_SIZE, %rax
- vpcmpeqb (%rax), %ymm6, %ymm0
- mov $-1, %r10d
- sub %rax, %rcx
- shl %cl, %r10d
- vpmovmskb %ymm0, %edx
- and %r10d, %edx
- jnz L(exit)
-
-L(align_vec_size_start):
- vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
- vpmovmskb %ymm0, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
- vpmovmskb %ymm1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
- vpmovmskb %ymm2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
- vpmovmskb %ymm3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
- add $(VEC_SIZE * 4), %rax
- vpmovmskb %ymm0, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
- vpmovmskb %ymm1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
- vpmovmskb %ymm2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
- vpmovmskb %ymm3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
- add $(VEC_SIZE * 4), %rax
- vpmovmskb %ymm0, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
- vpmovmskb %ymm1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
- vpmovmskb %ymm2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
- vpmovmskb %ymm3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
- add $(VEC_SIZE * 4), %rax
- vpmovmskb %ymm0, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
- vpmovmskb %ymm1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
- vpmovmskb %ymm2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
- vpmovmskb %ymm3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
- add $(VEC_SIZE * 5), %rax
- vpmovmskb %ymm0, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
- add $VEC_SIZE, %rax
- vpmovmskb %ymm1, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
- add $VEC_SIZE, %rax
- vpmovmskb %ymm2, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
- add $VEC_SIZE, %rax
- vpmovmskb %ymm3, %edx
- test %edx, %edx
- jnz L(exit)
-
- add $VEC_SIZE, %rax
-
- .p2align 4
-L(align_four_vec_loop):
- vmovaps (%rax), %ymm4
- vpminub VEC_SIZE(%rax), %ymm4, %ymm4
- vmovaps (VEC_SIZE * 2)(%rax), %ymm5
- vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5
- add $(VEC_SIZE * 4), %rax
- vpminub %ymm4, %ymm5, %ymm5
- vpcmpeqb %ymm5, %ymm6, %ymm5
- vpmovmskb %ymm5, %edx
- test %edx, %edx
- jz L(align_four_vec_loop)
-
- vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
- sub $(VEC_SIZE * 5), %rax
- vpmovmskb %ymm0, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
- vpmovmskb %ymm1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
- vpmovmskb %ymm2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
- vpmovmskb %ymm3, %edx
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 4), %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit):
- sub %rdi, %rax
-L(exit_null_on_first_vector):
- bsf %rdx, %rdx
- add %rdx, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_second_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $VEC_SIZE, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_third_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 2), %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_fourth_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 3), %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_fifth_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 4), %rax
-
- .p2align 4
-L(StartStrcpyPart):
- lea (%r9, %rax), %rdi
- mov %rsi, %rcx
- mov %r9, %rax /* save result */
-
-# ifdef USE_AS_STRNCAT
- test %r8, %r8
- jz L(ExitZero)
-# define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-avx2.S"
+#ifndef STRCAT
+# define STRCAT __strcat_avx2
#endif
+
+#define USE_AS_STRCAT
+#define STRCPY STRCAT
+#include "strcpy-avx2.S"
new file mode 100644
@@ -0,0 +1,101 @@
+/* strlen used for begining of str{n}cat using AVX2.
+ Copyright (C) 2011-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+/* NOTE: This file is meant to be included by strcat-avx2 or
+ strncat-avx2 and does not standalone. Before including %rdi
+ must be saved in %rax. */
+
+
+/* Simple strlen implementation that ends at
+ L(strcat_strlen_done). */
+ movq %rdi, %r8
+ andq $(VEC_SIZE * -1), %r8
+ VPCMPEQ (%r8), %VZERO, %VMM(0)
+ vpmovmskb %VMM(0), %ecx
+ shrxl %edi, %ecx, %ecx
+ testl %ecx, %ecx
+ jnz L(bsf_and_done_v0)
+
+ VPCMPEQ VEC_SIZE(%r8), %VZERO, %VMM(0)
+ vpmovmskb %VMM(0), %ecx
+ leaq (VEC_SIZE)(%r8), %rdi
+ testl %ecx, %ecx
+ jnz L(bsf_and_done_v0)
+
+ VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %VMM(0)
+ vpmovmskb %VMM(0), %ecx
+ testl %ecx, %ecx
+ jnz L(bsf_and_done_v1)
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %VMM(0)
+ vpmovmskb %VMM(0), %ecx
+ testl %ecx, %ecx
+ jnz L(bsf_and_done_v2)
+
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %VMM(0)
+ vpmovmskb %VMM(0), %ecx
+ testl %ecx, %ecx
+ jnz L(bsf_and_done_v3)
+
+ orq $(VEC_SIZE * 4 - 1), %rdi
+ .p2align 4,, 8
+L(loop_2x_vec):
+ VMOVA (VEC_SIZE * 0 + 1)(%rdi), %VMM(0)
+ VPMIN (VEC_SIZE * 1 + 1)(%rdi), %VMM(0), %VMM(1)
+ VMOVA (VEC_SIZE * 2 + 1)(%rdi), %VMM(2)
+ VPMIN (VEC_SIZE * 3 + 1)(%rdi), %VMM(2), %VMM(3)
+ VPMIN %VMM(1), %VMM(3), %VMM(3)
+ VPCMPEQ %VMM(3), %VZERO, %VMM(3)
+ vpmovmskb %VMM(3), %r8d
+ subq $(VEC_SIZE * -4), %rdi
+ testl %r8d, %r8d
+ jz L(loop_2x_vec)
+
+ addq $(VEC_SIZE * -4 + 1), %rdi
+
+ VPCMPEQ %VMM(0), %VZERO, %VMM(0)
+ vpmovmskb %VMM(0), %ecx
+ testl %ecx, %ecx
+ jnz L(bsf_and_done_v0)
+
+ VPCMPEQ %VMM(1), %VZERO, %VMM(1)
+ vpmovmskb %VMM(1), %ecx
+ testl %ecx, %ecx
+ jnz L(bsf_and_done_v1)
+
+ VPCMPEQ %VMM(2), %VZERO, %VMM(2)
+ vpmovmskb %VMM(2), %ecx
+ testl %ecx, %ecx
+ jnz L(bsf_and_done_v2)
+
+ movl %r8d, %ecx
+L(bsf_and_done_v3):
+ addq $VEC_SIZE, %rdi
+L(bsf_and_done_v2):
+ bsfl %ecx, %ecx
+ leaq (VEC_SIZE * 2)(%rdi, %rcx), %rdi
+ jmp L(strcat_strlen_done)
+
+ .p2align 4,, 4
+L(bsf_and_done_v1):
+ addq $VEC_SIZE, %rdi
+L(bsf_and_done_v0):
+ bsfl %ecx, %ecx
+ addq %rcx, %rdi
+L(strcat_strlen_done):
@@ -1,12 +1,3 @@
-#ifndef STRCPY
-# define STRCPY __strcpy_avx2_rtm
-#endif
-
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
- ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
-
-#define SECTION(p) p##.avx.rtm
-
+#define STRCPY __strcpy_avx2_rtm
+#include "x86-avx-rtm-vecs.h"
#include "strcpy-avx2.S"
@@ -20,984 +20,378 @@
#if ISA_SHOULD_BUILD (3)
+# include <sysdep.h>
-# ifndef USE_AS_STRCAT
-# include <sysdep.h>
-
-# ifndef STRCPY
-# define STRCPY __strcpy_avx2
-# endif
-
-# endif
-
-/* Number of bytes in a vector register */
# ifndef VEC_SIZE
-# define VEC_SIZE 32
-# endif
-
-# ifndef VZEROUPPER
-# define VZEROUPPER vzeroupper
-# endif
-
-# ifndef SECTION
-# define SECTION(p) p##.avx
-# endif
-
-/* zero register */
-#define xmmZ xmm0
-#define ymmZ ymm0
-
-/* mask register */
-#define ymmM ymm1
-
-# ifndef USE_AS_STRCAT
-
- .section SECTION(.text),"ax",@progbits
-ENTRY (STRCPY)
-# ifdef USE_AS_STRNCPY
- mov %RDX_LP, %R8_LP
- test %R8_LP, %R8_LP
- jz L(ExitZero)
-# endif
- mov %rsi, %rcx
-# ifndef USE_AS_STPCPY
- mov %rdi, %rax /* save result */
-# endif
-
+# include "x86-avx-vecs.h"
# endif
- vpxor %xmmZ, %xmmZ, %xmmZ
-
- and $((VEC_SIZE * 4) - 1), %ecx
- cmp $(VEC_SIZE * 2), %ecx
- jbe L(SourceStringAlignmentLessTwoVecSize)
-
- and $-VEC_SIZE, %rsi
- and $(VEC_SIZE - 1), %ecx
-
- vpcmpeqb (%rsi), %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- shr %cl, %rdx
-
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- mov $VEC_SIZE, %r10
- sub %rcx, %r10
- cmp %r10, %r8
-# else
- mov $(VEC_SIZE + 1), %r10
- sub %rcx, %r10
- cmp %r10, %r8
-# endif
- jbe L(CopyVecSizeTailCase2OrCase3)
+# ifndef STRCPY
+# define STRCPY __strcpy_avx2
# endif
- test %edx, %edx
- jnz L(CopyVecSizeTail)
- vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
- vpmovmskb %ymm2, %edx
+ /* Use movsb in page cross case to save code size. */
+# define USE_MOVSB_IN_PAGE_CROSS 1
-# ifdef USE_AS_STRNCPY
- add $VEC_SIZE, %r10
- cmp %r10, %r8
- jbe L(CopyTwoVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
- jnz L(CopyTwoVecSize)
-
- vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */
- vmovdqu %ymm2, (%rdi)
-
-/* If source address alignment != destination address alignment */
- .p2align 4
-L(UnalignVecSizeBoth):
- sub %rcx, %rdi
-# ifdef USE_AS_STRNCPY
- add %rcx, %r8
- sbb %rcx, %rcx
- or %rcx, %r8
-# endif
- mov $VEC_SIZE, %rcx
- vmovdqa (%rsi, %rcx), %ymm2
- vmovdqu %ymm2, (%rdi, %rcx)
- vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
- vpcmpeqb %ymm2, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $(VEC_SIZE * 3), %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_WCSCPY
+# define VPCMPEQ vpcmpeqd
+# define VPMIN vpminud
+# define CHAR_SIZE 4
# else
- jnz L(CopyVecSize)
+# define VPCMPEQ vpcmpeqb
+# define VPMIN vpminub
+# define CHAR_SIZE 1
# endif
- vmovdqu %ymm2, (%rdi, %rcx)
- vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
- vpcmpeqb %ymm3, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec3)
-# else
- jnz L(CopyVecSize)
-# endif
+# define PAGE_SIZE 4096
- vmovdqu %ymm3, (%rdi, %rcx)
- vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
- vpcmpeqb %ymm4, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec4)
+# ifdef USE_AS_STPCPY
+# define END_REG rax
# else
- jnz L(CopyVecSize)
+# define END_REG rdi, %rdx
# endif
- vmovdqu %ymm4, (%rdi, %rcx)
- vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
- vpcmpeqb %ymm2, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_STRCAT
+# define PAGE_ALIGN_REG ecx
# else
- jnz L(CopyVecSize)
+# define PAGE_ALIGN_REG eax
# endif
- vmovdqu %ymm2, (%rdi, %rcx)
- vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
- vpcmpeqb %ymm2, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec2)
-# else
- jnz L(CopyVecSize)
-# endif
+# define VZERO VMM(7)
+# define VZERO_128 VMM_128(7)
- vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
- vmovdqu %ymm2, (%rdi, %rcx)
- vpcmpeqb %ymm3, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec3)
-# else
- jnz L(CopyVecSize)
-# endif
+ .section SECTION(.text), "ax", @progbits
+ENTRY(STRCPY)
+ vpxor %VZERO_128, %VZERO_128, %VZERO_128
- vmovdqu %ymm3, (%rdi, %rcx)
- mov %rsi, %rdx
- lea VEC_SIZE(%rsi, %rcx), %rsi
- and $-(VEC_SIZE * 4), %rsi
- sub %rsi, %rdx
- sub %rdx, %rdi
-# ifdef USE_AS_STRNCPY
- lea (VEC_SIZE * 8)(%r8, %rdx), %r8
-# endif
-L(UnalignedFourVecSizeLoop):
- vmovdqa (%rsi), %ymm4
- vmovdqa VEC_SIZE(%rsi), %ymm5
- vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
- vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
- vpminub %ymm5, %ymm4, %ymm2
- vpminub %ymm7, %ymm6, %ymm3
- vpminub %ymm2, %ymm3, %ymm3
- vpcmpeqb %ymmM, %ymm3, %ymm3
- vpmovmskb %ymm3, %edx
-# ifdef USE_AS_STRNCPY
- sub $(VEC_SIZE * 4), %r8
- jbe L(UnalignedLeaveCase2OrCase3)
-# endif
- test %edx, %edx
- jnz L(UnalignedFourVecSizeLeave)
-
-L(UnalignedFourVecSizeLoop_start):
- add $(VEC_SIZE * 4), %rdi
- add $(VEC_SIZE * 4), %rsi
- vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
- vmovdqa (%rsi), %ymm4
- vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
- vmovdqa VEC_SIZE(%rsi), %ymm5
- vpminub %ymm5, %ymm4, %ymm2
- vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
- vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
- vmovdqu %ymm7, -VEC_SIZE(%rdi)
- vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
- vpminub %ymm7, %ymm6, %ymm3
- vpminub %ymm2, %ymm3, %ymm3
- vpcmpeqb %ymmM, %ymm3, %ymm3
- vpmovmskb %ymm3, %edx
-# ifdef USE_AS_STRNCPY
- sub $(VEC_SIZE * 4), %r8
- jbe L(UnalignedLeaveCase2OrCase3)
-# endif
- test %edx, %edx
- jz L(UnalignedFourVecSizeLoop_start)
-
-L(UnalignedFourVecSizeLeave):
- vpcmpeqb %ymm4, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- test %edx, %edx
- jnz L(CopyVecSizeUnaligned_0)
-
- vpcmpeqb %ymm5, %ymmZ, %ymmM
- vpmovmskb %ymmM, %ecx
- test %ecx, %ecx
- jnz L(CopyVecSizeUnaligned_16)
-
- vpcmpeqb %ymm6, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- test %edx, %edx
- jnz L(CopyVecSizeUnaligned_32)
-
- vpcmpeqb %ymm7, %ymmZ, %ymmM
- vpmovmskb %ymmM, %ecx
- bsf %ecx, %edx
- vmovdqu %ymm4, (%rdi)
- vmovdqu %ymm5, VEC_SIZE(%rdi)
- vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea (VEC_SIZE * 3)(%rdi, %rdx), %rax
-# endif
- vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
- add $(VEC_SIZE - 1), %r8
- sub %rdx, %r8
- lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- add $(VEC_SIZE * 3), %rsi
- add $(VEC_SIZE * 3), %rdi
- jmp L(CopyVecSizeExit)
+# ifdef USE_AS_STRCAT
+ movq %rdi, %rax
+# include "strcat-strlen-avx2.h.S"
# endif
-/* If source address alignment == destination address alignment */
-
-L(SourceStringAlignmentLessTwoVecSize):
- vmovdqu (%rsi), %ymm3
- vmovdqu VEC_SIZE(%rsi), %ymm2
- vpcmpeqb %ymm3, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
-
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- cmp $VEC_SIZE, %r8
-# else
- cmp $(VEC_SIZE + 1), %r8
-# endif
- jbe L(CopyVecSizeTail1Case2OrCase3)
+ movl %esi, %PAGE_ALIGN_REG
+ andl $(PAGE_SIZE - 1), %PAGE_ALIGN_REG
+ cmpl $(PAGE_SIZE - VEC_SIZE), %PAGE_ALIGN_REG
+ ja L(page_cross)
+L(page_cross_continue):
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+ movq %rdi, %rax
# endif
- test %edx, %edx
- jnz L(CopyVecSizeTail1)
-
- vmovdqu %ymm3, (%rdi)
- vpcmpeqb %ymm2, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
-
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- cmp $(VEC_SIZE * 2), %r8
-# else
- cmp $((VEC_SIZE * 2) + 1), %r8
-# endif
- jbe L(CopyTwoVecSize1Case2OrCase3)
-# endif
- test %edx, %edx
- jnz L(CopyTwoVecSize1)
-
- and $-VEC_SIZE, %rsi
- and $(VEC_SIZE - 1), %ecx
- jmp L(UnalignVecSizeBoth)
+ VMOVU (%rsi), %VMM(0)
+ VPCMPEQ %VMM(0), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
-/*------End of main part with loops---------------------*/
+ testl %ecx, %ecx
+ jz L(more_1x_vec)
-/* Case1 */
+ /* No longer need ymm registers so just vzeroupper so it doesn't
+ need to be duplicated at each return statement. */
+ COND_VZEROUPPER
-# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
- .p2align 4
-L(CopyVecSize):
- add %rcx, %rdi
-# endif
-L(CopyVecSizeTail):
- add %rcx, %rsi
-L(CopyVecSizeTail1):
- bsf %edx, %edx
-L(CopyVecSizeExit):
- cmp $32, %edx
- jae L(Exit32_63)
- cmp $16, %edx
- jae L(Exit16_31)
- cmp $8, %edx
- jae L(Exit8_15)
- cmp $4, %edx
- jae L(Exit4_7)
- cmp $3, %edx
- je L(Exit3)
- cmp $1, %edx
- ja L(Exit2)
- je L(Exit1)
- movb $0, (%rdi)
+ xorl %edx, %edx
+ bsfl %ecx, %edx
# ifdef USE_AS_STPCPY
- lea (%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $1, %r8
- lea 1(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
-L(return_vzeroupper):
- ZERO_UPPER_VEC_REGISTERS_RETURN
-
- .p2align 4
-L(CopyTwoVecSize1):
- add $VEC_SIZE, %rsi
- add $VEC_SIZE, %rdi
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $VEC_SIZE, %r8
-# endif
- jmp L(CopyVecSizeTail1)
-
- .p2align 4
-L(CopyTwoVecSize):
- bsf %edx, %edx
- add %rcx, %rsi
- add $VEC_SIZE, %edx
- sub %ecx, %edx
- jmp L(CopyVecSizeExit)
-
- .p2align 4
-L(CopyVecSizeUnaligned_0):
- bsf %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
- vmovdqu %ymm4, (%rdi)
- add $((VEC_SIZE * 4) - 1), %r8
- sub %rdx, %r8
- lea 1(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
-# else
- jmp L(CopyVecSizeExit)
-# endif
-
- .p2align 4
-L(CopyVecSizeUnaligned_16):
- bsf %ecx, %edx
- vmovdqu %ymm4, (%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea VEC_SIZE(%rdi, %rdx), %rax
-# endif
- vmovdqu %ymm5, VEC_SIZE(%rdi)
- add $((VEC_SIZE * 3) - 1), %r8
- sub %rdx, %r8
- lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
+ leaq (%rdi, %rdx), %rax
+# endif
+
+ /* Use mask bits in rcx to detect which copy we need. If the low
+ mask is zero then there must be a bit set in the upper half.
+ I.e if ecx != 0 and cx == 0, then match must be upper 16
+ bits so we use L(copy_16_31). */
+ testw %cx, %cx
+ jz L(copy_16_31)
+
+ testb %cl, %cl
+ jz L(copy_8_15)
+# ifdef USE_AS_WCSCPY
+ vmovd %xmm0, (%rdi)
+ movl $0, (%END_REG)
+ ret
# else
- add $VEC_SIZE, %rsi
- add $VEC_SIZE, %rdi
- jmp L(CopyVecSizeExit)
-# endif
-
- .p2align 4
-L(CopyVecSizeUnaligned_32):
- bsf %edx, %edx
- vmovdqu %ymm4, (%rdi)
- vmovdqu %ymm5, VEC_SIZE(%rdi)
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-# ifdef USE_AS_STPCPY
- lea (VEC_SIZE * 2)(%rdi, %rdx), %rax
-# endif
- vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
- add $((VEC_SIZE * 2) - 1), %r8
- sub %rdx, %r8
- lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
- jmp L(StrncpyFillTailWithZero)
+ testb $0x7, %cl
+ jz L(copy_4_7)
+
+ testl %edx, %edx
+ jz L(set_null_term)
+ vmovd %xmm0, %ecx
+ movw %cx, (%rdi)
+
+ .p2align 4,, 2
+L(set_null_term):
+ movb $0, (%END_REG)
+ ret
+
+ .p2align 4,, 12
+L(copy_4_7):
+ movl -3(%rsi, %rdx), %ecx
+ vmovd %xmm0, (%rdi)
+ movl %ecx, -3(%END_REG)
+ ret
+# endif
+
+ .p2align 4,, 10
+L(copy_16_31):
+ VMOVU -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
+ VMOVU %xmm0, (%rdi)
+ VMOVU %xmm1, -(16 - CHAR_SIZE)(%END_REG)
+ ret
+
+ .p2align 4,, 10
+L(copy_8_15):
+# ifdef USE_AS_WCSCPY
+ movl -(8 - CHAR_SIZE)(%rsi, %rdx), %ecx
# else
- add $(VEC_SIZE * 2), %rsi
- add $(VEC_SIZE * 2), %rdi
- jmp L(CopyVecSizeExit)
-# endif
-
-# ifdef USE_AS_STRNCPY
-# ifndef USE_AS_STRCAT
- .p2align 4
-L(CopyVecSizeUnalignedVec6):
- vmovdqu %ymm6, (%rdi, %rcx)
- jmp L(CopyVecSizeVecExit)
-
- .p2align 4
-L(CopyVecSizeUnalignedVec5):
- vmovdqu %ymm5, (%rdi, %rcx)
- jmp L(CopyVecSizeVecExit)
-
- .p2align 4
-L(CopyVecSizeUnalignedVec4):
- vmovdqu %ymm4, (%rdi, %rcx)
- jmp L(CopyVecSizeVecExit)
-
- .p2align 4
-L(CopyVecSizeUnalignedVec3):
- vmovdqu %ymm3, (%rdi, %rcx)
- jmp L(CopyVecSizeVecExit)
-# endif
-
-/* Case2 */
-
- .p2align 4
-L(CopyVecSizeCase2):
- add $VEC_SIZE, %r8
- add %rcx, %rdi
- add %rcx, %rsi
- bsf %edx, %edx
- cmp %r8d, %edx
- jb L(CopyVecSizeExit)
- jmp L(StrncpyExit)
-
- .p2align 4
-L(CopyTwoVecSizeCase2):
- add %rcx, %rsi
- bsf %edx, %edx
- add $VEC_SIZE, %edx
- sub %ecx, %edx
- cmp %r8d, %edx
- jb L(CopyVecSizeExit)
- jmp L(StrncpyExit)
-
-L(CopyVecSizeTailCase2):
- add %rcx, %rsi
- bsf %edx, %edx
- cmp %r8d, %edx
- jb L(CopyVecSizeExit)
- jmp L(StrncpyExit)
-
-L(CopyVecSizeTail1Case2):
- bsf %edx, %edx
- cmp %r8d, %edx
- jb L(CopyVecSizeExit)
- jmp L(StrncpyExit)
-
-/* Case2 or Case3, Case3 */
-
- .p2align 4
-L(CopyVecSizeCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyVecSizeCase2)
-L(CopyVecSizeCase3):
- add $VEC_SIZE, %r8
- add %rcx, %rdi
- add %rcx, %rsi
- jmp L(StrncpyExit)
-
- .p2align 4
-L(CopyTwoVecSizeCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyTwoVecSizeCase2)
- add %rcx, %rsi
- jmp L(StrncpyExit)
-
- .p2align 4
-L(CopyVecSizeTailCase2OrCase3):
- test %rdx, %rdx
- jnz L(CopyVecSizeTailCase2)
- add %rcx, %rsi
- jmp L(StrncpyExit)
-
- .p2align 4
-L(CopyTwoVecSize1Case2OrCase3):
- add $VEC_SIZE, %rdi
- add $VEC_SIZE, %rsi
- sub $VEC_SIZE, %r8
-L(CopyVecSizeTail1Case2OrCase3):
- test %rdx, %rdx
- jnz L(CopyVecSizeTail1Case2)
- jmp L(StrncpyExit)
-# endif
-
-/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
-
- .p2align 4
-L(Exit1):
- movzwl (%rsi), %edx
- mov %dx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 1(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $2, %r8
- lea 2(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER_RETURN
-
- .p2align 4
-L(Exit2):
- movzwl (%rsi), %ecx
- mov %cx, (%rdi)
- movb $0, 2(%rdi)
-# ifdef USE_AS_STPCPY
- lea 2(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $3, %r8
- lea 3(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER_RETURN
-
- .p2align 4
-L(Exit3):
- mov (%rsi), %edx
- mov %edx, (%rdi)
+ movq -(8 - CHAR_SIZE)(%rsi, %rdx), %rcx
+# endif
+ vmovq %xmm0, (%rdi)
+ movq %rcx, -(8 - CHAR_SIZE)(%END_REG)
+ ret
+
+
+ .p2align 4,, 8
+L(more_1x_vec):
+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+ VMOVU %VMM(0), (%rdi)
+# endif
+ subq %rsi, %rdi
+ orq $(VEC_SIZE - 1), %rsi
+ addq %rsi, %rdi
+ VMOVA 1(%rsi), %VMM(1)
+
+ /* Try and order stores after as many loads as is reasonable to
+ avoid potential false dependencies. */
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+ VMOVU %VMM(0), (%rax)
+# endif
+ VPCMPEQ %VMM(1), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ testl %ecx, %ecx
+ jnz L(ret_vec_x1)
+
+ VMOVA (VEC_SIZE + 1)(%rsi), %VMM(2)
+ VMOVU %VMM(1), 1(%rdi)
+
+ VPCMPEQ %VMM(2), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ testl %ecx, %ecx
+ jnz L(ret_vec_x2)
+
+ VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(3)
+ VMOVU %VMM(2), (VEC_SIZE + 1)(%rdi)
+
+ VPCMPEQ %VMM(3), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ testl %ecx, %ecx
+ jnz L(ret_vec_x3)
+
+ VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(4)
+ VMOVU %VMM(3), (VEC_SIZE * 2 + 1)(%rdi)
+ VPCMPEQ %VMM(4), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %edx
+ testl %edx, %edx
+ jnz L(ret_vec_x4)
+
+ VMOVU %VMM(4), (VEC_SIZE * 3 + 1)(%rdi)
+
+ /* Subtract rsi from rdi before aligning. Adding back rsi will
+ get proper rdi (dst) for new src. */
+ subq %rsi, %rdi
+ incq %rsi
+ orq $(VEC_SIZE * 4 - 1), %rsi
+
+ /* Do first half of loop ahead of time so loop can just start by
+ storing. */
+ VMOVA (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPMIN %VMM(4), %VMM(6), %VMM(6)
+ VPCMPEQ %VMM(6), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %edx
+ addq %rsi, %rdi
+
+ testl %edx, %edx
+ jnz L(loop_4x_done)
+
+ .p2align 4,, 11
+L(loop_4x_vec):
+
+ VMOVU %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
+ VMOVU %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
+ subq $(VEC_SIZE * -4), %rsi
+ VMOVU %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
+ VMOVU %VMM(3), (VEC_SIZE * 3 + 1)(%rdi)
+
+
+ VMOVA (VEC_SIZE * 0 + 1)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 1 + 1)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 2 + 1)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 3 + 1)(%rsi), %VMM(3)
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPMIN %VMM(4), %VMM(6), %VMM(6)
+ VPCMPEQ %VMM(6), %VZERO, %VMM(6)
+
+ vpmovmskb %VMM(6), %edx
+ subq $(VEC_SIZE * -4), %rdi
+ testl %edx, %edx
+ jz L(loop_4x_vec)
+
+L(loop_4x_done):
+ VPCMPEQ %VMM(0), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ testl %ecx, %ecx
+ jnz L(ret_vec_x1)
+ VMOVU %VMM(0), (VEC_SIZE * 0 + 1)(%rdi)
+
+ VPCMPEQ %VMM(1), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ testl %ecx, %ecx
+ jnz L(ret_vec_x2)
+ VMOVU %VMM(1), (VEC_SIZE * 1 + 1)(%rdi)
+
+ VPCMPEQ %VMM(2), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ testl %ecx, %ecx
+ jnz L(ret_vec_x3)
+ VMOVU %VMM(2), (VEC_SIZE * 2 + 1)(%rdi)
+L(ret_vec_x4):
+ bsfl %edx, %edx
+ VMOVU ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+ VMOVU %VMM(1), ((VEC_SIZE * 3 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
# ifdef USE_AS_STPCPY
- lea 3(%rdi), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub $4, %r8
- lea 4(%rdi), %rdi
- jnz L(StrncpyFillTailWithZero)
+ leaq (VEC_SIZE * 3 + 1)(%rdx, %rdi), %rax
# endif
+L(return_end):
VZEROUPPER_RETURN
- .p2align 4
-L(Exit4_7):
- mov (%rsi), %ecx
- mov %ecx, (%rdi)
- mov -3(%rsi, %rdx), %ecx
- mov %ecx, -3(%rdi, %rdx)
+ .p2align 4,, 8
+L(ret_vec_x1):
+ bsfl %ecx, %ecx
+ VMOVU (1 -(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+ VMOVU %VMM(1), (1 -(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub %rdx, %r8
- sub $1, %r8
- lea 1(%rdi, %rdx), %rdi
- jnz L(StrncpyFillTailWithZero)
+ leaq 1(%rcx, %rdi), %rax
# endif
- VZEROUPPER_RETURN
-
- .p2align 4
-L(Exit8_15):
- mov (%rsi), %rcx
- mov -7(%rsi, %rdx), %r9
- mov %rcx, (%rdi)
- mov %r9, -7(%rdi, %rdx)
-# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub %rdx, %r8
- sub $1, %r8
- lea 1(%rdi, %rdx), %rdi
- jnz L(StrncpyFillTailWithZero)
-# endif
- VZEROUPPER_RETURN
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
- .p2align 4
-L(Exit16_31):
- vmovdqu (%rsi), %xmm2
- vmovdqu -15(%rsi, %rdx), %xmm3
- vmovdqu %xmm2, (%rdi)
- vmovdqu %xmm3, -15(%rdi, %rdx)
+ .p2align 4,, 8
+L(ret_vec_x2):
+ bsfl %ecx, %ecx
+ VMOVU ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+ VMOVU %VMM(1), ((VEC_SIZE + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub %rdx, %r8
- sub $1, %r8
- lea 1(%rdi, %rdx), %rdi
- jnz L(StrncpyFillTailWithZero)
+ leaq (VEC_SIZE * 1 + 1)(%rcx, %rdi), %rax
# endif
VZEROUPPER_RETURN
- .p2align 4
-L(Exit32_63):
- vmovdqu (%rsi), %ymm2
- vmovdqu -31(%rsi, %rdx), %ymm3
- vmovdqu %ymm2, (%rdi)
- vmovdqu %ymm3, -31(%rdi, %rdx)
+ .p2align 4,, 8
+L(ret_vec_x3):
+ bsfl %ecx, %ecx
+ VMOVU ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rcx), %VMM(1)
+ VMOVU %VMM(1), ((VEC_SIZE * 2 + 1)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rcx)
# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- sub %rdx, %r8
- sub $1, %r8
- lea 1(%rdi, %rdx), %rdi
- jnz L(StrncpyFillTailWithZero)
+ leaq (VEC_SIZE * 2 + 1)(%rcx, %rdi), %rax
# endif
VZEROUPPER_RETURN
-# ifdef USE_AS_STRNCPY
- .p2align 4
-L(StrncpyExit1):
- movzbl (%rsi), %edx
- mov %dl, (%rdi)
+ .p2align 4,, 4
+L(page_cross):
+ movq %rsi, %rcx
+ andq $(VEC_SIZE * -1), %rcx
+
+ VPCMPEQ (%rcx), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ shrxl %esi, %ecx, %ecx
+# if USE_MOVSB_IN_PAGE_CROSS
+ /* Optimizing more aggressively for space as this is very cold
+ code. This saves 2x cache lines. */
+
+ /* This adds once to the later result which will get correct
+ copy bounds. NB: this can never zero-out a non-zero RCX as
+ to be in the page cross case rsi cannot be aligned and we
+ already right-shift rcx by the misalignment. */
+ shll $CHAR_SIZE, %ecx
+ jz L(page_cross_continue)
+ bsfl %ecx, %ecx
+# if !defined USE_AS_STPCPY && !defined USE_AS_STRCAT
+ movq %rdi, %rax
+# endif
+ rep movsb
# ifdef USE_AS_STPCPY
- lea 1(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 1(%rdi)
+ leaq -CHAR_SIZE(%rdi), %rax
# endif
- VZEROUPPER_RETURN
- .p2align 4
-L(StrncpyExit2):
- movzwl (%rsi), %edx
- mov %dx, (%rdi)
-# ifdef USE_AS_STPCPY
- lea 2(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 2(%rdi)
-# endif
VZEROUPPER_RETURN
- .p2align 4
-L(StrncpyExit3_4):
- movzwl (%rsi), %ecx
- movzwl -2(%rsi, %r8), %edx
- mov %cx, (%rdi)
- mov %dx, -2(%rdi, %r8)
-# ifdef USE_AS_STPCPY
- lea (%rdi, %r8), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi, %r8)
-# endif
- VZEROUPPER_RETURN
-
- .p2align 4
-L(StrncpyExit5_8):
- mov (%rsi), %ecx
- mov -4(%rsi, %r8), %edx
- mov %ecx, (%rdi)
- mov %edx, -4(%rdi, %r8)
-# ifdef USE_AS_STPCPY
- lea (%rdi, %r8), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi, %r8)
-# endif
- VZEROUPPER_RETURN
-
- .p2align 4
-L(StrncpyExit9_16):
- mov (%rsi), %rcx
- mov -8(%rsi, %r8), %rdx
- mov %rcx, (%rdi)
- mov %rdx, -8(%rdi, %r8)
-# ifdef USE_AS_STPCPY
- lea (%rdi, %r8), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi, %r8)
-# endif
- VZEROUPPER_RETURN
-
- .p2align 4
-L(StrncpyExit17_32):
- vmovdqu (%rsi), %xmm2
- vmovdqu -16(%rsi, %r8), %xmm3
- vmovdqu %xmm2, (%rdi)
- vmovdqu %xmm3, -16(%rdi, %r8)
-# ifdef USE_AS_STPCPY
- lea (%rdi, %r8), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi, %r8)
-# endif
- VZEROUPPER_RETURN
-
- .p2align 4
-L(StrncpyExit33_64):
- /* 0/32, 31/16 */
- vmovdqu (%rsi), %ymm2
- vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
- vmovdqu %ymm2, (%rdi)
- vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
-# ifdef USE_AS_STPCPY
- lea (%rdi, %r8), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi, %r8)
-# endif
- VZEROUPPER_RETURN
-
- .p2align 4
-L(StrncpyExit65):
- /* 0/32, 32/32, 64/1 */
- vmovdqu (%rsi), %ymm2
- vmovdqu 32(%rsi), %ymm3
- mov 64(%rsi), %cl
- vmovdqu %ymm2, (%rdi)
- vmovdqu %ymm3, 32(%rdi)
- mov %cl, 64(%rdi)
-# ifdef USE_AS_STPCPY
- lea 65(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, 65(%rdi)
-# endif
- VZEROUPPER_RETURN
+# else
+ testl %ecx, %ecx
+ jz L(page_cross_continue)
+ /* Traditional copy case, essentially same as used in non-page-
+ cross case but since we can't reuse VMM(0) we need twice as
+ many loads from rsi. */
# ifndef USE_AS_STRCAT
-
- .p2align 4
-L(Fill1):
- mov %dl, (%rdi)
- VZEROUPPER_RETURN
-
- .p2align 4
-L(Fill2):
- mov %dx, (%rdi)
- VZEROUPPER_RETURN
-
- .p2align 4
-L(Fill3_4):
- mov %dx, (%rdi)
- mov %dx, -2(%rdi, %r8)
- VZEROUPPER_RETURN
-
- .p2align 4
-L(Fill5_8):
- mov %edx, (%rdi)
- mov %edx, -4(%rdi, %r8)
- VZEROUPPER_RETURN
-
- .p2align 4
-L(Fill9_16):
- mov %rdx, (%rdi)
- mov %rdx, -8(%rdi, %r8)
- VZEROUPPER_RETURN
-
- .p2align 4
-L(Fill17_32):
- vmovdqu %xmmZ, (%rdi)
- vmovdqu %xmmZ, -16(%rdi, %r8)
- VZEROUPPER_RETURN
-
- .p2align 4
-L(CopyVecSizeUnalignedVec2):
- vmovdqu %ymm2, (%rdi, %rcx)
-
- .p2align 4
-L(CopyVecSizeVecExit):
- bsf %edx, %edx
- add $(VEC_SIZE - 1), %r8
- add %rcx, %rdi
-# ifdef USE_AS_STPCPY
- lea (%rdi, %rdx), %rax
-# endif
- sub %rdx, %r8
- lea 1(%rdi, %rdx), %rdi
-
- .p2align 4
-L(StrncpyFillTailWithZero):
- xor %edx, %edx
- sub $VEC_SIZE, %r8
- jbe L(StrncpyFillExit)
-
- vmovdqu %ymmZ, (%rdi)
- add $VEC_SIZE, %rdi
-
- mov %rdi, %rsi
- and $(VEC_SIZE - 1), %esi
- sub %rsi, %rdi
- add %rsi, %r8
- sub $(VEC_SIZE * 4), %r8
- jb L(StrncpyFillLessFourVecSize)
-
-L(StrncpyFillLoopVmovdqa):
- vmovdqa %ymmZ, (%rdi)
- vmovdqa %ymmZ, VEC_SIZE(%rdi)
- vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
- vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
- add $(VEC_SIZE * 4), %rdi
- sub $(VEC_SIZE * 4), %r8
- jae L(StrncpyFillLoopVmovdqa)
-
-L(StrncpyFillLessFourVecSize):
- add $(VEC_SIZE * 2), %r8
- jl L(StrncpyFillLessTwoVecSize)
- vmovdqa %ymmZ, (%rdi)
- vmovdqa %ymmZ, VEC_SIZE(%rdi)
- add $(VEC_SIZE * 2), %rdi
- sub $VEC_SIZE, %r8
- jl L(StrncpyFillExit)
- vmovdqa %ymmZ, (%rdi)
- add $VEC_SIZE, %rdi
- jmp L(Fill)
-
- .p2align 4
-L(StrncpyFillLessTwoVecSize):
- add $VEC_SIZE, %r8
- jl L(StrncpyFillExit)
- vmovdqa %ymmZ, (%rdi)
- add $VEC_SIZE, %rdi
- jmp L(Fill)
-
- .p2align 4
-L(StrncpyFillExit):
- add $VEC_SIZE, %r8
-L(Fill):
- cmp $17, %r8d
- jae L(Fill17_32)
- cmp $9, %r8d
- jae L(Fill9_16)
- cmp $5, %r8d
- jae L(Fill5_8)
- cmp $3, %r8d
- jae L(Fill3_4)
- cmp $1, %r8d
- ja L(Fill2)
- je L(Fill1)
- VZEROUPPER_RETURN
-
-/* end of ifndef USE_AS_STRCAT */
+ xorl %edx, %edx
# endif
-
- .p2align 4
-L(UnalignedLeaveCase2OrCase3):
- test %rdx, %rdx
- jnz L(UnalignedFourVecSizeLeaveCase2)
-L(UnalignedFourVecSizeLeaveCase3):
- lea (VEC_SIZE * 4)(%r8), %rcx
- and $-VEC_SIZE, %rcx
- add $(VEC_SIZE * 3), %r8
- jl L(CopyVecSizeCase3)
- vmovdqu %ymm4, (%rdi)
- sub $VEC_SIZE, %r8
- jb L(CopyVecSizeCase3)
- vmovdqu %ymm5, VEC_SIZE(%rdi)
- sub $VEC_SIZE, %r8
- jb L(CopyVecSizeCase3)
- vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
- sub $VEC_SIZE, %r8
- jb L(CopyVecSizeCase3)
- vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
+ bsfl %ecx, %edx
# ifdef USE_AS_STPCPY
- lea (VEC_SIZE * 4)(%rdi), %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (VEC_SIZE * 4)(%rdi)
+ leaq (%rdi, %rdx), %rax
+# elif !defined USE_AS_STRCAT
+ movq %rdi, %rax
# endif
- VZEROUPPER_RETURN
- .p2align 4
-L(UnalignedFourVecSizeLeaveCase2):
- xor %ecx, %ecx
- vpcmpeqb %ymm4, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- add $(VEC_SIZE * 3), %r8
- jle L(CopyVecSizeCase2OrCase3)
- test %edx, %edx
-# ifndef USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec4)
-# else
- jnz L(CopyVecSize)
-# endif
- vpcmpeqb %ymm5, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- vmovdqu %ymm4, (%rdi)
- add $VEC_SIZE, %rcx
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
- test %edx, %edx
-# ifndef USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec5)
-# else
- jnz L(CopyVecSize)
-# endif
+ /* vzeroupper early to avoid duplicating at each return. */
+ COND_VZEROUPPER
- vpcmpeqb %ymm6, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- vmovdqu %ymm5, VEC_SIZE(%rdi)
- add $VEC_SIZE, %rcx
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
- test %edx, %edx
-# ifndef USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec6)
-# else
- jnz L(CopyVecSize)
-# endif
+ testw %cx, %cx
+ jz L(page_cross_copy_16_31)
- vpcmpeqb %ymm7, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
- lea VEC_SIZE(%rdi, %rcx), %rdi
- lea VEC_SIZE(%rsi, %rcx), %rsi
- bsf %edx, %edx
- cmp %r8d, %edx
- jb L(CopyVecSizeExit)
-L(StrncpyExit):
- cmp $65, %r8d
- je L(StrncpyExit65)
- cmp $33, %r8d
- jae L(StrncpyExit33_64)
- cmp $17, %r8d
- jae L(StrncpyExit17_32)
- cmp $9, %r8d
- jae L(StrncpyExit9_16)
- cmp $5, %r8d
- jae L(StrncpyExit5_8)
- cmp $3, %r8d
- jae L(StrncpyExit3_4)
- cmp $1, %r8d
- ja L(StrncpyExit2)
- je L(StrncpyExit1)
-# ifdef USE_AS_STPCPY
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRCAT
- movb $0, (%rdi)
-# endif
- VZEROUPPER_RETURN
-
- .p2align 4
-L(ExitZero):
-# ifndef USE_AS_STRCAT
- mov %rdi, %rax
-# endif
- VZEROUPPER_RETURN
+ testb %cl, %cl
+ jz L(page_cross_copy_8_15)
-# endif
+ testl $0x7, %cl
+ jz L(page_cross_copy_4_7)
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# else
-END (STRCAT)
-# endif
+ testl %edx, %edx
+ jz L(page_cross_set_null_term)
+ movzwl (%rsi), %ecx
+ movw %cx, (%rdi)
+L(page_cross_set_null_term):
+ movb $0, (%END_REG)
+ ret
+
+ .p2align 4,, 4
+L(page_cross_copy_4_7):
+ movl (%rsi), %ecx
+ movl -3(%rsi, %rdx), %esi
+ movl %ecx, (%rdi)
+ movl %esi, -3(%END_REG)
+ ret
+
+ .p2align 4,, 4
+L(page_cross_copy_8_15):
+ movq (%rsi), %rcx
+ movq -7(%rsi, %rdx), %rsi
+ movq %rcx, (%rdi)
+ movq %rsi, -7(%END_REG)
+ ret
+
+
+ .p2align 4,, 3
+L(page_cross_copy_16_31):
+ VMOVU (%rsi), %xmm0
+ VMOVU -15(%rsi, %rdx), %xmm1
+ VMOVU %xmm0, (%rdi)
+ VMOVU %xmm1, -15(%END_REG)
+ ret
+# endif
+
+END(STRCPY)
#endif
@@ -1,3 +1,3 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_avx2_rtm
-#include "strcat-avx2-rtm.S"
+#define STRNCAT __strncat_avx2_rtm
+#include "x86-avx-rtm-vecs.h"
+#include "strncat-avx2.S"
@@ -1,7 +1,419 @@
-#ifndef STRNCAT
-# define STRNCAT __strncat_avx2
-#endif
+/* strncat with AVX2
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (3)
+
+# include <sysdep.h>
+
+# ifndef VEC_SIZE
+# include "x86-avx-vecs.h"
+# endif
+
+# ifndef STRNCAT
+# define STRNCAT __strncat_avx2
+# endif
+
+# ifdef USE_AS_WCSCPY
+# define MOVCHAR movl
+# define VPCMPEQ vpcmpeqd
+# define VPMIN vpminud
+# define CHAR_SIZE 4
+# else
+# define MOVCHAR movb
+# define VPCMPEQ vpcmpeqb
+# define VPMIN vpminub
+# define CHAR_SIZE 1
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE 4096
+
+# define VZERO VMM(7)
+# define VZERO_128 VMM_128(7)
+
+ .section SECTION(.text), "ax", @progbits
+ENTRY(STRNCAT)
+ /* Filter zero length strings and very long strings. Zero
+ length strings just return, very long strings are handled by
+ using the non-length variant {wcs|str}cat. */
+ movq %rdi, %rax
+# ifdef USE_AS_WCSCPY
+ leaq -1(%rdx), %rcx
+ shr $56, %rcx
+ jnz L(zero_len)
+ salq $2, %rdx
+# else
+ test %rdx, %rdx
+ jl L(zero_len)
+# endif
+ vpxor %VZERO_128, %VZERO_128, %VZERO_128
+
+# include "strcat-strlen-avx2.h.S"
+
+ movl %esi, %ecx
+ andl $(PAGE_SIZE - 1), %ecx
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
+ ja L(page_cross)
+L(page_cross_continue):
+ VMOVU (%rsi), %VMM(0)
+ VPCMPEQ %VMM(0), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+
+ tzcnt %ecx, %r8d
+ cmpq %r8, %rdx
+ jbe L(less_1x_vec)
+
+ testl %ecx, %ecx
+ jz L(more_1x_vec)
+
+ /* Hoist this to save code size. */
+
+ movl %r8d, %edx
+
+L(less_1x_vec):
+ COND_VZEROUPPER
+
+ cmpl $16, %edx
+ jae L(copy_16_31)
+ cmpl $8, %edx
+ jae L(copy_8_15)
+
+
+# ifdef USE_AS_WCSCPY
+ vmovd %VMM_128(0), (%rdi)
+ MOVCHAR $0, (%rdi, %rdx)
+ ret
+# else
+ cmpl $4, %edx
+ jae L(copy_4_7)
+
+ movzbl (%rsi), %ecx
+ cmpl $1, %edx
+ jbe L(set_null_term)
+
+ /* NB: make this `vmovw` if support for AVX512-FP16 is added.
+ */
+ movzwl 1(%rsi), %esi
+ movw %si, 1(%rdi)
+
+ .p2align 4,, 1
+L(set_null_term):
+ movb %cl, (%rdi)
+ MOVCHAR $0, (%rdi, %rdx)
+ ret
+
+ .p2align 4,, 11
+L(copy_4_7):
+ movl -(4)(%rsi, %rdx), %ecx
+ vmovd %xmm0, (%rdi)
+ movl %ecx, -(4)(%rdi, %rdx)
+ MOVCHAR $0, (%rdi, %rdx)
+ ret
+# endif
+
+
+ .p2align 4,, 10
+L(copy_16_31):
+ VMOVU -(16)(%rsi, %rdx), %xmm1
+ VMOVU %xmm0, (%rdi)
+ VMOVU %xmm1, -(16)(%rdi, %rdx)
+ MOVCHAR $0, (%rdi, %rdx)
+ ret
+
+ .p2align 4,, 10
+L(copy_8_15):
+ movq -(8)(%rsi, %rdx), %rcx
+ vmovq %xmm0, (%rdi)
+ movq %rcx, -(8)(%rdi, %rdx)
+ MOVCHAR $0, (%rdi, %rdx)
+ ret
+
+ .p2align 4,, 8
+ .p2align 6,, 14
+L(more_1x_vec):
+ VMOVU %VMM(0), (%rdi)
+
+ /* Align rsi (src) and just rdx/rdi (length/dst). */
+ addq %rsi, %rdx
+ subq %rsi, %rdi
+ orq $(VEC_SIZE - 1), %rsi
+ incq %rsi
+ addq %rsi, %rdi
+L(loop_last_4x_vec):
+ subq %rsi, %rdx
+ VMOVA 0(%rsi), %VMM(1)
+ VPCMPEQ %VMM(1), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(more_2x_vec)
+L(last_2x_vec):
+ tzcnt %ecx, %ecx
+ cmpl %ecx, %edx
+ jbe L(ret_vec_x1_len)
+
+ cmpl $VEC_SIZE, %ecx
+ jnz L(ret_vec_x1)
+
+ VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2)
+ VMOVU %VMM(1), (%rdi)
+ VPCMPEQ %VMM(2), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ addl $-VEC_SIZE, %edx
+ bzhil %edx, %ecx, %r8d
+ jz L(ret_vec_x2_len)
+L(ret_vec_x2):
+ bsfl %ecx, %edx
+L(ret_vec_x2_len):
+ VMOVU (%rsi, %rdx), %VMM(0)
+ MOVCHAR $0, (VEC_SIZE)(%rdi, %rdx)
+ VMOVU %VMM(0), (%rdi, %rdx)
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+
+
+ .p2align 4,, 12
+L(ret_vec_x1_len):
+ movl %edx, %ecx
+L(ret_vec_x1):
+ VMOVU -(VEC_SIZE)(%rsi, %rcx), %VMM(1)
+ MOVCHAR $0, (%rdi, %rcx)
+ VMOVU %VMM(1), -VEC_SIZE(%rdi, %rcx)
+ VZEROUPPER_RETURN
+
+ .p2align 4,, 8
+L(last_4x_vec):
+ subq $-(VEC_SIZE * 4), %rsi
+ VMOVA 0(%rsi), %VMM(1)
+ VPCMPEQ %VMM(1), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ subq $-(VEC_SIZE * 4), %rdi
+ addl $-(VEC_SIZE * 4), %edx
+ cmpl $(VEC_SIZE * 2), %edx
+ jbe L(last_2x_vec)
+ .p2align 4,, 8
+L(more_2x_vec):
+ /* L(ret_vec_x1) expects ecx to have position of first match so
+ test with bsf. */
+ bsfl %ecx, %ecx
+ jnz L(ret_vec_x1)
+
+ VMOVA (VEC_SIZE * 1)(%rsi), %VMM(2)
+ VMOVU %VMM(1), (%rdi)
+
+ VPCMPEQ %VMM(2), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ testl %ecx, %ecx
+ jnz L(ret_vec_x2)
+
-#define USE_AS_STRNCAT
-#define STRCAT STRNCAT
-#include "strcat-avx2.S"
+ VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3)
+ VMOVU %VMM(2), (VEC_SIZE * 1)(%rdi)
+
+ VPCMPEQ %VMM(3), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+
+ /* Check if length is greater than 4x VEC. */
+ cmpq $(VEC_SIZE * 4), %rdx
+ ja L(more_4x_vec)
+
+ addl $(VEC_SIZE * -2), %edx
+
+ tzcnt %ecx, %ecx
+ cmpl %ecx, %edx
+ jbe L(ret_vec_x3_len)
+
+ cmpl $VEC_SIZE, %ecx
+ jnz L(ret_vec_x3)
+
+ VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(4)
+ VMOVU %VMM(3), (VEC_SIZE * 2 + 0)(%rdi)
+ VPCMPEQ %VMM(4), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ addl $-VEC_SIZE, %edx
+ bzhil %edx, %ecx, %r8d
+ jz L(ret_vec_x4_len)
+L(ret_vec_x4):
+ bsfl %ecx, %edx
+L(ret_vec_x4_len):
+ VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(0)
+ MOVCHAR $0, (VEC_SIZE * 3)(%rdi, %rdx)
+ VMOVU %VMM(0), (VEC_SIZE * 2)(%rdi, %rdx)
+ VZEROUPPER_RETURN
+
+ .p2align 4,, 4
+L(ret_vec_x3_len):
+ movl %edx, %ecx
+L(ret_vec_x3):
+ VMOVU (VEC_SIZE)(%rsi, %rcx), %VMM(0)
+ MOVCHAR $0, (VEC_SIZE * 2)(%rdi, %rcx)
+ VMOVU %VMM(0), (VEC_SIZE)(%rdi, %rcx)
+ VZEROUPPER_RETURN
+
+
+ .p2align 4,, 8
+L(more_4x_vec):
+ bsfl %ecx, %ecx
+ jnz L(ret_vec_x3)
+
+ VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4)
+ VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
+ VPCMPEQ %VMM(4), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ testl %ecx, %ecx
+ jnz L(ret_vec_x4)
+
+ VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi)
+
+
+ /* Recheck length before aligning. */
+ cmpq $(VEC_SIZE * 8), %rdx
+ jbe L(last_4x_vec)
+
+ /* Align rsi (src) and just rdx/rdi (length/dst). */
+ addq %rsi, %rdx
+ subq %rsi, %rdi
+ subq $-(VEC_SIZE * 4), %rsi
+ andq $(VEC_SIZE * -4), %rsi
+
+ /* Do first half of loop ahead of time so loop can just start by
+ storing. */
+ VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPMIN %VMM(4), %VMM(6), %VMM(6)
+ VPCMPEQ %VMM(6), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %r8d
+ addq %rsi, %rdi
+ testl %r8d, %r8d
+ jnz L(loop_4x_done)
+
+ /* Use r9 for end of region before handling last 4x VEC
+ specially. */
+ leaq -(VEC_SIZE * 4)(%rdx), %r9
+
+ .p2align 4,, 11
+L(loop_4x_vec):
+
+ VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+ VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+ subq $(VEC_SIZE * -4), %rsi
+ VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+ VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+
+ subq $(VEC_SIZE * -4), %rdi
+ cmpq %rsi, %r9
+ jbe L(loop_last_4x_vec)
+
+ VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPMIN %VMM(4), %VMM(6), %VMM(6)
+ VPCMPEQ %VMM(6), %VZERO, %VMM(6)
+
+ vpmovmskb %VMM(6), %r8d
+
+ testl %r8d, %r8d
+ jz L(loop_4x_vec)
+
+L(loop_4x_done):
+ VPCMPEQ %VMM(0), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ /* L(ret_vec_x1) expects ecx to have position of first match so
+ test with bsf. */
+ bsfl %ecx, %ecx
+ jnz L(ret_vec_x1)
+ VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+
+ VPCMPEQ %VMM(1), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+
+ testl %ecx, %ecx
+ jnz L(ret_vec_x2)
+ VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+
+ VPCMPEQ %VMM(2), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ bsfl %ecx, %ecx
+ jnz L(ret_vec_x3)
+
+ VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+ bsfl %r8d, %r8d
+ VMOVU (VEC_SIZE * 2 + CHAR_SIZE)(%rsi, %r8), %VMM(1)
+ VMOVU %VMM(1), (VEC_SIZE * 2 + CHAR_SIZE)(%rdi, %r8)
+ VZEROUPPER_RETURN
+
+
+
+ .p2align 4,, 4
+L(page_cross):
+ movq %rsi, %r8
+ andq $(VEC_SIZE * -1), %r8
+
+ VPCMPEQ (%r8), %VZERO, %VMM(6)
+
+ vpmovmskb %VMM(6), %ecx
+ shrxl %esi, %ecx, %ecx
+
+ subl %esi, %r8d
+ andl $(VEC_SIZE - 1), %r8d
+ cmpq %r8, %rdx
+ jb L(page_cross_small)
+
+ /* Optimizing more aggressively for space as this is very cold
+ code. This saves 2x cache lines. */
+
+ /* This adds once to the later result which will get correct
+ copy bounds. NB: this can never zero-out a non-zero RCX as
+ to be in the page cross case rsi cannot be aligned and we
+ already right-shift rcx by the misalignment. */
+ shll $CHAR_SIZE, %ecx
+ jz L(page_cross_continue)
+ bsfl %ecx, %ecx
+ rep movsb
+ VZEROUPPER_RETURN
+
+L(page_cross_small):
+ tzcntl %ecx, %ecx
+ jz L(page_cross_setz)
+ cmpl %edx, %ecx
+ cmova %edx, %ecx
+ rep movsb
+L(page_cross_setz):
+ MOVCHAR $0, (%rdi)
+ VZEROUPPER_RETURN
+L(zero_len):
+# ifdef USE_AS_WCSCPY
+ test %rdx, %rdx
+# endif
+ jnz OVERFLOW_STRCAT
+ ret
+
+
+END(STRNCAT)
+#endif
@@ -1,3 +1,3 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STRNCPY __strncpy_avx2_rtm
+#include "x86-avx-rtm-vecs.h"
+#include "strncpy-avx2.S"
@@ -1,7 +1,735 @@
-#ifndef STRNCPY
-# define STRNCPY __strncpy_avx2
-#endif
+/* strncpy with AVX2
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (3)
+
+# include <sysdep.h>
+
+
+# ifndef VEC_SIZE
+# include "x86-avx-vecs.h"
+# endif
+
+# ifndef STRNCPY
+# define STRNCPY __strncpy_avx2
+# endif
+
+
+# ifdef USE_AS_WCSCPY
+# define VPCMPEQ vpcmpeqd
+# define VPMIN vpminud
+# define CHAR_SIZE 4
+# else
+# define VPCMPEQ vpcmpeqb
+# define VPMIN vpminub
+# define CHAR_SIZE 1
+# endif
+
+# include "strncpy-or-cat-overflow-def.h"
+
+# define PAGE_SIZE 4096
+
+# define VZERO VMM(7)
+# define VZERO_128 VMM_128(7)
+
+
+ .section SECTION(.text), "ax", @progbits
+ENTRY(STRNCPY)
+ /* Filter zero length strings and very long strings. Zero
+ length strings just return, very long strings are handled by
+ just running rep stos{b|l} to zero set (which will almost
+ certainly segfault), if that succeeds then just calling
+ OVERFLOW_STRCPY (strcpy, stpcpy, wcscpy, wcpcpy). */
+# ifdef USE_AS_WCSCPY
+ decq %rdx
+ movq %rdx, %rax
+ /* 56 is end of max supported address space. */
+ shr $56, %rax
+ jnz L(zero_len)
+ salq $2, %rdx
+# else
+ decq %rdx
+ /* `dec` can macrofuse with `jl`. If the flag needs to become
+ `jb` replace `dec` with `sub`. */
+ jl L(zero_len)
+# endif
+
+ vpxor %VZERO_128, %VZERO_128, %VZERO_128
+ movl %esi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(page_cross)
+
+L(page_cross_continue):
+ VMOVU (%rsi), %VMM(0)
+ VPCMPEQ %VMM(0), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+
+ /* If no STPCPY just save end ahead of time. */
+# ifndef USE_AS_STPCPY
+ movq %rdi, %rax
+# elif defined USE_AS_WCSCPY
+ /* Clear dependency as nearly all return code for wcpncpy uses
+ `setc %al`. */
+ xorl %eax, %eax
+# endif
+
+ cmpq $(VEC_SIZE - CHAR_SIZE), %rdx
+ /* `jb` because length rdx is now length - CHAR_SIZE. */
+ jbe L(less_1x_vec)
+
+ /* This may overset but thats fine because we still need to zero
+ fill. */
+ VMOVU %VMM(0), (%rdi)
+
+ testl %ecx, %ecx
+ jnz L(zfill)
+
+ /* Align. */
+ addq %rsi, %rdx
+ subq %rsi, %rdi
+ orq $(VEC_SIZE - 1), %rsi
+ incq %rsi
+L(last_4x_vec):
+ addq %rsi, %rdi
+L(loop_last_4x_vec):
+ subq %rsi, %rdx
+
+
+ VMOVA 0(%rsi), %VMM(1)
+ VPCMPEQ %VMM(1), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+
+ cmpq $(VEC_SIZE * 2), %rdx
+ jae L(more_2x_vec)
+
+ cmpl $(VEC_SIZE), %edx
+ jb L(ret_vec_x1_len)
+
+ testl %ecx, %ecx
+ jnz L(ret_vec_x1)
+
+ VPCMPEQ VEC_SIZE(%rsi), %VZERO, %VMM(6)
+ VMOVU %VMM(1), (%rdi)
+ vpmovmskb %VMM(6), %ecx
+ shlq $VEC_SIZE, %rcx
+L(ret_vec_x1_len):
+ tzcntq %rcx, %rcx
+ cmpl %ecx, %edx
+ jbe L(ret_vec_x1_len_no_zfill)
+ /* Fall through (expectation) is copy len < buffer len. */
+ VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+L(ret_vec_x1_len_no_zfill_mov):
+ movl %ecx, %edx
+# ifdef USE_AS_STPCPY
+ /* clear flags. */
+ xorl %ecx, %ecx
+# endif
+L(ret_vec_x1_len_no_zfill):
+ VMOVU ((0)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+ VMOVU %VMM(1), ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+# ifdef USE_AS_WCSCPY
+ setc %al
+ addq %rdx, %rdi
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ movl %edx, %eax
+ adcq %rdi, %rax
+# endif
+# endif
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+
+ .p2align 4,, 6
+L(ret_vec_x1):
+ bsfl %ecx, %ecx
+ VMOVU %VZERO, ((0)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+ subl %ecx, %edx
+ /* Check if we need to reload/store. */
+ cmpl $VEC_SIZE, %edx
+ jb L(ret_vec_x1_len_no_zfill_mov)
+ /* Otherwise safe to just store directly. */
+ VMOVU %VMM(1), (%rdi)
+ VMOVU %VZERO, (%rdi, %rcx)
+# ifdef USE_AS_STPCPY
+ leaq (%rdi, %rcx), %rax
+# endif
+ VZEROUPPER_RETURN
+
+ .p2align 4,, 12
+L(more_2x_vec):
+ VMOVU %VMM(1), (%rdi)
+ testl %ecx, %ecx
+ /* Must fill at least 2x VEC. */
+ jnz L(zfill_vec1)
+
+ VMOVA VEC_SIZE(%rsi), %VMM(2)
+ VMOVU %VMM(2), VEC_SIZE(%rdi)
+ VPCMPEQ %VMM(2), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ testl %ecx, %ecx
+ /* Must fill at least 1x VEC. */
+ jnz L(zfill_vec2)
+
+ VMOVA (VEC_SIZE * 2)(%rsi), %VMM(3)
+ VPCMPEQ %VMM(3), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+
+ /* Check if len is more 4x VEC. -CHAR_SIZE because rdx is len -
+ CHAR_SIZE. */
+ cmpq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
+ ja L(more_4x_vec)
+
+ subl $(VEC_SIZE * 3), %edx
+ jb L(ret_vec_x3_len)
+
+ testl %ecx, %ecx
+ jnz L(ret_vec_x3)
+
+ VPCMPEQ (VEC_SIZE * 3)(%rsi), %VZERO, %VMM(6)
+ VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
+ vpmovmskb %VMM(6), %ecx
+ tzcntl %ecx, %ecx
+ cmpl %ecx, %edx
+ jbe L(ret_vec_x4_len_no_zfill)
+ /* Fall through (expectation) is copy len < buffer len. */
+ VMOVU %VZERO, ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+ movl %ecx, %edx
+L(ret_vec_x4_len_no_zfill):
+ VMOVU ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+ VMOVU %VMM(1), ((VEC_SIZE * 3)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+# ifdef USE_AS_WCSCPY
+ setc %al
+ addq %rdx, %rdi
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+ leal (VEC_SIZE * 3 + 0)(%edx), %eax
+ adcq %rdi, %rax
+# endif
+# endif
+ VZEROUPPER_RETURN
+
+
+L(ret_vec_x3_len):
+ addl $(VEC_SIZE * 1), %edx
+ tzcntl %ecx, %ecx
+ cmpl %ecx, %edx
+ jbe L(ret_vec_x3_len_no_zfill)
+ /* Fall through (expectation) is copy len < buffer len. */
+ VMOVU %VZERO, ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+L(ret_vec_x3_len_no_zfill_mov):
+ movl %ecx, %edx
+# ifdef USE_AS_STPCPY
+ /* clear flags. */
+ xorl %ecx, %ecx
+# endif
+ .p2align 4,, 4
+L(ret_vec_x3_len_no_zfill):
+ VMOVU ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rsi, %rdx), %VMM(1)
+ VMOVU %VMM(1), ((VEC_SIZE * 2)-(VEC_SIZE - CHAR_SIZE))(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+# ifdef USE_AS_WCSCPY
+ setc %al
+ addq %rdx, %rdi
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+ leal (VEC_SIZE * 2 + 0)(%rdx), %eax
+ adcq %rdi, %rax
+# endif
+# endif
+ VZEROUPPER_RETURN
+
+
+ .p2align 4,, 8
+L(ret_vec_x3):
+ bsfl %ecx, %ecx
+ VMOVU %VZERO, (VEC_SIZE * 3 +(-(VEC_SIZE - CHAR_SIZE)))(%rdi, %rdx)
+ subl %ecx, %edx
+ jl L(ret_vec_x3_len_no_zfill_mov)
+ VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VZERO, (VEC_SIZE * 2)(%rdi, %rcx)
+# ifdef USE_AS_STPCPY
+ leaq (VEC_SIZE * 2)(%rdi, %rcx), %rax
+# endif
+ VZEROUPPER_RETURN
+
+ .p2align 4,, 8
+L(more_4x_vec):
+
+ VMOVU %VMM(3), (VEC_SIZE * 2)(%rdi)
+ testl %ecx, %ecx
+ jnz L(zfill_vec3)
+
+ VMOVA (VEC_SIZE * 3)(%rsi), %VMM(4)
+ VMOVU %VMM(4), (VEC_SIZE * 3)(%rdi)
+ VPCMPEQ %VMM(4), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ testl %ecx, %ecx
+ jnz L(zfill_vec4)
+
+ movq %rdx, %rcx
+ addq %rsi, %rdx
+ subq %rsi, %rdi
+ subq $-(VEC_SIZE * 4), %rsi
+ /* Recheck length before aligning. */
+ cmpq $(VEC_SIZE * 8 - CHAR_SIZE), %rcx
+ jbe L(last_4x_vec)
+
+ andq $(VEC_SIZE * -4), %rsi
+
+ /* Do first half of loop ahead of time so loop can just start by
+ storing. */
+ VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPMIN %VMM(4), %VMM(6), %VMM(6)
+ VPCMPEQ %VMM(6), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %r8d
+ addq %rsi, %rdi
+ testl %r8d, %r8d
+ jnz L(loop_4x_done)
+
+ /* Use r9 as end register. */
+ leaq -(VEC_SIZE * 4 - CHAR_SIZE)(%rdx), %r9
-#define USE_AS_STRNCPY
-#define STRCPY STRNCPY
-#include "strcpy-avx2.S"
+ .p2align 4,, 11
+L(loop_4x_vec):
+
+ VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+ VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+ subq $(VEC_SIZE * -4), %rsi
+ VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+ VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+
+ subq $(VEC_SIZE * -4), %rdi
+ cmpq %rsi, %r9
+ jbe L(loop_last_4x_vec)
+
+ VMOVA (VEC_SIZE * 0 + 0)(%rsi), %VMM(0)
+ VMOVA (VEC_SIZE * 1 + 0)(%rsi), %VMM(1)
+ VMOVA (VEC_SIZE * 2 + 0)(%rsi), %VMM(2)
+ VMOVA (VEC_SIZE * 3 + 0)(%rsi), %VMM(3)
+
+ VPMIN %VMM(0), %VMM(1), %VMM(4)
+ VPMIN %VMM(2), %VMM(3), %VMM(6)
+ VPMIN %VMM(4), %VMM(6), %VMM(6)
+ VPCMPEQ %VMM(6), %VZERO, %VMM(6)
+
+ vpmovmskb %VMM(6), %r8d
+
+ testl %r8d, %r8d
+ jz L(loop_4x_vec)
+
+L(loop_4x_done):
+ subq %rsi, %rdx
+ VMOVU %VMM(0), (VEC_SIZE * 0 + 0)(%rdi)
+ VPCMPEQ %VMM(0), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ testl %ecx, %ecx
+ jnz L(zfill_vec1)
+
+ VMOVU %VMM(1), (VEC_SIZE * 1 + 0)(%rdi)
+ VPCMPEQ %VMM(1), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ testl %ecx, %ecx
+ jnz L(zfill_vec2)
+
+ VMOVU %VMM(2), (VEC_SIZE * 2 + 0)(%rdi)
+ VPCMPEQ %VMM(2), %VZERO, %VMM(6)
+ vpmovmskb %VMM(6), %ecx
+ testl %ecx, %ecx
+ jnz L(zfill_vec3)
+
+ VMOVU %VMM(3), (VEC_SIZE * 3 + 0)(%rdi)
+ movl %r8d, %ecx
+
+ // Zfill more....
+
+ .p2align 4,, 4
+L(zfill_vec4):
+ addq $(VEC_SIZE * 2), %rdi
+ subq $(VEC_SIZE * 2), %rdx
+L(zfill_vec2):
+ shlq $VEC_SIZE, %rcx
+L(zfill):
+ bsfq %rcx, %rcx
+ subq %rcx, %rdx
+ addq %rcx, %rdi
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+L(zfill_from_page_cross):
+ cmpq $VEC_SIZE, %rdx
+ jb L(zfill_less_vec_vzeroupper)
+
+L(zfill_more_1x_vec):
+ VMOVU %VZERO, CHAR_SIZE(%rdi)
+ VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
+ cmpq $(VEC_SIZE * 2), %rdx
+ jae L(zfill_more_2x_vec)
+L(zfill_done0):
+ VZEROUPPER_RETURN
+
+ .p2align 4,, 8
+L(zfill_vec3):
+ addq $(VEC_SIZE * 2), %rdi
+ subq $(VEC_SIZE * 2), %rdx
+ .p2align 4,, 2
+L(zfill_vec1):
+ bsfl %ecx, %ecx
+ addq %rcx, %rdi
+ subq %rcx, %rdx
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+ /* zfill from vec1/vec3 must have to set at least 2x VECS. */
+
+ VMOVU %VZERO, CHAR_SIZE(%rdi)
+ VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE)(%rdi, %rdx)
+ cmpq $(VEC_SIZE * 2), %rdx
+ jb L(zfill_done0)
+L(zfill_more_2x_vec):
+ VMOVU %VZERO, (CHAR_SIZE - VEC_SIZE * 2)(%rdi, %rdx)
+ VMOVU %VZERO, (VEC_SIZE + CHAR_SIZE)(%rdi)
+ subq $(VEC_SIZE * 4 - CHAR_SIZE), %rdx
+ jbe L(zfill_done)
+
+ addq %rdi, %rdx
+ VMOVU %VZERO, (VEC_SIZE * 2 + CHAR_SIZE)(%rdi)
+ VMOVU %VZERO, (VEC_SIZE * 3 + CHAR_SIZE)(%rdi)
+
+
+ VMOVU %VZERO, (VEC_SIZE * 0 + 0)(%rdx)
+ VMOVU %VZERO, (VEC_SIZE * 1 + 0)(%rdx)
+
+ subq $-(VEC_SIZE * 4 + CHAR_SIZE), %rdi
+ cmpq %rdi, %rdx
+ jbe L(zfill_done)
+
+ andq $-(VEC_SIZE), %rdi
+ .p2align 4,, 12
+L(zfill_loop_4x_vec):
+ VMOVA %VZERO, (VEC_SIZE * 0)(%rdi)
+ VMOVA %VZERO, (VEC_SIZE * 1)(%rdi)
+ VMOVA %VZERO, (VEC_SIZE * 2)(%rdi)
+ VMOVA %VZERO, (VEC_SIZE * 3)(%rdi)
+ subq $-(VEC_SIZE * 4), %rdi
+ cmpq %rdi, %rdx
+ ja L(zfill_loop_4x_vec)
+L(zfill_done):
+ VZEROUPPER_RETURN
+
+
+ .p2align 4,, 8
+L(copy_1x):
+ VMOVU %VMM(0), (%rdi)
+ testl %ecx, %ecx
+ jz L(ret_32_32)
+L(zfill_less_vec):
+ bsfl %ecx, %ecx
+L(zfill_less_vec_no_bsf):
+ subq %rcx, %rdx
+ addq %rcx, %rdi
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+L(zfill_less_vec_vzeroupper):
+ COND_VZEROUPPER
+ /* We are taking advantage of the fact that to be here we must
+ be writing null-term as (%rdi, %rcx) we have a byte of lee-
+ way for overwriting. */
+ cmpl $16, %edx
+ jb L(zfill_less_16)
+ VMOVU %VZERO_128, (%rdi)
+ VMOVU %VZERO_128, -(16 - CHAR_SIZE)(%rdi, %rdx)
+ ret
+# ifdef USE_AS_STPCPY
+L(ret_32_32):
+ leaq CHAR_SIZE(%rdi, %rdx), %rax
+ VZEROUPPER_RETURN
+# endif
+
+ .p2align 4,, 4
+L(copy_16_31):
+ /* Overfill to avoid branches. */
+ vmovdqu -(16 - CHAR_SIZE)(%rsi, %rdx), %xmm1
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm1, -(16 - CHAR_SIZE)(%rdi, %rdx)
+ cmpl %ecx, %edx
+ ja L(zfill_less_vec_no_bsf)
+# ifndef USE_AS_STPCPY
+L(ret_32_32):
+# else
+# ifdef USE_AS_WCSCPY
+ setc %al
+ addq %rdx, %rdi
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ movl %edx, %eax
+ adcq %rdi, %rax
+# endif
+# endif
+ VZEROUPPER_RETURN
+
+ .p2align 4,, 4
+L(copy_8_15):
+ /* Overfill to avoid branches. */
+ movq -(8 - CHAR_SIZE)(%rsi, %rdx), %rsi
+ vmovq %xmm0, (%rdi)
+ movq %rsi, -(8 - CHAR_SIZE)(%rdi, %rdx)
+ cmpl %ecx, %edx
+ jbe L(ret_8_15)
+ subq %rcx, %rdx
+ addq %rcx, %rdi
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+ .p2align 4,, 8
+L(zfill_less_16):
+ xorl %ecx, %ecx
+ cmpl $8, %edx
+ jb L(zfill_less_8)
+ movq %rcx, (%rdi)
+ movq %rcx, -(8 - CHAR_SIZE)(%rdi, %rdx)
+# ifndef USE_AS_STPCPY
+L(ret_8_15):
+# endif
+ ret
+
+
+ .p2align 4,, 8
+L(less_1x_vec):
+ /* Reuse flag from `cmp $VEC_SIZE, %rdx`. The idea is many
+ buffer sizes are aligned conventially. */
+ je L(copy_1x)
+
+ tzcntl %ecx, %ecx
+ cmpl $16, %edx
+ jae L(copy_16_31)
+
+ COND_VZEROUPPER
+ cmpl $8, %edx
+ jae L(copy_8_15)
+# ifdef USE_AS_WCSCPY
+ testl %ecx, %ecx
+ jz L(zfill_less_8_set_ret)
+
+ movl (%rsi, %rdx), %esi
+ vmovd %xmm0, (%rdi)
+ movl %esi, (%rdi, %rdx)
+
+# ifdef USE_AS_STPCPY
+ cmpl %ecx, %edx
+L(ret_8_15):
+ setc %al
+ addq %rdx, %rdi
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# endif
+ ret
+L(zfill_less_8_set_ret):
+ xorl %ecx, %ecx
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+L(zfill_less_8):
+ movl %ecx, (%rdi)
+ movl %ecx, (%rdi, %rdx)
+ ret
+
+# else
+ cmpl $3, %edx
+ jb L(copy_0_3)
+ /* Overfill to avoid branches. */
+ movl -3(%rsi, %rdx), %esi
+ vmovd %xmm0, (%rdi)
+ movl %esi, -3(%rdi, %rdx)
+ cmpl %ecx, %edx
+ jbe L(ret_4_7)
+ subq %rcx, %rdx
+ addq %rcx, %rdi
+# ifdef USE_AS_STPCPY
+ movq %rdi, %rax
+# endif
+ xorl %ecx, %ecx
+ .p2align 4,, 8
+L(zfill_less_8):
+ cmpl $3, %edx
+ jb L(zfill_less_3)
+ movl %ecx, (%rdi)
+ movl %ecx, -3(%rdi, %rdx)
+# ifdef USE_AS_STPCPY
+ ret
+# endif
+
+L(ret_4_7):
+# ifdef USE_AS_STPCPY
+L(ret_8_15):
+ movl %edx, %eax
+ adcq %rdi, %rax
+# endif
+ ret
+
+ .p2align 4,, 4
+L(zfill_less_3):
+ testl %edx, %edx
+ jz L(zfill_1)
+ movw %cx, (%rdi)
+L(zfill_1):
+ movb %cl, (%rdi, %rdx)
+ ret
+
+ .p2align 4,, 8
+L(copy_0_3):
+ vmovd %xmm0, %r8d
+ testl %edx, %edx
+ jz L(copy_1)
+ movw %r8w, (%rdi)
+ cmpl %ecx, %edx
+ ja L(zfill_from_1)
+ movzbl (%rsi, %rdx), %r8d
+# ifdef USE_AS_STPCPY
+ movl %edx, %eax
+ adcq %rdi, %rax
+ movb %r8b, (%rdi, %rdx)
+ ret
+# endif
+
+L(copy_1):
+# ifdef USE_AS_STPCPY
+ movl %edx, %eax
+ cmpl %ecx, %edx
+ adcq %rdi, %rax
+# endif
+# ifdef USE_AS_WCSCPY
+ vmovd %xmm0, (%rdi)
+# else
+ movb %r8b, (%rdi, %rdx)
+# endif
+ ret
+# endif
+
+ .p2align 4,, 2
+L(zero_len):
+ movq %rdi, %rax
+ ret
+# ifndef USE_AS_WCSCPY
+ .p2align 4,, 8
+L(zfill_from_1):
+# ifdef USE_AS_STPCPY
+ leaq (%rdi, %rcx), %rax
+# endif
+ movw $0, -1(%rdi, %rdx)
+ ret
+# endif
+
+ .p2align 4,, 4
+ .p2align 6,, 8
+L(page_cross):
+ movq %rsi, %rax
+ andq $(VEC_SIZE * -1), %rax
+
+ VPCMPEQ (%rax), %VZERO, %VMM(6)
+
+ vpmovmskb %VMM(6), %ecx
+ shrxl %esi, %ecx, %ecx
+
+ subl %esi, %eax
+ andl $(VEC_SIZE - 1), %eax
+ cmpq %rax, %rdx
+ jb L(page_cross_small)
+ /* Optimizing more aggressively for space as this is very cold
+ code. This saves 2x cache lines. */
+
+ /* If rcx is non-zero then continue. */
+ shl $CHAR_SIZE, %ecx
+ jz L(page_cross_continue)
+ bsf %ecx, %ecx
+
+ subq %rcx, %rdx
+# ifdef USE_AS_STPCPY
+ leaq -CHAR_SIZE(%rdi, %rcx), %rax
+# else
+ movq %rdi, %rax
+# endif
+
+ rep movsb
+# ifdef USE_AS_WCSCPY
+ movl $0, (%rdi)
+# else
+ movb $0, (%rdi)
+# endif
+ jmp L(zfill_from_page_cross)
+
+L(page_cross_small):
+ tzcntl %ecx, %ecx
+ xorl %eax, %eax
+ cmpl %ecx, %edx
+ jbe L(page_cross_copy_only)
+
+ /* Do a zfill of the tail before copying. */
+ movq %rdi, %r9
+ movl %ecx, %r8d
+
+ subl %ecx, %edx
+ leaq CHAR_SIZE(%rdi, %rcx), %rdi
+ movl %edx, %ecx
+ rep stosb
+ movq %r9, %rdi
+ movl %r8d, %edx
+L(page_cross_copy_only):
+ leal CHAR_SIZE(%rdx), %ecx
+# ifdef USE_AS_STPCPY
+# ifdef USE_AS_WCSCPY
+ setc %al
+ addq %rdi, %rdx
+ leaq (%rdx, %rax, CHAR_SIZE), %rax
+# else
+ movl %edx, %eax
+ adcq %rdi, %rax
+# endif
+# else
+ movq %rdi, %rax
+# endif
+ rep movsb
+ ret
+
+
+L(best_effort_strncpy):
+ movq %rdx, %rcx
+ xorl %eax, %eax
+ movq %rdi, %r8
+ /* The length is >= 2^63. We very much so expect to segfault at
+ rep stos. If that doesn't happen then just strcpy to finish.
+ */
+# ifdef USE_AS_WCSCPY
+ rep stosl
+# else
+ rep stosb
+# endif
+ movq %r8, %rdi
+ jmp OVERFLOW_STRCPY
+END(STRNCPY)
+#endif
@@ -27,7 +27,8 @@
#define VEC_SIZE 32
#include "x86-vec-macros.h"
-#define USE_WITH_AVX 1
+#define USE_WITH_AVX2 1
+
#define SECTION(p) p##.avx
/* 4-byte mov instructions with AVX2. */