x86-64: Optimize memrchr with AVX2

Message ID	20170601181331.GC28627@lucon.org
State	New, archived
Headers	Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk Sender: libc-alpha-owner@sourceware.org Date: Thu, 1 Jun 2017 11:13:31 -0700 From: "H.J. Lu" <hongjiu.lu@intel.com> To: GNU C Library <libc-alpha@sourceware.org> Subject: [PATCH] x86-64: Optimize memrchr with AVX2 Message-ID: <20170601181331.GC28627@lucon.org> Reply-To: "H.J. Lu" <hjl.tools@gmail.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.8.0 (2017-02-23)

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 2974495..aee4820 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -7,6 +7,7 @@ ifeq ($(subdir),string) sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ strcmp-sse2-unaligned strncmp-ssse3 \ memchr-avx2 rawmemchr-avx2 \ + memrchr-avx2 \ memcmp-avx2 \ memcmp-sse4 memcpy-ssse3 \ memmove-ssse3 \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index d3fa98f..9f1ec4b 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -111,6 +111,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2_unaligned_erms)) + /* Support sysdeps/x86_64/multiarch/memrchr.S. */ + IFUNC_IMPL (i, name, memrchr, + IFUNC_IMPL_ADD (array, i, memrchr, + HAS_ARCH_FEATURE (AVX2_Usable), + __memrchr_avx2) + IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2)) + /* Support sysdeps/x86_64/multiarch/memset_chk.S. */ IFUNC_IMPL (i, name, __memset_chk, IFUNC_IMPL_ADD (array, i, __memset_chk, 1, diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S new file mode 100644 index 0000000..3ee02e1 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S @@ -0,0 +1,359 @@ +/* memrchr optimized with AVX2. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) + +# include <sysdep.h> + +# ifndef VZEROUPPER +# define VZEROUPPER vzeroupper +# endif + +# define VEC_SIZE 32 + + .section .text.avx,"ax",@progbits +ENTRY (__memrchr_avx2) + /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 + vpbroadcastb %xmm0, %ymm0 + + subq $VEC_SIZE, %rdx + jbe L(last_vec_or_less) + + addq %rdx, %rdi + + /* Check the last VEC_SIZE bytes. */ + vpcmpeqb (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x0) + + subq $(VEC_SIZE * 4), %rdi + movl %edi, %ecx + andl $(VEC_SIZE - 1), %ecx + jz L(aligned_more) + + /* Align data for aligned loads in the loop. */ + addq $VEC_SIZE, %rdi + addq $VEC_SIZE, %rdx + andq $-VEC_SIZE, %rdi + subq %rcx, %rdx + + .p2align 4 +L(aligned_more): + subq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec_or_less) + + /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 + vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 + vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(last_vec_x1) + + vpcmpeqb (%rdi), %ymm0, %ymm4 + vpmovmskb %ymm4, %eax + testl %eax, %eax + jnz L(last_vec_x0) + + /* Align data to 4 * VEC_SIZE for loop with fewer branches. + There are some overlaps with above if data isn't aligned + to 4 * VEC_SIZE. */ + movl %edi, %ecx + andl $(VEC_SIZE * 4 - 1), %ecx + jz L(loop_4x_vec) + + addq $(VEC_SIZE * 4), %rdi + addq $(VEC_SIZE * 4), %rdx + andq $-(VEC_SIZE * 4), %rdi + subq %rcx, %rdx + + .p2align 4 +L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ + subq $(VEC_SIZE * 4), %rdi + subq $(VEC_SIZE * 4), %rdx + jbe L(last_4x_vec_or_less) + + vmovdqa (%rdi), %ymm1 + vmovdqa VEC_SIZE(%rdi), %ymm2 + vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 + vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 + + vpcmpeqb %ymm1, %ymm0, %ymm1 + vpcmpeqb %ymm2, %ymm0, %ymm2 + vpcmpeqb %ymm3, %ymm0, %ymm3 + vpcmpeqb %ymm4, %ymm0, %ymm4 + + vpor %ymm1, %ymm2, %ymm5 + vpor %ymm3, %ymm4, %ymm6 + vpor %ymm5, %ymm6, %ymm5 + + vpmovmskb %ymm5, %eax + testl %eax, %eax + jz L(loop_4x_vec) + + /* There is a match. */ + vpmovmskb %ymm4, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(last_vec_x1) + + vpmovmskb %ymm1, %eax + bsrl %eax, %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_4x_vec_or_less): + addl $(VEC_SIZE * 4), %edx + cmpl $(VEC_SIZE * 2), %edx + jbe L(last_2x_vec) + + vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + + vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 + vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 + vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(last_vec_x1_check) + cmpl $(VEC_SIZE * 3), %edx + jbe L(zero) + + vpcmpeqb (%rdi), %ymm0, %ymm4 + vpmovmskb %ymm4, %eax + testl %eax, %eax + jz L(zero) + bsrl %eax, %eax + subq $(VEC_SIZE * 4), %rdx + addq %rax, %rdx + jl L(zero) + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_2x_vec): + vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x3_check) + cmpl $VEC_SIZE, %edx + jbe L(zero) + + vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jz L(zero) + bsrl %eax, %eax + subq $(VEC_SIZE * 2), %rdx + addq %rax, %rdx + jl L(zero) + addl $(VEC_SIZE * 2), %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_x0): + bsrl %eax, %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_x1): + bsrl %eax, %eax + addl $VEC_SIZE, %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_x2): + bsrl %eax, %eax + addl $(VEC_SIZE * 2), %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_x3): + bsrl %eax, %eax + addl $(VEC_SIZE * 3), %eax + addq %rdi, %rax + ret + + .p2align 4 +L(last_vec_x1_check): + bsrl %eax, %eax + subq $(VEC_SIZE * 3), %rdx + addq %rax, %rdx + jl L(zero) + addl $VEC_SIZE, %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_x3_check): + bsrl %eax, %eax + subq $VEC_SIZE, %rdx + addq %rax, %rdx + jl L(zero) + addl $(VEC_SIZE * 3), %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(zero): + VZEROUPPER +L(null): + xorl %eax, %eax + ret + + .p2align 4 +L(last_vec_or_less_aligned): + movl %edx, %ecx + + vpcmpeqb (%rdi), %ymm0, %ymm1 + + movl $1, %edx + /* Support rdx << 32. */ + salq %cl, %rdx + subq $1, %rdx + + vpmovmskb %ymm1, %eax + + /* Remove the trailing bytes. */ + andl %edx, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_or_less): + addl $VEC_SIZE, %edx + + /* Check for zero length. */ + testl %edx, %edx + jz L(null) + + movl %edi, %ecx + andl $(VEC_SIZE - 1), %ecx + jz L(last_vec_or_less_aligned) + + movl %ecx, %esi + movl %ecx, %r8d + addl %edx, %esi + andq $-VEC_SIZE, %rdi + + subl $VEC_SIZE, %esi + ja L(last_vec_2x_aligned) + + /* Check the last VEC. */ + vpcmpeqb (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + + /* Remove the leading and trailing bytes. */ + sarl %cl, %eax + movl %edx, %ecx + + movl $1, %edx + sall %cl, %edx + subl $1, %edx + + andl %edx, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + addq %r8, %rax + VZEROUPPER + ret + + .p2align 4 +L(last_vec_2x_aligned): + movl %esi, %ecx + + /* Check the last VEC. */ + vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1 + + movl $1, %edx + sall %cl, %edx + subl $1, %edx + + vpmovmskb %ymm1, %eax + + /* Remove the trailing bytes. */ + andl %edx, %eax + + testl %eax, %eax + jnz L(last_vec_x1) + + /* Check the second last VEC. */ + vpcmpeqb (%rdi), %ymm0, %ymm1 + + movl %r8d, %ecx + + vpmovmskb %ymm1, %eax + + /* Remove the leading bytes. Must use unsigned right shift for + bsrl below. */ + shrl %cl, %eax + testl %eax, %eax + jz L(zero) + + bsrl %eax, %eax + addq %rdi, %rax + addq %r8, %rax + VZEROUPPER + ret +END (__memrchr_avx2) +#endif diff --git a/sysdeps/x86_64/multiarch/memrchr.S b/sysdeps/x86_64/multiarch/memrchr.S new file mode 100644 index 0000000..20a9e76 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memrchr.S @@ -0,0 +1,55 @@ +/* Multiple versions of memrchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + +/* Define multiple versions only for the definition in libc. */ +#if IS_IN (libc) + .text +ENTRY(__memrchr) + .type __memrchr, @gnu_indirect_function + LOAD_RTLD_GLOBAL_RO_RDX + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER) + jnz 1f + HAS_ARCH_FEATURE (AVX2_Usable) + jz 1f + HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) + jz 1f + leaq __memrchr_avx2(%rip), %rax + ret + +1: leaq __memrchr_sse2(%rip), %rax + ret +END(__memrchr) + +# undef ENTRY +# define ENTRY(name) \ + .type __memrchr_sse2, @function; \ + .p2align 4; \ + .globl __memrchr_sse2; \ + .hidden __memrchr_sse2; \ + __memrchr_sse2: cfi_startproc; \ + CALL_MCOUNT +# undef END +# define END(name) \ + cfi_endproc; .size __memrchr_sse2, .-__memrchr_sse2 +#endif + +#include "../memrchr.S"

x86-64: Optimize memrchr with AVX2

Commit Message

Patch