From patchwork Mon Apr 4 17:46:50 2016 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Paul E. Murphy" X-Patchwork-Id: 11619 X-Patchwork-Delegate: tuliom@linux.vnet.ibm.com Received: (qmail 116368 invoked by alias); 4 Apr 2016 17:47:07 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 116319 invoked by uid 89); 4 Apr 2016 17:47:07 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-1.9 required=5.0 tests=BAYES_00, KAM_LAZY_DOMAIN_SECURITY, RP_MATCHES_RCVD autolearn=ham version=3.3.2 spammy=__typeof, 3211, vr, rv X-HELO: e35.co.us.ibm.com X-IBM-Helo: d03dlp02.boulder.ibm.com X-IBM-MailFrom: murphyp@linux.vnet.ibm.com X-IBM-RcptTo: libc-alpha@sourceware.org From: "Paul E. Murphy" Subject: [PATCH v3] powerpc: Add optimized P8 strspn To: "libc-alpha@sourceware.org" , Tulio Magno Quites Machado Filho Message-ID: <5702A88A.7060507@linux.vnet.ibm.com> Date: Mon, 4 Apr 2016 12:46:50 -0500 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.6.0 MIME-Version: 1.0 X-TM-AS-MML: disable X-Content-Scanned: Fidelis XPS MAILER x-cbid: 16040417-0013-0000-0000-000024E56D25 No big changes, but necessary to fix a silent merge conflict: * Rename r0 to 0 for lvx/lvsr usage as appropriate * A couple of minor typos in comments * Change strspn-ppc64.S to strspn-ppc64.c as recent changes have retracted the PPC64 specific ASM retested on ppc64le ----8<---- This utilizes vectors and bitmasks. For small needle, large haystack, the performance improvement is upto 8x. For short strings (0-4B), the cost of computing the bitmask dominates, and is a tad slower. 2016-04-04 Paul E. Murphy * sysdeps/powerpc/powerpc64/multiarch/Makefile: (sysdep_routines): Add new strspn targets. * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c: (__libc_ifunc_impl_list): Add strspn. * sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S: New file. * sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strspn.c: Likewise. * sysdeps/powerpc/powerpc64/power8/strspn.S: Likewise. --- sysdeps/powerpc/powerpc64/multiarch/Makefile | 3 +- .../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 8 + .../powerpc/powerpc64/multiarch/strspn-power8.S | 40 +++++ sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c | 25 +++ sysdeps/powerpc/powerpc64/multiarch/strspn.c | 35 ++++ sysdeps/powerpc/powerpc64/power8/strspn.S | 179 +++++++++++++++++++++ 6 files changed, 289 insertions(+), 1 deletion(-) create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strspn.c create mode 100644 sysdeps/powerpc/powerpc64/power8/strspn.S diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index 3b0e3a0..7ed56bf 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -19,7 +19,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ strcmp-power8 strcmp-power7 strcmp-ppc64 \ strcat-power8 strcat-power7 strcat-ppc64 \ memmove-power7 memmove-ppc64 wordcopy-ppc64 bcopy-ppc64 \ - strncpy-power8 strstr-power7 strstr-ppc64 + strncpy-power8 strstr-power7 strstr-ppc64 \ + strspn-power8 strspn-ppc64 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index 11a8215..3e1f099 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -322,6 +322,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_ppc)) + /* Support sysdeps/powerpc/powerpc64/multiarch/strspn.c. */ + IFUNC_IMPL (i, name, strspn, + IFUNC_IMPL_ADD (array, i, strspn, + hwcap2 & PPC_FEATURE2_ARCH_2_07, + __strspn_power8) + IFUNC_IMPL_ADD (array, i, strspn, 1, + __strspn_ppc)) + /* Support sysdeps/powerpc/powerpc64/multiarch/strstr.c. */ IFUNC_IMPL (i, name, strstr, IFUNC_IMPL_ADD (array, i, strstr, diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S new file mode 100644 index 0000000..86a4e09 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn-power8.S @@ -0,0 +1,40 @@ +/* Optimized strspn implementation for POWER8. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#undef EALIGN +#define EALIGN(name, alignt, words) \ + .section ".text"; \ + ENTRY_2(__strspn_power8) \ + .align ALIGNARG(alignt); \ + EALIGN_W_##words; \ + BODY_LABEL(__strspn_power8): \ + cfi_startproc; \ + LOCALENTRY(__strspn_power8) + +#undef END +#define END(name) \ + cfi_endproc; \ + TRACEBACK(__strspn_power8) \ + END_2(__strspn_power8) + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c new file mode 100644 index 0000000..4c63665 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c @@ -0,0 +1,25 @@ +/* Default strspn implementation for PowerPC64. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#define STRSPN __strspn_ppc +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) +#endif + +#include diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn.c b/sysdeps/powerpc/powerpc64/multiarch/strspn.c new file mode 100644 index 0000000..0e653f3 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn.c @@ -0,0 +1,35 @@ +/* Multiple versions of strspn. PowerPC64 version. + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +# include +# include +# include "init-arch.h" + +#undef strspn +extern __typeof (strspn) __libc_strspn; + +extern __typeof (strspn) __strspn_ppc attribute_hidden; +extern __typeof (strspn) __strspn_power8 attribute_hidden; + +libc_ifunc (__libc_strspn, + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __strspn_power8 + : __strspn_ppc); + +weak_alias (__libc_strspn, strspn) +libc_hidden_builtin_def (strspn) diff --git a/sysdeps/powerpc/powerpc64/power8/strspn.S b/sysdeps/powerpc/powerpc64/power8/strspn.S new file mode 100644 index 0000000..35d868e --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power8/strspn.S @@ -0,0 +1,179 @@ +/* Optimized strspn implementation for Power8. + + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +/* size_t [r3] strspn (const char *string [r3], + const char *needleAccept [r4] */ + +/* This takes a novel approach by computing a 256 bit mask whereby + each set bit implies the byte is "accepted". P8 vector hardware + has extremely efficient hardware for selecting bits from a mask. + + One might ask "why not use bpermd for short strings"? It is + so slow that its performance about matches the generic PPC64 + variant without any fancy masking, with the added expense of + making the mask. That was the first variant of this. */ + + + +#include "sysdep.h" + +/* Simple macro to use VSX instructions in overlapping VR's. */ +#define XXVR(insn, vrt, vra, vrb) \ + insn 32+vrt, 32+vra, 32+vrb + +/* ISA 2.07B instructions are not all defined for older binutils. + Macros are defined below for these newer instructions in order + to maintain compatibility. */ + +/* Note, TX/SX is always set as VMX regs are the high 32 VSX regs. */ +#define MTVRD(v,r) .long (0x7c000167 | ((v)<<(32-11)) | ((r)<<(32-16))) +#define MFVRD(r,v) .long (0x7c000067 | ((v)<<(32-11)) | ((r)<<(32-16))) + +#define VBPERMQ(t,a,b) .long (0x1000054c \ + | ((t)<<(32-11)) \ + | ((a)<<(32-16)) \ + | ((b)<<(32-21)) ) + + /* This can be updated to power8 once the minimum version of + binutils supports power8 and the above instructions. */ + .machine power7 +EALIGN(strspn, 4, 0) + CALL_MCOUNT 2 + + /* Generate useful constants for later on. */ + vspltisb v1, 7 + vspltisb v2, -1 + vslb v1, v1, v1 /* 0x80 to swap high bit for vbpermq. */ + vspltisb v10, 0 + vsldoi v4, v10, v2, 2 /* 0xFFFF into vr4. */ + XXVR(xxmrgld, v4, v4, v10) /* Mask for checking matches. */ + + /* Prepare to compute 256b mask. */ + addi r4, r4, -1 + li r5, 0 + li r6, 0 + li r7, 0 + li r8, 0 + li r11, 1 + sldi r11, r11, 63 + + /* Start interleaved Mask computation. + This will eventually or 1's into ignored bits from vbpermq. */ + lvsr v11, 0, r3 + vspltb v11, v11, 0 /* Splat shift constant. */ + + /* Build a 256b mask in r5-r8. */ + .align 4 +L(next_needle): + lbzu r9, 1(r4) + + cmpldi cr0, r9, 0 + cmpldi cr1, r9, 128 + + /* This is a little tricky. srd only uses the first 7 bits, + and if bit 7 is set, value is always 0. So, we can + effectively shift 128b in this case. */ + xori r12, r9, 0x40 /* Invert bit 6. */ + srd r10, r11, r9 /* Mask for bits 0-63. */ + srd r12, r11, r12 /* Mask for bits 64-127. */ + + beq cr0, L(start_cmp) + + /* Now, or the value into the correct GPR. */ + bge cr1,L(needle_gt128) + or r5, r5, r10 /* 0 - 63. */ + or r6, r6, r12 /* 64 - 127. */ + b L(next_needle) + + .align 4 +L(needle_gt128): + or r7, r7, r10 /* 128 - 191. */ + or r8, r8, r12 /* 192 - 255. */ + b L(next_needle) + + + .align 4 +L(start_cmp): + /* Move and merge bitmap into 2 VRs. bpermd is slower on P8. */ + mr r0, r3 /* Save r3 for final length computation. */ + MTVRD (v5, r5) + MTVRD (v6, r6) + MTVRD (v7, r7) + MTVRD (v8, r8) + + /* Continue interleaved mask generation. */ +#ifdef __LITTLE_ENDIAN__ + vsrw v11, v2, v11 /* Note, shift ignores higher order bits. */ + vsplth v11, v11, 0 /* Only care about the high 16 bits of v10. */ +#else + vslw v11, v2, v11 /* Note, shift ignores higher order bits. */ + vsplth v11, v11, 1 /* Only care about the low 16 bits of v10. */ +#endif + lvx v0, 0, r3 /* Note, unaligned load ignores lower bits. */ + + /* Do the merging of the bitmask. */ + XXVR(xxmrghd, v5, v5, v6) + XXVR(xxmrghd, v6, v7, v8) + + /* Finish mask generation. */ + vand v11, v11, v4 /* Throwaway bits not in the mask. */ + + /* Compare the first 1-16B, while masking unwanted bytes. */ + clrrdi r3, r3, 4 /* Note, counts from qw boundaries. */ + vxor v9, v0, v1 /* Swap high bit. */ + VBPERMQ (v8, v5, v0) + VBPERMQ (v7, v6, v9) + vor v7, v7, v8 + vor v7, v7, v11 /* Ignore non-participating bytes. */ + vcmpequh. v8, v7, v4 + bnl cr6, L(done) + + addi r3, r3, 16 + + .align 4 +L(vec): + lvx v0, 0, r3 + addi r3, r3, 16 + vxor v9, v0, v1 /* Swap high bit. */ + VBPERMQ (v8, v5, v0) + VBPERMQ (v7, v6, v9) + vor v7, v7, v8 + vcmpequh. v8, v7, v4 + blt cr6, L(vec) + + addi r3, r3, -16 +L(done): + subf r3, r0, r3 + MFVRD (r10, v7) + +#ifdef __LITTLE_ENDIAN__ + addi r0, r10, 1 /* Count the trailing 1's. */ + andc r10, r10, r0 + popcntd r10, r10 +#else + xori r10, r10, 0xffff /* Count leading 1's by inverting. */ + addi r3, r3, -48 /* Account for the extra leading zeros. */ + cntlzd r10, r10 +#endif + + add r3, r3, r10 + blr + +END(strspn) +libc_hidden_builtin_def (strspn)