From patchwork Fri Dec 8 19:40:20 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Tulio Magno Quites Machado Filho X-Patchwork-Id: 24830 Received: (qmail 28765 invoked by alias); 8 Dec 2017 19:52:10 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 28709 invoked by uid 89); 8 Dec 2017 19:52:10 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-24.0 required=5.0 tests=AWL, BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_LAZY_DOMAIN_SECURITY, KAM_SHORT, RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.2 spammy= X-HELO: mx0a-001b2d01.pphosted.com From: "Tulio Magno Quites Machado Filho" To: libc-alpha@sourceware.org Cc: Adhemerval Zanella , Rajalakshmi Srinivasaraghavan Subject: [PATCHv2] powerpc: POWER8 memcpy optimization for cached memory Date: Fri, 8 Dec 2017 17:40:20 -0200 In-Reply-To: <87vaik8uxy.fsf@linux.vnet.ibm.com> References: <87vaik8uxy.fsf@linux.vnet.ibm.com> X-TM-AS-GCONF: 00 x-cbid: 17120819-2213-0000-0000-00000247E285 X-IBM-SpamModules-Scores: X-IBM-SpamModules-Versions: BY=3.00008173; HX=3.00000241; KW=3.00000007; PH=3.00000004; SC=3.00000244; SDB=6.00957347; UDB=6.00484022; IPR=6.00737378; BA=6.00005731; NDR=6.00000001; ZLA=6.00000005; ZF=6.00000009; ZB=6.00000000; ZP=6.00000000; ZH=6.00000000; ZU=6.00000002; MB=3.00018430; XFM=3.00000015; UTC=2017-12-08 19:41:31 X-IBM-AV-DETECTION: SAVI=unused REMOTE=unused XFE=unused x-cbparentid: 17120819-2214-0000-0000-000058634764 Message-Id: <20171208194020.5005-1-tuliom@linux.vnet.ibm.com> X-Proofpoint-Virus-Version: vendor=fsecure engine=2.50.10432:, , definitions=2017-12-08_10:, , signatures=0 X-Proofpoint-Spam-Details: rule=outbound_notspam policy=outbound score=0 priorityscore=1501 malwarescore=0 suspectscore=4 phishscore=0 bulkscore=0 spamscore=0 clxscore=1011 lowpriorityscore=0 mlxscore=0 impostorscore=0 mlxlogscore=999 adultscore=0 classifier=spam adjust=0 reason=mlx scancount=1 engine=8.0.1-1711220000 definitions=main-1712080269 From: Adhemerval Zanella I made the changes I requested, updated copyright entries, added a manual entry and fixed a build issue on powerpc64. --- 8< --- On POWER8, unaligned memory accesses to cached memory has little impact on performance as opposed to its ancestors. It is disabled by default and will only be available when the tunable glibc.tune.cached_memopt is set to 1. __memcpy_power8_cached __memcpy_power7 Reviewed-by: Rajalakshmi Srinivasaraghavan ============================================================ max-size=4096: 33325.70 ( 12.65%) 38153.00 max-size=8192: 32878.20 ( 11.17%) 37012.30 max-size=16384: 33782.20 ( 11.61%) 38219.20 max-size=32768: 33296.20 ( 11.30%) 37538.30 max-size=65536: 33765.60 ( 10.53%) 37738.40 2017-12-08 Adhemerval Zanella Tulio Magno Quites Machado Filho * manual/tunables.texi (Hardware Capability Tunables): Document glibc.tune.cached_memopt. * sysdeps/powerpc/cpu-features.c: New file. * sysdeps/powerpc/cpu-features.h: New file. * sysdeps/powerpc/dl-procinfo.c [!IS_IN(ldconfig)]: Add _dl_powerpc_cpu_features. * sysdeps/powerpc/dl-tunables.list: New file. * sysdeps/powerpc/ldsodefs.h: Include cpu-features.h. * sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h: . * sysdeps/powerpc/powerpc64/dl-machine.h (INIT_ARCH): Initialize use_aligned_memopt. * sysdeps/powerpc/powerpc64/multiarch/Makefile (sysdep_routines): Add memcpy-power8-cached. * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c: Add __memcpy_power8_cached. * sysdeps/powerpc/powerpc64/multiarch/memcpy.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/memcpy-power8-cached.S: New file. --- manual/tunables.texi | 7 + sysdeps/powerpc/cpu-features.c | 39 +++++ sysdeps/powerpc/cpu-features.h | 28 ++++ sysdeps/powerpc/dl-procinfo.c | 16 ++ sysdeps/powerpc/dl-tunables.list | 28 ++++ sysdeps/powerpc/ldsodefs.h | 1 + .../powerpc/powerpc32/power4/multiarch/init-arch.h | 2 + sysdeps/powerpc/powerpc64/dl-machine.h | 4 +- sysdeps/powerpc/powerpc64/multiarch/Makefile | 4 +- .../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 2 + .../powerpc64/multiarch/memcpy-power8-cached.S | 179 +++++++++++++++++++++ sysdeps/powerpc/powerpc64/multiarch/memcpy.c | 23 +-- 12 files changed, 320 insertions(+), 13 deletions(-) create mode 100644 sysdeps/powerpc/cpu-features.c create mode 100644 sysdeps/powerpc/cpu-features.h create mode 100644 sysdeps/powerpc/dl-tunables.list create mode 100644 sysdeps/powerpc/powerpc64/multiarch/memcpy-power8-cached.S diff --git a/manual/tunables.texi b/manual/tunables.texi index e851b95..17ceb64 100644 --- a/manual/tunables.texi +++ b/manual/tunables.texi @@ -319,6 +319,13 @@ the ones in @code{sysdeps/x86/cpu-features.h}. This tunable is specific to i386 and x86-64. @end deftp +@deftp Tunable glibc.tune.cached_memopt +The @code{glibc.tune.cached_memopt=[0|1]} tunable allows the user to enable +optimizations recommended to cacheable memory. + +This tunable is specific to powerpc, powerpc64 and powerpc64le. +@end deftp + @deftp Tunable glibc.tune.cpu The @code{glibc.tune.cpu=xxx} tunable allows the user to tell @theglibc{} to assume that the CPU is @code{xxx} where xxx may have one of these values: diff --git a/sysdeps/powerpc/cpu-features.c b/sysdeps/powerpc/cpu-features.c new file mode 100644 index 0000000..6870582 --- /dev/null +++ b/sysdeps/powerpc/cpu-features.c @@ -0,0 +1,39 @@ +/* Initialize cpu feature data. PowerPC version. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +#if HAVE_TUNABLES +# include +#endif + +static inline void +init_cpu_features (struct cpu_features *cpu_features) +{ + /* Default is to use aligned memory access on optimized function unless + tunables is enable, since for this case user can explicit disable + unaligned optimizations. */ +#if HAVE_TUNABLES + int32_t cached_memfunc = TUNABLE_GET (glibc, tune, cached_memopt, int32_t, + NULL); + cpu_features->use_cached_memopt = (cached_memfunc > 0); +#else + cpu_features->use_cached_memopt = false; +#endif +} diff --git a/sysdeps/powerpc/cpu-features.h b/sysdeps/powerpc/cpu-features.h new file mode 100644 index 0000000..36a8bb4 --- /dev/null +++ b/sysdeps/powerpc/cpu-features.h @@ -0,0 +1,28 @@ +/* Initialize cpu feature data. PowerPC version. + Copyright (C) 2017 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#ifndef __CPU_FEATURES_POWERPC_H +# define __CPU_FEATURES_POWERPC_H + +#include + +struct cpu_features +{ + bool use_cached_memopt; +}; + +#endif /* __CPU_FEATURES_H */ diff --git a/sysdeps/powerpc/dl-procinfo.c b/sysdeps/powerpc/dl-procinfo.c index 55a6e78..c8b14454d 100644 --- a/sysdeps/powerpc/dl-procinfo.c +++ b/sysdeps/powerpc/dl-procinfo.c @@ -42,6 +42,22 @@ # define PROCINFO_CLASS #endif +#if !IS_IN (ldconfig) +# if !defined PROCINFO_DECL && defined SHARED + ._dl_powerpc_cpu_features +# else +PROCINFO_CLASS struct cpu_features _dl_powerpc_cpu_features +# endif +# ifndef PROCINFO_DECL += { } +# endif +# if !defined SHARED || defined PROCINFO_DECL +; +# else +, +# endif +#endif + #if !defined PROCINFO_DECL && defined SHARED ._dl_powerpc_cap_flags #else diff --git a/sysdeps/powerpc/dl-tunables.list b/sysdeps/powerpc/dl-tunables.list new file mode 100644 index 0000000..9e14b9a --- /dev/null +++ b/sysdeps/powerpc/dl-tunables.list @@ -0,0 +1,28 @@ +# powerpc specific tunables. +# Copyright (C) 2017 Free Software Foundation, Inc. +# This file is part of the GNU C Library. + +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. + +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. + +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# . + +glibc { + tune { + cached_memopt { + type: INT_32 + minval: 0 + maxval: 1 + default: 0 + } + } +} diff --git a/sysdeps/powerpc/ldsodefs.h b/sysdeps/powerpc/ldsodefs.h index 466de79..6f8b3a2 100644 --- a/sysdeps/powerpc/ldsodefs.h +++ b/sysdeps/powerpc/ldsodefs.h @@ -20,6 +20,7 @@ #define _POWERPC_LDSODEFS_H 1 #include +#include struct La_ppc32_regs; struct La_ppc32_retval; diff --git a/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h b/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h index f2e6a4b..6038941 100644 --- a/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h +++ b/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h @@ -37,6 +37,8 @@ #define INIT_ARCH() \ unsigned long int hwcap = __GLRO(dl_hwcap); \ unsigned long int __attribute__((unused)) hwcap2 = __GLRO(dl_hwcap2); \ + bool __attribute__((unused)) use_cached_memopt = \ + GLRO(dl_powerpc_cpu_features).use_cached_memopt; \ if (hwcap & PPC_FEATURE_ARCH_2_06) \ hwcap |= PPC_FEATURE_ARCH_2_05 | \ PPC_FEATURE_POWER5_PLUS | \ diff --git a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h index aeb91b8..76dceee 100644 --- a/sysdeps/powerpc/powerpc64/dl-machine.h +++ b/sysdeps/powerpc/powerpc64/dl-machine.h @@ -27,6 +27,7 @@ #include #include #include +#include /* Translate a processor specific dynamic tag to the index in l_info array. */ @@ -300,13 +301,14 @@ BODY_PREFIX "_dl_start_user:\n" \ /* We define an initialization function to initialize HWCAP/HWCAP2 and platform data so it can be copied into the TCB later. This is called very early in _dl_sysdep_start for dynamically linked binaries. */ -#ifdef SHARED +#if defined(SHARED) && IS_IN (rtld) # define DL_PLATFORM_INIT dl_platform_init () static inline void __attribute__ ((unused)) dl_platform_init (void) { __tcb_parse_hwcap_and_convert_at_platform (); + init_cpu_features (&GLRO(dl_powerpc_cpu_features)); } #endif diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index dea49ac..4df6b45 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -1,6 +1,6 @@ ifeq ($(subdir),string) -sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ - memcpy-power4 memcpy-ppc64 \ +sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ + memcpy-cell memcpy-power4 memcpy-ppc64 \ memcmp-power8 memcmp-power7 memcmp-power4 memcmp-ppc64 \ memset-power7 memset-power6 memset-power4 \ memset-ppc64 memset-power8 \ diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index 6a88536..77a60ea 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -51,6 +51,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, #ifdef SHARED /* Support sysdeps/powerpc/powerpc64/multiarch/memcpy.c. */ IFUNC_IMPL (i, name, memcpy, + IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07, + __memcpy_power8_cached) IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX, __memcpy_power7) IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_ARCH_2_06, diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy-power8-cached.S b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power8-cached.S new file mode 100644 index 0000000..e5b6f25 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power8-cached.S @@ -0,0 +1,179 @@ +/* Optimized memcpy implementation for cached memory on PowerPC64/POWER8. + Copyright (C) 2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + + +/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); + Returns 'dst'. */ + + .machine power8 +ENTRY_TOCLESS (__memcpy_power8_cached, 5) + CALL_MCOUNT 3 + + cmpldi cr7,r5,15 + bgt cr7,L(ge_16) + andi. r9,r5,0x1 + mr r9,r3 + beq cr0,1f + lbz r10,0(r4) + addi r9,r3,1 + addi r4,r4,1 + stb r10,0(r3) +1: + andi. r10,r5,0x2 + beq cr0,2f + lhz r10,0(r4) + addi r9,r9,2 + addi r4,r4,2 + sth r10,-2(r9) +2: + andi. r10,r5,0x4 + beq cr0,3f + lwz r10,0(r4) + addi r9,9,4 + addi r4,4,4 + stw r10,-4(r9) +3: + andi. r10,r5,0x8 + beqlr cr0 + ld r10,0(r4) + std r10,0(r9) + blr + + .align 4 +L(ge_16): + cmpldi cr7,r5,32 + ble cr7,L(ge_16_le_32) + cmpldi cr7,r5,64 + ble cr7,L(gt_32_le_64) + + /* Align dst to 16 bytes. */ + andi. r9,r3,0xf + mr r12,r3 + beq cr0,L(dst_is_align_16) + lxvd2x v0,r0,r4 + subfic r12,r9,16 + subf r5,r12,r5 + add r4,r4,r12 + add r12,r3,r12 + stxvd2x v0,r0,r3 +L(dst_is_align_16): + cmpldi cr7,r5,127 + ble cr7,L(tail_copy) + addi r8,r5,-128 + mr r9,r12 + rldicr r8,r8,0,56 + li r11,16 + srdi r10,r8,7 + addi r0,r8,128 + addi r10,r10,1 + li r6,32 + mtctr r10 + li r7,48 + + /* Main loop, copy 128 bytes each time. */ + .align 4 +L(copy_128): + lxvd2x v10,r0,r4 + lxvd2x v11,r4,r11 + addi r8,r4,64 + addi r10,r9,64 + lxvd2x v12,r4,r6 + lxvd2x v0,r4,r7 + addi r4,r4,128 + stxvd2x v10,r0,r9 + stxvd2x v11,r9,r11 + stxvd2x v12,r9,r6 + stxvd2x v0,r9,r7 + addi r9,r9,128 + lxvd2x v10,r0,r8 + lxvd2x v11,r8,r11 + lxvd2x v12,r8,r6 + lxvd2x v0,r8,r7 + stxvd2x v10,r0,r10 + stxvd2x v11,r10,r11 + stxvd2x v12,r10,r6 + stxvd2x v0,r10,r7 + bdnz L(copy_128) + + add r12,r12,r0 + rldicl r5,r5,0,57 +L(tail_copy): + cmpldi cr7,r5,63 + ble cr7,L(tail_le_64) + li r8,16 + li r10,32 + lxvd2x v10,r0,r4 + li r9,48 + addi r5,r5,-64 + lxvd2x v11,r4,r8 + lxvd2x v12,r4,r10 + lxvd2x v0,r4,r9 + addi r4,r4,64 + stxvd2x v10,r0,r12 + stxvd2x v11,r12,r8 + stxvd2x v12,r12,r10 + stxvd2x v0,r12,9 + addi r12,r12,64 + +L(tail_le_64): + cmpldi cr7,r5,32 + bgt cr7,L(tail_gt_32_le_64) + cmpdi cr7,r5,0 + beqlr cr7 + addi r5,r5,-32 + li r9,16 + add r8,r4,r5 + add r10,r12,r5 + lxvd2x v12,r4,r5 + lxvd2x v0,r8,r9 + stxvd2x v12,r12,r5 + stxvd2x v0,r10,r9 + blr + + .align 4 +L(ge_16_le_32): + addi r5,r5,-16 + lxvd2x v0,r0,r4 + lxvd2x v1,r4,r5 + stxvd2x v0,r0,r3 + stxvd2x v1,r3,r5 + blr + + .align 4 +L(gt_32_le_64): + mr r12,r3 + + .align 4 +L(tail_gt_32_le_64): + li r9,16 + lxvd2x v0,r0,r4 + addi r5,r5,-32 + lxvd2x v1,r4,r9 + add r8,r4,r5 + lxvd2x v2,r4,r5 + add r10,r12,r5 + lxvd2x v3,r8,r9 + stxvd2x v0,r0,r12 + stxvd2x v1,r12,r9 + stxvd2x v2,r12,r5 + stxvd2x v3,r10,r9 + blr + +END_GEN_TB (__memcpy_power8_cached,TB_TOCLESS) diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c index 9f4286c..fb49fe1 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c +++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c @@ -35,18 +35,21 @@ extern __typeof (__redirect_memcpy) __memcpy_cell attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_power6 attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_a2 attribute_hidden; extern __typeof (__redirect_memcpy) __memcpy_power7 attribute_hidden; +extern __typeof (__redirect_memcpy) __memcpy_power8_cached attribute_hidden; libc_ifunc (__libc_memcpy, - (hwcap & PPC_FEATURE_HAS_VSX) - ? __memcpy_power7 : - (hwcap & PPC_FEATURE_ARCH_2_06) - ? __memcpy_a2 : - (hwcap & PPC_FEATURE_ARCH_2_05) - ? __memcpy_power6 : - (hwcap & PPC_FEATURE_CELL_BE) - ? __memcpy_cell : - (hwcap & PPC_FEATURE_POWER4) - ? __memcpy_power4 + ((hwcap2 & PPC_FEATURE2_ARCH_2_07) && use_cached_memopt) + ? __memcpy_power8_cached : + (hwcap & PPC_FEATURE_HAS_VSX) + ? __memcpy_power7 : + (hwcap & PPC_FEATURE_ARCH_2_06) + ? __memcpy_a2 : + (hwcap & PPC_FEATURE_ARCH_2_05) + ? __memcpy_power6 : + (hwcap & PPC_FEATURE_CELL_BE) + ? __memcpy_cell : + (hwcap & PPC_FEATURE_POWER4) + ? __memcpy_power4 : __memcpy_ppc); #undef memcpy