From patchwork Thu Dec 7 07:53:51 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Rajalakshmi S X-Patchwork-Id: 24771 X-Patchwork-Delegate: tuliom@linux.vnet.ibm.com Received: (qmail 115316 invoked by alias); 7 Dec 2017 07:54:28 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 115293 invoked by uid 89); 7 Dec 2017 07:54:25 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-24.1 required=5.0 tests=AWL, BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_LAZY_DOMAIN_SECURITY, RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.2 spammy=Improvements, Prepare, upto X-HELO: mx0a-001b2d01.pphosted.com From: Rajalakshmi Srinivasaraghavan To: libc-alpha@sourceware.org Cc: Rajalakshmi Srinivasaraghavan Subject: [PATCH] powerpc: st{r,p}cpy optimization for aligned strings Date: Thu, 7 Dec 2017 13:23:51 +0530 X-TM-AS-GCONF: 00 x-cbid: 17120707-0016-0000-0000-0000050A3F48 X-IBM-AV-DETECTION: SAVI=unused REMOTE=unused XFE=unused x-cbparentid: 17120707-0017-0000-0000-0000284640B8 Message-Id: <1512633231-3626-1-git-send-email-raji@linux.vnet.ibm.com> X-Proofpoint-Virus-Version: vendor=fsecure engine=2.50.10432:, , definitions=2017-12-07_02:, , signatures=0 X-Proofpoint-Spam-Details: rule=outbound_notspam policy=outbound score=0 priorityscore=1501 malwarescore=0 suspectscore=1 phishscore=0 bulkscore=0 spamscore=0 clxscore=1015 lowpriorityscore=0 impostorscore=0 adultscore=0 classifier=spam adjust=0 reason=mlx scancount=1 engine=8.0.1-1709140000 definitions=main-1712070122 This patch makes use of vectors for aligned inputs. Improvements upto 30% seen for larger aligned inputs. 2017-12-05 Rajalakshmi Srinivasaraghavan * sysdeps/powerpc/powerpc64/power8/strcpy.S: Use vectors for aligned inputs. Reviewed-by: Tulio Magno Quites Machado Filho --- sysdeps/powerpc/powerpc64/power8/strcpy.S | 149 +++++++++++++++++++++++++++++- 1 file changed, 146 insertions(+), 3 deletions(-) diff --git a/sysdeps/powerpc/powerpc64/power8/strcpy.S b/sysdeps/powerpc/powerpc64/power8/strcpy.S index 13e7a0fcbc..a1683f9dfe 100644 --- a/sysdeps/powerpc/powerpc64/power8/strcpy.S +++ b/sysdeps/powerpc/powerpc64/power8/strcpy.S @@ -47,7 +47,7 @@ 64K as default, the page cross handling assumes minimum page size of 4k. */ - .machine power7 + .machine power8 ENTRY_TOCLESS (FUNC_NAME, 4) li r0,0 /* Doubleword with null chars to use with cmpb. */ @@ -120,7 +120,7 @@ L(pagecross): ldu r8, 8(r7) L(loop_before): - /* Save the two doublewords readed from source and align the source + /* Save the two doublewords read from source and align the source to 16 bytes for the loop. */ mr r11,r3 std r12,0(r11) @@ -129,7 +129,150 @@ L(loop_before): rldicl r9,r4,0,60 subf r7,r9,r7 subf r11,r9,r11 - b L(loop_start) + /* Source is adjusted to 16B alignment and destination r11 is + also moved based on that adjustment. Now check if r11 is + also 16B aligned to move to vectorized loop. */ + andi. r6, r11, 0xF + bne L(loop_start) + + /* Prepare for the loop. */ + subf r4, r9, r4 /* Adjust r4 based on alignment. */ + li r7, 16 /* Load required offsets. */ + li r8, 32 + li r9, 48 + vspltisb v0, 0 + addi r4, r4, 16 + /* Are we 64-byte aligned? If so, jump to the vectorized loop. + Else copy 16B till r4 is 64B aligned. */ + andi. r6, r4, 63 + beq L(qw_loop) + + lvx v6, 0, r4 /* Load 16 bytes from memory. */ + vcmpequb. v5, v0, v6 /* Check for null. */ + bne cr6, L(qw_done) + stvx v6, 0, r11 /* Store 16 bytes. */ + addi r4, r4, 16 /* Increment the address. */ + addi r11, r11, 16 + andi. r6, r4, 63 + beq L(qw_loop) + + lvx v6, 0, r4 + vcmpequb. v5, v0, v6 + bne cr6, L(qw_done) + stvx v6, 0, r11 + addi r4, r4, 16 + addi r11, r11, 16 + andi. r6, r4, 63 + beq L(qw_loop) + + lvx v6, 0, r4 + vcmpequb. v5, v0, v6 + bne cr6, L(qw_done) + stvx v6, 0, r11 + addi r4, r4, 16 + addi r11, r11, 16 + + .align 4 +L(qw_loop): + lvx v1, r4, r0 /* Load 4 quadwords. */ + lvx v2, r4, r7 + lvx v3, r4, r8 + lvx v4, r4, r9 + vminub v5, v1, v2 /* Compare and merge into one VR for speed. */ + vminub v8, v3, v4 + vminub v7, v5, v8 + vcmpequb. v7, v7, v0 /* Check for NULLs. */ + bne cr6, L(qw_loop_done) + stvx v1, r11, r0 /* Store 4 quadwords. */ + stvx v2, r11, r7 + stvx v3, r11, r8 + stvx v4, r11, r9 + addi r4, r4, 64 /* Adjust address for the next iteration. */ + addi r11, r11, 64 /* Adjust address for the next iteration. */ + + lvx v1, r4, r0 /* Load 4 quadwords. */ + lvx v2, r4, r7 + lvx v3, r4, r8 + lvx v4, r4, r9 + vminub v5, v1, v2 /* Compare and merge into one VR for speed. */ + vminub v8, v3, v4 + vminub v7, v5, v8 + vcmpequb. v7, v7, v0 /* Check for NULLs. */ + bne cr6, L(qw_loop_done) + stvx v1, r11, r0 /* Store 4 quadwords. */ + stvx v2, r11, r7 + stvx v3, r11, r8 + stvx v4, r11, r9 + addi r4, r4, 64 /* Adjust address for the next iteration. */ + addi r11, r11, 64 /* Adjust address for the next iteration. */ + + lvx v1, r4, r0 /* Load 4 quadwords. */ + lvx v2, r4, r7 + lvx v3, r4, r8 + lvx v4, r4, r9 + vminub v5, v1, v2 /* Compare and merge into one VR for speed. */ + vminub v8, v3, v4 + vminub v7, v5, v8 + vcmpequb. v7, v7, v0 /* Check for NULLs. */ + bne cr6, L(qw_loop_done) + stvx v1, r11, r0 /* Store 4 quadwords. */ + stvx v2, r11, r7 + stvx v3, r11, r8 + stvx v4, r11, r9 + addi r4, r4, 64 /* Adjust address for the next iteration. */ + addi r11, r11, 64 /* Adjust address for the next iteration. */ + b L(qw_loop) + + .align 4 +L(qw_loop_done): + /* Null found in one of the 4 loads. */ + vcmpequb. v7, v1, v0 + vor v6, v1, v1 + bne cr6, L(qw_done) + /* Not on the first 16B, So store it. */ + stvx v1, r11, r0 + addi r4, r4, 16 + addi r11, r11, 16 + vcmpequb. v7, v2, v0 + vor v6, v2, v2 + bne cr6, L(qw_done) + /* Not on the second 16B, So store it. */ + stvx v2, r11, r0 + addi r4, r4, 16 + addi r11, r11, 16 + vcmpequb. v7, v3, v0 + vor v6, v3, v3 + bne cr6, L(qw_done) + /* Not on the third 16B, So store it. */ + stvx v6, r11, r0 + addi r4, r4, 16 + addi r11, r11, 16 + vor v6, v4, v4 + + .align 4 +L(qw_done): + mr r7, r4 + /* Move the result to GPR. */ +#ifdef __LITTLE_ENDIAN__ + vsldoi v4, v6, v0, 8 + mfvrd r12, v4 +#else + mfvrd r12, v6 +#endif + /* Check for null in the first 8 bytes. */ + cmpb r10, r12, r0 + cmpdi cr6, r10, 0 + bne cr6, L(done2) + /* Null found in second doubleword. */ +#ifdef __LITTLE_ENDIAN__ + mfvrd r6, v6 +#else + vsldoi v6, v6, v0, 8 + mfvrd r6, v6 +#endif + cmpb r10, r6, r0 + addi r7, r7, 8 + b L(done2) .align 5 L(loop):