From patchwork Tue Jul 12 19:29:02 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Noah Goldstein X-Patchwork-Id: 55986 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 9D94F3836F90 for ; Tue, 12 Jul 2022 19:30:21 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 9D94F3836F90 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1657654221; bh=q+GQdVtQ9WHQNIzwduGnd1ns9fbYes0xvcvAssd2f0k=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=UHJ1lkxbTcIRmFEFz1fvbTeBYKVsZxMZE6BA4bINraPe8aQAQtD9zkoZLlMy5j4h0 AXRTYLZGIGtGqMYGcMepng9om6oX+R5H/IlOgdGyoehh6GPvKc6G8ZFpXTan3vj5ir 1ODlyq2E8Q5PO6nQY4auv/wmF3w4Rx/DrtHIdhwc= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pf1-x42a.google.com (mail-pf1-x42a.google.com [IPv6:2607:f8b0:4864:20::42a]) by sourceware.org (Postfix) with ESMTPS id 5043A383A37C for ; Tue, 12 Jul 2022 19:29:15 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 5043A383A37C Received: by mail-pf1-x42a.google.com with SMTP id x184so8320568pfx.2 for ; Tue, 12 Jul 2022 12:29:15 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=q+GQdVtQ9WHQNIzwduGnd1ns9fbYes0xvcvAssd2f0k=; b=VD2B3MQWUj6quMUIs+WbM3cXJSlSRRmhAs4vc+sF2wCCLLrG77a9MnOQZfLMcSwaHv VbW7ktcIcHrzuz9/WrwcNtamIQlH5H/aMv53n/9ts4KGC6mJuVPJ8a/g182Ta12n5WtZ B2+j8nobOWd/+xAGKH0PM5F61+nOrjOgJ//CcL9w9PGhoAPydwkCCikTVIeZY8jAFKCd pm0DdUCCXWruVCExrHKh6PINJaD+bbUO0HM0xhGX/6fYX1hZdi6MA7AxAaZ5N+sNkkyd BpkzsOX1+lLFctKZCjye4zKS6Rs3AXJwrU6tBeQbgFzOp/6FD3JaWzGxNfpe1OpxNa4o SNNg== X-Gm-Message-State: AJIora+hggcYv5xM9rFvc0jihhlJxSPPFNp/QTjhbY2m/L6+at7Q1WDV 7YjIrE/DWXxaTgQmfRTZHuMj+rn3zOA= X-Google-Smtp-Source: AGRyM1vLDvntivOdxQB+HX1KFcCFD5IwLL2TFlPmMIYkgIF12BFag4EaXMyOfW+tYcXBC4CGupBItw== X-Received: by 2002:aa7:8b47:0:b0:525:4214:c195 with SMTP id i7-20020aa78b47000000b005254214c195mr25419854pfd.55.1657654153918; Tue, 12 Jul 2022 12:29:13 -0700 (PDT) Received: from noah-tgl.. ([192.55.60.37]) by smtp.gmail.com with ESMTPSA id w7-20020a170902e88700b0016c28fbd7e5sm7274704plg.268.2022.07.12.12.29.12 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Tue, 12 Jul 2022 12:29:13 -0700 (PDT) To: libc-alpha@sourceware.org Subject: [PATCH v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S Date: Tue, 12 Jul 2022 12:29:02 -0700 Message-Id: <20220712192910.351121-2-goldstein.w.n@gmail.com> X-Mailer: git-send-email 2.34.1 In-Reply-To: <20220712192910.351121-1-goldstein.w.n@gmail.com> References: <20220712192910.351121-1-goldstein.w.n@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-12.1 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: Noah Goldstein via Libc-alpha From: Noah Goldstein Reply-To: Noah Goldstein Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org Sender: "Libc-alpha" This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch. --- sysdeps/x86_64/multiarch/rtld-stpcpy.S | 18 ++++ sysdeps/x86_64/multiarch/stpcpy-sse2.S | 15 +-- sysdeps/x86_64/multiarch/strcpy-sse2.S | 137 ++++++++++++++++++++++-- sysdeps/x86_64/stpcpy.S | 3 +- sysdeps/x86_64/strcpy.S | 138 +------------------------ 5 files changed, 156 insertions(+), 155 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/rtld-stpcpy.S diff --git a/sysdeps/x86_64/multiarch/rtld-stpcpy.S b/sysdeps/x86_64/multiarch/rtld-stpcpy.S new file mode 100644 index 0000000000..914141f07f --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-stpcpy.S @@ -0,0 +1,18 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include "../stpcpy.S" diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2.S b/sysdeps/x86_64/multiarch/stpcpy-sse2.S index 078504a44e..ea9f973af3 100644 --- a/sysdeps/x86_64/multiarch/stpcpy-sse2.S +++ b/sysdeps/x86_64/multiarch/stpcpy-sse2.S @@ -17,17 +17,10 @@ . */ #if IS_IN (libc) - -# include -# define __stpcpy __stpcpy_sse2 - -# undef weak_alias -# define weak_alias(ignored1, ignored2) -# undef libc_hidden_def -# define libc_hidden_def(__stpcpy) -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(stpcpy) +# ifndef STRCPY +# define STRCPY __stpcpy_sse2 +# endif #endif #define USE_AS_STPCPY -#include +#include "strcpy-sse2.S" diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2.S b/sysdeps/x86_64/multiarch/strcpy-sse2.S index f37967c441..8b5db8b13d 100644 --- a/sysdeps/x86_64/multiarch/strcpy-sse2.S +++ b/sysdeps/x86_64/multiarch/strcpy-sse2.S @@ -17,12 +17,137 @@ . */ #if IS_IN (libc) +# ifndef STRCPY +# define STRCPY __strcpy_sse2 +# endif +#endif -# include -# define strcpy __strcpy_sse2 +#include -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(strcpy) -#endif + .text +ENTRY (STRCPY) + movq %rsi, %rcx /* Source register. */ + andl $7, %ecx /* mask alignment bits */ + movq %rdi, %rdx /* Duplicate destination pointer. */ + + jz 5f /* aligned => start loop */ + + neg %ecx /* We need to align to 8 bytes. */ + addl $8,%ecx + /* Search the first bytes directly. */ +0: + movb (%rsi), %al /* Fetch a byte */ + testb %al, %al /* Is it NUL? */ + movb %al, (%rdx) /* Store it */ + jz 4f /* If it was NUL, done! */ + incq %rsi + incq %rdx + decl %ecx + jnz 0b + +5: + movq $0xfefefefefefefeff,%r8 + + /* Now the sources is aligned. Unfortunatly we cannot force + to have both source and destination aligned, so ignore the + alignment of the destination. */ + .p2align 4 +1: + /* 1st unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 3f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 2nd unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 3f /* found NUL => return pointer */ -#include + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 3rd unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 3f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + + /* 4th unroll. */ + movq (%rsi), %rax /* Read double word (8 bytes). */ + addq $8, %rsi /* Adjust pointer for next word. */ + movq %rax, %r9 /* Save a copy for NUL finding. */ + addq %r8, %r9 /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc 3f /* highest byte is NUL => return pointer */ + xorq %rax, %r9 /* (word+magic)^word */ + orq %r8, %r9 /* set all non-carry bits */ + incq %r9 /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + jnz 3f /* found NUL => return pointer */ + + movq %rax, (%rdx) /* Write value to destination. */ + addq $8, %rdx /* Adjust pointer. */ + jmp 1b /* Next iteration. */ + + /* Do the last few bytes. %rax contains the value to write. + The loop is unrolled twice. */ + .p2align 4 +3: + /* Note that stpcpy needs to return with the value of the NUL + byte. */ + movb %al, (%rdx) /* 1st byte. */ + testb %al, %al /* Is it NUL. */ + jz 4f /* yes, finish. */ + incq %rdx /* Increment destination. */ + movb %ah, (%rdx) /* 2nd byte. */ + testb %ah, %ah /* Is it NUL?. */ + jz 4f /* yes, finish. */ + incq %rdx /* Increment destination. */ + shrq $16, %rax /* Shift... */ + jmp 3b /* and look at next two bytes in %rax. */ + +4: +#ifdef USE_AS_STPCPY + movq %rdx, %rax /* Destination is return value. */ +#else + movq %rdi, %rax /* Source is return value. */ +#endif + retq +END (STRCPY) diff --git a/sysdeps/x86_64/stpcpy.S b/sysdeps/x86_64/stpcpy.S index ec23de1416..b097c203dd 100644 --- a/sysdeps/x86_64/stpcpy.S +++ b/sysdeps/x86_64/stpcpy.S @@ -1,7 +1,6 @@ -#define USE_AS_STPCPY #define STRCPY __stpcpy -#include +#include "multiarch/stpcpy-sse2.S" weak_alias (__stpcpy, stpcpy) libc_hidden_def (__stpcpy) diff --git a/sysdeps/x86_64/strcpy.S b/sysdeps/x86_64/strcpy.S index 17e8073550..05f19e6e94 100644 --- a/sysdeps/x86_64/strcpy.S +++ b/sysdeps/x86_64/strcpy.S @@ -16,140 +16,6 @@ License along with the GNU C Library; if not, see . */ -#include -#include "asm-syntax.h" - -#ifndef USE_AS_STPCPY -# define STRCPY strcpy -#endif - - .text -ENTRY (STRCPY) - movq %rsi, %rcx /* Source register. */ - andl $7, %ecx /* mask alignment bits */ - movq %rdi, %rdx /* Duplicate destination pointer. */ - - jz 5f /* aligned => start loop */ - - neg %ecx /* We need to align to 8 bytes. */ - addl $8,%ecx - /* Search the first bytes directly. */ -0: - movb (%rsi), %al /* Fetch a byte */ - testb %al, %al /* Is it NUL? */ - movb %al, (%rdx) /* Store it */ - jz 4f /* If it was NUL, done! */ - incq %rsi - incq %rdx - decl %ecx - jnz 0b - -5: - movq $0xfefefefefefefeff,%r8 - - /* Now the sources is aligned. Unfortunatly we cannot force - to have both source and destination aligned, so ignore the - alignment of the destination. */ - .p2align 4 -1: - /* 1st unroll. */ - movq (%rsi), %rax /* Read double word (8 bytes). */ - addq $8, %rsi /* Adjust pointer for next word. */ - movq %rax, %r9 /* Save a copy for NUL finding. */ - addq %r8, %r9 /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 3f /* highest byte is NUL => return pointer */ - xorq %rax, %r9 /* (word+magic)^word */ - orq %r8, %r9 /* set all non-carry bits */ - incq %r9 /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - - jnz 3f /* found NUL => return pointer */ - - movq %rax, (%rdx) /* Write value to destination. */ - addq $8, %rdx /* Adjust pointer. */ - - /* 2nd unroll. */ - movq (%rsi), %rax /* Read double word (8 bytes). */ - addq $8, %rsi /* Adjust pointer for next word. */ - movq %rax, %r9 /* Save a copy for NUL finding. */ - addq %r8, %r9 /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 3f /* highest byte is NUL => return pointer */ - xorq %rax, %r9 /* (word+magic)^word */ - orq %r8, %r9 /* set all non-carry bits */ - incq %r9 /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - - jnz 3f /* found NUL => return pointer */ - - movq %rax, (%rdx) /* Write value to destination. */ - addq $8, %rdx /* Adjust pointer. */ - - /* 3rd unroll. */ - movq (%rsi), %rax /* Read double word (8 bytes). */ - addq $8, %rsi /* Adjust pointer for next word. */ - movq %rax, %r9 /* Save a copy for NUL finding. */ - addq %r8, %r9 /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 3f /* highest byte is NUL => return pointer */ - xorq %rax, %r9 /* (word+magic)^word */ - orq %r8, %r9 /* set all non-carry bits */ - incq %r9 /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - - jnz 3f /* found NUL => return pointer */ - - movq %rax, (%rdx) /* Write value to destination. */ - addq $8, %rdx /* Adjust pointer. */ - - /* 4th unroll. */ - movq (%rsi), %rax /* Read double word (8 bytes). */ - addq $8, %rsi /* Adjust pointer for next word. */ - movq %rax, %r9 /* Save a copy for NUL finding. */ - addq %r8, %r9 /* add the magic value to the word. We get - carry bits reported for each byte which - is *not* 0 */ - jnc 3f /* highest byte is NUL => return pointer */ - xorq %rax, %r9 /* (word+magic)^word */ - orq %r8, %r9 /* set all non-carry bits */ - incq %r9 /* add 1: if one carry bit was *not* set - the addition will not result in 0. */ - - jnz 3f /* found NUL => return pointer */ - - movq %rax, (%rdx) /* Write value to destination. */ - addq $8, %rdx /* Adjust pointer. */ - jmp 1b /* Next iteration. */ - - /* Do the last few bytes. %rax contains the value to write. - The loop is unrolled twice. */ - .p2align 4 -3: - /* Note that stpcpy needs to return with the value of the NUL - byte. */ - movb %al, (%rdx) /* 1st byte. */ - testb %al, %al /* Is it NUL. */ - jz 4f /* yes, finish. */ - incq %rdx /* Increment destination. */ - movb %ah, (%rdx) /* 2nd byte. */ - testb %ah, %ah /* Is it NUL?. */ - jz 4f /* yes, finish. */ - incq %rdx /* Increment destination. */ - shrq $16, %rax /* Shift... */ - jmp 3b /* and look at next two bytes in %rax. */ - -4: -#ifdef USE_AS_STPCPY - movq %rdx, %rax /* Destination is return value. */ -#else - movq %rdi, %rax /* Source is return value. */ -#endif - retq -END (STRCPY) -#ifndef USE_AS_STPCPY +#define STRCPY strcpy +#include "multiarch/strcpy-sse2.S" libc_hidden_builtin_def (strcpy) -#endif