From patchwork Fri Mar 5 16:53:15 2021 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "H.J. Lu" X-Patchwork-Id: 42278 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 58E403AAA0E0; Fri, 5 Mar 2021 16:53:34 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 58E403AAA0E0 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1614963214; bh=raWDXQdDkiFF5JBrg2bHCRCyYZVPGKOjXGR11vzSFXk=; h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe: List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To: From; b=ZF1NGDa2vkK5XYy0x38KtscWrdW/vVkNCHY0IPKooFPa8fx3Q/56TFMzRRS++VjB1 NMh9ENhH/+VktWkuQvuL75AlsxHsQkNAtabOCiT6ZhGD7D9DCz8+p5CJ+M5+r+Iae6 KzovJjHgFjhrKEPqqMaUjUL6Bd28jFY/Ky9s878g= X-Original-To: libc-alpha@sourceware.org Delivered-To: libc-alpha@sourceware.org Received: from mail-pf1-x429.google.com (mail-pf1-x429.google.com [IPv6:2607:f8b0:4864:20::429]) by sourceware.org (Postfix) with ESMTPS id 2B0113AAA0DC for ; Fri, 5 Mar 2021 16:53:26 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 2B0113AAA0DC Received: by mail-pf1-x429.google.com with SMTP id j12so2410072pfj.12 for ; Fri, 05 Mar 2021 08:53:26 -0800 (PST) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:from:to:subject:date:message-id:in-reply-to :references:mime-version:content-transfer-encoding; bh=raWDXQdDkiFF5JBrg2bHCRCyYZVPGKOjXGR11vzSFXk=; b=ju+3SLoCwE8QF2Y0IuD8SMjxTvzmINs3T/pLunIMuhsV6PKZjQHlIdmNpAo9gCoRPR vs34KBVzQXu78rT58QxwxhTSHNdXqmBilfBF5udVrezoR2vHvn8uWEUvdW7ZTG4QOsCv P1WfE4Rg1vz+ci2uNiZlgQSZUdfuzca7g64z0GpFWKytmZ7gcGqJ38xnPQoa+bSpidUa 6tOjkyCeUrblurGXsP3ihBzGqrqPkDx3afWqdzN+cGfwTblTZ12s/fV6lCQjJwjT9SbG oh6eDZNchEPenJls3rXMTSYlbMrilD1VXy9fTgE7baBzTZac3DahFkFc5KxblGN3/mws wAPA== X-Gm-Message-State: AOAM531OZ4YQW+Y5aUC/+W4N/4bl+44XxM4e58W/hum9I4cUcsv9bOEp 90i8rXt/fh/pj9ri+ee5pLFC2BDawhM= X-Google-Smtp-Source: ABdhPJyCT99Yb9a2ugQMrJ7MAZxxb3t/HT6SMr0vRMEP2j1hpfSYQ6klANYRReDcx4rH3tWwrIBUVw== X-Received: by 2002:a65:64ce:: with SMTP id t14mr9436185pgv.36.1614963203312; Fri, 05 Mar 2021 08:53:23 -0800 (PST) Received: from gnu-cfl-2.localdomain ([172.56.38.48]) by smtp.gmail.com with ESMTPSA id t5sm2912271pgl.89.2021.03.05.08.53.20 for (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Fri, 05 Mar 2021 08:53:21 -0800 (PST) Received: from gnu-tgl-2.localdomain (gnu-tgl-2 [192.168.1.34]) by gnu-cfl-2.localdomain (Postfix) with ESMTPS id 2D6711A09BC for ; Fri, 5 Mar 2021 08:53:18 -0800 (PST) Received: from gnu-tgl-2.?040none?041 (localhost [IPv6:::1]) by gnu-tgl-2.localdomain (Postfix) with ESMTP id 698B23003A5 for ; Fri, 5 Mar 2021 08:53:16 -0800 (PST) To: libc-alpha@sourceware.org Subject: [PATCH 7/8] x86-64: Add AVX optimized string/memory functions for RTM Date: Fri, 5 Mar 2021 08:53:15 -0800 Message-Id: <20210305165316.323467-8-hjl.tools@gmail.com> X-Mailer: git-send-email 2.29.2 In-Reply-To: <20210305165316.323467-1-hjl.tools@gmail.com> References: <20210305165316.323467-1-hjl.tools@gmail.com> MIME-Version: 1.0 X-Spam-Status: No, score=-3034.8 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_BARRACUDACENTRAL, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-Patchwork-Original-From: "H.J. Lu via Libc-alpha" From: "H.J. Lu" Reply-To: "H.J. Lu" Errors-To: libc-alpha-bounces@sourceware.org Sender: "Libc-alpha" Since VZEROUPPER triggers RTM abort while VZEROALL won't, select AVX optimized string/memory functions with xtest jz 1f vzeroall ret 1: vzeroupper ret at function exit on processors with usable RTM, but without 256-bit EVEX instructions to avoid VZEROUPPER inside a transactionally executing RTM region. --- sysdeps/x86_64/multiarch/Makefile | 27 +++ sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 170 ++++++++++++++++++ sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 + sysdeps/x86_64/multiarch/ifunc-memmove.h | 12 ++ sysdeps/x86_64/multiarch/ifunc-memset.h | 12 ++ sysdeps/x86_64/multiarch/ifunc-strcpy.h | 4 + sysdeps/x86_64/multiarch/ifunc-wmemset.h | 5 + sysdeps/x86_64/multiarch/memchr-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/memchr-avx2.S | 45 +++-- .../x86_64/multiarch/memcmp-avx2-movbe-rtm.S | 12 ++ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 28 ++- .../memmove-avx-unaligned-erms-rtm.S | 17 ++ .../multiarch/memmove-vec-unaligned-erms.S | 33 ++-- sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/memrchr-avx2.S | 53 +++--- .../memset-avx2-unaligned-erms-rtm.S | 10 ++ .../multiarch/memset-avx2-unaligned-erms.S | 12 +- .../multiarch/memset-vec-unaligned-erms.S | 41 ++--- sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S | 4 + sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S | 4 + sysdeps/x86_64/multiarch/strcat-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strcat-avx2.S | 6 +- sysdeps/x86_64/multiarch/strchr-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strchr-avx2.S | 28 ++- sysdeps/x86_64/multiarch/strchr.c | 4 + sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strcmp-avx2.S | 55 +++--- sysdeps/x86_64/multiarch/strcmp.c | 4 + sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strcpy-avx2.S | 85 ++++----- sysdeps/x86_64/multiarch/strlen-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strlen-avx2.S | 43 ++--- sysdeps/x86_64/multiarch/strncat-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/strncmp.c | 4 + sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S | 4 + sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S | 12 ++ sysdeps/x86_64/multiarch/strrchr-avx2.S | 19 +- sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S | 4 + sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S | 4 + sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S | 5 + sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S | 5 + sysdeps/x86_64/multiarch/wcsnlen.c | 4 + sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S | 3 + sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S | 4 + .../x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S | 4 + sysdeps/x86_64/sysdep.h | 22 +++ 52 files changed, 670 insertions(+), 248 deletions(-) create mode 100644 sysdeps/x86_64/multiarch/memchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S create mode 100644 sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strcat-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strlen-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strncat-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 9d79b138e9..491c7698dc 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -40,6 +40,25 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ memset-sse2-unaligned-erms \ memset-avx2-unaligned-erms \ memset-avx512-unaligned-erms \ + memchr-avx2-rtm \ + memcmp-avx2-movbe-rtm \ + memmove-avx-unaligned-erms-rtm \ + memrchr-avx2-rtm \ + memset-avx2-unaligned-erms-rtm \ + rawmemchr-avx2-rtm \ + strchr-avx2-rtm \ + strcmp-avx2-rtm \ + strchrnul-avx2-rtm \ + stpcpy-avx2-rtm \ + stpncpy-avx2-rtm \ + strcat-avx2-rtm \ + strcpy-avx2-rtm \ + strlen-avx2-rtm \ + strncat-avx2-rtm \ + strncmp-avx2-rtm \ + strncpy-avx2-rtm \ + strnlen-avx2-rtm \ + strrchr-avx2-rtm \ memchr-evex \ memcmp-evex-movbe \ memmove-evex-unaligned-erms \ @@ -76,6 +95,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ wcsrchr-sse2 wcsrchr-avx2 \ wcsnlen-sse4_1 wcsnlen-c \ wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \ + wcschr-avx2-rtm \ + wcscmp-avx2-rtm \ + wcslen-avx2-rtm \ + wcsncmp-avx2-rtm \ + wcsnlen-avx2-rtm \ + wcsrchr-avx2-rtm \ + wmemchr-avx2-rtm \ + wmemcmp-avx2-movbe-rtm \ wcschr-evex \ wcscmp-evex \ wcslen-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h index 634c3c3c91..e3ec62ca5e 100644 --- a/sysdeps/x86_64/multiarch/ifunc-avx2.h +++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h @@ -21,6 +21,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * @@ -36,6 +37,9 @@ IFUNC_SELECTOR (void) && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) return OPTIMIZE (evex); + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 3d89773ee5..ec3df70d50 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -43,6 +43,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memchr, CPU_FEATURE_USABLE (AVX2), __memchr_avx2) + IFUNC_IMPL_ADD (array, i, memchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memchr_avx2_rtm) IFUNC_IMPL_ADD (array, i, memchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -56,6 +60,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX2) && CPU_FEATURE_USABLE (MOVBE)), __memcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, memcmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (MOVBE) + && CPU_FEATURE_USABLE (RTM)), + __memcmp_avx2_movbe_rtm) IFUNC_IMPL_ADD (array, i, memcmp, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -85,6 +94,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memmove_chk, CPU_FEATURE_USABLE (AVX), __memmove_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memmove_chk_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memmove_chk_avx_unaligned_erms_rtm) IFUNC_IMPL_ADD (array, i, __memmove_chk, CPU_FEATURE_USABLE (AVX512VL), __memmove_chk_evex_unaligned) @@ -113,6 +130,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (AVX), __memmove_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memmove, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memmove_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, memmove, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memmove_avx_unaligned_erms_rtm) IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (AVX512VL), __memmove_evex_unaligned) @@ -143,6 +168,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memrchr, CPU_FEATURE_USABLE (AVX2), __memrchr_avx2) + IFUNC_IMPL_ADD (array, i, memrchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memrchr_avx2_rtm) IFUNC_IMPL_ADD (array, i, memrchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), @@ -165,6 +194,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memset_chk, CPU_FEATURE_USABLE (AVX2), __memset_chk_avx2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memset_chk_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memset_chk_avx2_unaligned_erms_rtm) IFUNC_IMPL_ADD (array, i, __memset_chk, CPU_FEATURE_USABLE (AVX512VL), __memset_chk_evex_unaligned) @@ -196,6 +233,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memset, CPU_FEATURE_USABLE (AVX2), __memset_avx2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memset_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __memset_avx2_unaligned_erms_rtm) IFUNC_IMPL_ADD (array, i, memset, CPU_FEATURE_USABLE (AVX512VL), __memset_evex_unaligned) @@ -218,6 +263,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, rawmemchr, CPU_FEATURE_USABLE (AVX2), __rawmemchr_avx2) + IFUNC_IMPL_ADD (array, i, rawmemchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __rawmemchr_avx2_rtm) IFUNC_IMPL_ADD (array, i, rawmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -230,6 +279,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strlen, CPU_FEATURE_USABLE (AVX2), __strlen_avx2) + IFUNC_IMPL_ADD (array, i, strlen, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strlen_avx2_rtm) IFUNC_IMPL_ADD (array, i, strlen, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), @@ -241,6 +294,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strnlen, CPU_FEATURE_USABLE (AVX2), __strnlen_avx2) + IFUNC_IMPL_ADD (array, i, strnlen, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strnlen_avx2_rtm) IFUNC_IMPL_ADD (array, i, strnlen, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), @@ -253,6 +310,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __stpncpy_ssse3) IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), __stpncpy_avx2) + IFUNC_IMPL_ADD (array, i, stpncpy, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __stpncpy_avx2_rtm) IFUNC_IMPL_ADD (array, i, stpncpy, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), @@ -267,6 +328,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __stpcpy_ssse3) IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), __stpcpy_avx2) + IFUNC_IMPL_ADD (array, i, stpcpy, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __stpcpy_avx2_rtm) IFUNC_IMPL_ADD (array, i, stpcpy, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), @@ -305,6 +370,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strcat, IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2), __strcat_avx2) + IFUNC_IMPL_ADD (array, i, strcat, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcat_avx2_rtm) IFUNC_IMPL_ADD (array, i, strcat, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), @@ -319,6 +388,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strchr, CPU_FEATURE_USABLE (AVX2), __strchr_avx2) + IFUNC_IMPL_ADD (array, i, strchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strchr_avx2_rtm) IFUNC_IMPL_ADD (array, i, strchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -332,6 +405,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strchrnul, CPU_FEATURE_USABLE (AVX2), __strchrnul_avx2) + IFUNC_IMPL_ADD (array, i, strchrnul, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strchrnul_avx2_rtm) IFUNC_IMPL_ADD (array, i, strchrnul, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -344,6 +421,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strrchr, CPU_FEATURE_USABLE (AVX2), __strrchr_avx2) + IFUNC_IMPL_ADD (array, i, strrchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strrchr_avx2_rtm) IFUNC_IMPL_ADD (array, i, strrchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), @@ -355,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (AVX2), __strcmp_avx2) + IFUNC_IMPL_ADD (array, i, strcmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcmp_avx2_rtm) IFUNC_IMPL_ADD (array, i, strcmp, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -371,6 +456,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strcpy, IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2), __strcpy_avx2) + IFUNC_IMPL_ADD (array, i, strcpy, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcpy_avx2_rtm) IFUNC_IMPL_ADD (array, i, strcpy, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), @@ -418,6 +507,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strncat, IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2), __strncat_avx2) + IFUNC_IMPL_ADD (array, i, strncat, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncat_avx2_rtm) IFUNC_IMPL_ADD (array, i, strncat, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), @@ -432,6 +525,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL (i, name, strncpy, IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2), __strncpy_avx2) + IFUNC_IMPL_ADD (array, i, strncpy, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncpy_avx2_rtm) IFUNC_IMPL_ADD (array, i, strncpy, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), @@ -465,6 +562,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcschr, CPU_FEATURE_USABLE (AVX2), __wcschr_avx2) + IFUNC_IMPL_ADD (array, i, wcschr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wcschr_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcschr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -477,6 +578,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcsrchr, CPU_FEATURE_USABLE (AVX2), __wcsrchr_avx2) + IFUNC_IMPL_ADD (array, i, wcsrchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wcsrchr_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcsrchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -489,6 +594,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcscmp, CPU_FEATURE_USABLE (AVX2), __wcscmp_avx2) + IFUNC_IMPL_ADD (array, i, wcscmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wcscmp_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcscmp, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -501,6 +610,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcsncmp, CPU_FEATURE_USABLE (AVX2), __wcsncmp_avx2) + IFUNC_IMPL_ADD (array, i, wcsncmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wcsncmp_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcsncmp, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -519,6 +632,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcslen, CPU_FEATURE_USABLE (AVX2), __wcslen_avx2) + IFUNC_IMPL_ADD (array, i, wcslen, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wcslen_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcslen, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -531,6 +648,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcsnlen, CPU_FEATURE_USABLE (AVX2), __wcsnlen_avx2) + IFUNC_IMPL_ADD (array, i, wcsnlen, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wcsnlen_avx2_rtm) IFUNC_IMPL_ADD (array, i, wcsnlen, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -546,6 +667,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wmemchr, CPU_FEATURE_USABLE (AVX2), __wmemchr_avx2) + IFUNC_IMPL_ADD (array, i, wmemchr, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wmemchr_avx2_rtm) IFUNC_IMPL_ADD (array, i, wmemchr, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -559,6 +684,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX2) && CPU_FEATURE_USABLE (MOVBE)), __wmemcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, wmemcmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (MOVBE) + && CPU_FEATURE_USABLE (RTM)), + __wmemcmp_avx2_movbe_rtm) IFUNC_IMPL_ADD (array, i, wmemcmp, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW) @@ -577,6 +707,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wmemset, CPU_FEATURE_USABLE (AVX2), __wmemset_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __wmemset_avx2_unaligned_rtm) IFUNC_IMPL_ADD (array, i, wmemset, CPU_FEATURE_USABLE (AVX512VL), __wmemset_evex_unaligned) @@ -602,6 +736,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memcpy_chk, CPU_FEATURE_USABLE (AVX), __memcpy_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memcpy_chk_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memcpy_chk_avx_unaligned_erms_rtm) IFUNC_IMPL_ADD (array, i, __memcpy_chk, CPU_FEATURE_USABLE (AVX512VL), __memcpy_chk_evex_unaligned) @@ -630,6 +772,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (AVX), __memcpy_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memcpy, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memcpy_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, memcpy, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __memcpy_avx_unaligned_erms_rtm) IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (AVX512VL), __memcpy_evex_unaligned) @@ -672,6 +822,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (AVX), __mempcpy_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __mempcpy_chk_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __mempcpy_chk_avx_unaligned_erms_rtm) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (AVX512VL), __mempcpy_chk_evex_unaligned) @@ -709,6 +867,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (AVX), __mempcpy_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, mempcpy, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __mempcpy_avx_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, mempcpy, + (CPU_FEATURE_USABLE (AVX) + && CPU_FEATURE_USABLE (RTM)), + __mempcpy_avx_unaligned_erms_rtm) IFUNC_IMPL_ADD (array, i, __mempcpy, CPU_FEATURE_USABLE (AVX512VL), __mempcpy_evex_unaligned) @@ -730,6 +896,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (AVX2), __strncmp_avx2) + IFUNC_IMPL_ADD (array, i, strncmp, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncmp_avx2_rtm) IFUNC_IMPL_ADD (array, i, strncmp, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h index 5ac41a19b8..8bee1aff75 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h @@ -23,6 +23,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; static inline void * @@ -38,6 +39,9 @@ IFUNC_SELECTOR (void) && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) return OPTIMIZE (evex_movbe); + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_movbe_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2_movbe); } diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h index 517b332bfc..4eba926eca 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h @@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm) + attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) @@ -71,6 +75,14 @@ IFUNC_SELECTOR (void) return OPTIMIZE (evex_unaligned); } + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (avx_unaligned_erms_rtm); + + return OPTIMIZE (avx_unaligned_rtm); + } + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) { if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h index 3a7a9b7e22..9e2bf491bd 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memset.h +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h @@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm) + attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm) + attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) @@ -68,6 +72,14 @@ IFUNC_SELECTOR (void) return OPTIMIZE (evex_unaligned); } + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (avx2_unaligned_erms_rtm); + + return OPTIMIZE (avx2_unaligned_rtm); + } + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) { if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h index f31f436adf..39568f480f 100644 --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h @@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * @@ -39,6 +40,9 @@ IFUNC_SELECTOR (void) && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) return OPTIMIZE (evex); + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h index 7e947c56b4..8d952eff99 100644 --- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h +++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h @@ -20,6 +20,8 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm) + attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; @@ -39,6 +41,9 @@ IFUNC_SELECTOR (void) if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) return OPTIMIZE (evex_unaligned); + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_unaligned_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2_unaligned); } diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S new file mode 100644 index 0000000000..87b076c7c4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef MEMCHR +# define MEMCHR __memchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "memchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S index 77a9523168..1fcb1c350f 100644 --- a/sysdeps/x86_64/multiarch/memchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S @@ -34,9 +34,13 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (MEMCHR) # ifndef USE_AS_RAWMEMCHR /* Check for zero length. */ @@ -107,8 +111,8 @@ L(cros_page_boundary): # endif addq %rdi, %rax addq %rcx, %rax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(aligned_more): @@ -224,8 +228,7 @@ L(last_4x_vec_or_less): jnz L(first_vec_x3_check) xorl %eax, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_2x_vec): @@ -243,8 +246,7 @@ L(last_2x_vec): testl %eax, %eax jnz L(first_vec_x1_check) xorl %eax, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x0_check): @@ -253,8 +255,7 @@ L(first_vec_x0_check): cmpq %rax, %rdx jbe L(zero) addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1_check): @@ -264,8 +265,7 @@ L(first_vec_x1_check): jbe L(zero) addq $VEC_SIZE, %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2_check): @@ -275,8 +275,7 @@ L(first_vec_x2_check): jbe L(zero) addq $(VEC_SIZE * 2), %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x3_check): @@ -286,12 +285,14 @@ L(first_vec_x3_check): jbe L(zero) addq $(VEC_SIZE * 3), %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(zero): - VZEROUPPER + xorl %eax, %eax + jmp L(return_vzeroupper) + + .p2align 4 L(null): xorl %eax, %eax ret @@ -301,24 +302,21 @@ L(null): L(first_vec_x0): tzcntl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1): tzcntl %eax, %eax addq $VEC_SIZE, %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): tzcntl %eax, %eax addq $(VEC_SIZE * 2), %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(4x_vec_end): @@ -337,8 +335,7 @@ L(first_vec_x3): tzcntl %eax, %eax addq $(VEC_SIZE * 3), %rax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN END (MEMCHR) #endif diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S new file mode 100644 index 0000000000..cf4eff5d4a --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S @@ -0,0 +1,12 @@ +#ifndef MEMCMP +# define MEMCMP __memcmp_avx2_movbe_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "memcmp-avx2-movbe.S" diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S index cf9c9b8c1f..ad0fa962a1 100644 --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S @@ -47,6 +47,10 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 # define VEC_MASK ((1 << VEC_SIZE) - 1) @@ -55,7 +59,7 @@ memcmp has to use UNSIGNED comparison for elemnts. */ - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (MEMCMP) # ifdef USE_AS_WMEMCMP shl $2, %RDX_LP @@ -123,8 +127,8 @@ ENTRY (MEMCMP) vptest %ymm0, %ymm5 jnc L(4x_vec_end) xorl %eax, %eax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(last_2x_vec): @@ -144,8 +148,7 @@ L(last_vec): vpmovmskb %ymm2, %eax subl $VEC_MASK, %eax jnz L(first_vec) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec): @@ -164,8 +167,7 @@ L(wmemcmp_return): movzbl (%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_WMEMCMP .p2align 4 @@ -367,8 +369,7 @@ L(last_4x_vec): vpmovmskb %ymm2, %eax subl $VEC_MASK, %eax jnz L(first_vec) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(4x_vec_end): @@ -394,8 +395,7 @@ L(4x_vec_end): movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1): @@ -410,8 +410,7 @@ L(first_vec_x1): movzbl VEC_SIZE(%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): @@ -426,7 +425,6 @@ L(first_vec_x2): movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx sub %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN END (MEMCMP) #endif diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S new file mode 100644 index 0000000000..1ec1962e86 --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S @@ -0,0 +1,17 @@ +#if IS_IN (libc) +# define VEC_SIZE 32 +# define VEC(i) ymm##i +# define VMOVNT vmovntdq +# define VMOVU vmovdqu +# define VMOVA vmovdqa + +# define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +# define VZEROUPPER_RETURN jmp L(return) + +# define SECTION(p) p##.avx.rtm +# define MEMMOVE_SYMBOL(p,s) p##_avx_##s##_rtm + +# include "memmove-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index d713d7d679..897a3d9762 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -150,11 +150,12 @@ L(last_2x_vec): VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) VMOVU %VEC(0), (%rdi) VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) - VZEROUPPER #if !defined USE_MULTIARCH || !IS_IN (libc) L(nop): -#endif ret +#else + VZEROUPPER_RETURN +#endif #if defined USE_MULTIARCH && IS_IN (libc) END (MEMMOVE_SYMBOL (__memmove, unaligned)) @@ -247,8 +248,11 @@ L(last_2x_vec): VMOVU %VEC(0), (%rdi) VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) L(return): - VZEROUPPER +#if VEC_SIZE > 16 + ZERO_UPPER_VEC_REGISTERS_RETURN +#else ret +#endif L(movsb): cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP @@ -313,8 +317,7 @@ L(between_32_63): VMOVU -32(%rsi,%rdx), %YMM1 VMOVU %YMM0, (%rdi) VMOVU %YMM1, -32(%rdi,%rdx) - VZEROUPPER - ret + VZEROUPPER_RETURN #endif #if VEC_SIZE > 16 /* From 16 to 31. No branch when size == 16. */ @@ -323,7 +326,7 @@ L(between_16_31): VMOVU -16(%rsi,%rdx), %XMM1 VMOVU %XMM0, (%rdi) VMOVU %XMM1, -16(%rdi,%rdx) - ret + VZEROUPPER_RETURN #endif L(between_8_15): /* From 8 to 15. No branch when size == 8. */ @@ -376,8 +379,7 @@ L(more_2x_vec): VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) - VZEROUPPER - ret + VZEROUPPER_RETURN L(last_4x_vec): /* Copy from 2 * VEC to 4 * VEC. */ VMOVU (%rsi), %VEC(0) @@ -388,8 +390,7 @@ L(last_4x_vec): VMOVU %VEC(1), VEC_SIZE(%rdi) VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) - VZEROUPPER - ret + VZEROUPPER_RETURN L(more_8x_vec): cmpq %rsi, %rdi @@ -445,8 +446,7 @@ L(loop_4x_vec_forward): VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) /* Store the first VEC. */ VMOVU %VEC(4), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN L(more_8x_vec_backward): /* Load the first 4 * VEC and last VEC to support overlapping @@ -497,8 +497,7 @@ L(loop_4x_vec_backward): VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) /* Store the last VEC. */ VMOVU %VEC(8), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) L(large_forward): @@ -533,8 +532,7 @@ L(loop_large_forward): VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) /* Store the first VEC. */ VMOVU %VEC(4), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN L(large_backward): /* Don't use non-temporal store if there is overlap between @@ -568,8 +566,7 @@ L(loop_large_backward): VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) /* Store the last VEC. */ VMOVU %VEC(8), (%r11) - VZEROUPPER - ret + VZEROUPPER_RETURN #endif END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S new file mode 100644 index 0000000000..cea2d2a72d --- /dev/null +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef MEMRCHR +# define MEMRCHR __memrchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "memrchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S index eddede45be..ac7370cb06 100644 --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S @@ -20,14 +20,22 @@ # include +# ifndef MEMRCHR +# define MEMRCHR __memrchr_avx2 +# endif + # ifndef VZEROUPPER # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits -ENTRY (__memrchr_avx2) + .section SECTION(.text),"ax",@progbits +ENTRY (MEMRCHR) /* Broadcast CHAR to YMM0. */ vmovd %esi, %xmm0 vpbroadcastb %xmm0, %ymm0 @@ -134,8 +142,8 @@ L(loop_4x_vec): vpmovmskb %ymm1, %eax bsrl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(last_4x_vec_or_less): @@ -169,8 +177,7 @@ L(last_4x_vec_or_less): addq %rax, %rdx jl L(zero) addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_2x_vec): @@ -191,31 +198,27 @@ L(last_2x_vec): jl L(zero) addl $(VEC_SIZE * 2), %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x0): bsrl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x1): bsrl %eax, %eax addl $VEC_SIZE, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x2): bsrl %eax, %eax addl $(VEC_SIZE * 2), %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x3): @@ -232,8 +235,7 @@ L(last_vec_x1_check): jl L(zero) addl $VEC_SIZE, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_x3_check): @@ -243,12 +245,14 @@ L(last_vec_x3_check): jl L(zero) addl $(VEC_SIZE * 3), %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(zero): - VZEROUPPER + xorl %eax, %eax + VZEROUPPER_RETURN + + .p2align 4 L(null): xorl %eax, %eax ret @@ -273,8 +277,7 @@ L(last_vec_or_less_aligned): bsrl %eax, %eax addq %rdi, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_or_less): @@ -315,8 +318,7 @@ L(last_vec_or_less): bsrl %eax, %eax addq %rdi, %rax addq %r8, %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_vec_2x_aligned): @@ -353,7 +355,6 @@ L(last_vec_2x_aligned): bsrl %eax, %eax addq %rdi, %rax addq %r8, %rax - VZEROUPPER - ret -END (__memrchr_avx2) + VZEROUPPER_RETURN +END (MEMRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S new file mode 100644 index 0000000000..8ac3e479bb --- /dev/null +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S @@ -0,0 +1,10 @@ +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return) + +#define SECTION(p) p##.avx.rtm +#define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm +#define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm + +#include "memset-avx2-unaligned-erms.S" diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S index 7ab3d89849..ae0860f36a 100644 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S @@ -14,9 +14,15 @@ movq r, %rax; \ vpbroadcastd %xmm0, %ymm0 -# define SECTION(p) p##.avx -# define MEMSET_SYMBOL(p,s) p##_avx2_##s -# define WMEMSET_SYMBOL(p,s) p##_avx2_##s +# ifndef SECTION +# define SECTION(p) p##.avx +# endif +# ifndef MEMSET_SYMBOL +# define MEMSET_SYMBOL(p,s) p##_avx2_##s +# endif +# ifndef WMEMSET_SYMBOL +# define WMEMSET_SYMBOL(p,s) p##_avx2_##s +# endif # include "memset-vec-unaligned-erms.S" #endif diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S index 358ee4be12..584747f1a1 100644 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S @@ -45,17 +45,14 @@ #ifndef VZEROUPPER # if VEC_SIZE > 16 # define VZEROUPPER vzeroupper +# define VZEROUPPER_SHORT_RETURN vzeroupper; ret # else # define VZEROUPPER # endif #endif #ifndef VZEROUPPER_SHORT_RETURN -# if VEC_SIZE > 16 -# define VZEROUPPER_SHORT_RETURN vzeroupper -# else -# define VZEROUPPER_SHORT_RETURN rep -# endif +# define VZEROUPPER_SHORT_RETURN rep; ret #endif #ifndef MOVQ @@ -117,8 +114,7 @@ L(entry_from_bzero): /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN #if defined USE_MULTIARCH && IS_IN (libc) END (MEMSET_SYMBOL (__memset, unaligned)) @@ -141,14 +137,12 @@ ENTRY (__memset_erms) ENTRY (MEMSET_SYMBOL (__memset, erms)) # endif L(stosb): - /* Issue vzeroupper before rep stosb. */ - VZEROUPPER mov %RDX_LP, %RCX_LP movzbl %sil, %eax mov %RDI_LP, %RDX_LP rep stosb mov %RDX_LP, %RAX_LP - ret + VZEROUPPER_RETURN # if VEC_SIZE == 16 END (__memset_erms) # else @@ -175,8 +169,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN L(stosb_more_2x_vec): cmp __x86_rep_stosb_threshold(%rip), %RDX_LP @@ -190,8 +183,11 @@ L(more_2x_vec): VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) L(return): - VZEROUPPER +#if VEC_SIZE > 16 + ZERO_UPPER_VEC_REGISTERS_RETURN +#else ret +#endif L(loop_start): leaq (VEC_SIZE * 4)(%rdi), %rcx @@ -217,7 +213,6 @@ L(loop): cmpq %rcx, %rdx jne L(loop) VZEROUPPER_SHORT_RETURN - ret L(less_vec): /* Less than 1 VEC. */ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 @@ -241,40 +236,34 @@ L(less_vec): jb 1f movb %cl, (%rdi) 1: - VZEROUPPER - ret + VZEROUPPER_RETURN # if VEC_SIZE > 32 /* From 32 to 63. No branch when size == 32. */ L(between_32_63): VMOVU %YMM0, -32(%rdi,%rdx) VMOVU %YMM0, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN # endif # if VEC_SIZE > 16 /* From 16 to 31. No branch when size == 16. */ L(between_16_31): VMOVU %XMM0, -16(%rdi,%rdx) VMOVU %XMM0, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN # endif /* From 8 to 15. No branch when size == 8. */ L(between_8_15): movq %rcx, -8(%rdi,%rdx) movq %rcx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN L(between_4_7): /* From 4 to 7. No branch when size == 4. */ movl %ecx, -4(%rdi,%rdx) movl %ecx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN L(between_2_3): /* From 2 to 3. No branch when size == 2. */ movw %cx, -2(%rdi,%rdx) movw %cx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN END (MEMSET_SYMBOL (__memset, unaligned_erms)) diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S new file mode 100644 index 0000000000..acc5f6e2fb --- /dev/null +++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S @@ -0,0 +1,4 @@ +#define MEMCHR __rawmemchr_avx2_rtm +#define USE_AS_RAWMEMCHR 1 + +#include "memchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S new file mode 100644 index 0000000000..2b9c07a59f --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_avx2_rtm +#include "strcpy-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S new file mode 100644 index 0000000000..60a2ccfe53 --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_avx2_rtm +#include "strcpy-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S new file mode 100644 index 0000000000..637fb557c4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCAT +# define STRCAT __strcat_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strcat-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S index 41de8b2b64..4356fa7330 100644 --- a/sysdeps/x86_64/multiarch/strcat-avx2.S +++ b/sysdeps/x86_64/multiarch/strcat-avx2.S @@ -30,7 +30,11 @@ /* Number of bytes in a vector register */ # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + + .section SECTION(.text),"ax",@progbits ENTRY (STRCAT) mov %rdi, %r9 # ifdef USE_AS_STRNCAT diff --git a/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S new file mode 100644 index 0000000000..81f20d1d8e --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCHR +# define STRCHR __strchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S index 476c20c289..25bec38b5d 100644 --- a/sysdeps/x86_64/multiarch/strchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S @@ -40,10 +40,14 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 # define PAGE_SIZE 4096 - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRCHR) movl %edi, %ecx # ifndef USE_AS_STRCHRNUL @@ -76,8 +80,8 @@ ENTRY (STRCHR) cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(more_vecs): @@ -126,8 +130,7 @@ L(aligned_more): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x0): @@ -138,8 +141,7 @@ L(first_vec_x0): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1): @@ -149,8 +151,7 @@ L(first_vec_x1): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): @@ -161,8 +162,7 @@ L(first_vec_x2): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN L(prep_loop_4x): /* Align data to 4 * VEC_SIZE. */ @@ -221,8 +221,7 @@ L(loop_4x_vec): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN /* Cold case for crossing page with first load. */ .p2align 4 @@ -246,8 +245,7 @@ L(cross_page_boundary): cmp (%rax), %CHAR_REG cmovne %rdx, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRCHR) # endif diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c index 2c0a3e78fa..691770f335 100644 --- a/sysdeps/x86_64/multiarch/strchr.c +++ b/sysdeps/x86_64/multiarch/strchr.c @@ -29,6 +29,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * @@ -44,6 +45,9 @@ IFUNC_SELECTOR (void) && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) return OPTIMIZE (evex); + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S new file mode 100644 index 0000000000..cdcf818b91 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRCHR __strchrnul_avx2_rtm +#define USE_AS_STRCHRNUL 1 +#include "strchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S new file mode 100644 index 0000000000..aecd30d97f --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCMP +# define STRCMP __strcmp_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strcmp-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S index 53cb7a6696..40333010a6 100644 --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S @@ -55,6 +55,10 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + /* Warning! wcscmp/wcsncmp have to use SIGNED comparison for elements. strcmp/strncmp have to use UNSIGNED comparison for elements. @@ -75,7 +79,7 @@ the maximum offset is reached before a difference is found, zero is returned. */ - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRCMP) # ifdef USE_AS_STRNCMP /* Check for simple cases (0 or 1) in offset. */ @@ -127,8 +131,8 @@ L(return): movzbl (%rsi, %rdx), %edx subl %edx, %eax # endif - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(return_vec_size): @@ -161,8 +165,7 @@ L(return_vec_size): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(return_2_vec_size): @@ -195,8 +198,7 @@ L(return_2_vec_size): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(return_3_vec_size): @@ -229,8 +231,7 @@ L(return_3_vec_size): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(next_3_vectors): @@ -356,8 +357,7 @@ L(back_to_loop): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(test_vec): @@ -400,8 +400,7 @@ L(test_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(test_2_vec): @@ -444,8 +443,7 @@ L(test_2_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(test_3_vec): @@ -486,8 +484,7 @@ L(test_3_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(loop_cross_page): @@ -556,8 +553,7 @@ L(loop_cross_page): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(loop_cross_page_2_vec): @@ -631,8 +627,7 @@ L(loop_cross_page_2_vec): subl %edx, %eax # endif # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_STRNCMP L(string_nbyte_offset_check): @@ -674,8 +669,7 @@ L(cross_page_loop): # ifndef USE_AS_WCSCMP L(different): # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_WCSCMP .p2align 4 @@ -685,16 +679,14 @@ L(different): setl %al negl %eax orl $1, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN # endif # ifdef USE_AS_STRNCMP .p2align 4 L(zero): xorl %eax, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(char0): @@ -708,8 +700,7 @@ L(char0): movzbl (%rdi), %eax subl %ecx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # endif .p2align 4 @@ -734,8 +725,7 @@ L(last_vector): movzbl (%rsi, %rdx), %edx subl %edx, %eax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN /* Comparing on page boundary region requires special treatment: It must done one vector at the time, starting with the wider @@ -856,7 +846,6 @@ L(cross_page_4bytes): testl %eax, %eax jne L(cross_page_loop) subl %ecx, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRCMP) #endif diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c index 1df75690d0..62b7abeeee 100644 --- a/sysdeps/x86_64/multiarch/strcmp.c +++ b/sysdeps/x86_64/multiarch/strcmp.c @@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * @@ -46,6 +47,9 @@ IFUNC_SELECTOR (void) && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) return OPTIMIZE (evex); + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S new file mode 100644 index 0000000000..c2c581ecf7 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRCPY +# define STRCPY __strcpy_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strcpy-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S index b7629eaf15..5b6506d58f 100644 --- a/sysdeps/x86_64/multiarch/strcpy-avx2.S +++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S @@ -37,6 +37,10 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + /* zero register */ #define xmmZ xmm0 #define ymmZ ymm0 @@ -46,7 +50,7 @@ # ifndef USE_AS_STRCAT - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRCPY) # ifdef USE_AS_STRNCPY mov %RDX_LP, %R8_LP @@ -369,8 +373,8 @@ L(CopyVecSizeExit): lea 1(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(CopyTwoVecSize1): @@ -553,8 +557,7 @@ L(Exit1): lea 2(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit2): @@ -569,8 +572,7 @@ L(Exit2): lea 3(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit3): @@ -584,8 +586,7 @@ L(Exit3): lea 4(%rdi), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit4_7): @@ -602,8 +603,7 @@ L(Exit4_7): lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit8_15): @@ -620,8 +620,7 @@ L(Exit8_15): lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit16_31): @@ -638,8 +637,7 @@ L(Exit16_31): lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Exit32_63): @@ -656,8 +654,7 @@ L(Exit32_63): lea 1(%rdi, %rdx), %rdi jnz L(StrncpyFillTailWithZero) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifdef USE_AS_STRNCPY @@ -671,8 +668,7 @@ L(StrncpyExit1): # ifdef USE_AS_STRCAT movb $0, 1(%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit2): @@ -684,8 +680,7 @@ L(StrncpyExit2): # ifdef USE_AS_STRCAT movb $0, 2(%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit3_4): @@ -699,8 +694,7 @@ L(StrncpyExit3_4): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit5_8): @@ -714,8 +708,7 @@ L(StrncpyExit5_8): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit9_16): @@ -729,8 +722,7 @@ L(StrncpyExit9_16): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit17_32): @@ -744,8 +736,7 @@ L(StrncpyExit17_32): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit33_64): @@ -760,8 +751,7 @@ L(StrncpyExit33_64): # ifdef USE_AS_STRCAT movb $0, (%rdi, %r8) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(StrncpyExit65): @@ -778,50 +768,43 @@ L(StrncpyExit65): # ifdef USE_AS_STRCAT movb $0, 65(%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # ifndef USE_AS_STRCAT .p2align 4 L(Fill1): mov %dl, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill2): mov %dx, (%rdi) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill3_4): mov %dx, (%rdi) mov %dx, -2(%rdi, %r8) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill5_8): mov %edx, (%rdi) mov %edx, -4(%rdi, %r8) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill9_16): mov %rdx, (%rdi) mov %rdx, -8(%rdi, %r8) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(Fill17_32): vmovdqu %xmmZ, (%rdi) vmovdqu %xmmZ, -16(%rdi, %r8) - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(CopyVecSizeUnalignedVec2): @@ -898,8 +881,7 @@ L(Fill): cmp $1, %r8d ja L(Fill2) je L(Fill1) - VZEROUPPER - ret + VZEROUPPER_RETURN /* end of ifndef USE_AS_STRCAT */ # endif @@ -929,8 +911,7 @@ L(UnalignedFourVecSizeLeaveCase3): # ifdef USE_AS_STRCAT movb $0, (VEC_SIZE * 4)(%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(UnalignedFourVecSizeLeaveCase2): @@ -1001,16 +982,14 @@ L(StrncpyExit): # ifdef USE_AS_STRCAT movb $0, (%rdi) # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(ExitZero): # ifndef USE_AS_STRCAT mov %rdi, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN # endif diff --git a/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S new file mode 100644 index 0000000000..75b4b7612c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRLEN +# define STRLEN __strlen_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strlen-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S index caa615970c..1caae9e6bc 100644 --- a/sysdeps/x86_64/multiarch/strlen-avx2.S +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S @@ -36,9 +36,13 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRLEN) # ifdef USE_AS_STRNLEN /* Check for zero length. */ @@ -111,8 +115,8 @@ L(cros_page_boundary): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(aligned_more): @@ -231,8 +235,7 @@ L(last_4x_vec_or_less): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(last_2x_vec): @@ -253,8 +256,7 @@ L(last_2x_vec): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x0_check): @@ -267,8 +269,7 @@ L(first_vec_x0_check): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1_check): @@ -282,8 +283,7 @@ L(first_vec_x1_check): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2_check): @@ -297,8 +297,7 @@ L(first_vec_x2_check): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x3_check): @@ -312,8 +311,7 @@ L(first_vec_x3_check): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(max): @@ -321,8 +319,7 @@ L(max): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(zero): @@ -338,8 +335,7 @@ L(first_vec_x0): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x1): @@ -350,8 +346,7 @@ L(first_vec_x1): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(first_vec_x2): @@ -362,8 +357,7 @@ L(first_vec_x2): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(4x_vec_end): @@ -389,8 +383,7 @@ L(first_vec_x3): # ifdef USE_AS_WCSLEN shrq $2, %rax # endif - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRLEN) #endif diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S new file mode 100644 index 0000000000..0dcea18dbb --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCAT +#define STRCAT __strncat_avx2_rtm +#include "strcat-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S new file mode 100644 index 0000000000..37d1224bb9 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRCMP __strncmp_avx2_rtm +#define USE_AS_STRNCMP 1 +#include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c index 617c4e1065..60ba0fe356 100644 --- a/sysdeps/x86_64/multiarch/strncmp.c +++ b/sysdeps/x86_64/multiarch/strncmp.c @@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * @@ -46,6 +47,9 @@ IFUNC_SELECTOR (void) && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) return OPTIMIZE (evex); + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S new file mode 100644 index 0000000000..79e7083299 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S @@ -0,0 +1,3 @@ +#define USE_AS_STRNCPY +#define STRCPY __strncpy_avx2_rtm +#include "strcpy-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S new file mode 100644 index 0000000000..04f1626a5c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S @@ -0,0 +1,4 @@ +#define STRLEN __strnlen_avx2_rtm +#define USE_AS_STRNLEN 1 + +#include "strlen-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S new file mode 100644 index 0000000000..5def14ec1c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S @@ -0,0 +1,12 @@ +#ifndef STRRCHR +# define STRRCHR __strrchr_avx2_rtm +#endif + +#define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +#define VZEROUPPER_RETURN jmp L(return_vzeroupper) + +#define SECTION(p) p##.avx.rtm + +#include "strrchr-avx2.S" diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S index 53ea445305..0deba97114 100644 --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S @@ -36,9 +36,13 @@ # define VZEROUPPER vzeroupper # endif +# ifndef SECTION +# define SECTION(p) p##.avx +# endif + # define VEC_SIZE 32 - .section .text.avx,"ax",@progbits + .section SECTION(.text),"ax",@progbits ENTRY (STRRCHR) movd %esi, %xmm4 movl %edi, %ecx @@ -166,8 +170,8 @@ L(return_value): # endif bsrl %eax, %eax leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER - ret +L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN .p2align 4 L(match): @@ -198,8 +202,7 @@ L(find_nul): jz L(return_value) bsrl %eax, %eax leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(char_and_nul): @@ -222,14 +225,12 @@ L(char_and_nul_in_first_vec): jz L(return_null) bsrl %eax, %eax leaq -VEC_SIZE(%rdi, %rax), %rax - VZEROUPPER - ret + VZEROUPPER_RETURN .p2align 4 L(return_null): xorl %eax, %eax - VZEROUPPER - ret + VZEROUPPER_RETURN END (STRRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S new file mode 100644 index 0000000000..d49dbbf0b4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRCHR __wcschr_avx2_rtm +#define USE_AS_WCSCHR 1 +#include "strchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S new file mode 100644 index 0000000000..d6ca2b8064 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S @@ -0,0 +1,4 @@ +#define STRCMP __wcscmp_avx2_rtm +#define USE_AS_WCSCMP 1 + +#include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S new file mode 100644 index 0000000000..35658d7365 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S @@ -0,0 +1,4 @@ +#define STRLEN __wcslen_avx2_rtm +#define USE_AS_WCSLEN 1 + +#include "strlen-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S new file mode 100644 index 0000000000..4e88c70cc6 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S @@ -0,0 +1,5 @@ +#define STRCMP __wcsncmp_avx2_rtm +#define USE_AS_STRNCMP 1 +#define USE_AS_WCSCMP 1 + +#include "strcmp-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S new file mode 100644 index 0000000000..7437ebee2d --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S @@ -0,0 +1,5 @@ +#define STRLEN __wcsnlen_avx2_rtm +#define USE_AS_WCSLEN 1 +#define USE_AS_STRNLEN 1 + +#include "strlen-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c index 19bc6fd938..4983f1b222 100644 --- a/sysdeps/x86_64/multiarch/wcsnlen.c +++ b/sysdeps/x86_64/multiarch/wcsnlen.c @@ -29,6 +29,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; static inline void * @@ -44,6 +45,9 @@ IFUNC_SELECTOR (void) && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) return OPTIMIZE (evex); + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) return OPTIMIZE (avx2); } diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S new file mode 100644 index 0000000000..9bf760833f --- /dev/null +++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S @@ -0,0 +1,3 @@ +#define STRRCHR __wcsrchr_avx2_rtm +#define USE_AS_WCSRCHR 1 +#include "strrchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S new file mode 100644 index 0000000000..58ed21db01 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S @@ -0,0 +1,4 @@ +#define MEMCHR __wmemchr_avx2_rtm +#define USE_AS_WMEMCHR 1 + +#include "memchr-avx2-rtm.S" diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S new file mode 100644 index 0000000000..31104d1215 --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S @@ -0,0 +1,4 @@ +#define MEMCMP __wmemcmp_avx2_movbe_rtm +#define USE_AS_WMEMCMP 1 + +#include "memcmp-avx2-movbe-rtm.S" diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h index d07b8f0aaf..7bebdeb210 100644 --- a/sysdeps/x86_64/sysdep.h +++ b/sysdeps/x86_64/sysdep.h @@ -95,6 +95,28 @@ lose: \ #define R14_LP r14 #define R15_LP r15 +/* Zero upper vector registers and return with xtest. NB: Use VZEROALL + to avoid RTM abort triggered by VZEROUPPER inside transactionally. */ +#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \ + xtest; \ + jz 1f; \ + vzeroall; \ + ret; \ +1: \ + vzeroupper; \ + ret + +/* Zero upper vector registers and return. */ +#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN +# define ZERO_UPPER_VEC_REGISTERS_RETURN \ + VZEROUPPER; \ + ret +#endif + +#ifndef VZEROUPPER_RETURN +# define VZEROUPPER_RETURN VZEROUPPER; ret +#endif + #else /* __ASSEMBLER__ */ /* Long and pointer size in bytes. */