From patchwork Tue Aug 6 16:18:48 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alexander Monakov X-Patchwork-Id: 95390 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id 73ABD3858429 for ; Tue, 6 Aug 2024 16:19:37 +0000 (GMT) X-Original-To: gcc-patches@gcc.gnu.org Delivered-To: gcc-patches@gcc.gnu.org Received: from mail.ispras.ru (mail.ispras.ru [83.149.199.84]) by sourceware.org (Postfix) with ESMTPS id A7F6D385841E for ; Tue, 6 Aug 2024 16:19:00 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org A7F6D385841E Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=ispras.ru Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=ispras.ru ARC-Filter: OpenARC Filter v1.0.0 sourceware.org A7F6D385841E Authentication-Results: server2.sourceware.org; arc=none smtp.remote-ip=83.149.199.84 ARC-Seal: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1722961142; cv=none; b=ljlKfr/16TpenBMbq6VgatVgKz/cKGkkpNqV5ZkiNTjPOWq6MEK4k1mdYV5lQT4fBio/erk7ycDakKbO+U/4o28utucGRZrB4Mh30aD44FcWGHBcmxlWk8lC6MiF8bwzRg4FjFIgLoAGp8JuhiV6jm35gKMcHyIfFwoaUk4Brgc= ARC-Message-Signature: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1722961142; c=relaxed/simple; bh=gBDJNEdEthdyBVR6aOwJfsv7cYJz+hTwqiHo7UvYaC8=; h=DKIM-Signature:From:To:Subject:Date:Message-Id:MIME-Version; b=sdPwhf+3q/DYAXG+yPV1Gz/zvadIy/qO/O2X5Fg0fX3yE5a6A32p+mwNJrm9Sf4N6iJCHOOvIZ6Ovabyq4YuoX00PDrfJG+V1n5WwQK9DueihFarHjA9PlFLsVo0dtssLqO/vjCbRrPgBkGbPFHCUlUKroSCc8fxQcyZhSe92ZQ= ARC-Authentication-Results: i=1; server2.sourceware.org Received: from localhost.intra.ispras.ru (unknown [10.10.3.121]) by mail.ispras.ru (Postfix) with ESMTP id 7F18F40737C3; Tue, 6 Aug 2024 16:18:59 +0000 (UTC) DKIM-Filter: OpenDKIM Filter v2.11.0 mail.ispras.ru 7F18F40737C3 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=ispras.ru; s=default; t=1722961139; bh=ppv4tLQ4NGTf3fke+TfYNek594D8nnLTTObIrUJQ3qA=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=bngoc1gkTCEPDJnBrxHxNCDrqVtOXOm8Z3D9SvQGgy+Tcd93AdI+T8l9+t6aR0ElB x+8vWPyWd+SQW8voOSFwd0QrO3s9xnxoo3JCkx/XpytxJ5fo6udNWHCZLL8i/8mQoi 34CRxU50QcnkXVVV26IZ9fEafPhpF5mrhoUYa2r8= From: Alexander Monakov To: gcc-patches@gcc.gnu.org Cc: Andi Kleen , Alexander Monakov Subject: [PATCH 1/3] libcpp: configure: check for AVX2 instead of SSE4 Date: Tue, 6 Aug 2024 19:18:48 +0300 Message-Id: <20240806161850.18839-1-amonakov@ispras.ru> X-Mailer: git-send-email 2.32.0 In-Reply-To: References: MIME-Version: 1.0 X-Spam-Status: No, score=-8.8 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, GIT_PATCH_0, KAM_NUMSUBJECT, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.30 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: gcc-patches-bounces~patchwork=sourceware.org@gcc.gnu.org Upcoming patches first drop Binutils ISA support from SSE4.2 to SSSE3, then bump it to AVX2. Instead of fiddling with detection, just bump our configure check to AVX2 immediately: if by some accident somebody builds GCC without AVX2 support in the assembler, they will get SSE2 vectorized lexer, which is not too slow. libcpp/ChangeLog: * config.in: Regenerate. * configure: Regenerate. * configure.ac: Check for AVX2 instead of SSE4.2. * lex.cc: Adjust for changed config macro. --- libcpp/config.in | 6 +++--- libcpp/configure | 4 ++-- libcpp/configure.ac | 6 +++--- libcpp/lex.cc | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/libcpp/config.in b/libcpp/config.in index 253ef03a3d..a0ca9e4df4 100644 --- a/libcpp/config.in +++ b/libcpp/config.in @@ -35,6 +35,9 @@ */ #undef HAVE_ALLOCA_H +/* Define to 1 if you can assemble AVX2 insns. */ +#undef HAVE_AVX2 + /* Define to 1 if you have the Mac OS X function CFLocaleCopyPreferredLanguages in the CoreFoundation framework. */ #undef HAVE_CFLOCALECOPYPREFERREDLANGUAGES @@ -210,9 +213,6 @@ /* Define to 1 if you have the `putc_unlocked' function. */ #undef HAVE_PUTC_UNLOCKED -/* Define to 1 if you can assemble SSE4 insns. */ -#undef HAVE_SSE4 - /* Define to 1 if you have the header file. */ #undef HAVE_STDDEF_H diff --git a/libcpp/configure b/libcpp/configure index 32d6aaa306..74af097620 100755 --- a/libcpp/configure +++ b/libcpp/configure @@ -9140,14 +9140,14 @@ case $target in int main () { -asm ("pcmpestri %0, %%xmm0, %%xmm1" : : "i"(0)) +asm ("vpshufb %ymm0, %ymm1, %ymm2") ; return 0; } _ACEOF if ac_fn_c_try_compile "$LINENO"; then : -$as_echo "#define HAVE_SSE4 1" >>confdefs.h +$as_echo "#define HAVE_AVX2 1" >>confdefs.h fi rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext diff --git a/libcpp/configure.ac b/libcpp/configure.ac index b883fec776..cfefb63552 100644 --- a/libcpp/configure.ac +++ b/libcpp/configure.ac @@ -197,9 +197,9 @@ fi case $target in i?86-* | x86_64-*) - AC_TRY_COMPILE([], [asm ("pcmpestri %0, %%xmm0, %%xmm1" : : "i"(0))], - [AC_DEFINE([HAVE_SSE4], [1], - [Define to 1 if you can assemble SSE4 insns.])]) + AC_TRY_COMPILE([], [asm ("vpshufb %ymm0, %ymm1, %ymm2")], + [AC_DEFINE([HAVE_AVX2], [1], + [Define to 1 if you can assemble AVX2 insns.])]) esac # Enable --enable-host-shared. diff --git a/libcpp/lex.cc b/libcpp/lex.cc index 1591dcdf15..fa9c03614c 100644 --- a/libcpp/lex.cc +++ b/libcpp/lex.cc @@ -344,7 +344,7 @@ search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) return (const uchar *)p + found; } -#ifdef HAVE_SSE4 +#ifdef HAVE_AVX2 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */ static const uchar * From patchwork Tue Aug 6 16:18:49 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alexander Monakov X-Patchwork-Id: 95391 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id BABD3385841D for ; Tue, 6 Aug 2024 16:20:33 +0000 (GMT) X-Original-To: gcc-patches@gcc.gnu.org Delivered-To: gcc-patches@gcc.gnu.org Received: from mail.ispras.ru (mail.ispras.ru [83.149.199.84]) by sourceware.org (Postfix) with ESMTPS id F07D1385842C for ; Tue, 6 Aug 2024 16:19:00 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org F07D1385842C Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=ispras.ru Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=ispras.ru ARC-Filter: OpenARC Filter v1.0.0 sourceware.org F07D1385842C Authentication-Results: server2.sourceware.org; arc=none smtp.remote-ip=83.149.199.84 ARC-Seal: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1722961142; cv=none; b=o0tJxfYTPiPQKFIhPk8BybMxtNTGSHaHtAfHylzgMir6iij4X5ZxCmaVsY28emIAcxYsciGiq/xS7LnYo9/mFGljFYvIYqi1XCBUYUc4V95Z0P0yZrRppIewGIN2XdYD3n7YBUkacjlzrVoIBsFkHvIWAi/A3hVxkekTC+ufhko= ARC-Message-Signature: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1722961142; c=relaxed/simple; bh=RxtTTCTxAbXMcpNXPBcC623cH+01LIio13f203y56aE=; h=DKIM-Signature:From:To:Subject:Date:Message-Id:MIME-Version; b=UQNL4H7L4MUm4dBr+Gie1loa1B6Lhtp/qB3T3a4CnsdsnwcKUM5yWqV4ew0kMMVqfPljWKWfmbhTGs4f48rTKuDsgJRmi2LtZRiskdML1LVs9Vz7B5bt+fi5pp98KTChGL7h/MbO0SHLOZYrm5CzaFuGfuyWg1zy9925eCDv4Jk= ARC-Authentication-Results: i=1; server2.sourceware.org Received: from localhost.intra.ispras.ru (unknown [10.10.3.121]) by mail.ispras.ru (Postfix) with ESMTP id 9E82B4076738; Tue, 6 Aug 2024 16:18:59 +0000 (UTC) DKIM-Filter: OpenDKIM Filter v2.11.0 mail.ispras.ru 9E82B4076738 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=ispras.ru; s=default; t=1722961139; bh=NanUxrzhCYQxP/tm3A6lw9+S6NU38MBP0Yahd38VXPE=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=AvoSLHEFDsQR501r2okLDjYebL8B08wODLnlLWUYdfu+h9adq1z7R+Jngnst2Idds IUZsePmdvThTUd2KMlAaab1a5krgyc+OmcN829mr4ggUfmlnjP3kuJe9WwCOcSaK/l Z6XO+V4Jon/jd9KNRun9DtUqCP4QZeKDkJRANRLQ= From: Alexander Monakov To: gcc-patches@gcc.gnu.org Cc: Andi Kleen , Alexander Monakov Subject: [PATCH 2/3] libcpp: replace SSE4.2 helper with an SSSE3 one Date: Tue, 6 Aug 2024 19:18:49 +0300 Message-Id: <20240806161850.18839-2-amonakov@ispras.ru> X-Mailer: git-send-email 2.32.0 In-Reply-To: <20240806161850.18839-1-amonakov@ispras.ru> References: <20240806161850.18839-1-amonakov@ispras.ru> MIME-Version: 1.0 X-Spam-Status: No, score=-9.1 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, GIT_PATCH_0, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.30 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: gcc-patches-bounces~patchwork=sourceware.org@gcc.gnu.org Since the characters we are searching for (CR, LF, '\', '?') all have distinct ASCII codes mod 16, PSHUFB can help match them all at once. libcpp/ChangeLog: * lex.cc (search_line_sse42): Replace with... (search_line_ssse3): ... this new function. Adjust the use... (init_vectorized_lexer): ... here. --- libcpp/lex.cc | 118 ++++++++++++++++++++------------------------------ 1 file changed, 46 insertions(+), 72 deletions(-) diff --git a/libcpp/lex.cc b/libcpp/lex.cc index fa9c03614c..815b8abd29 100644 --- a/libcpp/lex.cc +++ b/libcpp/lex.cc @@ -345,84 +345,58 @@ search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) } #ifdef HAVE_AVX2 -/* A version of the fast scanner using SSE 4.2 vectorized string insns. */ +/* A version of the fast scanner using SSSE3 shuffle (PSHUFB) insns. */ static const uchar * -#ifndef __SSE4_2__ -__attribute__((__target__("sse4.2"))) +#ifndef __SSSE3__ +__attribute__((__target__("ssse3"))) #endif -search_line_sse42 (const uchar *s, const uchar *end) +search_line_ssse3 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) { typedef char v16qi __attribute__ ((__vector_size__ (16))); - static const v16qi search = { '\n', '\r', '?', '\\' }; - - uintptr_t si = (uintptr_t)s; - uintptr_t index; - - /* Check for unaligned input. */ - if (si & 15) - { - v16qi sv; - - if (__builtin_expect (end - s < 16, 0) - && __builtin_expect ((si & 0xfff) > 0xff0, 0)) - { - /* There are less than 16 bytes left in the buffer, and less - than 16 bytes left on the page. Reading 16 bytes at this - point might generate a spurious page fault. Defer to the - SSE2 implementation, which already handles alignment. */ - return search_line_sse2 (s, end); - } - - /* ??? The builtin doesn't understand that the PCMPESTRI read from - memory need not be aligned. */ - sv = __builtin_ia32_loaddqu ((const char *) s); - index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0); - - if (__builtin_expect (index < 16, 0)) - goto found; - - /* Advance the pointer to an aligned address. We will re-scan a - few bytes, but we no longer need care for reading past the - end of a page, since we're guaranteed a match. */ - s = (const uchar *)((si + 15) & -16); - } - - /* Main loop, processing 16 bytes at a time. */ -#ifdef __GCC_ASM_FLAG_OUTPUTS__ - while (1) + typedef v16qi v16qi_u __attribute__ ((__aligned__ (1))); + /* Helper vector for pshufb-based matching: + each character C we're searching for is at position (C % 16). */ + v16qi lut = { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?' }; + static_assert ('\n' == 10 && '\r' == 13 && '\\' == 92 && '?' == 63); + + int found; + /* Process three 16-byte chunks per iteration. */ + for (; ; s += 48) { - char f; - - /* By using inline assembly instead of the builtin, - we can use the result, as well as the flags set. */ - __asm ("%vpcmpestri\t$0, %2, %3" - : "=c"(index), "=@ccc"(f) - : "m"(*s), "x"(search), "a"(4), "d"(16)); - if (f) - break; - - s += 16; + v16qi data, t; + /* Unaligned load. Reading beyond the final newline is safe, since + files.cc:read_file_guts pads the allocation. */ + data = *(const v16qi_u *)s; + /* Prevent propagation into pshufb and pcmp as memory operand. */ + __asm__ ("" : "+x" (data)); + t = __builtin_ia32_pshufb128 (lut, data); + if ((found = __builtin_ia32_pmovmskb128 (t == data))) + goto done; + /* Second chunk. */ + data = *(const v16qi_u *)(s + 16); + __asm__ ("" : "+x" (data)); + t = __builtin_ia32_pshufb128 (lut, data); + if ((found = __builtin_ia32_pmovmskb128 (t == data))) + goto add_16; + /* Third chunk. */ + data = *(const v16qi_u *)(s + 32); + __asm__ ("" : "+x" (data)); + t = __builtin_ia32_pshufb128 (lut, data); + if ((found = __builtin_ia32_pmovmskb128 (t == data))) + goto add_32; } -#else - s -= 16; - /* By doing the whole loop in inline assembly, - we can make proper use of the flags set. */ - __asm ( ".balign 16\n" - "0: add $16, %1\n" - " %vpcmpestri\t$0, (%1), %2\n" - " jnc 0b" - : "=&c"(index), "+r"(s) - : "x"(search), "a"(4), "d"(16)); -#endif - - found: - return s + index; +add_32: + s += 16; +add_16: + s += 16; +done: + return s + __builtin_ctz (found); } #else -/* Work around out-dated assemblers without sse4 support. */ -#define search_line_sse42 search_line_sse2 +/* Work around out-dated assemblers without SSSE3 support. */ +#define search_line_ssse3 search_line_sse2 #endif /* Check the CPU capabilities. */ @@ -440,18 +414,18 @@ init_vectorized_lexer (void) search_line_fast_type impl = search_line_acc_char; int minimum = 0; -#if defined(__SSE4_2__) +#if defined(__SSSE3__) minimum = 3; #elif defined(__SSE2__) minimum = 2; #endif if (minimum == 3) - impl = search_line_sse42; + impl = search_line_ssse3; else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2) { - if (minimum == 3 || (ecx & bit_SSE4_2)) - impl = search_line_sse42; + if (minimum == 3 || (ecx & bit_SSSE3)) + impl = search_line_ssse3; else if (minimum == 2 || (edx & bit_SSE2)) impl = search_line_sse2; } From patchwork Tue Aug 6 16:18:50 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alexander Monakov X-Patchwork-Id: 95392 Return-Path: X-Original-To: patchwork@sourceware.org Delivered-To: patchwork@sourceware.org Received: from server2.sourceware.org (localhost [IPv6:::1]) by sourceware.org (Postfix) with ESMTP id CC773385843B for ; Tue, 6 Aug 2024 16:21:28 +0000 (GMT) X-Original-To: gcc-patches@gcc.gnu.org Delivered-To: gcc-patches@gcc.gnu.org Received: from mail.ispras.ru (mail.ispras.ru [83.149.199.84]) by sourceware.org (Postfix) with ESMTPS id 1CD2A385842D for ; Tue, 6 Aug 2024 16:19:01 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 1CD2A385842D Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=ispras.ru Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=ispras.ru ARC-Filter: OpenARC Filter v1.0.0 sourceware.org 1CD2A385842D Authentication-Results: server2.sourceware.org; arc=none smtp.remote-ip=83.149.199.84 ARC-Seal: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1722961144; cv=none; b=DiErywO7JSq8WTkAzieOpYvVTJUtRy97SNXhrip7/UyFmzvQqJ2IkZywUVXr2PHTdlnlMJJHnbq4lsJ2TqX3mzyUL45iyaSNdTEN+yGDduehLRRwj+n8hqDMKNrRxN6sFXrDhkl10g5HkhvGwL+v/2WHEmvudzrS2LCz8yZbrnM= ARC-Message-Signature: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1722961144; c=relaxed/simple; bh=ANFmSV6vbxSRubvvFDnke5o0ySGzejt9MFJLkEwzMv4=; h=DKIM-Signature:From:To:Subject:Date:Message-Id:MIME-Version; b=lzffEsSzrWKbo6F631AUwBud2UNyUcuG8zGfGcCIhBY7FCk7OZYQW9AWGkPc/I291qNdO1mlHAHo2we5yVApsjvanjnEPX2+L5noDGPyEL7mPRPZHPJLZ24H2xqvUVRgpnDVmISFOtLROoMVPOT7KrN4XoLcqPEM73HQ3ks+5mc= ARC-Authentication-Results: i=1; server2.sourceware.org Received: from localhost.intra.ispras.ru (unknown [10.10.3.121]) by mail.ispras.ru (Postfix) with ESMTP id C31D14078508; Tue, 6 Aug 2024 16:18:59 +0000 (UTC) DKIM-Filter: OpenDKIM Filter v2.11.0 mail.ispras.ru C31D14078508 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=ispras.ru; s=default; t=1722961139; bh=40PXGGULWJSv2s03x28V0jpn8ugdh9qNKQy8n4JJ8io=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=Z3+ZgMzTpbG2q6uuLgAUFK3NjAZztDLpyKa0+zsS0AH3GozmACPKEPSPFVfa3U0v+ b4f5bKBrbaJlTVMjQ/LmKWXziRmyR/5z+cx5snpc6bS42DxdS5lYqBUqq4WLMsqGzA AejRAjz0YWZ0dFSP87dlUNNgzZ0JQMFe+wUZsYe8= From: Alexander Monakov To: gcc-patches@gcc.gnu.org Cc: Andi Kleen , Alexander Monakov Subject: [PATCH 3/3] libcpp: add AVX2 helper Date: Tue, 6 Aug 2024 19:18:50 +0300 Message-Id: <20240806161850.18839-3-amonakov@ispras.ru> X-Mailer: git-send-email 2.32.0 In-Reply-To: <20240806161850.18839-1-amonakov@ispras.ru> References: <20240806161850.18839-1-amonakov@ispras.ru> MIME-Version: 1.0 X-Spam-Status: No, score=-9.2 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, GIT_PATCH_0, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: gcc-patches@gcc.gnu.org X-Mailman-Version: 2.1.30 Precedence: list List-Id: Gcc-patches mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: gcc-patches-bounces~patchwork=sourceware.org@gcc.gnu.org Use the same PSHUFB-based matching as in the SSSE3 helper, just 2x wider. Directly use the new helper if __AVX2__ is defined. It makes the other helpers unused, so mark them inline to prevent warnings. Rewrite and simplify init_vectorized_lexer. libcpp/ChangeLog: * files.cc (read_file_guts): Bump padding to 32 if HAVE_AVX2. * lex.cc (search_line_acc_char): Mark inline, not "unused". (search_line_sse2): Mark inline. (search_line_ssse3): Ditto. (search_line_avx2): New function. (init_vectorized_lexer): Reimplement. --- libcpp/files.cc | 15 +++---- libcpp/lex.cc | 111 ++++++++++++++++++++++++++++++++++++------------ 2 files changed, 92 insertions(+), 34 deletions(-) diff --git a/libcpp/files.cc b/libcpp/files.cc index 78f56e30bd..3df070d035 100644 --- a/libcpp/files.cc +++ b/libcpp/files.cc @@ -693,7 +693,7 @@ static bool read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, const char *input_charset) { - ssize_t size, total, count; + ssize_t size, pad, total, count; uchar *buf; bool regular; @@ -732,11 +732,10 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, the majority of C source files. */ size = 8 * 1024; - /* The + 16 here is space for the final '\n' and 15 bytes of padding, - used to quiet warnings from valgrind or Address Sanitizer, when the - optimized lexer accesses aligned 16-byte memory chunks, including - the bytes after the malloced, area, and stops lexing on '\n'. */ - buf = XNEWVEC (uchar, size + 16); + pad = HAVE_AVX2 ? 32 : 16; + /* The '+ PAD' here is space for the final '\n' and PAD-1 bytes of padding, + allowing search_line_fast to use (possibly misaligned) vector loads. */ + buf = XNEWVEC (uchar, size + pad); total = 0; while ((count = read (file->fd, buf + total, size - total)) > 0) { @@ -747,7 +746,7 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, if (regular) break; size *= 2; - buf = XRESIZEVEC (uchar, buf, size + 16); + buf = XRESIZEVEC (uchar, buf, size + pad); } } @@ -765,7 +764,7 @@ read_file_guts (cpp_reader *pfile, _cpp_file *file, location_t loc, file->buffer = _cpp_convert_input (pfile, input_charset, - buf, size + 16, total, + buf, size + pad, total, &file->buffer_start, &file->st.st_size); file->buffer_valid = file->buffer; diff --git a/libcpp/lex.cc b/libcpp/lex.cc index 815b8abd29..c336281658 100644 --- a/libcpp/lex.cc +++ b/libcpp/lex.cc @@ -225,10 +225,7 @@ acc_char_index (word_type cmp ATTRIBUTE_UNUSED, and branches without increasing the number of arithmetic operations. It's almost certainly going to be a win with 64-bit word size. */ -static const uchar * search_line_acc_char (const uchar *, const uchar *) - ATTRIBUTE_UNUSED; - -static const uchar * +static inline const uchar * search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) { const word_type repl_nl = acc_char_replicate ('\n'); @@ -293,7 +290,7 @@ static const char repl_chars[4][16] __attribute__((aligned(16))) = { /* A version of the fast scanner using SSE2 vectorized byte compare insns. */ -static const uchar * +static inline const uchar * #ifndef __SSE2__ __attribute__((__target__("sse2"))) #endif @@ -345,9 +342,9 @@ search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) } #ifdef HAVE_AVX2 -/* A version of the fast scanner using SSSE3 shuffle (PSHUFB) insns. */ +/* Variants of the fast scanner using SSSE3 shuffle (PSHUFB) insns. */ -static const uchar * +static inline const uchar * #ifndef __SSSE3__ __attribute__((__target__("ssse3"))) #endif @@ -394,44 +391,106 @@ done: return s + __builtin_ctz (found); } +static inline const uchar * +#ifndef __AVX2__ +__attribute__((__target__("avx2"))) +#endif +search_line_avx2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) +{ + typedef char v32qi __attribute__ ((__vector_size__ (32))); + typedef v32qi v32qi_u __attribute__ ((__aligned__ (1))); + v32qi lut = { + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?', + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?' + }; + + int found; + /* Process three 32-byte chunks per iteration. */ + for (; ; s += 96) + { + v32qi data, t; + data = *(const v32qi_u *)s; + __asm__ ("" : "+x" (data)); + t = __builtin_ia32_pshufb256 (lut, data); + if ((found = __builtin_ia32_pmovmskb256 (t == data))) + goto done; + /* Second chunk. */ + data = *(const v32qi_u *)(s + 32); + __asm__ ("" : "+x" (data)); + t = __builtin_ia32_pshufb256 (lut, data); + if ((found = __builtin_ia32_pmovmskb256 (t == data))) + goto add_32; + /* Third chunk. */ + data = *(const v32qi_u *)(s + 64); + __asm__ ("" : "+x" (data)); + t = __builtin_ia32_pshufb256 (lut, data); + if ((found = __builtin_ia32_pmovmskb256 (t == data))) + goto add_64; + } +add_64: + s += 32; +add_32: + s += 32; +done: + return s + __builtin_ctz (found); +} + #else -/* Work around out-dated assemblers without SSSE3 support. */ +/* Work around out-dated assemblers without AVX2 support. */ #define search_line_ssse3 search_line_sse2 +#define search_line_avx2 search_line_sse2 #endif +#ifdef __AVX2__ +/* No need for CPU probing, just use the best available variant. */ +#define search_line_fast search_line_avx2 +#else /* Check the CPU capabilities. */ #include "../gcc/config/i386/cpuid.h" typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *); -static search_line_fast_type search_line_fast; +static search_line_fast_type search_line_fast +#if defined(__SSE2__) + = search_line_sse2; +#else + = search_line_acc_char; +#endif #define HAVE_init_vectorized_lexer 1 static inline void init_vectorized_lexer (void) { - unsigned dummy, ecx = 0, edx = 0; - search_line_fast_type impl = search_line_acc_char; - int minimum = 0; - -#if defined(__SSSE3__) - minimum = 3; -#elif defined(__SSE2__) - minimum = 2; -#endif + unsigned a1, b1, c1, d1; + + if (!__get_cpuid (1, &a1, &b1, &c1, &d1)) + return; - if (minimum == 3) - impl = search_line_ssse3; - else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2) + if (c1 & bit_OSXSAVE) { - if (minimum == 3 || (ecx & bit_SSSE3)) - impl = search_line_ssse3; - else if (minimum == 2 || (edx & bit_SSE2)) - impl = search_line_sse2; + /* Check leaf 7 subleaf 0 for AVX2 ISA support. */ + unsigned a7, b7, c7, d7; + if (__get_cpuid_count (7, 0, &a7, &b7, &c7, &d7) + && (b7 & bit_AVX2)) + { + /* Check XCR0 for YMM state support in the OS. */ + unsigned xcr0h, xcr0l; + __asm__ volatile (".byte 0x0f, 0x01, 0xd0" // xgetbv + : "=d" (xcr0h), "=a" (xcr0l) : "c" (0)); + if ((xcr0l & 6) == 6) + { + search_line_fast = search_line_avx2; + return; + } + } } - search_line_fast = impl; + if (c1 & bit_SSSE3) + search_line_fast = search_line_ssse3; + else if (d1 & bit_SSE2) + search_line_fast = search_line_sse2; } +#endif #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)