[0/7] AArch64 Optimize truncation, shifts and bitmask comparisons

Message ID	patch-14899-tamar@arm.com
Headers	DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 44CC8385842B Received-SPF: Pass (protection.outlook.com: domain of arm.com designates 63.35.35.123 as permitted sender) receiver=protection.outlook.com; client-ip=63.35.35.123; helo=64aa7808-outbound-1.mta.getcheckrecipient.com; Authentication-Results-Original: gcc.gnu.org; dkim=none (message not signed) header.d=none;gcc.gnu.org; dmarc=none action=none header.from=arm.com; Date: Wed, 29 Sep 2021 17:19:06 +0100 To: gcc-patches@gcc.gnu.org Subject: [PATCH 0/7]AArch64 Optimize truncation, shifts and bitmask comparisons Message-ID: <patch-14899-tamar@arm.com> Content-Type: multipart/mixed; boundary="/9DWx/yDrRhgMJTb" Content-Disposition: inline User-Agent: Mutt/1.9.4 (2018-02-28) MIME-Version: 1.0 NoDisclaimer: true Original-Authentication-Results: gcc.gnu.org; dkim=none (message not signed) header.d=none;gcc.gnu.org; dmarc=none action=none header.from=arm.com; Precedence: list From: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org> Reply-To: Tamar Christina <tamar.christina@arm.com> Cc: Richard.Earnshaw@arm.com, nd@arm.com, richard.sandiford@arm.com, Marcus.Shawcroft@arm.com Errors-To: gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org>
Series	AArch64 Optimize truncation, shifts and bitmask comparisons \| [0/7] AArch64 Optimize truncation, shifts and bitmask comparisons [1/7] AArch64 Add combine patterns for right shift and narrow [2/7] AArch64 Add combine patterns for narrowing shift of half top bits (shuffle) [3/7] AArch64 Add pattern for sshr to cmlt [4/7] AArch64 Add pattern xtn+xtn2 to uzp2 [5/7] middle-end Convert bitclear <imm> + cmp<cc> #0 into cm<cc2> <imm2> [6/7] AArch64 Add neg + cmle into cmgt [7/7] AArch64 Combine cmeq 0 + not into cmtst

Message ID

patch-14899-tamar@arm.com

Headers

DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 44CC8385842B
Received-SPF: Pass (protection.outlook.com: domain of arm.com designates
 63.35.35.123 as permitted sender) receiver=protection.outlook.com;
 client-ip=63.35.35.123; helo=64aa7808-outbound-1.mta.getcheckrecipient.com;
Authentication-Results-Original: gcc.gnu.org; dkim=none (message not signed)
 header.d=none;gcc.gnu.org; dmarc=none action=none header.from=arm.com;
Date: Wed, 29 Sep 2021 17:19:06 +0100
To: gcc-patches@gcc.gnu.org
Subject: [PATCH 0/7]AArch64 Optimize truncation,
 shifts and bitmask comparisons
Message-ID: <patch-14899-tamar@arm.com>
Content-Type: multipart/mixed; boundary="/9DWx/yDrRhgMJTb"
Content-Disposition: inline
User-Agent: Mutt/1.9.4 (2018-02-28)
MIME-Version: 1.0
NoDisclaimer: true
X-Microsoft-Antispam-Message-Info-Original: 
 GTbzQPJMWCDUU9APjoxGxYE8KAjTg1T6FAP8qhoX+wTeIugowbMfpQZ7dq2vX+mkPczd7MpvlIsDR1gfKx+5Comkc9KJZ1zWA4Qe6lFfCMqS3ZwoSUxblehkqtwgbhCIaRxWfK+wqPT9c16Y0jlCruQMWsvns1TUnpfs5E9QtdaE1dDdY/KaBGvhN4QYnjSWwDZtEtTV15djG1MfZemj+hfx1rxH31ZOaij+2sqH2cLtLG+o1e0SZZbievJTEeWRPuKWv7OQCeoVcYocjJzcsizI5g416qITT8GWnrfZ3IxqQ8ee/8OgY2mjAqSrTh2NC3bAanyrZNoa4xWMXzUF4ulq8OTZF8JTStKbyePvKQk0SsOlrqxpzu7q/RTodl5yTjSuVbS1/b5fK+Qm/c4Ty1S5BaOASg0COzy7+WWVEaDJlA2HntKhnhwJ3dFkWwGqtp/KKq3492dYLdSlCwqEZATWAimlKDvZLS42shs/gAT4SgC/UPl0aHxXXLiF3i+1RwEdRKt7Lx7D6TS47MQdo7NN5pL+kJdUPjyn5LL5Tx2oJzutVzcYitLVlFBWXUvy+gQsuXmtXd38u+pGKn5c6dArxaeD0UZZSyI7Ir2pftzRw/zlrfa+XfPOazebIqSupx6Nv/JPW2ngM5bYCFDc034XlQl3PWrJu4LiyzbYeaPszAKLW6pHRrKfjgCDlx0OoKrwLM41YwoYHsLwUi6JXwGn/K0g1Dw1pskCTiw4KnGKk3bcRYFiu4A9zj4C0nRtuWt35vbmNw4kvWxZxn/ZWEa49ILHNVBduEpQTNOsPL9ebSyeJz4HRfM0ZwwMPQ3jinPDCwqgJVvgGd6FCrPaZA==
X-Forefront-Antispam-Report-Untrusted: CIP:255.255.255.255; CTRY:; LANG:en;
 SCL:1; SRV:; IPV:NLI; SFV:NSPM; H:VI1PR08MB5325.eurprd08.prod.outlook.com;
 PTR:; CAT:NONE;
 SFS:(4636009)(366004)(33964004)(6916009)(7696005)(26005)(52116002)(44832011)(186003)(4326008)(2906002)(8676002)(8936002)(44144004)(38100700002)(38350700002)(956004)(83380400001)(86362001)(66556008)(36756003)(66476007)(8886007)(508600001)(55016002)(2616005)(66946007)(235185007)(5660300002)(316002)(966005)(66616009)(4216001)(2700100001);
 DIR:OUT; SFP:1101;
X-MS-Exchange-Transport-CrossTenantHeadersStamped: VI1PR08MB2878
Original-Authentication-Results: gcc.gnu.org; dkim=none (message not signed)
 header.d=none;gcc.gnu.org; dmarc=none action=none header.from=arm.com;
X-EOPAttributedMessage: 0
X-MS-Exchange-Transport-CrossTenantHeadersStripped: 
 VE1EUR03FT044.eop-EUR03.prod.protection.outlook.com
X-MS-Office365-Filtering-Correlation-Id-Prvs: 
 6cc220c7-974d-4d1b-8bb8-08d98364dd9c
X-Microsoft-Antispam: BCL:0;
X-Microsoft-Antispam-Message-Info: 
 0TrhT13J7fQhqbhAjpb2A4jNVLT73NPOTNdz2MpWXzRXsY+ytkyFgszd49MIi0jkmLIeG2t3touLDEC5V3fBJQ12T2oEWv0wsZ8z9Jq5XBQxsj1IV70yZpkYYcXuLsn50pLx+TuohPVgNARzlGahk+IDTpoe+TMRn9eKnX9xeHA6zjf7Ez0+ZecXpaUuqYNV/ol6cBPvQJIQoWbTk5Cz6cDvuhioxudGYeZ3S0iijDNzeOp4uO3mPPvLugZOFF7lEZHRQNvvcRmRlqUSA6C2rhAKN6bkeSb79VK0chV3nCHRguBiLxUUzfEdzppGlGLKASDrolANgcyR24fzPV6+Y89zysHsb/ObfzhbwqQdJgw/QvHrY2oIP4ucGfRgBblLoplxX3XisVbN4jz2pYHAdgyC6uw8Dd5uWRGPHyoFUPYBVXNfLoIaybi5EyZL95YhLcrp9VhJP8oAvnzq+r7wpesIM2d+SQtaKEqQsdOZGIdMeGJOtcHFxz5pzVmF/qMhcg5T1altyrrnjwnZk4GNS3dPoq9mdwIVmtUqsl/4Z230yXxyqqcyn6yx7oXw2U3wfbdyiXHJAVgaKlpOtDsdajxQNDEQnRJxvBzy1u8pEmS2Gsxst66xBukwnI3GpJ5KnSf6JtCVja4U/IVWvfPfhD8XhmIWzYVPfcrU1jNF33GuWEBHDkjqjHY03zqGGAmZXj4TNZOs84xIoCnUVfVvW7fvJBjaFpSak6m+csyf5rZ4N5IQwgq3pyJRKnWDN9qallbszYcumulHi2rzR26NYtoUpDN0mDWQg+xwNq1surNG3VllYmul9l7LUG4HWwEaxFc4kiFZXeJIVje7yb6IBg==
X-Forefront-Antispam-Report: CIP:63.35.35.123; CTRY:IE; LANG:en; SCL:1; SRV:;
 IPV:CAL; SFV:NSPM; H:64aa7808-outbound-1.mta.getcheckrecipient.com;
 PTR:ec2-63-35-35-123.eu-west-1.compute.amazonaws.com; CAT:NONE;
 SFS:(4636009)(46966006)(36840700001)(83380400001)(2616005)(186003)(66616009)(6916009)(70206006)(4326008)(356005)(26005)(956004)(44832011)(86362001)(235185007)(8886007)(2906002)(81166007)(336012)(36860700001)(82310400003)(47076005)(70586007)(5660300002)(316002)(966005)(508600001)(7696005)(55016002)(33964004)(44144004)(8676002)(36756003)(8936002)(4216001)(2700100001);
 DIR:OUT; SFP:1101;
X-OriginatorOrg: arm.com
X-MS-Exchange-CrossTenant-OriginalArrivalTime: 29 Sep 2021 16:19:26.1285 (UTC)
X-MS-Exchange-CrossTenant-Network-Message-Id: 
 cb754f0a-b0a0-4a1d-ab6a-08d98364e806
X-MS-Exchange-CrossTenant-Id: f34e5979-57d9-4aaa-ad4d-b122a662184d
X-MS-Exchange-CrossTenant-OriginalAttributedTenantConnectingIp: 
 TenantId=f34e5979-57d9-4aaa-ad4d-b122a662184d; Ip=[63.35.35.123];
 Helo=[64aa7808-outbound-1.mta.getcheckrecipient.com]
X-MS-Exchange-CrossTenant-AuthSource: 
 VE1EUR03FT044.eop-EUR03.prod.protection.outlook.com
X-MS-Exchange-CrossTenant-AuthAs: Anonymous
X-MS-Exchange-CrossTenant-FromEntityHeader: HybridOnPrem
X-MS-Exchange-Transport-CrossTenantHeadersStamped: DBBPR08MB4442
X-Spam-Status: No, score=-7.7 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, KAM_SHORT, MSGID_FROM_MTA_HEADER, RCVD_IN_DNSWL_NONE,
 RCVD_IN_MSPIKE_H2, SPF_HELO_PASS, SPF_PASS, TXREP,
 UNPARSEABLE_RELAY autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: gcc-patches@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-patches mailing list <gcc-patches.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=subscribe>
From: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>
Reply-To: Tamar Christina <tamar.christina@arm.com>
Cc: Richard.Earnshaw@arm.com, nd@arm.com, richard.sandiford@arm.com,
 Marcus.Shawcroft@arm.com
Errors-To: gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org
Sender: "Gcc-patches"
 <gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org>

Series

AArch64 Optimize truncation, shifts and bitmask comparisons |

Message

Tamar Christina Sept. 29, 2021, 4:19 p.m. UTC

  Hi All,

This patch series is optimizing AArch64 codegen for narrowing operations,
shift and narrow, and some comparisons with bitmasks.

There are more to come but this is the first batch.

This series shows a 2% gain on x264 in SPECCPU2017 and 0.05% size reduction
and shows 5-10% perf gain on various intrinsics optimized real world
libraries.

One part that is missing and needs additional work is being able to combine
stores into sequential locations.  Consider:

#include <arm_neon.h>
?
#define SIZE 1
#define SIZE2 8 * 8 * 8
?
extern void pop (uint8_t*);
?
void foo (int16x8_t row0, int16x8_t row1, int16x8_t row2, int16x8_t row3,
          int16x8_t row4, int16x8_t row5, int16x8_t row6, int16x8_t row7) {
    uint8_t block_nbits[SIZE2];

    uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
                                   vmovn_u16(vreinterpretq_u16_s16(row0)));
    uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
                                   vmovn_u16(vreinterpretq_u16_s16(row1)));
    uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
                                   vmovn_u16(vreinterpretq_u16_s16(row2)));
    uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
                                   vmovn_u16(vreinterpretq_u16_s16(row3)));
    uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
                                   vmovn_u16(vreinterpretq_u16_s16(row4)));
    uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
                                   vmovn_u16(vreinterpretq_u16_s16(row5)));
    uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
                                   vmovn_u16(vreinterpretq_u16_s16(row6)));
    uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
                                   vmovn_u16(vreinterpretq_u16_s16(row7)));

    vst1_u8(block_nbits + 0 * SIZE, row0_nbits);
    vst1_u8(block_nbits + 1 * SIZE, row1_nbits);
    vst1_u8(block_nbits + 2 * SIZE, row2_nbits);
    vst1_u8(block_nbits + 3 * SIZE, row3_nbits);
    vst1_u8(block_nbits + 4 * SIZE, row4_nbits);
    vst1_u8(block_nbits + 5 * SIZE, row5_nbits);
    vst1_u8(block_nbits + 6 * SIZE, row6_nbits);
    vst1_u8(block_nbits + 7 * SIZE, row7_nbits);
?
    pop (block_nbits);
}

currently generates:

movi v1.8b, #0x10

xtn v17.8b, v17.8h
xtn v23.8b, v23.8h
xtn v22.8b, v22.8h
xtn v4.8b, v21.8h
xtn v20.8b, v20.8h
xtn v19.8b, v19.8h
xtn v18.8b, v18.8h
xtn v24.8b, v24.8h

sub v17.8b, v1.8b, v17.8b
sub v23.8b, v1.8b, v23.8b
sub v22.8b, v1.8b, v22.8b
sub v16.8b, v1.8b, v4.8b
sub v8.8b, v1.8b, v20.8b
sub v4.8b, v1.8b, v19.8b
sub v2.8b, v1.8b, v18.8b
sub v1.8b, v1.8b, v24.8b

stp d17, d23, [sp, #224]
stp d22, d16, [sp, #240]
stp d8, d4, [sp, #256]
stp d2, d1, [sp, #272]

where optimized codegen for this is:

movi v1.16b, #0x10

uzp1 v17.16b, v17.16b, v23.16b
uzp1 v22.16b, v22.16b, v4.16b
uzp1 v20.16b, v20.16b, v19.16b
uzp1 v24.16b, v18.16b, v24.16b

sub v17.16b, v1.16b, v17.16b
sub v18.16b, v1.16b, v22.16b
sub v19.16b, v1.16b, v20.16b
sub v20.16b, v1.16b, v24.16b

stp q17, q18, [sp, #224]
stp q19, q20, [sp, #256]

which requires us to recognize the stores into sequential locations (multiple
stp d blocks in the current example) and merge them into one.

This pattern happens reasonably often but unsure how to handle it.  For one this
requires st1 and friends to not be unspec, which is currently the focus of

https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579582.html

Thanks,
Tamar

--- inline copy of patch -- 

--