From patchwork Tue Aug 8 16:03:09 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: Wilco Dijkstra X-Patchwork-Id: 21981 Received: (qmail 101998 invoked by alias); 8 Aug 2017 16:03:22 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 101890 invoked by uid 89); 8 Aug 2017 16:03:16 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-24.9 required=5.0 tests=AWL, BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, RCVD_IN_DNSWL_NONE, SPF_HELO_PASS, SPF_PASS autolearn=ham version=3.3.2 spammy=subtraction, Sources X-HELO: EUR03-DB5-obe.outbound.protection.outlook.com From: Wilco Dijkstra To: "libc-alpha@sourceware.org" CC: nd , Szabolcs Nagy Subject: Re: [PATCH][AArch64] Optimized memcmp Date: Tue, 8 Aug 2017 16:03:09 +0000 Message-ID: References: In-Reply-To: authentication-results: spf=none (sender IP is ) smtp.mailfrom=Wilco.Dijkstra@arm.com; x-ms-publictraffictype: Email x-microsoft-exchange-diagnostics: 1; AM5PR0802MB2482; 6:zN/SSmaGKVclwg9IwewHZEd5MZu6VC6hmSRz5UUd68Rp3LpYmo4t8O/cbOyKMIhuZvdCyGyPcIngaZeiy37ZTdc/f6naDwf2nrXmH+LJDVqe1NdVdUndAtIKyCClQu5V57sIQ19DmT/1Iumj/FAhEbNkPSjlQVTQ6U35rkkMDyanrSlUGgZVe1M03NUSqRa9dBjUtbzuisz0AKUTfAGVuTMP2X44raDI4Gowpiw4E9pA4Jd0EWRgiKdhqU7DByZuDD820olXCoCU+uHDWERJTZKsZEmqLgS4nANS2LB41XIDiFebXyc3WV1eOXqP/GqkhTVMxp2MhwdX1MTcf67CwA==; 5:IBtesWHNFL12+Yjm4QcH8e1u3t+IICHDA4hCMk15vjKd6socbGHZNMuqyu2KFoVNKLWcWuotsgsnPmxvAZdv3vQcKAtVAb5Fxd0SnThK2m37iIQ7JOaMjzbveipkqsPg38So9hQxfTLkDzUsaEiR7Q==; 24:XcEMFhjOciloF+NAlqDxhfNl7DwdoU+JqOVF4yqiO9kEaErRqZnCrO8HgfoQu3c/SeiYS2kr+oqrN9uEN+dhQCi++GHUtPeim47YLwp9ICA=; 7:949I5JbOeatlcuUMpzeHQgSs7lKX7nIm3ScvtuLUXTiaWl7ufel0mHOrORYIkqJzelqw3kL9DartI8NnCmB9oIv5KEs2nUbypG6+6JnPHDsB1fbQhi5abxR3RIopRP86a152ffDAZf99w8I0KwQBTT7QI4By+a3EwVa4Xv5Sykp9Kd3qrk0Av1hRPzIZ1GMx4anSz1+Oi9wKPuG9/J+E+cvefjpBegkMFqDDpdh/wo8= x-ms-exchange-antispam-srfa-diagnostics: SSOS; x-ms-office365-filtering-correlation-id: e2bfed16-5116-43c0-3204-08d4de76f6ea x-ms-office365-filtering-ht: Tenant x-microsoft-antispam: UriScan:; BCL:0; PCL:0; RULEID:(300000500095)(300135000095)(300000501095)(300135300095)(22001)(300000502095)(300135100095)(2017030254152)(48565401081)(300000503095)(300135400095)(2017052603031)(201703131423075)(201703031133081)(201702281549075)(300000504095)(300135200095)(300000505095)(300135600095)(300000506095)(300135500095); SRVR:AM5PR0802MB2482; x-ms-traffictypediagnostic: AM5PR0802MB2482: nodisclaimer: True x-exchange-antispam-report-test: UriScan:(180628864354917); x-microsoft-antispam-prvs: x-exchange-antispam-report-cfa-test: BCL:0; PCL:0; RULEID:(100000700101)(100105000095)(100000701101)(100105300095)(100000702101)(100105100095)(6040450)(601004)(2401047)(8121501046)(5005006)(100000703101)(100105400095)(10201501046)(93006095)(93001095)(3002001)(6055026)(6041248)(201703131423075)(201702281528075)(201703061421075)(201703061406153)(20161123560025)(20161123558100)(20161123555025)(20161123562025)(20161123564025)(6072148)(201708071742011)(100000704101)(100105200095)(100000705101)(100105500095); SRVR:AM5PR0802MB2482; BCL:0; PCL:0; RULEID:(100000800101)(100110000095)(100000801101)(100110300095)(100000802101)(100110100095)(100000803101)(100110400095)(100000804101)(100110200095)(100000805101)(100110500095); SRVR:AM5PR0802MB2482; x-forefront-prvs: 03932714EB x-forefront-antispam-report: SFV:NSPM; SFS:(10009020)(6009001)(39860400002)(39840400002)(39450400003)(39410400002)(39850400002)(39400400002)(377424004)(54534003)(189002)(199003)(6116002)(102836003)(3846002)(2906002)(54356999)(6506006)(86362001)(76176999)(81166006)(50986999)(189998001)(25786009)(3660700001)(53936002)(6246003)(110136004)(38730400002)(3280700002)(4326008)(5640700003)(6436002)(305945005)(9686003)(54906002)(53946003)(99286003)(66066001)(55016002)(101416001)(33656002)(6916009)(2950100002)(229853002)(2900100001)(7736002)(68736007)(478600001)(7696004)(8936002)(74316002)(5660300001)(81156014)(2501003)(8676002)(106356001)(2351001)(105586002)(53546010)(5250100002)(72206003)(14454004)(97736004)(357404004); DIR:OUT; SFP:1101; SCL:1; SRVR:AM5PR0802MB2482; H:DB6PR0801MB2053.eurprd08.prod.outlook.com; FPR:; SPF:None; PTR:InfoNoRecords; MX:1; A:1; LANG:en; received-spf: None (protection.outlook.com: arm.com does not designate permitted sender hosts) spamdiagnosticoutput: 1:99 spamdiagnosticmetadata: NSPM MIME-Version: 1.0 X-OriginatorOrg: arm.com X-MS-Exchange-CrossTenant-originalarrivaltime: 08 Aug 2017 16:03:09.4883 (UTC) X-MS-Exchange-CrossTenant-fromentityheader: Hosted X-MS-Exchange-CrossTenant-id: f34e5979-57d9-4aaa-ad4d-b122a662184d X-MS-Exchange-Transport-CrossTenantHeadersStamped: AM5PR0802MB2482 ping From: Wilco Dijkstra Sent: 07 July 2017 16:11 To: libc-alpha@sourceware.org Cc: nd; Szabolcs Nagy Subject: [PATCH][AArch64] Optimized memcmp   This is an optimized memcmp for AArch64.  This is a complete rewrite using a different algorithm.  The previous version split into cases where both inputs were aligned, the inputs were mutually aligned and unaligned using a byte loop.  The new version combines all these cases, while small inputs of less than 8 bytes are handled separately. This allows the main code to be sped up using unaligned loads since there are now at least 8 bytes to be compared.  After the first 8 bytes, align the first input.  This ensures each iteration does at most one unaligned access and mutually aligned inputs behave as aligned. After the main loop, process the last 8 bytes using unaligned accesses. This improves performance of (mutually) aligned cases by 25% and unaligned by >500% (yes >6 times faster) on large inputs. ChangeLog: 2017-07-07  Wilco Dijkstra          * sysdeps/aarch64/memcmp.S (memcmp):         Rewrite of optimized memcmp. GLIBC benchtests/bench-memcmp.c performance comparison for Cortex-A53: Length    1, alignment  1/ 1:           153% Length    1, alignment  1/ 1:           119% Length    1, alignment  1/ 1:           154% Length    2, alignment  2/ 2:           121% Length    2, alignment  2/ 2:           140% Length    2, alignment  2/ 2:           121% Length    3, alignment  3/ 3:           105% Length    3, alignment  3/ 3:           105% Length    3, alignment  3/ 3:           105% Length    4, alignment  4/ 4:           155% Length    4, alignment  4/ 4:           154% Length    4, alignment  4/ 4:           161% Length    5, alignment  5/ 5:           173% Length    5, alignment  5/ 5:           173% Length    5, alignment  5/ 5:           173% Length    6, alignment  6/ 6:           145% Length    6, alignment  6/ 6:           145% Length    6, alignment  6/ 6:           145% Length    7, alignment  7/ 7:           125% Length    7, alignment  7/ 7:           125% Length    7, alignment  7/ 7:           125% Length    8, alignment  8/ 8:           111% Length    8, alignment  8/ 8:           130% Length    8, alignment  8/ 8:           124% Length    9, alignment  9/ 9:           160% Length    9, alignment  9/ 9:           160% Length    9, alignment  9/ 9:           150% Length   10, alignment 10/10:           170% Length   10, alignment 10/10:           137% Length   10, alignment 10/10:           150% Length   11, alignment 11/11:           160% Length   11, alignment 11/11:           160% Length   11, alignment 11/11:           160% Length   12, alignment 12/12:           146% Length   12, alignment 12/12:           168% Length   12, alignment 12/12:           156% Length   13, alignment 13/13:           167% Length   13, alignment 13/13:           167% Length   13, alignment 13/13:           173% Length   14, alignment 14/14:           167% Length   14, alignment 14/14:           168% Length   14, alignment 14/14:           168% Length   15, alignment 15/15:           168% Length   15, alignment 15/15:           173% Length   15, alignment 15/15:           173% Length    1, alignment  0/ 0:           134% Length    1, alignment  0/ 0:           127% Length    1, alignment  0/ 0:           119% Length    2, alignment  0/ 0:           94% Length    2, alignment  0/ 0:           94% Length    2, alignment  0/ 0:           106% Length    3, alignment  0/ 0:           82% Length    3, alignment  0/ 0:           87% Length    3, alignment  0/ 0:           82% Length    4, alignment  0/ 0:           115% Length    4, alignment  0/ 0:           115% Length    4, alignment  0/ 0:           122% Length    5, alignment  0/ 0:           127% Length    5, alignment  0/ 0:           119% Length    5, alignment  0/ 0:           127% Length    6, alignment  0/ 0:           103% Length    6, alignment  0/ 0:           100% Length    6, alignment  0/ 0:           100% Length    7, alignment  0/ 0:           82% Length    7, alignment  0/ 0:           91% Length    7, alignment  0/ 0:           87% Length    8, alignment  0/ 0:           111% Length    8, alignment  0/ 0:           124% Length    8, alignment  0/ 0:           124% Length    9, alignment  0/ 0:           136% Length    9, alignment  0/ 0:           136% Length    9, alignment  0/ 0:           136% Length   10, alignment  0/ 0:           136% Length   10, alignment  0/ 0:           135% Length   10, alignment  0/ 0:           136% Length   11, alignment  0/ 0:           136% Length   11, alignment  0/ 0:           136% Length   11, alignment  0/ 0:           135% Length   12, alignment  0/ 0:           136% Length   12, alignment  0/ 0:           136% Length   12, alignment  0/ 0:           136% Length   13, alignment  0/ 0:           135% Length   13, alignment  0/ 0:           136% Length   13, alignment  0/ 0:           136% Length   14, alignment  0/ 0:           136% Length   14, alignment  0/ 0:           136% Length   14, alignment  0/ 0:           136% Length   15, alignment  0/ 0:           136% Length   15, alignment  0/ 0:           136% Length   15, alignment  0/ 0:           136% Length    4, alignment  0/ 0:           115% Length    4, alignment  0/ 0:           115% Length    4, alignment  0/ 0:           115% Length   32, alignment  0/ 0:           127% Length   32, alignment  7/ 2:           395% Length   32, alignment  0/ 0:           127% Length   32, alignment  0/ 0:           127% Length    8, alignment  0/ 0:           111% Length    8, alignment  0/ 0:           124% Length    8, alignment  0/ 0:           124% Length   64, alignment  0/ 0:           128% Length   64, alignment  6/ 4:           475% Length   64, alignment  0/ 0:           131% Length   64, alignment  0/ 0:           134% Length   16, alignment  0/ 0:           128% Length   16, alignment  0/ 0:           119% Length   16, alignment  0/ 0:           128% Length  128, alignment  0/ 0:           129% Length  128, alignment  5/ 6:           475% Length  128, alignment  0/ 0:           130% Length  128, alignment  0/ 0:           129% Length   32, alignment  0/ 0:           126% Length   32, alignment  0/ 0:           126% Length   32, alignment  0/ 0:           126% Length  256, alignment  0/ 0:           127% Length  256, alignment  4/ 8:           545% Length  256, alignment  0/ 0:           126% Length  256, alignment  0/ 0:           128% Length   64, alignment  0/ 0:           171% Length   64, alignment  0/ 0:           171% Length   64, alignment  0/ 0:           174% Length  512, alignment  0/ 0:           126% Length  512, alignment  3/10:           585% Length  512, alignment  0/ 0:           126% Length  512, alignment  0/ 0:           127% Length  128, alignment  0/ 0:           129% Length  128, alignment  0/ 0:           128% Length  128, alignment  0/ 0:           129% Length 1024, alignment  0/ 0:           125% Length 1024, alignment  2/12:           611% Length 1024, alignment  0/ 0:           126% Length 1024, alignment  0/ 0:           126% Length  256, alignment  0/ 0:           128% Length  256, alignment  0/ 0:           127% Length  256, alignment  0/ 0:           128% Length 2048, alignment  0/ 0:           125% Length 2048, alignment  1/14:           625% Length 2048, alignment  0/ 0:           125% Length 2048, alignment  0/ 0:           125% Length  512, alignment  0/ 0:           126% Length  512, alignment  0/ 0:           127% Length  512, alignment  0/ 0:           127% Length 4096, alignment  0/ 0:           125% Length 4096, alignment  0/16:           125% Length 4096, alignment  0/ 0:           125% Length 4096, alignment  0/ 0:           125% Length 1024, alignment  0/ 0:           126% Length 1024, alignment  0/ 0:           126% Length 1024, alignment  0/ 0:           126% Length 8192, alignment  0/ 0:           125% Length 8192, alignment 63/18:           636% Length 8192, alignment  0/ 0:           125% Length 8192, alignment  0/ 0:           125% Length   16, alignment  1/ 2:           317% Length   16, alignment  1/ 2:           317% Length   16, alignment  1/ 2:           317% Length   32, alignment  2/ 4:           395% Length   32, alignment  2/ 4:           395% Length   32, alignment  2/ 4:           398% Length   64, alignment  3/ 6:           475% Length   64, alignment  3/ 6:           475% Length   64, alignment  3/ 6:           477% Length  128, alignment  4/ 8:           479% Length  128, alignment  4/ 8:           479% Length  128, alignment  4/ 8:           479% Length  256, alignment  5/10:           543% Length  256, alignment  5/10:           539% Length  256, alignment  5/10:           543% Length  512, alignment  6/12:           585% Length  512, alignment  6/12:           585% Length  512, alignment  6/12:           585% Length 1024, alignment  7/14:           611% Length 1024, alignment  7/14:           611% Length 1024, alignment  7/14:           611% diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S index 4cfcb89297551a07fcfa055827564b833e86f8fb..b99c081bba2c7f26b5b53315d6b806ae22eaaafc 100644 --- a/sysdeps/aarch64/memcmp.S +++ b/sysdeps/aarch64/memcmp.S @@ -22,132 +22,98 @@    /* Assumptions:   * - * ARMv8-a, AArch64 + * ARMv8-a, AArch64, unaligned accesses.   */    /* Parameters and result.  */  #define src1            x0  #define src2            x1  #define limit           x2 -#define result         x0 +#define result         w0    /* Internal variables.  */  #define data1           x3  #define data1w          w3  #define data2           x4  #define data2w          w4 -#define has_nul                x5 -#define diff           x6 -#define endloop                x7 -#define tmp1           x8 -#define tmp2           x9 -#define tmp3           x10 -#define pos            x11 -#define limit_wd       x12 -#define mask           x13 +#define tmp1           x5    ENTRY_ALIGN (memcmp, 6)          DELOUSE (0)          DELOUSE (1)          DELOUSE (2) -       cbz     limit, L(ret0) -       eor     tmp1, src1, src2 -       tst     tmp1, #7 -       b.ne    L(misaligned8) -       ands    tmp1, src1, #7 -       b.ne    L(mutual_align) -       add     limit_wd, limit, #7 -       lsr     limit_wd, limit_wd, #3 -       /* Start of performance-critical section  -- one 64B cache line.  */ -L(loop_aligned): -       ldr     data1, [src1], #8 -       ldr     data2, [src2], #8 -L(start_realigned): -       subs    limit_wd, limit_wd, #1 -       eor     diff, data1, data2      /* Non-zero if differences found.  */ -       csinv   endloop, diff, xzr, ne  /* Last Dword or differences.  */ -       cbz     endloop, L(loop_aligned) -       /* End of performance-critical section  -- one 64B cache line.  */ - -       /* Not reached the limit, must have found a diff.  */ -       cbnz    limit_wd, L(not_limit) - -       /* Limit % 8 == 0 => all bytes significant.  */ -       ands    limit, limit, #7 -       b.eq    L(not_limit) - -       lsl     limit, limit, #3        /* Bits -> bytes.  */ -       mov     mask, #~0 -#ifdef __AARCH64EB__ -       lsr     mask, mask, limit -#else -       lsl     mask, mask, limit -#endif -       bic     data1, data1, mask -       bic     data2, data2, mask - -       orr     diff, diff, mask -L(not_limit):   -#ifndef        __AARCH64EB__ -       rev     diff, diff +       subs    limit, limit, 8 +       b.lo    .Lless8 + +       /* Limit >= 8, so check first 8 bytes using unaligned loads.  */ +       ldr     data1, [src1], 8 +       ldr     data2, [src2], 8 +       and     tmp1, src1, 7 +       add     limit, limit, tmp1 +       cmp     data1, data2 +       bne     .Lreturn + +       /* Align src1 and adjust src2 with bytes not yet done.  */ +       sub     src1, src1, tmp1 +       sub     src2, src2, tmp1 + +       subs    limit, limit, 8 +       b.ls    .Llast_bytes + +       /* Loop performing 8 bytes per iteration using aligned src1. +          Limit is pre-decremented by 8 and must be larger than zero. +          Exit if <= 8 bytes left to do or if the data is not equal.  */ +       .p2align 4 +.Lloop8: +       ldr     data1, [src1], 8 +       ldr     data2, [src2], 8 +       subs    limit, limit, 8 +       ccmp    data1, data2, 0, hi  /* NZCV = 0b0000.  */ +       b.eq    .Lloop8 + +       cmp     data1, data2 +       bne     .Lreturn + +       /* Compare last 1-8 bytes using unaligned access.  */ +.Llast_bytes: +       ldr     data1, [src1, limit] +       ldr     data2, [src2, limit] + +       /* Compare data bytes and set return value to 0, -1 or 1.  */ +.Lreturn: +#ifndef __AARCH64EB__          rev     data1, data1          rev     data2, data2  #endif -       /* The MS-non-zero bit of DIFF marks either the first bit -          that is different, or the end of the significant data. -          Shifting left now will bring the critical information into the -          top bits.  */ -       clz     pos, diff -       lsl     data1, data1, pos -       lsl     data2, data2, pos -       /* But we need to zero-extend (char is unsigned) the value and then -          perform a signed 32-bit subtraction.  */ -       lsr     data1, data1, #56 -       sub     result, data1, data2, lsr #56 -       RET - -L(mutual_align): -       /* Sources are mutually aligned, but are not currently at an -          alignment boundary.  Round down the addresses and then mask off -          the bytes that precede the start point.  */ -       bic     src1, src1, #7 -       bic     src2, src2, #7 -       add     limit, limit, tmp1      /* Adjust the limit for the extra.  */ -       lsl     tmp1, tmp1, #3          /* Bytes beyond alignment -> bits.  */ -       ldr     data1, [src1], #8 -       neg     tmp1, tmp1              /* Bits to alignment -64.  */ -       ldr     data2, [src2], #8 -       mov     tmp2, #~0 -#ifdef __AARCH64EB__ -       /* Big-endian.  Early bytes are at MSB.  */ -       lsl     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */ -#else -       /* Little-endian.  Early bytes are at LSB.  */ -       lsr     tmp2, tmp2, tmp1        /* Shift (tmp1 & 63).  */ -#endif -       add     limit_wd, limit, #7 -       orr     data1, data1, tmp2 -       orr     data2, data2, tmp2 -       lsr     limit_wd, limit_wd, #3 -       b       L(start_realigned) - -L(ret0): -       mov     result, #0 -       RET - -       .p2align 6 -L(misaligned8): -       sub     limit, limit, #1 -1: -       /* Perhaps we can do better than this.  */ -       ldrb    data1w, [src1], #1 -       ldrb    data2w, [src2], #1 -       subs    limit, limit, #1 -       ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */ -       b.eq    1b -       sub     result, data1, data2 -       RET +       cmp     data1, data2 +.Lret_eq: +       cset    result, ne +       cneg    result, result, lo +       ret + +       .p2align 4 +       /* Compare up to 8 bytes.  Limit is [-8..-1].  */ +.Lless8: +       adds    limit, limit, 4 +       b.lo    .Lless4 +       ldr     data1w, [src1], 4 +       ldr     data2w, [src2], 4 +       cmp     data1w, data2w +       b.ne    .Lreturn +       sub     limit, limit, 4 +.Lless4: +       adds    limit, limit, 4 +       beq     .Lret_eq +.Lbyte_loop: +       ldrb    data1w, [src1], 1 +       ldrb    data2w, [src2], 1 +       subs    limit, limit, 1 +       ccmp    data1w, data2w, 0, ne   /* NZCV = 0b0000.  */ +       b.eq    .Lbyte_loop +       sub     result, data1w, data2w +       ret +  END (memcmp)  #undef bcmp  weak_alias (memcmp, bcmp)