From patchwork Fri Feb 22 19:27:01 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Adhemerval Zanella Netto X-Patchwork-Id: 31559 Received: (qmail 104382 invoked by alias); 22 Feb 2019 19:27:15 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 104254 invoked by uid 89); 22 Feb 2019 19:27:15 -0000 Authentication-Results: sourceware.org; auth=none X-Spam-SWARE-Status: No, score=-26.9 required=5.0 tests=BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_PASS autolearn=ham version=3.3.2 spammy=0x48, Small, 0x58, 0x28 X-HELO: mail-qk1-f173.google.com DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linaro.org; s=google; h=from:to:subject:date:message-id:in-reply-to:references; bh=S7BlCjhdv4DtbR4F1QTfiCNpajUpygrWUXDByD8zu/4=; b=F2CPD0qs5ejHEUKcuWQhL0xZGV/vdseOP01/fXiSJvxY7vLSNcW/AfqowewvsDDPmQ XjBhPTccU4k+vr6HcWfADYYf682uEVd3ZoKz8golKzSfRsN81AHf9zw8F5ad+uWUL3mj Xszh0ajgxNrdfVUw6PlPi4uObWVF6lr2slVOvXJGwCKG3mp1N9oXUXPuwpmQAVOvEycR TsA6cG00fVJq/+uSqYPW+asFcaZAGhGicTEvC7sEjAX6+iAq6qK2upOD/N4M8os15jrM 1Xya0ELseDmke6mYSmejy3RQMKTWoFMkXWx78bO1h47cV9/iG5fNTiwmxYt5Ps3XR+F/ lwGA== Return-Path: From: Adhemerval Zanella To: libc-alpha@sourceware.org Subject: [PATCH 2/4] Small optimization for lowlevellock Date: Fri, 22 Feb 2019 16:27:01 -0300 Message-Id: <20190222192703.18177-2-adhemerval.zanella@linaro.org> In-Reply-To: <20190222192703.18177-1-adhemerval.zanella@linaro.org> References: <20190222192703.18177-1-adhemerval.zanella@linaro.org> This patch optimizes both __lll_lock_wait_private and __lll_lock_wait by issuing only one lll_futex_wait. Since it is defined as an inlined syscall and inlined syscalls are defined using inlined assembly the compiler usually can not see both calls are equal and optimize accordingly. On aarch64 the resulting binary is change from: 0000000000000060 <__lll_lock_wait>: 60: 2a0103e5 mov w5, w1 64: b9400001 ldr w1, [x0] 68: aa0003e4 mov x4, x0 6c: 7100083f cmp w1, #0x2 70: 540000e1 b.ne 8c <__lll_lock_wait+0x2c> // b.any 74: 521900a1 eor w1, w5, #0x80 78: d2800042 mov x2, #0x2 // #2 7c: 93407c21 sxtw x1, w1 80: d2800003 mov x3, #0x0 // #0 84: d2800c48 mov x8, #0x62 // #98 88: d4000001 svc #0x0 8c: 521900a5 eor w5, w5, #0x80 90: 52800046 mov w6, #0x2 // #2 94: 93407ca5 sxtw x5, w5 98: 14000008 b b8 <__lll_lock_wait+0x58> 9c: d503201f nop a0: aa0403e0 mov x0, x4 a4: aa0503e1 mov x1, x5 a8: d2800042 mov x2, #0x2 // #2 ac: d2800003 mov x3, #0x0 // #0 b0: d2800c48 mov x8, #0x62 // #98 b4: d4000001 svc #0x0 b8: 885ffc80 ldaxr w0, [x4] bc: 88017c86 stxr w1, w6, [x4] c0: 35ffffc1 cbnz w1, b8 <__lll_lock_wait+0x58> c4: 35fffee0 cbnz w0, a0 <__lll_lock_wait+0x40> c8: d65f03c0 ret To: 0000000000000048 <__lll_lock_wait>: 48: aa0003e4 mov x4, x0 4c: 2a0103e5 mov w5, w1 50: b9400000 ldr w0, [x0] 54: 7100081f cmp w0, #0x2 58: 540000c0 b.eq 70 <__lll_lock_wait+0x28> // b.none 5c: 52800041 mov w1, #0x2 // #2 60: 885ffc80 ldaxr w0, [x4] 64: 88027c81 stxr w2, w1, [x4] 68: 35ffffc2 cbnz w2, 60 <__lll_lock_wait+0x18> 6c: 34000120 cbz w0, 90 <__lll_lock_wait+0x48> 70: 521900a1 eor w1, w5, #0x80 74: aa0403e0 mov x0, x4 78: 93407c21 sxtw x1, w1 7c: d2800042 mov x2, #0x2 // #2 80: d2800003 mov x3, #0x0 // #0 84: d2800c48 mov x8, #0x62 // #98 88: d4000001 svc #0x0 8c: 17fffff4 b 5c <__lll_lock_wait+0x14> 90: d65f03c0 ret I see similar changes on powerpc and other architectures. It also aligns with x86_64 implementation by adding the systemtap probes. Checker on aarch64-linux-gnu. * nptl/lowlevellock.c (__lll_lock_wait, __lll_lock_wait_private): Optimize futex call and add systemtap probe. Reviewed-by: Carlos O'Donell --- nptl/lowlevellock.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/nptl/lowlevellock.c b/nptl/lowlevellock.c index 5eaa3807ea..47548ff121 100644 --- a/nptl/lowlevellock.c +++ b/nptl/lowlevellock.c @@ -17,20 +17,23 @@ License along with the GNU C Library; if not, see . */ -#include #include #include -#include #include +#include void __lll_lock_wait_private (int *futex) { - if (*futex == 2) - lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */ - - while (atomic_exchange_acq (futex, 2) != 0) - lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */ + if (atomic_load_relaxed (futex) == 2) + goto futex; + + while (atomic_exchange_acquire (futex, 2) != 0) + { + futex: + LIBC_PROBE (lll_lock_wait_private, 1, futex); + lll_futex_wait (futex, 2, LLL_PRIVATE); /* Wait if *futex == 2. */ + } } @@ -39,10 +42,14 @@ __lll_lock_wait_private (int *futex) void __lll_lock_wait (int *futex, int private) { - if (*futex == 2) - lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */ - - while (atomic_exchange_acq (futex, 2) != 0) - lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */ + if (atomic_load_relaxed (futex) == 2) + goto futex; + + while (atomic_exchange_acquire (futex, 2) != 0) + { + futex: + LIBC_PROBE (lll_lock_wait, 1, futex); + lll_futex_wait (futex, 2, private); /* Wait if *futex == 2. */ + } } #endif