[RFC] pthread_once: Use unified variant instead of custom x86_64/i386

Message ID 1413751577.8483.27.camel@triegel.csb
State Superseded
Headers

Commit Message

Torvald Riegel Oct. 19, 2014, 8:46 p.m. UTC
  On Mon, 2014-04-07 at 15:47 +0200, Torvald Riegel wrote:
> On Fri, 2013-10-11 at 23:28 +0300, Torvald Riegel wrote:
> > Assuming the pthread_once unification I sent recently is applied, we
> > still have custom x86_64 and i386 variants of pthread_once.  The
> > algorithm they use is the same as the unified variant, so we would be
> > able to remove the custom variants if this doesn't affect performance.
> > 
> > The common case when pthread_once is executed is that the initialization
> > has already been performed; thus, this is the fast path that we can
> > focus on.  (I haven't looked specifically at the generated code for the
> > slow path, but the algorithm is the same and I assume that the overhead
> > of the synchronizing instructions and futex syscalls determines the
> > performance of it, not any differences between compiler-generated code
> > and the custom code.)
> > 
> > The fast path of the custom assembler version:
> > 	testl	$2, (%rdi)
> > 	jz	1f
> > 	xorl	%eax, %eax
> > 	retq
> > 
> > The fast path of the generic pthread_once C code, as it is after the
> > pthread_once unification patch:
> >   20:   48 89 5c 24 e8          mov    %rbx,-0x18(%rsp)
> >   25:   48 89 6c 24 f0          mov    %rbp,-0x10(%rsp)
> >   2a:   48 89 fb                mov    %rdi,%rbx
> >   2d:   4c 89 64 24 f8          mov    %r12,-0x8(%rsp)
> >   32:   48 89 f5                mov    %rsi,%rbp
> >   35:   48 83 ec 38             sub    $0x38,%rsp
> >   39:   41 b8 ca 00 00 00       mov    $0xca,%r8d
> >   3f:   8b 13                   mov    (%rbx),%edx
> >   41:   f6 c2 02                test   $0x2,%dl
> >   44:   74 16                   je     5c <__pthread_once+0x3c>
> >   46:   31 c0                   xor    %eax,%eax
> >   48:   48 8b 5c 24 20          mov    0x20(%rsp),%rbx
> >   4d:   48 8b 6c 24 28          mov    0x28(%rsp),%rbp
> >   52:   4c 8b 64 24 30          mov    0x30(%rsp),%r12
> >   57:   48 83 c4 38             add    $0x38,%rsp
> >   5b:   c3                      retq   
> > 
> > The only difference is more stack save/restore.  However, a quick run of
> > benchtests/pthread_once (see the patch I sent for review) on my laptop
> > doesn't show any noticeable differences between both (averages of 8 runs
> > of the microbenchmark differ by 0.2%).
> > 
> > When splitting out the slow path like this:
> > 
> > static int
> > __attribute__((noinline))
> > __pthread_once_slow (once_control, init_routine)
> > /* ... */
> > 
> > int
> > __pthread_once (once_control, init_routine)
> >      pthread_once_t *once_control;
> >      void (*init_routine) (void);
> > {
> >   int val;
> >   val = *once_control;
> >   atomic_read_barrier();
> >   if (__builtin_expect ((val & __PTHREAD_ONCE_DONE) != 0, 1))
> >     return 0;
> >   else
> >     return __pthread_once_slow(once_control, init_routine);
> > }
> > 
> > we get this for the C variants fast path:
> > 
> > 00000000000000e0 <__pthread_once>:
> >   e0:   8b 07                   mov    (%rdi),%eax
> >   e2:   a8 02                   test   $0x2,%al
> >   e4:   74 03                   je     e9 <__pthread_once+0x9>
> >   e6:   31 c0                   xor    %eax,%eax
> >   e8:   c3                      retq   
> >   e9:   31 c0                   xor    %eax,%eax
> >   eb:   e9 30 ff ff ff          jmpq   20 <__pthread_once_slow>
> > 
> > This is very close to the fast path of the custom assembler code.
> > 
> > I haven't looked further at i386, but the custom code is pretty similar
> > to the x86_64 variant.
> > 
> > 
> > What do you all prefer?:
> > 1) Keep the x86-specific assembler versions?
> > 2) Remove the x86-specific assembler versions and split out the slow
> > path?
> > 2) Just remove the x86-specific assembler versions?
> > 
> 
> Here is an updated patch for 2).
> 
> Without the fast path, I get the following without and with assembly (on
> my x86_64 laptop):
> 
> "pthread_once": {
> "": {
> "duration": 2.42853e+10, "iterations": 2.16569e+09, "max": 2217.22,
> "min": 11.024, "mean": 11.2137
> }
> }
> "pthread_once": {
> "": {
> "duration": 2.40695e+10, "iterations": 2.65473e+09, "max": 2185.57,
> "min": 9.016, "mean": 9.06665
> }
> }
> 
> With the fast path as split out, I get:
> "pthread_once": {
> "": {
> "duration": 2.40632e+10, "iterations": 2.6526e+09, "max": 2336.96,
> "min": 9.016, "mean": 9.07154
> }
> }
> 
> Okay to commit the fast path (after the pthread_once unification has
> been committed) and remove the x86 assembler variants?

This slipped through and this was never committed.  I took Richard's
advice into account, and the x86_64 code generated for the generic
version is now:
00000000000000d0 <__pthread_once>:
  d0:   8b 07                   mov    (%rdi),%eax
  d2:   a8 02                   test   $0x2,%al
  d4:   74 03                   je     d9 <__pthread_once+0x9>
  d6:   31 c0                   xor    %eax,%eax
  d8:   c3                      retq   
  d9:   e9 42 ff ff ff          jmpq   20 <__pthread_once_slow>


Attached is an updated patch, which I'll commit after two days if nobody
objects.

2014-10-20  Torvald Riegel  <triegel@redhat.com>

	[BZ #15215]
	* nptl/pthread_once.c (__pthread_once): Split out fast path to ...
	(__pthread_once_slow): ... here.
	* sysdeps/unix/sysv/linux/i386/pthread_once.S: Remove file.
	* sysdeps/unix/sysv/linux/x86_64/pthread_once.S: Remove file.
  

Comments

Roland McGrath Oct. 20, 2014, 1:37 a.m. UTC | #1
> +static int
> +__attribute__((noinline))

Space before paren.

> +__pthread_once_slow (pthread_once_t *once_control, void (*init_routine) (void))

This needs a comment explaining why it's separate.  I would have expected
the compiler to generate essentially the same code for the early bailout
case marked with __glibc_likely.

> +  atomic_read_barrier();
[...]
> +    return __pthread_once_slow(once_control, init_routine);

Space before paren.
  
Rich Felker Oct. 20, 2014, 3:41 a.m. UTC | #2
On Sun, Oct 19, 2014 at 06:37:16PM -0700, Roland McGrath wrote:
> > +static int
> > +__attribute__((noinline))
> 
> Space before paren.
> 
> > +__pthread_once_slow (pthread_once_t *once_control, void (*init_routine) (void))
> 
> This needs a comment explaining why it's separate.  I would have expected
> the compiler to generate essentially the same code for the early bailout
> case marked with __glibc_likely.

Sadly gcc never does that, as far as I know. This optimization
shouldn't even be dependent on whether the code path is "likely" or
not; compilers should "shrink-wrap" code paths that don't need heavy
stack frame setup whenever they can do so without pessimizing the
other code paths, but they don't.

The method in this patch is the standard fix that I've used in various
places to "manually shrink-wrap" before and it seems appropriate here.
I just tried the same approach in musl for pthread_once (already using
it for mutexes) and it doubled performance in the fast path, so I
think it's a good idea.

Rich
  

Patch

commit bc0ee21b1af4d8def1f3b6860816bf51f981c28e
Author: Torvald Riegel <triegel@redhat.com>
Date:   Sun Oct 19 21:59:26 2014 +0200

    pthread_once: Add fast path and remove x86 variants

diff --git a/nptl/pthread_once.c b/nptl/pthread_once.c
index 595bd7e..27cf06f 100644
--- a/nptl/pthread_once.c
+++ b/nptl/pthread_once.c
@@ -59,10 +59,9 @@  clear_once_control (void *arg)
    once_control are used for the fork generation), and try to initialize
    again, we can deadlock because we can't distinguish the in-progress and
    interrupted cases anymore.  */
-int
-__pthread_once (once_control, init_routine)
-     pthread_once_t *once_control;
-     void (*init_routine) (void);
+static int
+__attribute__((noinline))
+__pthread_once_slow (pthread_once_t *once_control, void (*init_routine) (void))
 {
   while (1)
     {
@@ -130,5 +129,18 @@  __pthread_once (once_control, init_routine)
 
   return 0;
 }
+
+int
+__pthread_once (pthread_once_t *once_control, void (*init_routine) (void))
+{
+  /* Fast path.  See __pthread_once_slow.  */
+  int val;
+  val = *once_control;
+  atomic_read_barrier();
+  if (__glibc_likely ((val & __PTHREAD_ONCE_DONE) != 0))
+    return 0;
+  else
+    return __pthread_once_slow(once_control, init_routine);
+}
 weak_alias (__pthread_once, pthread_once)
 hidden_def (__pthread_once)
diff --git a/sysdeps/unix/sysv/linux/i386/pthread_once.S b/sysdeps/unix/sysv/linux/i386/pthread_once.S
deleted file mode 100644
index dacd724..0000000
--- a/sysdeps/unix/sysv/linux/i386/pthread_once.S
+++ /dev/null
@@ -1,178 +0,0 @@ 
-/* Copyright (C) 2002-2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <unwindbuf.h>
-#include <sysdep.h>
-#include <kernel-features.h>
-#include <lowlevellock.h>
-
-
-	.comm	__fork_generation, 4, 4
-
-	.text
-
-
-	.globl	__pthread_once
-	.type	__pthread_once,@function
-	.align	16
-	cfi_startproc
-__pthread_once:
-	movl	4(%esp), %ecx
-	testl	$2, (%ecx)
-	jz	1f
-	xorl	%eax, %eax
-	ret
-
-1:	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (3, 0)
-	pushl	%esi
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (6, 0)
-	movl	%ecx, %ebx
-	xorl	%esi, %esi
-
-	/* Not yet initialized or initialization in progress.
-	   Get the fork generation counter now.  */
-6:	movl	(%ebx), %eax
-#ifdef PIC
-	LOAD_PIC_REG(cx)
-#endif
-
-5:	movl	%eax, %edx
-
-	testl	$2, %eax
-	jnz	4f
-
-	andl	$3, %edx
-#ifdef PIC
-	orl	__fork_generation@GOTOFF(%ecx), %edx
-#else
-	orl	__fork_generation, %edx
-#endif
-	orl	$1, %edx
-
-	LOCK
-	cmpxchgl %edx, (%ebx)
-	jnz	5b
-
-	/* Check whether another thread already runs the initializer.  */
-	testl	$1, %eax
-	jz	3f	/* No -> do it.  */
-
-	/* Check whether the initializer execution was interrupted
-	   by a fork.  */
-	xorl	%edx, %eax
-	testl	$0xfffffffc, %eax
-	jnz	3f	/* Different for generation -> run initializer.  */
-
-	/* Somebody else got here first.  Wait.  */
-#ifdef __ASSUME_PRIVATE_FUTEX
-	movl	$FUTEX_WAIT|FUTEX_PRIVATE_FLAG, %ecx
-#else
-# if FUTEX_WAIT == 0
-	movl	%gs:PRIVATE_FUTEX, %ecx
-# else
-	movl	$FUTEX_WAIT, %ecx
-	orl	%gs:PRIVATE_FUTEX, %ecx
-# endif
-#endif
-	movl	$SYS_futex, %eax
-	ENTER_KERNEL
-	jmp	6b
-
-3:	/* Call the initializer function after setting up the
-	   cancellation handler.  Note that it is not possible here
-	   to use the unwind-based cleanup handling.  This would require
-	   that the user-provided function and all the code it calls
-	   is compiled with exceptions.  Unfortunately this cannot be
-	   guaranteed.  */
-	subl	$UNWINDBUFSIZE+8, %esp
-	cfi_adjust_cfa_offset (UNWINDBUFSIZE+8)
-	movl	%ecx, %ebx		/* PIC register value.  */
-
-	leal	8+UWJMPBUF(%esp), %eax
-	movl	$0, 4(%esp)
-	movl	%eax, (%esp)
-	call	__sigsetjmp@PLT
-	testl	%eax, %eax
-	jne	7f
-
-	leal	8(%esp), %eax
-	call	HIDDEN_JUMPTARGET(__pthread_register_cancel)
-
-	/* Call the user-provided initialization function.  */
-	call	*24+UNWINDBUFSIZE(%esp)
-
-	/* Pop the cleanup handler.  */
-	leal	8(%esp), %eax
-	call	HIDDEN_JUMPTARGET(__pthread_unregister_cancel)
-	addl	$UNWINDBUFSIZE+8, %esp
-	cfi_adjust_cfa_offset (-UNWINDBUFSIZE-8)
-
-	/* Sucessful run of the initializer.  Signal that we are done.  */
-	movl	12(%esp), %ebx
-	LOCK
-	addl	$1, (%ebx)
-
-	/* Wake up all other threads.  */
-	movl	$0x7fffffff, %edx
-#ifdef __ASSUME_PRIVATE_FUTEX
-	movl	$FUTEX_WAKE|FUTEX_PRIVATE_FLAG, %ecx
-#else
-	movl	$FUTEX_WAKE, %ecx
-	orl	%gs:PRIVATE_FUTEX, %ecx
-#endif
-	movl	$SYS_futex, %eax
-	ENTER_KERNEL
-
-4:	popl	%esi
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (6)
-	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (3)
-	xorl	%eax, %eax
-	ret
-
-7:	/* __sigsetjmp returned for the second time.  */
-	movl	20+UNWINDBUFSIZE(%esp), %ebx
-	cfi_adjust_cfa_offset (UNWINDBUFSIZE+16)
-	cfi_offset (3, -8)
-	cfi_offset (6, -12)
-	movl	$0, (%ebx)
-
-	movl	$0x7fffffff, %edx
-#ifdef __ASSUME_PRIVATE_FUTEX
-	movl	$FUTEX_WAKE|FUTEX_PRIVATE_FLAG, %ecx
-#else
-	movl	$FUTEX_WAKE, %ecx
-	orl	%gs:PRIVATE_FUTEX, %ecx
-#endif
-	movl	$SYS_futex, %eax
-	ENTER_KERNEL
-
-	leal	8(%esp), %eax
-	call	HIDDEN_JUMPTARGET (__pthread_unwind_next)
-	/* NOTREACHED */
-	hlt
-	cfi_endproc
-	.size	__pthread_once,.-__pthread_once
-
-hidden_def (__pthread_once)
-strong_alias (__pthread_once, pthread_once)
diff --git a/sysdeps/unix/sysv/linux/x86_64/pthread_once.S b/sysdeps/unix/sysv/linux/x86_64/pthread_once.S
deleted file mode 100644
index 2cbe2fa..0000000
--- a/sysdeps/unix/sysv/linux/x86_64/pthread_once.S
+++ /dev/null
@@ -1,193 +0,0 @@ 
-/* Copyright (C) 2002-2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include <kernel-features.h>
-#include <tcb-offsets.h>
-#include <lowlevellock.h>
-
-
-	.comm	__fork_generation, 4, 4
-
-	.text
-
-
-	.globl	__pthread_once
-	.type	__pthread_once,@function
-	.align	16
-__pthread_once:
-.LSTARTCODE:
-	cfi_startproc
-#ifdef SHARED
-	cfi_personality(DW_EH_PE_pcrel | DW_EH_PE_sdata4 | DW_EH_PE_indirect,
-			DW.ref.__gcc_personality_v0)
-	cfi_lsda(DW_EH_PE_pcrel | DW_EH_PE_sdata4, .LexceptSTART)
-#else
-	cfi_personality(DW_EH_PE_udata4, __gcc_personality_v0)
-	cfi_lsda(DW_EH_PE_udata4, .LexceptSTART)
-#endif
-	testl	$2, (%rdi)
-	jz	1f
-	xorl	%eax, %eax
-	retq
-
-	/* Preserve the function pointer.  */
-1:	pushq	%rsi
-	cfi_adjust_cfa_offset(8)
-	xorq	%r10, %r10
-
-	/* Not yet initialized or initialization in progress.
-	   Get the fork generation counter now.  */
-6:	movl	(%rdi), %eax
-
-5:	movl	%eax, %edx
-
-	testl	$2, %eax
-	jnz	4f
-
-	andl	$3, %edx
-	orl	__fork_generation(%rip), %edx
-	orl	$1, %edx
-
-	LOCK
-	cmpxchgl %edx, (%rdi)
-	jnz	5b
-
-	/* Check whether another thread already runs the initializer.  */
-	testl	$1, %eax
-	jz	3f	/* No -> do it.  */
-
-	/* Check whether the initializer execution was interrupted
-	   by a fork.  */
-	xorl	%edx, %eax
-	testl	$0xfffffffc, %eax
-	jnz	3f	/* Different for generation -> run initializer.  */
-
-	/* Somebody else got here first.  Wait.  */
-#ifdef __ASSUME_PRIVATE_FUTEX
-	movl	$FUTEX_WAIT|FUTEX_PRIVATE_FLAG, %esi
-#else
-# if FUTEX_WAIT == 0
-	movl	%fs:PRIVATE_FUTEX, %esi
-# else
-	movl	$FUTEX_WAIT, %esi
-	orl	%fs:PRIVATE_FUTEX, %esi
-# endif
-#endif
-	movl	$SYS_futex, %eax
-	syscall
-	jmp	6b
-
-	/* Preserve the pointer to the control variable.  */
-3:	pushq	%rdi
-	cfi_adjust_cfa_offset(8)
-	pushq	%rdi
-	cfi_adjust_cfa_offset(8)
-
-.LcleanupSTART:
-	callq	*16(%rsp)
-.LcleanupEND:
-
-	/* Get the control variable address back.  */
-	popq	%rdi
-	cfi_adjust_cfa_offset(-8)
-
-	/* Sucessful run of the initializer.  Signal that we are done.  */
-	LOCK
-	incl	(%rdi)
-
-	addq	$8, %rsp
-	cfi_adjust_cfa_offset(-8)
-
-	/* Wake up all other threads.  */
-	movl	$0x7fffffff, %edx
-#ifdef __ASSUME_PRIVATE_FUTEX
-	movl	$FUTEX_WAKE|FUTEX_PRIVATE_FLAG, %esi
-#else
-	movl	$FUTEX_WAKE, %esi
-	orl	%fs:PRIVATE_FUTEX, %esi
-#endif
-	movl	$SYS_futex, %eax
-	syscall
-
-4:	addq	$8, %rsp
-	cfi_adjust_cfa_offset(-8)
-	xorl	%eax, %eax
-	retq
-	.size	__pthread_once,.-__pthread_once
-
-
-hidden_def (__pthread_once)
-strong_alias (__pthread_once, pthread_once)
-
-
-	.type	clear_once_control,@function
-	.align	16
-clear_once_control:
-	cfi_adjust_cfa_offset(3 * 8)
-	movq	(%rsp), %rdi
-	movq	%rax, %r8
-	movl	$0, (%rdi)
-
-	movl	$0x7fffffff, %edx
-#ifdef __ASSUME_PRIVATE_FUTEX
-	movl	$FUTEX_WAKE|FUTEX_PRIVATE_FLAG, %esi
-#else
-	movl	$FUTEX_WAKE, %esi
-	orl	%fs:PRIVATE_FUTEX, %esi
-#endif
-	movl	$SYS_futex, %eax
-	syscall
-
-	movq	%r8, %rdi
-.LcallUR:
-	call	_Unwind_Resume@PLT
-	hlt
-.LENDCODE:
-	cfi_endproc
-	.size	clear_once_control,.-clear_once_control
-
-
-	.section .gcc_except_table,"a",@progbits
-.LexceptSTART:
-	.byte	DW_EH_PE_omit			# @LPStart format
-	.byte	DW_EH_PE_omit			# @TType format
-	.byte	DW_EH_PE_uleb128		# call-site format
-	.uleb128 .Lcstend-.Lcstbegin
-.Lcstbegin:
-	.uleb128 .LcleanupSTART-.LSTARTCODE
-	.uleb128 .LcleanupEND-.LcleanupSTART
-	.uleb128 clear_once_control-.LSTARTCODE
-	.uleb128  0
-	.uleb128 .LcallUR-.LSTARTCODE
-	.uleb128 .LENDCODE-.LcallUR
-	.uleb128 0
-	.uleb128  0
-.Lcstend:
-
-
-#ifdef SHARED
-	.hidden	DW.ref.__gcc_personality_v0
-	.weak	DW.ref.__gcc_personality_v0
-	.section .gnu.linkonce.d.DW.ref.__gcc_personality_v0,"aw",@progbits
-	.align	LP_SIZE
-	.type	DW.ref.__gcc_personality_v0, @object
-	.size	DW.ref.__gcc_personality_v0, LP_SIZE
-DW.ref.__gcc_personality_v0:
-	ASM_ADDR __gcc_personality_v0
-#endif