Use generic lowlevellock-futex.h in x86_64 lowlevellock.h.

Message ID 1418858157.20194.30.camel@triegel.csb
State Dropped
Headers

Commit Message

Torvald Riegel Dec. 17, 2014, 11:15 p.m. UTC
  This patch replaces the custom futex operations for x86_64 with the
generic ones from lowlevellock-futex.h.  It also adds a few
#ifdef __ASSEMBLER__ to the generic Linux one to make this work.

In the long term, we'd want to use the fully generic low-level lock
implementation on x86_64 too, but this would require more code
inspection and/or measurements to assess the impacts on performance of
such a change.
Furthermore, there is still x86_64 assembly code that uses futexes, and
we need some of the futex macros (e.g., SYS_futex) be defined somewhere
for those assembly files.  Thus, it seems easier to remove the
lowlevellock.h ones the other assembly uses of futexes have been
replaced with C code.

OK?

2014-12-17  Torvald Riegel  <triegel@redhat.com>

	* sysdeps/unix/sysv/linux/x86_64/lowlevellock.h: Include
	<lowlevellock-futex.h>.  Remove FUTEX_* constants defined there.
	(__lll_private_flag): Remove.
	(lll_futex_wait): Likewise.
	(lll_futex_timed_wait): Likewise.
	(lll_futex_wake): Likewise.
	(lll_futex_requeue): Likewise.
	(lll_wait_tid): Use lll_futex_wait instead of assembly code.
	(__lll_timedwait_tid): Spell out argument names.
	(lll_timedwait_tid): Add comments and parentheses around macro
	arguments.
	* sysdeps/unix/sysv/linux/lowlevellock-futex.h: Make FUTEX_* constants,
	LLL_SHARED and LLL_PRIVATE usable from assembly code.
  

Comments

Roland McGrath Dec. 17, 2014, 11:37 p.m. UTC | #1
You didn't describe testing you did.  For this, it seems like you should
not have too much trouble examining the generated code just by eyeball and
characterizing the differences to give us confidence.
  
Torvald Riegel Dec. 17, 2014, 11:54 p.m. UTC | #2
On Wed, 2014-12-17 at 15:37 -0800, Roland McGrath wrote:
> You didn't describe testing you did.  For this, it seems like you should
> not have too much trouble examining the generated code just by eyeball and
> characterizing the differences to give us confidence.

I tested on x86_64 and there are no regressions.  If I hadn't done any
testing on the only arch this affects, I would have mentioned this in
the description of the patch (or not submitted the patch if I actually
encountered regressions).

I did not inspect the generated code because on x86_64, the futex calls
used from C code are on the slow paths.  Looking now at pthread_once.o,
the generated code seems reasonable -- although I don't claim to have
checked whether it's optimal.
  
Roland McGrath Dec. 18, 2014, 12:08 a.m. UTC | #3
> I tested on x86_64 and there are no regressions.  If I hadn't done any
> testing on the only arch this affects, I would have mentioned this in
> the description of the patch (or not submitted the patch if I actually
> encountered regressions).

It's usual form to say in the original patch posting what testing you did.
Just "Tested x86_64-linux-gnu" communicates what you've just said.

> I did not inspect the generated code because on x86_64, the futex calls
> used from C code are on the slow paths.  Looking now at pthread_once.o,
> the generated code seems reasonable -- although I don't claim to have
> checked whether it's optimal.

It's not that it's optimal that should be checked, but that it's not worse
than the status quo ante.  But given the caveat about how microoptimization
should not actually matter here, the change is fine with me even without
you having done that.


Thanks,
Roland
  

Patch

commit dd87a779e2b9c80e24352f503a982d18969f4a90
Author: Torvald Riegel <triegel@redhat.com>
Date:   Wed Dec 17 18:55:19 2014 +0100

    Use generic lowlevellock-futex.h in x86_64 lowlevellock.h.

diff --git a/sysdeps/unix/sysv/linux/lowlevellock-futex.h b/sysdeps/unix/sysv/linux/lowlevellock-futex.h
index 8927661..907b4c7 100644
--- a/sysdeps/unix/sysv/linux/lowlevellock-futex.h
+++ b/sysdeps/unix/sysv/linux/lowlevellock-futex.h
@@ -19,10 +19,11 @@ 
 #ifndef _LOWLEVELLOCK_FUTEX_H
 #define _LOWLEVELLOCK_FUTEX_H	1
 
+#ifndef __ASSEMBLER__
 #include <sysdep.h>
 #include <tls.h>
 #include <kernel-features.h>
-
+#endif
 
 #define FUTEX_WAIT		0
 #define FUTEX_WAKE		1
@@ -48,6 +49,7 @@ 
 #define LLL_PRIVATE	0
 #define LLL_SHARED	FUTEX_PRIVATE_FLAG
 
+#ifndef __ASSEMBLER__
 
 #if IS_IN (libc) || IS_IN (rtld)
 /* In libc.so or ld.so all futexes are private.  */
@@ -133,5 +135,6 @@ 
 					 private),                      \
 		     nr_wake, nr_move, mutex, val)
 
+#endif  /* !__ASSEMBLER__  */
 
 #endif  /* lowlevellock-futex.h */
diff --git a/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h b/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h
index 2f0cf5c..56570ee 100644
--- a/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h
+++ b/sysdeps/unix/sysv/linux/x86_64/lowlevellock.h
@@ -45,59 +45,13 @@ 
 # endif
 #endif
 
+#include <lowlevellock-futex.h>
+
+/* XXX Remove when no assembler code uses futexes anymore.  */
 #define SYS_futex		__NR_futex
-#define FUTEX_WAIT		0
-#define FUTEX_WAKE		1
-#define FUTEX_CMP_REQUEUE	4
-#define FUTEX_WAKE_OP		5
-#define FUTEX_LOCK_PI		6
-#define FUTEX_UNLOCK_PI		7
-#define FUTEX_TRYLOCK_PI	8
-#define FUTEX_WAIT_BITSET	9
-#define FUTEX_WAKE_BITSET	10
-#define FUTEX_WAIT_REQUEUE_PI	11
-#define FUTEX_CMP_REQUEUE_PI	12
-#define FUTEX_PRIVATE_FLAG	128
-#define FUTEX_CLOCK_REALTIME	256
-
-#define FUTEX_BITSET_MATCH_ANY	0xffffffff
-
-#define FUTEX_OP_CLEAR_WAKE_IF_GT_ONE	((4 << 24) | 1)
-
-/* Values for 'private' parameter of locking macros.  Yes, the
-   definition seems to be backwards.  But it is not.  The bit will be
-   reversed before passing to the system call.  */
-#define LLL_PRIVATE	0
-#define LLL_SHARED	FUTEX_PRIVATE_FLAG
 
 #ifndef __ASSEMBLER__
 
-#if IS_IN (libc) || IS_IN (rtld)
-/* In libc.so or ld.so all futexes are private.  */
-# ifdef __ASSUME_PRIVATE_FUTEX
-#  define __lll_private_flag(fl, private) \
-  ((fl) | FUTEX_PRIVATE_FLAG)
-# else
-#  define __lll_private_flag(fl, private) \
-  ((fl) | THREAD_GETMEM (THREAD_SELF, header.private_futex))
-# endif
-#else
-# ifdef __ASSUME_PRIVATE_FUTEX
-#  define __lll_private_flag(fl, private) \
-  (((fl) | FUTEX_PRIVATE_FLAG) ^ (private))
-# else
-#  define __lll_private_flag(fl, private) \
-  (__builtin_constant_p (private)					      \
-   ? ((private) == 0							      \
-      ? ((fl) | THREAD_GETMEM (THREAD_SELF, header.private_futex))	      \
-      : (fl))								      \
-   : ({ unsigned int __fl = ((private) ^ FUTEX_PRIVATE_FLAG);		      \
-	asm ("andl %%fs:%P1, %0" : "+r" (__fl)				      \
-	     : "i" (offsetof (struct pthread, header.private_futex)));	      \
-	__fl | (fl); }))
-# endif
-#endif
-
 /* Initializer for lock.  */
 #define LLL_LOCK_INITIALIZER		(0)
 #define LLL_LOCK_INITIALIZER_LOCKED	(1)
@@ -106,39 +60,6 @@ 
 /* Delay in spinlock loop.  */
 #define BUSY_WAIT_NOP	  asm ("rep; nop")
 
-#define lll_futex_wait(futex, val, private) \
-  lll_futex_timed_wait(futex, val, NULL, private)
-
-
-#define lll_futex_timed_wait(futex, val, timeout, private) \
-  ({									      \
-    register const struct timespec *__to __asm ("r10") = timeout;	      \
-    int __status;							      \
-    register __typeof (val) _val __asm ("edx") = (val);			      \
-    __asm __volatile ("syscall"						      \
-		      : "=a" (__status)					      \
-		      : "0" (SYS_futex), "D" (futex),			      \
-			"S" (__lll_private_flag (FUTEX_WAIT, private)),	      \
-			"d" (_val), "r" (__to)				      \
-		      : "memory", "cc", "r11", "cx");			      \
-    __status;								      \
-  })
-
-
-#define lll_futex_wake(futex, nr, private) \
-  ({									      \
-    int __status;							      \
-    register __typeof (nr) _nr __asm ("edx") = (nr);			      \
-    LIBC_PROBE (lll_futex_wake, 3, futex, nr, private);                       \
-    __asm __volatile ("syscall"						      \
-		      : "=a" (__status)					      \
-		      : "0" (SYS_futex), "D" (futex),			      \
-			"S" (__lll_private_flag (FUTEX_WAKE, private)),	      \
-			"d" (_nr)					      \
-		      : "memory", "cc", "r10", "r11", "cx");		      \
-    __status;								      \
-  })
-
 
 /* NB: in the lll_trylock macro we simply return the value in %eax
    after the cmpxchg instruction.  In case the operation succeded this
@@ -378,58 +299,38 @@  extern int __lll_timedlock_elision (int *futex, short *adapt_count,
     }									      \
   while (0)
 
-/* Returns non-zero if error happened, zero if success.  */
-#define lll_futex_requeue(ftx, nr_wake, nr_move, mutex, val, private) \
-  ({ int __res;								      \
-     register int __nr_move __asm ("r10") = nr_move;			      \
-     register void *__mutex __asm ("r8") = mutex;			      \
-     register int __val __asm ("r9") = val;				      \
-     __asm __volatile ("syscall"					      \
-		       : "=a" (__res)					      \
-		       : "0" (__NR_futex), "D" ((void *) ftx),		      \
-			 "S" (__lll_private_flag (FUTEX_CMP_REQUEUE,	      \
-						  private)), "d" (nr_wake),   \
-			 "r" (__nr_move), "r" (__mutex), "r" (__val)	      \
-		       : "cx", "r11", "cc", "memory");			      \
-     __res < 0; })
-
 #define lll_islocked(futex) \
   (futex != LLL_LOCK_INITIALIZER)
 
 
 /* The kernel notifies a process which uses CLONE_CHILD_CLEARTID via futex
-   wakeup when the clone terminates.  The memory location contains the
-   thread ID while the clone is running and is reset to zero
-   afterwards.
-
-   The macro parameter must not have any side effect.  */
+   wake-up when the clone terminates.  The memory location contains the
+   thread ID while the clone is running and is reset to zero by the kernel
+   afterwards.  The kernel up to version 3.16.3 does not use the private futex
+   operations for futex wake-up when the clone terminates.  */
 #define lll_wait_tid(tid) \
-  do {									      \
-    int __ignore;							      \
-    register __typeof (tid) _tid asm ("edx") = (tid);			      \
-    if (_tid != 0)							      \
-      __asm __volatile ("xorq %%r10, %%r10\n\t"				      \
-			"1:\tmovq %2, %%rax\n\t"			      \
-			"syscall\n\t"					      \
-			"cmpl $0, (%%rdi)\n\t"				      \
-			"jne 1b"					      \
-			: "=&a" (__ignore)				      \
-			: "S" (FUTEX_WAIT), "i" (SYS_futex), "D" (&tid),      \
-			  "d" (_tid)					      \
-			: "memory", "cc", "r10", "r11", "cx");		      \
+  do {					\
+    __typeof (tid) __tid;		\
+    while ((__tid = (tid)) != 0)	\
+      lll_futex_wait (&(tid), __tid, LLL_SHARED);\
   } while (0)
 
-extern int __lll_timedwait_tid (int *tid, const struct timespec *abstime)
+extern int __lll_timedwait_tid (int *, const struct timespec *)
      attribute_hidden;
+
+/* As lll_wait_tid, but with a timeout.  If the timeout occurs then return
+   ETIMEDOUT.  If ABSTIME is invalid, return EINVAL.
+   XXX Note that this differs from the generic version in that we do the
+   error checking here and not in __lll_timedwait_tid.  */
 #define lll_timedwait_tid(tid, abstime) \
   ({									      \
     int __result = 0;							      \
-    if (tid != 0)							      \
+    if ((tid) != 0)							      \
       {									      \
-	if (abstime->tv_nsec < 0 || abstime->tv_nsec >= 1000000000)	      \
+	if ((abstime)->tv_nsec < 0 || (abstime)->tv_nsec >= 1000000000)	      \
 	  __result = EINVAL;						      \
 	else								      \
-	  __result = __lll_timedwait_tid (&tid, abstime);		      \
+	  __result = __lll_timedwait_tid (&(tid), (abstime));		      \
       }									      \
     __result; })