amdgcn: Implement proper locks

Message ID 20240322145330.761851-1-ams@baylibre.com
State New
Headers
Series amdgcn: Implement proper locks |

Commit Message

Andrew Stubbs March 22, 2024, 2:53 p.m. UTC
  This should prevent printf output from multiple threads getting garbled.

I don't know why IO ever worked properly -- probably it was always a bit
broken -- but the GFX11 devices have a different cache architecture and
trying to print from many threads at once corrupted the FILE buffers.
---
 newlib/Makefile.in                        |  24 ++-
 newlib/libc/sys/amdgcn/Makefile.inc       |   3 +-
 newlib/libc/sys/amdgcn/include/sys/lock.h |  39 +++++
 newlib/libc/sys/amdgcn/lock.c             | 187 ++++++++++++++++++++++
 4 files changed, 250 insertions(+), 3 deletions(-)
 create mode 100644 newlib/libc/sys/amdgcn/include/sys/lock.h
 create mode 100644 newlib/libc/sys/amdgcn/lock.c
  

Comments

Corinna Vinschen March 25, 2024, 10:35 a.m. UTC | #1
On Mar 22 14:53, Andrew Stubbs wrote:
> This should prevent printf output from multiple threads getting garbled.
> 
> I don't know why IO ever worked properly -- probably it was always a bit
> broken -- but the GFX11 devices have a different cache architecture and
> trying to print from many threads at once corrupted the FILE buffers.
> ---
>  newlib/Makefile.in                        |  24 ++-
>  newlib/libc/sys/amdgcn/Makefile.inc       |   3 +-
>  newlib/libc/sys/amdgcn/include/sys/lock.h |  39 +++++
>  newlib/libc/sys/amdgcn/lock.c             | 187 ++++++++++++++++++++++
>  4 files changed, 250 insertions(+), 3 deletions(-)
>  create mode 100644 newlib/libc/sys/amdgcn/include/sys/lock.h
>  create mode 100644 newlib/libc/sys/amdgcn/lock.c

Pushed.

Thanks,
Corinna
  

Patch

diff --git a/newlib/Makefile.in b/newlib/Makefile.in
index 61a89ff65..2bd1431f9 100644
--- a/newlib/Makefile.in
+++ b/newlib/Makefile.in
@@ -530,7 +530,8 @@  check_PROGRAMS =
 
 @HAVE_LIBC_SYS_AMDGCN_DIR_TRUE@am__append_43 = \
 @HAVE_LIBC_SYS_AMDGCN_DIR_TRUE@	libc/sys/amdgcn/close.c libc/sys/amdgcn/fstat.c libc/sys/amdgcn/isatty.c libc/sys/amdgcn/lseek.c libc/sys/amdgcn/read.c libc/sys/amdgcn/write.c \
-@HAVE_LIBC_SYS_AMDGCN_DIR_TRUE@	libc/sys/amdgcn/fcntl.c libc/sys/amdgcn/getpid.c libc/sys/amdgcn/kill.c libc/sys/amdgcn/open.c libc/sys/amdgcn/raise.c libc/sys/amdgcn/stat.c libc/sys/amdgcn/unlink.c
+@HAVE_LIBC_SYS_AMDGCN_DIR_TRUE@	libc/sys/amdgcn/fcntl.c libc/sys/amdgcn/getpid.c libc/sys/amdgcn/kill.c libc/sys/amdgcn/open.c libc/sys/amdgcn/raise.c libc/sys/amdgcn/stat.c \
+@HAVE_LIBC_SYS_AMDGCN_DIR_TRUE@	libc/sys/amdgcn/unlink.c libc/sys/amdgcn/lock.c
 
 @HAVE_LIBC_SYS_ARM_DIR_TRUE@am__append_44 = libc/sys/arm/access.c libc/sys/arm/aeabi_atexit.c libc/sys/arm/sysconf.c
 @HAVE_LIBC_SYS_ARM_DIR_TRUE@@MAY_SUPPLY_SYSCALLS_TRUE@am__append_45 = libc/sys/arm/libcfunc.c libc/sys/arm/trap.S libc/sys/arm/syscalls.c
@@ -1626,7 +1627,8 @@  am__objects_51 = libc/ssp/libc_a-chk_fail.$(OBJEXT) \
 @HAVE_LIBC_SYS_AMDGCN_DIR_TRUE@	libc/sys/amdgcn/libc_a-open.$(OBJEXT) \
 @HAVE_LIBC_SYS_AMDGCN_DIR_TRUE@	libc/sys/amdgcn/libc_a-raise.$(OBJEXT) \
 @HAVE_LIBC_SYS_AMDGCN_DIR_TRUE@	libc/sys/amdgcn/libc_a-stat.$(OBJEXT) \
-@HAVE_LIBC_SYS_AMDGCN_DIR_TRUE@	libc/sys/amdgcn/libc_a-unlink.$(OBJEXT)
+@HAVE_LIBC_SYS_AMDGCN_DIR_TRUE@	libc/sys/amdgcn/libc_a-unlink.$(OBJEXT) \
+@HAVE_LIBC_SYS_AMDGCN_DIR_TRUE@	libc/sys/amdgcn/libc_a-lock.$(OBJEXT)
 @HAVE_LIBC_SYS_ARM_DIR_TRUE@am__objects_55 = libc/sys/arm/libc_a-access.$(OBJEXT) \
 @HAVE_LIBC_SYS_ARM_DIR_TRUE@	libc/sys/arm/libc_a-aeabi_atexit.$(OBJEXT) \
 @HAVE_LIBC_SYS_ARM_DIR_TRUE@	libc/sys/arm/libc_a-sysconf.$(OBJEXT)
@@ -7268,6 +7270,9 @@  libc/sys/amdgcn/libc_a-stat.$(OBJEXT):  \
 libc/sys/amdgcn/libc_a-unlink.$(OBJEXT):  \
 	libc/sys/amdgcn/$(am__dirstamp) \
 	libc/sys/amdgcn/$(DEPDIR)/$(am__dirstamp)
+libc/sys/amdgcn/libc_a-lock.$(OBJEXT):  \
+	libc/sys/amdgcn/$(am__dirstamp) \
+	libc/sys/amdgcn/$(DEPDIR)/$(am__dirstamp)
 libc/sys/arm/$(am__dirstamp):
 	@$(MKDIR_P) libc/sys/arm
 	@: > libc/sys/arm/$(am__dirstamp)
@@ -13831,6 +13836,7 @@  distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@libc/sys/amdgcn/$(DEPDIR)/libc_a-getpid.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@libc/sys/amdgcn/$(DEPDIR)/libc_a-isatty.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@libc/sys/amdgcn/$(DEPDIR)/libc_a-kill.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@libc/sys/amdgcn/$(DEPDIR)/libc_a-lock.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@libc/sys/amdgcn/$(DEPDIR)/libc_a-lseek.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@libc/sys/amdgcn/$(DEPDIR)/libc_a-open.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@libc/sys/amdgcn/$(DEPDIR)/libc_a-raise.Po@am__quote@
@@ -30960,6 +30966,20 @@  libc/sys/amdgcn/libc_a-unlink.obj: libc/sys/amdgcn/unlink.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -c -o libc/sys/amdgcn/libc_a-unlink.obj `if test -f 'libc/sys/amdgcn/unlink.c'; then $(CYGPATH_W) 'libc/sys/amdgcn/unlink.c'; else $(CYGPATH_W) '$(srcdir)/libc/sys/amdgcn/unlink.c'; fi`
 
+libc/sys/amdgcn/libc_a-lock.o: libc/sys/amdgcn/lock.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -MT libc/sys/amdgcn/libc_a-lock.o -MD -MP -MF libc/sys/amdgcn/$(DEPDIR)/libc_a-lock.Tpo -c -o libc/sys/amdgcn/libc_a-lock.o `test -f 'libc/sys/amdgcn/lock.c' || echo '$(srcdir)/'`libc/sys/amdgcn/lock.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) libc/sys/amdgcn/$(DEPDIR)/libc_a-lock.Tpo libc/sys/amdgcn/$(DEPDIR)/libc_a-lock.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='libc/sys/amdgcn/lock.c' object='libc/sys/amdgcn/libc_a-lock.o' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -c -o libc/sys/amdgcn/libc_a-lock.o `test -f 'libc/sys/amdgcn/lock.c' || echo '$(srcdir)/'`libc/sys/amdgcn/lock.c
+
+libc/sys/amdgcn/libc_a-lock.obj: libc/sys/amdgcn/lock.c
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -MT libc/sys/amdgcn/libc_a-lock.obj -MD -MP -MF libc/sys/amdgcn/$(DEPDIR)/libc_a-lock.Tpo -c -o libc/sys/amdgcn/libc_a-lock.obj `if test -f 'libc/sys/amdgcn/lock.c'; then $(CYGPATH_W) 'libc/sys/amdgcn/lock.c'; else $(CYGPATH_W) '$(srcdir)/libc/sys/amdgcn/lock.c'; fi`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) libc/sys/amdgcn/$(DEPDIR)/libc_a-lock.Tpo libc/sys/amdgcn/$(DEPDIR)/libc_a-lock.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='libc/sys/amdgcn/lock.c' object='libc/sys/amdgcn/libc_a-lock.obj' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -c -o libc/sys/amdgcn/libc_a-lock.obj `if test -f 'libc/sys/amdgcn/lock.c'; then $(CYGPATH_W) 'libc/sys/amdgcn/lock.c'; else $(CYGPATH_W) '$(srcdir)/libc/sys/amdgcn/lock.c'; fi`
+
 libc/sys/arm/libc_a-access.o: libc/sys/arm/access.c
 @am__fastdepCC_TRUE@	$(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libc_a_CPPFLAGS) $(CPPFLAGS) $(libc_a_CFLAGS) $(CFLAGS) -MT libc/sys/arm/libc_a-access.o -MD -MP -MF libc/sys/arm/$(DEPDIR)/libc_a-access.Tpo -c -o libc/sys/arm/libc_a-access.o `test -f 'libc/sys/arm/access.c' || echo '$(srcdir)/'`libc/sys/arm/access.c
 @am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) libc/sys/arm/$(DEPDIR)/libc_a-access.Tpo libc/sys/arm/$(DEPDIR)/libc_a-access.Po
diff --git a/newlib/libc/sys/amdgcn/Makefile.inc b/newlib/libc/sys/amdgcn/Makefile.inc
index c1570b2ad..4e540fc24 100644
--- a/newlib/libc/sys/amdgcn/Makefile.inc
+++ b/newlib/libc/sys/amdgcn/Makefile.inc
@@ -1,3 +1,4 @@ 
 libc_a_SOURCES += \
 	%D%/close.c %D%/fstat.c %D%/isatty.c %D%/lseek.c %D%/read.c %D%/write.c \
-	%D%/fcntl.c %D%/getpid.c %D%/kill.c %D%/open.c %D%/raise.c %D%/stat.c %D%/unlink.c
+	%D%/fcntl.c %D%/getpid.c %D%/kill.c %D%/open.c %D%/raise.c %D%/stat.c \
+	%D%/unlink.c %D%/lock.c
diff --git a/newlib/libc/sys/amdgcn/include/sys/lock.h b/newlib/libc/sys/amdgcn/include/sys/lock.h
new file mode 100644
index 000000000..0e0e667e5
--- /dev/null
+++ b/newlib/libc/sys/amdgcn/include/sys/lock.h
@@ -0,0 +1,39 @@ 
+#ifndef __SYS_LOCK_H__
+#define __SYS_LOCK_H__
+
+#include <newlib.h>
+#include <_ansi.h>
+
+typedef unsigned int _LOCK_T;
+typedef unsigned int _LOCK_RECURSIVE_T;
+
+#define __LOCK_INIT(CLASS,LOCK) CLASS _LOCK_T LOCK = 0;
+#define __LOCK_INIT_RECURSIVE(CLASS,LOCK) __LOCK_INIT(CLASS,LOCK)
+
+#define __lock_init(LOCK) LOCK = 0
+#define __lock_init_recursive(LOCK) LOCK = 0
+#define __lock_close(LOCK) ((void)0)
+#define __lock_close_recursive(LOCK) ((void) 0)
+#define __lock_acquire(LOCK) __gcn_lock_acquire (&LOCK)
+#define __lock_acquire_recursive(LOCK) \
+   __gcn_lock_acquire_recursive (&LOCK)
+#define __lock_try_acquire(LOCK) __gcn_try_lock_acquire (&LOCK)
+#define __lock_try_acquire_recursive(LOCK) \
+  __gcn_lock_try_acquire_recursive (&LOCK)
+#define __lock_release(LOCK) __gcn_lock_release (&LOCK)
+#define __lock_release_recursive(LOCK) \
+  __gcn_lock_release_recursive (&LOCK)
+
+
+int __gcn_try_lock_acquire (_LOCK_T *lock_ptr);
+void __gcn_lock_acquire (_LOCK_T *lock_ptr);
+void __gcn_lock_release (_LOCK_T *lock_ptr);
+int __gcn_lock_try_acquire_recursive (_LOCK_T *lock_ptr);
+void __gcn_lock_acquire_recursive (_LOCK_T *lock_ptr);
+void __gcn_lock_release_recursive (_LOCK_T *lock_ptr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __SYS_LOCK_H__ */
diff --git a/newlib/libc/sys/amdgcn/lock.c b/newlib/libc/sys/amdgcn/lock.c
new file mode 100644
index 000000000..c4c94720b
--- /dev/null
+++ b/newlib/libc/sys/amdgcn/lock.c
@@ -0,0 +1,187 @@ 
+/*
+ * Support file for amdgcn in newlib.
+ * Copyright (c) 2024 BayLibre.
+ *
+ * The authors hereby grant permission to use, copy, modify, distribute,
+ * and license this software and its documentation for any purpose, provided
+ * that existing copyright notices are retained in all copies and that this
+ * notice is included verbatim in any distributions. No written agreement,
+ * license, or royalty fee is required for any of the authorized uses.
+ * Modifications to this software may be copyrighted by their authors
+ * and need not follow the licensing terms described here, provided that
+ * the new terms are clearly indicated on the first page of each file where
+ * they apply.
+ */
+
+/* Lock routines for AMD GPU devices.
+ 
+   The lock is a 32-bit int:
+     - bits 0-3: wavefront id
+     - bits 4-23: workgroup id (+1, so never zero)
+     - bits 24-31: recursive lock count.
+   
+   The purpose of the "relaxed" loads and stores being "atomic" here is
+   mostly just to ensure we punch through the caches consistently.
+
+   Non-recursive locks may be unlocked by any thread.  It's an error to
+   attempt to unlock a recursive lock from the wrong thread.
+
+   The DEBUG statements here use sprintf and write to avoid taking locks
+   themselves.  */
+
+#include <sys/lock.h>
+#include <assert.h>
+
+#define DEBUG 0
+
+#if DEBUG
+extern void write(int, char *, int);
+#endif
+
+static unsigned
+__gcn_thread_id ()
+{
+  /* Dim(0) is the workgroup ID; range 0 to maybe thousands.
+     Dim(1) is the wavefront ID; range 0 to 15.  */
+  return (((__builtin_gcn_dim_pos (0) + 1) << 4)
+	  + __builtin_gcn_dim_pos (1));
+}
+
+static int
+__gcn_lock_acquire_int (_LOCK_T *lock_ptr, int _try)
+{
+  int id = __gcn_thread_id ();
+
+#if DEBUG
+  char buf[1000];
+  __builtin_sprintf (buf,"acquire:%p(%d) lock_value:0x%x  id:0x%x", lock_ptr,
+		     _try, *lock_ptr, id);
+  write (1, buf, __builtin_strlen(buf));
+#endif
+
+  int expected = 0;
+  while (!__atomic_compare_exchange_n (lock_ptr, &expected, id, 0,
+				       __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
+    {
+      /* Lock *not* acquired.  */
+      if (_try)
+	return 0;
+      else
+	{
+	  asm ("s_sleep 64");
+	  expected = 0;
+	}
+    }
+
+#if DEBUG
+  __builtin_sprintf (buf,"acquired:%p(%d) lock_value:0x%x  id:0x%x", lock_ptr,
+		     _try, *lock_ptr, id);
+  write (1, buf, __builtin_strlen(buf));
+#endif
+
+  return 1;
+}
+
+int
+__gcn_try_lock_acquire (_LOCK_T *lock_ptr)
+{
+  return __gcn_lock_acquire_int (lock_ptr, 1);
+}
+
+void
+__gcn_lock_acquire (_LOCK_T *lock_ptr)
+{
+  __gcn_lock_acquire_int (lock_ptr, 0);
+}
+
+static int
+__gcn_lock_acquire_recursive_int (_LOCK_T *lock_ptr, int _try)
+{
+  int id = __gcn_thread_id ();
+
+#if DEBUG
+  char buf[1000];
+  __builtin_sprintf (buf,"acquire recursive:%p(%d) lock_value:0x%x  id:0x%x",
+		     lock_ptr, _try, *lock_ptr, id);
+  write (1, buf, __builtin_strlen(buf));
+#endif
+
+  unsigned int lock_value = __atomic_load_n (lock_ptr, __ATOMIC_RELAXED);
+  if ((lock_value & 0xffffff) == id)
+    {
+      /* This thread already holds the lock.
+	 Increment the recursion counter and update the lock.  */
+      int count = lock_value >> 24;
+      lock_value = ((count + 1) << 24) | id;
+      __atomic_store_n (lock_ptr, lock_value, __ATOMIC_RELAXED);
+
+#if DEBUG
+      __builtin_sprintf (buf,
+			 "increment recursive:%p(%d) lock_value:0x%x  id:0x%x",
+			 lock_ptr, _try, *lock_ptr, id);
+      write (1, buf, __builtin_strlen(buf));
+#endif
+
+      return 1;
+    }
+  else
+    return __gcn_lock_acquire_int (lock_ptr, _try);
+}
+
+int
+__gcn_lock_try_acquire_recursive (_LOCK_T *lock_ptr)
+{
+  return __gcn_lock_acquire_recursive_int (lock_ptr, 1);
+}
+
+void
+__gcn_lock_acquire_recursive (_LOCK_T *lock_ptr)
+{
+  __gcn_lock_acquire_recursive_int (lock_ptr, 0);
+}
+
+void
+__gcn_lock_release (_LOCK_T *lock_ptr)
+{
+#if DEBUG
+  char buf[1000];
+  __builtin_sprintf (buf,"release:%p lock_value:0x%x  id:0x%x", lock_ptr,
+		     *lock_ptr, __gcn_thread_id());
+  write (1, buf, __builtin_strlen(buf));
+#endif
+
+  __atomic_store_n (lock_ptr, 0, __ATOMIC_RELEASE);
+}
+
+void
+__gcn_lock_release_recursive (_LOCK_T *lock_ptr)
+{
+  int id = __gcn_thread_id ();
+  unsigned int lock_value = __atomic_load_n (lock_ptr, __ATOMIC_RELAXED);
+
+#if DEBUG
+  char buf[1000];
+  __builtin_sprintf (buf, "release recursive:%p lock_value:0x%x  id:0x%x",
+		     lock_ptr, lock_value, id);
+  write (1, buf, __builtin_strlen(buf));
+#endif
+
+  /* It is an error to call this function from the wrong thread.  */
+  assert ((lock_value & 0xffffff) == id);
+
+  /* Decrement or release the lock.  */
+  int count = lock_value >> 24;
+  if (count > 0)
+    {
+      lock_value = ((count - 1) << 24) | id;
+      __atomic_store_n (lock_ptr, lock_value, __ATOMIC_RELAXED);
+
+#if DEBUG
+      __builtin_sprintf (buf, "decrement recursive:%p lock_value:0x%x  id:0x%x",
+			 lock_ptr, *lock_ptr, id);
+      write (1, buf, __builtin_strlen(buf));
+#endif
+    }
+  else
+    __gcn_lock_release (lock_ptr);
+}