nvptx: reimplement libgomp barriers [PR99555]

Instead of trying to have the GPU do CPU-with-OS-like things, this new barriers implementation for NVPTX uses simplistic bar.* synchronization instructions. Tasks are processed after threads have joined, and only if team->task_count != 0 It is noted that: there might be a little bit of performance forfeited for cases where earlier arriving threads could've been used to process tasks ahead of other threads, but that has the requirement of implementing complex futex-wait/wake like behavior, which is what we're try to avoid with this patch. It is deemed that task processing is not what GPU target offloading is usually used for. Implementation highlight notes: 1. gomp_team_barrier_wake() is now an empty function (threads never "wake" in the usual manner) 2. gomp_team_barrier_cancel() now uses the "exit" PTX instruction. 3. gomp_barrier_wait_last() now is implemented using "bar.arrive" 4. gomp_team_barrier_wait_end()/gomp_team_barrier_wait_cancel_end(): The main synchronization is done using a 'bar.red' instruction. This reduces across all threads the condition (team->task_count != 0), to enable the task processing down below if any thread created a task. (this bar.red usage means that this patch is dependent on the prior NVPTX bar.red GCC patch) PR target/99555 libgomp/ChangeLog: * config/nvptx/bar.c (generation_to_barrier): Remove. (futex_wait,futex_wake,do_spin,do_wait): Remove. (GOMP_WAIT_H): Remove. (#include "../linux/bar.c"): Remove. (gomp_barrier_wait_end): New function. (gomp_barrier_wait): Likewise. (gomp_barrier_wait_last): Likewise. (gomp_team_barrier_wait_end): Likewise. (gomp_team_barrier_wait): Likewise. (gomp_team_barrier_wait_final): Likewise. (gomp_team_barrier_wait_cancel_end): Likewise. (gomp_team_barrier_wait_cancel): Likewise. (gomp_team_barrier_cancel): Likewise. * config/nvptx/bar.h (gomp_barrier_t): Remove waiters, lock fields. (gomp_barrier_init): Remove init of waiters, lock fields. (gomp_team_barrier_wake): Remove prototype, add new static inline function.
2026-02-22 03:47:02 -05:00 · 2022-12-21 05:57:45 -08:00
parent 623daaf8a2
commit fdc7469cf5
2 changed files with 148 additions and 127 deletions
--- a/libgomp/config/nvptx/bar.c
+++ b/libgomp/config/nvptx/bar.c
@@ -30,137 +30,156 @@
 #include <limits.h>
 #include "libgomp.h"

-/* For cpu_relax.  */
-#include "doacross.h"
-
-/* Assuming ADDR is &bar->generation, return bar.  Copied from
-   rtems/bar.c.  */
-
-static gomp_barrier_t *
-generation_to_barrier (int *addr)
+void
+gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
 {
-  char *bar
-    = (char *) addr - __builtin_offsetof (gomp_barrier_t, generation);
-  return (gomp_barrier_t *)bar;
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      bar->awaited = bar->total;
+      __atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
+			MEMMODEL_RELEASE);
+    }
+  if (bar->total > 1)
+    asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
 }

-/* Implement futex_wait-like behaviour to plug into the linux/bar.c
-   implementation.  Assumes ADDR is &bar->generation.   */
-
-static inline void
-futex_wait (int *addr, int val)
+void
+gomp_barrier_wait (gomp_barrier_t *bar)
 {
-  gomp_barrier_t *bar = generation_to_barrier (addr);
+  gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}

-  if (bar->total < 2)
-    /* A barrier with less than two threads, nop.  */
-    return;
+/* Like gomp_barrier_wait, except that if the encountering thread
+   is not the last one to hit the barrier, it returns immediately.
+   The intended usage is that a thread which intends to gomp_barrier_destroy
+   this barrier calls gomp_barrier_wait, while all other threads
+   call gomp_barrier_wait_last.  When gomp_barrier_wait returns,
+   the barrier can be safely destroyed.  */

-  gomp_mutex_lock (&bar->lock);
+void
+gomp_barrier_wait_last (gomp_barrier_t *bar)
+{
+  /* The above described behavior matches 'bar.arrive' perfectly.  */
+  if (bar->total > 1)
+    asm ("bar.arrive 1, %0;" : : "r" (32 * bar->total));
+}

-  /* Futex semantics: only go to sleep if *addr == val.  */
-  if (__builtin_expect (__atomic_load_n (addr, MEMMODEL_ACQUIRE) != val, 0))
+/* Barriers are implemented mainly using 'bar.red.or', which combines a bar.sync
+   operation with a OR-reduction of "team->task_count != 0" across all threads.
+   Task processing is done only after synchronization and verifying that
+   task_count was non-zero in at least one of the team threads.
+
+   This use of simple-barriers, and queueing of tasks till the end, is deemed
+   more efficient performance-wise for GPUs in the common offloading case, as
+   opposed to implementing futex-wait/wake operations to simultaneously process
+   tasks in a CPU-thread manner (which is not easy to implement efficiently
+   on GPUs).  */
+
+void
+gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+
+  bool run_tasks = (team->task_count != 0);
+  if (bar->total > 1)
+    run_tasks = __builtin_nvptx_bar_red_or (1, 32 * bar->total, true,
+					    (team->task_count != 0));
+
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
    {
-      gomp_mutex_unlock (&bar->lock);
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
+    }
+
+  if (__builtin_expect (run_tasks == true, 0))
+    {
+      while (__atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE)
+	     & BAR_TASK_PENDING)
+	gomp_barrier_handle_tasks (state);
+
+      if (bar->total > 1)
+	asm volatile ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+    }
+}
+
+void
+gomp_team_barrier_wait (gomp_barrier_t *bar)
+{
+  gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
+}
+
+void
+gomp_team_barrier_wait_final (gomp_barrier_t *bar)
+{
+  gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    bar->awaited_final = bar->total;
+  gomp_team_barrier_wait_end (bar, state);
+}
+
+/* See also comments for gomp_team_barrier_wait_end.  */
+
+bool
+gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
+				   gomp_barrier_state_t state)
+{
+  struct gomp_thread *thr = gomp_thread ();
+  struct gomp_team *team = thr->ts.team;
+
+  bool run_tasks = (team->task_count != 0);
+  if (bar->total > 1)
+    run_tasks = __builtin_nvptx_bar_red_or (1, 32 * bar->total, true,
+					    (team->task_count != 0));
+  if (state & BAR_CANCELLED)
+    return true;
+
+  if (__builtin_expect (state & BAR_WAS_LAST, 0))
+    {
+      /* Note: BAR_CANCELLED should never be set in state here, because
+	 cancellation means that at least one of the threads has been
+	 cancelled, thus on a cancellable barrier we should never see
+	 all threads to arrive.  */
+
+      /* Next time we'll be awaiting TOTAL threads again.  */
+      bar->awaited = bar->total;
+      team->work_share_cancelled = 0;
+    }
+
+  if (__builtin_expect (run_tasks == true, 0))
+    {
+      while (__atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE)
+	     & BAR_TASK_PENDING)
+	gomp_barrier_handle_tasks (state);
+
+      if (bar->total > 1)
+	asm volatile ("bar.sync 1, %0;" : : "r" (32 * bar->total));
+    }
+
+  return false;
+}
+
+bool
+gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
+{
+  return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar));
+}
+
+void
+gomp_team_barrier_cancel (struct gomp_team *team)
+{
+  gomp_mutex_lock (&team->task_lock);
+  if (team->barrier.generation & BAR_CANCELLED)
+    {
+      gomp_mutex_unlock (&team->task_lock);
      return;
    }
+  team->barrier.generation |= BAR_CANCELLED;
+  gomp_mutex_unlock (&team->task_lock);

-  /* Register as waiter.  */
-  unsigned int waiters
-    = __atomic_add_fetch (&bar->waiters, 1, MEMMODEL_ACQ_REL);
-  if (waiters == 0)
-    __builtin_abort ();
-  unsigned int waiter_id = waiters;
-
-  if (waiters > 1)
-    {
-      /* Wake other threads in bar.sync.  */
-      asm volatile ("bar.sync 1, %0;" : : "r" (32 * waiters));
-
-      /* Ensure that they have updated waiters.  */
-      asm volatile ("bar.sync 1, %0;" : : "r" (32 * waiters));
-    }
-
-  gomp_mutex_unlock (&bar->lock);
-
-  while (1)
-    {
-      /* Wait for next thread in barrier.  */
-      asm volatile ("bar.sync 1, %0;" : : "r" (32 * (waiters + 1)));
-
-      /* Get updated waiters.  */
-      unsigned int updated_waiters
-	= __atomic_load_n (&bar->waiters, MEMMODEL_ACQUIRE);
-
-      /* Notify that we have updated waiters.  */
-      asm volatile ("bar.sync 1, %0;" : : "r" (32 * (waiters + 1)));
-
-      waiters = updated_waiters;
-
-      if (waiter_id > waiters)
-	/* A wake happened, and we're in the group of woken threads.  */
-	break;
-
-      /* Continue waiting.  */
-    }
+  /* The 'exit' instruction cancels this thread and also fullfills any other
+     CTA threads waiting on barriers.  */
+  asm volatile ("exit;");
 }
-
-/* Implement futex_wake-like behaviour to plug into the linux/bar.c
-   implementation.  Assumes ADDR is &bar->generation.  */
-
-static inline void
-futex_wake (int *addr, int count)
-{
-  gomp_barrier_t *bar = generation_to_barrier (addr);
-
-  if (bar->total < 2)
-    /* A barrier with less than two threads, nop.  */
-    return;
-
-  gomp_mutex_lock (&bar->lock);
-  unsigned int waiters = __atomic_load_n (&bar->waiters, MEMMODEL_ACQUIRE);
-  if (waiters == 0)
-    {
-      /* No threads to wake.  */
-      gomp_mutex_unlock (&bar->lock);
-      return;
-    }
-
-  if (count == INT_MAX)
-    /* Release all threads.  */
-    __atomic_store_n (&bar->waiters, 0, MEMMODEL_RELEASE);
-  else if (count < bar->total)
-    /* Release count threads.  */
-    __atomic_add_fetch (&bar->waiters, -count, MEMMODEL_ACQ_REL);
-  else
-    /* Count has an illegal value.  */
-    __builtin_abort ();
-
-  /* Wake other threads in bar.sync.  */
-  asm volatile ("bar.sync 1, %0;" : : "r" (32 * (waiters + 1)));
-
-  /* Let them get the updated waiters.  */
-  asm volatile ("bar.sync 1, %0;" : : "r" (32 * (waiters + 1)));
-
-  gomp_mutex_unlock (&bar->lock);
-}
-
-/* Copied from linux/wait.h.  */
-
-static inline int do_spin (int *addr, int val)
-{
-  /* The current implementation doesn't spin.  */
-  return 1;
-}
-
-/* Copied from linux/wait.h.  */
-
-static inline void do_wait (int *addr, int val)
-{
-  if (do_spin (addr, val))
-    futex_wait (addr, val);
-}
-
-/* Reuse the linux implementation.  */
-#define GOMP_WAIT_H 1
-#include "../linux/bar.c"
--- a/libgomp/config/nvptx/bar.h
+++ b/libgomp/config/nvptx/bar.h
@@ -38,8 +38,6 @@ typedef struct
  unsigned generation;
  unsigned awaited;
  unsigned awaited_final;
-  unsigned waiters;
-  gomp_mutex_t lock;
 } gomp_barrier_t;

 typedef unsigned int gomp_barrier_state_t;
@@ -59,8 +57,6 @@ static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
  bar->awaited = count;
  bar->awaited_final = count;
  bar->generation = 0;
-  bar->waiters = 0;
-  gomp_mutex_init (&bar->lock);
 }

 static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned count)
@@ -83,10 +79,16 @@ extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
 extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
 extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
 					       gomp_barrier_state_t);
-extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
 struct gomp_team;
 extern void gomp_team_barrier_cancel (struct gomp_team *);

+static inline void
+gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
+{
+  /* We never "wake up" threads on nvptx.  Threads wait at barrier
+     instructions till barrier fullfilled.  Do nothing here.  */
+}
+
 static inline gomp_barrier_state_t
 gomp_barrier_wait_start (gomp_barrier_t *bar)
 {