nvptx: reimplement libgomp barriers [PR99555]

Instead of trying to have the GPU do CPU-with-OS-like things, this new barriers
implementation for NVPTX uses simplistic bar.* synchronization instructions.
Tasks are processed after threads have joined, and only if team->task_count != 0

It is noted that: there might be a little bit of performance forfeited for
cases where earlier arriving threads could've been used to process tasks ahead
of other threads, but that has the requirement of implementing complex
futex-wait/wake like behavior, which is what we're try to avoid with this patch.
It is deemed that task processing is not what GPU target offloading is usually
used for.

Implementation highlight notes:
1. gomp_team_barrier_wake() is now an empty function (threads never "wake" in
   the usual manner)
2. gomp_team_barrier_cancel() now uses the "exit" PTX instruction.
3. gomp_barrier_wait_last() now is implemented using "bar.arrive"

4. gomp_team_barrier_wait_end()/gomp_team_barrier_wait_cancel_end():
   The main synchronization is done using a 'bar.red' instruction. This reduces
   across all threads the condition (team->task_count != 0), to enable the task
   processing down below if any thread created a task.
   (this bar.red usage means that this patch is dependent on the prior NVPTX
   bar.red GCC patch)

	PR target/99555

libgomp/ChangeLog:

	* config/nvptx/bar.c (generation_to_barrier): Remove.
	(futex_wait,futex_wake,do_spin,do_wait): Remove.
	(GOMP_WAIT_H): Remove.
	(#include "../linux/bar.c"): Remove.
	(gomp_barrier_wait_end): New function.
	(gomp_barrier_wait): Likewise.
	(gomp_barrier_wait_last): Likewise.
	(gomp_team_barrier_wait_end): Likewise.
	(gomp_team_barrier_wait): Likewise.
	(gomp_team_barrier_wait_final): Likewise.
	(gomp_team_barrier_wait_cancel_end): Likewise.
	(gomp_team_barrier_wait_cancel): Likewise.
	(gomp_team_barrier_cancel): Likewise.
	* config/nvptx/bar.h (gomp_barrier_t): Remove waiters, lock fields.
	(gomp_barrier_init): Remove init of waiters, lock fields.
	(gomp_team_barrier_wake): Remove prototype, add new static inline
	function.
This commit is contained in:
Chung-Lin Tang
2022-12-21 05:57:45 -08:00
parent 623daaf8a2
commit fdc7469cf5
2 changed files with 148 additions and 127 deletions

View File

@@ -30,137 +30,156 @@
#include <limits.h>
#include "libgomp.h"
/* For cpu_relax. */
#include "doacross.h"
/* Assuming ADDR is &bar->generation, return bar. Copied from
rtems/bar.c. */
static gomp_barrier_t *
generation_to_barrier (int *addr)
void
gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
{
char *bar
= (char *) addr - __builtin_offsetof (gomp_barrier_t, generation);
return (gomp_barrier_t *)bar;
if (__builtin_expect (state & BAR_WAS_LAST, 0))
{
/* Next time we'll be awaiting TOTAL threads again. */
bar->awaited = bar->total;
__atomic_store_n (&bar->generation, bar->generation + BAR_INCR,
MEMMODEL_RELEASE);
}
if (bar->total > 1)
asm ("bar.sync 1, %0;" : : "r" (32 * bar->total));
}
/* Implement futex_wait-like behaviour to plug into the linux/bar.c
implementation. Assumes ADDR is &bar->generation. */
static inline void
futex_wait (int *addr, int val)
void
gomp_barrier_wait (gomp_barrier_t *bar)
{
gomp_barrier_t *bar = generation_to_barrier (addr);
gomp_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
}
if (bar->total < 2)
/* A barrier with less than two threads, nop. */
return;
/* Like gomp_barrier_wait, except that if the encountering thread
is not the last one to hit the barrier, it returns immediately.
The intended usage is that a thread which intends to gomp_barrier_destroy
this barrier calls gomp_barrier_wait, while all other threads
call gomp_barrier_wait_last. When gomp_barrier_wait returns,
the barrier can be safely destroyed. */
gomp_mutex_lock (&bar->lock);
void
gomp_barrier_wait_last (gomp_barrier_t *bar)
{
/* The above described behavior matches 'bar.arrive' perfectly. */
if (bar->total > 1)
asm ("bar.arrive 1, %0;" : : "r" (32 * bar->total));
}
/* Futex semantics: only go to sleep if *addr == val. */
if (__builtin_expect (__atomic_load_n (addr, MEMMODEL_ACQUIRE) != val, 0))
/* Barriers are implemented mainly using 'bar.red.or', which combines a bar.sync
operation with a OR-reduction of "team->task_count != 0" across all threads.
Task processing is done only after synchronization and verifying that
task_count was non-zero in at least one of the team threads.
This use of simple-barriers, and queueing of tasks till the end, is deemed
more efficient performance-wise for GPUs in the common offloading case, as
opposed to implementing futex-wait/wake operations to simultaneously process
tasks in a CPU-thread manner (which is not easy to implement efficiently
on GPUs). */
void
gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state)
{
struct gomp_thread *thr = gomp_thread ();
struct gomp_team *team = thr->ts.team;
bool run_tasks = (team->task_count != 0);
if (bar->total > 1)
run_tasks = __builtin_nvptx_bar_red_or (1, 32 * bar->total, true,
(team->task_count != 0));
if (__builtin_expect (state & BAR_WAS_LAST, 0))
{
gomp_mutex_unlock (&bar->lock);
/* Next time we'll be awaiting TOTAL threads again. */
bar->awaited = bar->total;
team->work_share_cancelled = 0;
}
if (__builtin_expect (run_tasks == true, 0))
{
while (__atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE)
& BAR_TASK_PENDING)
gomp_barrier_handle_tasks (state);
if (bar->total > 1)
asm volatile ("bar.sync 1, %0;" : : "r" (32 * bar->total));
}
}
void
gomp_team_barrier_wait (gomp_barrier_t *bar)
{
gomp_team_barrier_wait_end (bar, gomp_barrier_wait_start (bar));
}
void
gomp_team_barrier_wait_final (gomp_barrier_t *bar)
{
gomp_barrier_state_t state = gomp_barrier_wait_final_start (bar);
if (__builtin_expect (state & BAR_WAS_LAST, 0))
bar->awaited_final = bar->total;
gomp_team_barrier_wait_end (bar, state);
}
/* See also comments for gomp_team_barrier_wait_end. */
bool
gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar,
gomp_barrier_state_t state)
{
struct gomp_thread *thr = gomp_thread ();
struct gomp_team *team = thr->ts.team;
bool run_tasks = (team->task_count != 0);
if (bar->total > 1)
run_tasks = __builtin_nvptx_bar_red_or (1, 32 * bar->total, true,
(team->task_count != 0));
if (state & BAR_CANCELLED)
return true;
if (__builtin_expect (state & BAR_WAS_LAST, 0))
{
/* Note: BAR_CANCELLED should never be set in state here, because
cancellation means that at least one of the threads has been
cancelled, thus on a cancellable barrier we should never see
all threads to arrive. */
/* Next time we'll be awaiting TOTAL threads again. */
bar->awaited = bar->total;
team->work_share_cancelled = 0;
}
if (__builtin_expect (run_tasks == true, 0))
{
while (__atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE)
& BAR_TASK_PENDING)
gomp_barrier_handle_tasks (state);
if (bar->total > 1)
asm volatile ("bar.sync 1, %0;" : : "r" (32 * bar->total));
}
return false;
}
bool
gomp_team_barrier_wait_cancel (gomp_barrier_t *bar)
{
return gomp_team_barrier_wait_cancel_end (bar, gomp_barrier_wait_start (bar));
}
void
gomp_team_barrier_cancel (struct gomp_team *team)
{
gomp_mutex_lock (&team->task_lock);
if (team->barrier.generation & BAR_CANCELLED)
{
gomp_mutex_unlock (&team->task_lock);
return;
}
team->barrier.generation |= BAR_CANCELLED;
gomp_mutex_unlock (&team->task_lock);
/* Register as waiter. */
unsigned int waiters
= __atomic_add_fetch (&bar->waiters, 1, MEMMODEL_ACQ_REL);
if (waiters == 0)
__builtin_abort ();
unsigned int waiter_id = waiters;
if (waiters > 1)
{
/* Wake other threads in bar.sync. */
asm volatile ("bar.sync 1, %0;" : : "r" (32 * waiters));
/* Ensure that they have updated waiters. */
asm volatile ("bar.sync 1, %0;" : : "r" (32 * waiters));
}
gomp_mutex_unlock (&bar->lock);
while (1)
{
/* Wait for next thread in barrier. */
asm volatile ("bar.sync 1, %0;" : : "r" (32 * (waiters + 1)));
/* Get updated waiters. */
unsigned int updated_waiters
= __atomic_load_n (&bar->waiters, MEMMODEL_ACQUIRE);
/* Notify that we have updated waiters. */
asm volatile ("bar.sync 1, %0;" : : "r" (32 * (waiters + 1)));
waiters = updated_waiters;
if (waiter_id > waiters)
/* A wake happened, and we're in the group of woken threads. */
break;
/* Continue waiting. */
}
/* The 'exit' instruction cancels this thread and also fullfills any other
CTA threads waiting on barriers. */
asm volatile ("exit;");
}
/* Implement futex_wake-like behaviour to plug into the linux/bar.c
implementation. Assumes ADDR is &bar->generation. */
static inline void
futex_wake (int *addr, int count)
{
gomp_barrier_t *bar = generation_to_barrier (addr);
if (bar->total < 2)
/* A barrier with less than two threads, nop. */
return;
gomp_mutex_lock (&bar->lock);
unsigned int waiters = __atomic_load_n (&bar->waiters, MEMMODEL_ACQUIRE);
if (waiters == 0)
{
/* No threads to wake. */
gomp_mutex_unlock (&bar->lock);
return;
}
if (count == INT_MAX)
/* Release all threads. */
__atomic_store_n (&bar->waiters, 0, MEMMODEL_RELEASE);
else if (count < bar->total)
/* Release count threads. */
__atomic_add_fetch (&bar->waiters, -count, MEMMODEL_ACQ_REL);
else
/* Count has an illegal value. */
__builtin_abort ();
/* Wake other threads in bar.sync. */
asm volatile ("bar.sync 1, %0;" : : "r" (32 * (waiters + 1)));
/* Let them get the updated waiters. */
asm volatile ("bar.sync 1, %0;" : : "r" (32 * (waiters + 1)));
gomp_mutex_unlock (&bar->lock);
}
/* Copied from linux/wait.h. */
static inline int do_spin (int *addr, int val)
{
/* The current implementation doesn't spin. */
return 1;
}
/* Copied from linux/wait.h. */
static inline void do_wait (int *addr, int val)
{
if (do_spin (addr, val))
futex_wait (addr, val);
}
/* Reuse the linux implementation. */
#define GOMP_WAIT_H 1
#include "../linux/bar.c"

View File

@@ -38,8 +38,6 @@ typedef struct
unsigned generation;
unsigned awaited;
unsigned awaited_final;
unsigned waiters;
gomp_mutex_t lock;
} gomp_barrier_t;
typedef unsigned int gomp_barrier_state_t;
@@ -59,8 +57,6 @@ static inline void gomp_barrier_init (gomp_barrier_t *bar, unsigned count)
bar->awaited = count;
bar->awaited_final = count;
bar->generation = 0;
bar->waiters = 0;
gomp_mutex_init (&bar->lock);
}
static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned count)
@@ -83,10 +79,16 @@ extern void gomp_team_barrier_wait_end (gomp_barrier_t *,
extern bool gomp_team_barrier_wait_cancel (gomp_barrier_t *);
extern bool gomp_team_barrier_wait_cancel_end (gomp_barrier_t *,
gomp_barrier_state_t);
extern void gomp_team_barrier_wake (gomp_barrier_t *, int);
struct gomp_team;
extern void gomp_team_barrier_cancel (struct gomp_team *);
static inline void
gomp_team_barrier_wake (gomp_barrier_t *bar, int count)
{
/* We never "wake up" threads on nvptx. Threads wait at barrier
instructions till barrier fullfilled. Do nothing here. */
}
static inline gomp_barrier_state_t
gomp_barrier_wait_start (gomp_barrier_t *bar)
{