libgomp: parallel reverse offload

Extend OpenMP reverse offload support to allow running the host kernels
on multiple threads.  The device plugin API for reverse offload is now made
non-blocking, meaning that running the host kernel in the wrong device
context is no longer a problem.  The NVPTX message passing interface now
uses a ring buffer aproximately matching GCN.

libgomp/ChangeLog:

	* config/gcn/target.c (GOMP_target_ext): Add "signal" field.
	Fix atomics race condition.
	* config/nvptx/libgomp-nvptx.h (REV_OFFLOAD_QUEUE_SIZE): New define.
	(struct rev_offload): Implement ring buffer.
	* config/nvptx/target.c (GOMP_target_ext): Likewise.
	* env.c (initialize_env): Read GOMP_REVERSE_OFFLOAD_THREADS.
	* libgomp-plugin.c (GOMP_PLUGIN_target_rev): Replace "aq" parameter
	with "signal" and "use_aq".
	* libgomp-plugin.h (GOMP_PLUGIN_target_rev): Likewise.
	* libgomp.h (gomp_target_rev): Likewise.
	* plugin/plugin-gcn.c (process_reverse_offload): Add "signal".
	(console_output): Pass signal value through.
	* plugin/plugin-nvptx.c (GOMP_OFFLOAD_openacc_async_construct):
	Attach new threads to the numbered device.
	Change the flag to CU_STREAM_NON_BLOCKING.
	(GOMP_OFFLOAD_run): Implement ring-buffer and remove signalling.
	* target.c (gomp_target_rev): Rename to ...
	(gomp_target_rev_internal): ... this, and change "dev_num" to
	"devicep".
	(gomp_target_rev_worker_thread): New function.
	(gomp_target_rev): New function (old name).
	* libgomp.texi: Document GOMP_REVERSE_OFFLOAD_THREADS.
	* testsuite/libgomp.c/reverse-offload-threads-1.c: New test.
	* testsuite/libgomp.c/reverse-offload-threads-2.c: New test.
This commit is contained in:
Andrew Stubbs
2023-09-01 16:49:58 +01:00
committed by Sandra Loosemore
parent 6ffe14f319
commit fc4bf7b190
13 changed files with 374 additions and 57 deletions

View File

@@ -122,19 +122,38 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
<= (index - 1024))
asm ("s_sleep 64");
unsigned int slot = index % 1024;
data->queue[slot].value_u64[0] = (uint64_t) fn;
data->queue[slot].value_u64[1] = (uint64_t) mapnum;
data->queue[slot].value_u64[2] = (uint64_t) hostaddrs;
data->queue[slot].value_u64[3] = (uint64_t) sizes;
data->queue[slot].value_u64[4] = (uint64_t) kinds;
data->queue[slot].value_u64[5] = (uint64_t) GOMP_ADDITIONAL_ICVS.device_num;
/* In theory, it should be enough to write "written" with __ATOMIC_RELEASE,
and have the rest of the data flushed to memory automatically, but some
devices (gfx908) seem to have a race condition where the flushed data
arrives after the atomic data, and the host does the wrong thing.
If we just write everything atomically in the correct order then we're
safe. */
data->queue[slot].type = 4; /* Reverse offload. */
unsigned int slot = index % 1024;
__atomic_store_n (&data->queue[slot].value_u64[0], (uint64_t) fn,
__ATOMIC_RELAXED);
__atomic_store_n (&data->queue[slot].value_u64[1], (uint64_t) mapnum,
__ATOMIC_RELAXED);
__atomic_store_n (&data->queue[slot].value_u64[2], (uint64_t) hostaddrs,
__ATOMIC_RELAXED);
__atomic_store_n (&data->queue[slot].value_u64[3], (uint64_t) sizes,
__ATOMIC_RELAXED);
__atomic_store_n (&data->queue[slot].value_u64[4], (uint64_t) kinds,
__ATOMIC_RELAXED);
__atomic_store_n (&data->queue[slot].value_u64[5],
(uint64_t) GOMP_ADDITIONAL_ICVS.device_num,
__ATOMIC_RELAXED);
volatile int signal = 0;
__atomic_store_n (&data->queue[slot].value_u64[6], (uint64_t) &signal,
__ATOMIC_RELAXED);
__atomic_store_n (&data->queue[slot].type, 4 /* Reverse offload. */,
__ATOMIC_RELAXED);
__atomic_store_n (&data->queue[slot].written, 1, __ATOMIC_RELEASE);
/* Spinlock while the host catches up. */
while (__atomic_load_n (&data->queue[slot].written, __ATOMIC_ACQUIRE) != 0)
/* Spinlock while the host runs the kernel. */
while (__atomic_load_n (&signal, __ATOMIC_ACQUIRE) == 0)
asm ("s_sleep 64");
}

View File

@@ -25,20 +25,41 @@
/* This file contains defines and type definitions shared between the
nvptx target's libgomp.a and the plugin-nvptx.c, but that is only
needef for this target. */
needed for this target. */
#ifndef LIBGOMP_NVPTX_H
#define LIBGOMP_NVPTX_H 1
#define GOMP_REV_OFFLOAD_VAR __gomp_rev_offload_var
#define REV_OFFLOAD_QUEUE_SIZE 1024
struct rev_offload {
uint64_t fn;
uint64_t mapnum;
uint64_t addrs;
uint64_t sizes;
uint64_t kinds;
int32_t dev_num;
/* The target can grab a slot by incrementing "next_slot".
Each host thread may claim some slots for processing.
When the host processing is completed "consumed" indicates that the
corresponding slots in the ring-buffer "queue" are available for reuse.
Note that "next_slot" is an index, and "consumed"/"claimed" are counters,
so beware of the fence-posts. */
unsigned int next_slot;
unsigned int consumed;
unsigned int claimed;
struct rev_req {
/* The target writes an address to "signal" as the last item, which
indicates to the host that the record is completely written. The target
must not assume that it still owns the slot, after that. The signal
address is then used by the host to communicate that the reverse-offload
kernel has completed execution. */
volatile int *signal;
uint64_t fn;
uint64_t mapnum;
uint64_t addrs;
uint64_t sizes;
uint64_t kinds;
int32_t dev_num;
} queue[REV_OFFLOAD_QUEUE_SIZE];
};
#if (__SIZEOF_SHORT__ != 2 \

View File

@@ -101,7 +101,6 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
void **hostaddrs, size_t *sizes, unsigned short *kinds,
unsigned int flags, void **depend, void **args)
{
static int lock = 0; /* == gomp_mutex_t lock; gomp_mutex_init (&lock); */
(void) flags;
(void) depend;
(void) args;
@@ -111,43 +110,57 @@ GOMP_target_ext (int device, void (*fn) (void *), size_t mapnum,
|| GOMP_REV_OFFLOAD_VAR == NULL)
return;
gomp_mutex_lock (&lock);
/* Reserve one slot. */
unsigned int index = __atomic_fetch_add (&GOMP_REV_OFFLOAD_VAR->next_slot,
1, __ATOMIC_ACQUIRE);
GOMP_REV_OFFLOAD_VAR->mapnum = mapnum;
GOMP_REV_OFFLOAD_VAR->addrs = (uint64_t) hostaddrs;
GOMP_REV_OFFLOAD_VAR->sizes = (uint64_t) sizes;
GOMP_REV_OFFLOAD_VAR->kinds = (uint64_t) kinds;
GOMP_REV_OFFLOAD_VAR->dev_num = GOMP_ADDITIONAL_ICVS.device_num;
if ((unsigned int) (index + 1) < GOMP_REV_OFFLOAD_VAR->consumed)
abort (); /* Overflow. */
/* Set 'fn' to trigger processing on the host; wait for completion,
which is flagged by setting 'fn' back to 0 on the host. */
uint64_t addr_struct_fn = (uint64_t) &GOMP_REV_OFFLOAD_VAR->fn;
/* Spinlock while the host catches up. */
if (index >= REV_OFFLOAD_QUEUE_SIZE)
while (__atomic_load_n (&GOMP_REV_OFFLOAD_VAR->consumed, __ATOMIC_ACQUIRE)
<= (index - REV_OFFLOAD_QUEUE_SIZE))
; /* spin */
unsigned int slot = index % REV_OFFLOAD_QUEUE_SIZE;
GOMP_REV_OFFLOAD_VAR->queue[slot].fn = (uint64_t) fn;
GOMP_REV_OFFLOAD_VAR->queue[slot].mapnum = mapnum;
GOMP_REV_OFFLOAD_VAR->queue[slot].addrs = (uint64_t) hostaddrs;
GOMP_REV_OFFLOAD_VAR->queue[slot].sizes = (uint64_t) sizes;
GOMP_REV_OFFLOAD_VAR->queue[slot].kinds = (uint64_t) kinds;
GOMP_REV_OFFLOAD_VAR->queue[slot].dev_num = GOMP_ADDITIONAL_ICVS.device_num;
/* Set 'signal' to trigger processing on the host; the slot is now consumed
by the host, so we should not touch it again. */
volatile int signal = 0;
uint64_t addr_struct_signal = (uint64_t) &GOMP_REV_OFFLOAD_VAR->queue[slot].signal;
#if __PTX_SM__ >= 700
asm volatile ("st.global.release.sys.u64 [%0], %1;"
: : "r"(addr_struct_fn), "r" (fn) : "memory");
: : "r"(addr_struct_signal), "r" (&signal) : "memory");
#else
__sync_synchronize (); /* membar.sys */
asm volatile ("st.volatile.global.u64 [%0], %1;"
: : "r"(addr_struct_fn), "r" (fn) : "memory");
: : "r"(addr_struct_signal), "r" (&signal) : "memory");
#endif
/* The host signals completion by writing a non-zero value to the 'signal'
variable. */
#if __PTX_SM__ >= 700
uint64_t fn2;
uint64_t signal2;
do
{
asm volatile ("ld.acquire.sys.global.u64 %0, [%1];"
: "=r" (fn2) : "r" (addr_struct_fn) : "memory");
: "=r" (signal2) : "r" (&signal) : "memory");
}
while (fn2 != 0);
while (signal2 == 0);
#else
/* ld.global.u64 %r64,[__gomp_rev_offload_var];
ld.u64 %r36,[%r64];
membar.sys; */
while (__atomic_load_n (&GOMP_REV_OFFLOAD_VAR->fn, __ATOMIC_ACQUIRE) != 0)
while (__atomic_load_n (&signal, __ATOMIC_ACQUIRE) == 0)
; /* spin */
#endif
gomp_mutex_unlock (&lock);
}
void

View File

@@ -124,6 +124,7 @@ size_t gomp_affinity_format_len;
char *goacc_device_type;
int goacc_device_num;
int goacc_default_dims[GOMP_DIM_MAX];
int gomp_reverse_offload_threads = 8; /* Reasonable default. */
#ifndef LIBGOMP_OFFLOADED_ONLY
@@ -2489,6 +2490,11 @@ initialize_env (void)
handle_omp_display_env ();
/* Control the number of background threads reverse offload is permitted
to use. */
parse_int_secure ("GOMP_REVERSE_OFFLOAD_THREADS",
&gomp_reverse_offload_threads, false);
/* OpenACC. */
if (!parse_int ("ACC_DEVICE_NUM", getenv ("ACC_DEVICE_NUM"),

View File

@@ -82,8 +82,8 @@ GOMP_PLUGIN_fatal (const char *msg, ...)
void
GOMP_PLUGIN_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
struct goacc_asyncqueue *aq)
volatile int *signal, bool use_aq)
{
gomp_target_rev (fn_ptr, mapnum, devaddrs_ptr, sizes_ptr, kinds_ptr, dev_num,
aq);
signal, use_aq);
}

View File

@@ -150,7 +150,7 @@ extern void GOMP_PLUGIN_fatal (const char *, ...)
__attribute__ ((noreturn, format (printf, 1, 2)));
extern void GOMP_PLUGIN_target_rev (uint64_t, uint64_t, uint64_t, uint64_t,
uint64_t, int, struct goacc_asyncqueue *);
uint64_t, int, volatile int *, bool);
/* Prototypes for functions implemented by libgomp plugins. */
extern const char *GOMP_OFFLOAD_get_name (void);

View File

@@ -620,6 +620,7 @@ extern struct gomp_offload_icv_list *gomp_offload_icv_list;
extern int goacc_device_num;
extern char *goacc_device_type;
extern int goacc_default_dims[GOMP_DIM_MAX];
extern int gomp_reverse_offload_threads;
enum gomp_task_kind
{
@@ -1134,7 +1135,7 @@ extern void gomp_init_targets_once (void);
extern int gomp_get_num_devices (void);
extern bool gomp_target_task_fn (void *);
extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
int, struct goacc_asyncqueue *);
int, volatile int *, bool);
extern bool gomp_page_locked_host_alloc (void **, size_t);
extern void gomp_page_locked_host_free (void *);

View File

@@ -3912,6 +3912,7 @@ variable is not set.
* GOMP_STACKSIZE:: Set default thread stack size
* GOMP_SPINCOUNT:: Set the busy-wait spin count
* GOMP_RTEMS_THREAD_POOLS:: Set the RTEMS specific thread pools
* GOMP_REVERSE_OFFLOAD_THREADS:: Set the maximum number of host threads
@end menu
@@ -4677,6 +4678,22 @@ pools available and their worker threads run at priority four.
@node GOMP_REVERSE_OFFLOAD_THREADS
@section @env{GOMP_REVERSE_OFFLOAD_THREADS} -- Set the maximum number of host threads
@cindex Environment Variable
@table @asis
@item @emph{Description}
Set the maximum number of threads that may be used to run reverse offload
code sections (host code nested within offload regions, declared using
@code{#pragma omp target device(ancestor:1)}). The value should be a non-zero
positive integer. The default is 8 threads.
The threads are created on demand, up to the maximum number given, and are
destroyed when no reverse offload requests remain.
@end table
@c ---------------------------------------------------------------------
@c Enabling OpenACC
@c ---------------------------------------------------------------------

View File

@@ -2026,11 +2026,12 @@ create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
static void
process_reverse_offload (uint64_t fn, uint64_t mapnum, uint64_t hostaddrs,
uint64_t sizes, uint64_t kinds, uint64_t dev_num64)
uint64_t sizes, uint64_t kinds, uint64_t dev_num64,
uint64_t signal)
{
int dev_num = dev_num64;
GOMP_PLUGIN_target_rev (fn, mapnum, hostaddrs, sizes, kinds, dev_num,
NULL);
(volatile int *) signal, false);
}
/* Output any data written to console output from the kernel. It is expected
@@ -2080,7 +2081,8 @@ console_output (struct kernel_info *kernel, struct kernargs *kernargs,
case 4:
process_reverse_offload (data->value_u64[0], data->value_u64[1],
data->value_u64[2], data->value_u64[3],
data->value_u64[4], data->value_u64[5]);
data->value_u64[4], data->value_u64[5],
data->value_u64[6]);
break;
default: printf ("GCN print buffer error!\n"); break;
}

View File

@@ -1979,9 +1979,10 @@ nvptx_goacc_asyncqueue_construct (unsigned int flags)
}
struct goacc_asyncqueue *
GOMP_OFFLOAD_openacc_async_construct (int device __attribute__((unused)))
GOMP_OFFLOAD_openacc_async_construct (int device)
{
return nvptx_goacc_asyncqueue_construct (CU_STREAM_DEFAULT);
nvptx_attach_host_thread_to_device (device);
return nvptx_goacc_asyncqueue_construct (CU_STREAM_NON_BLOCKING);
}
static bool
@@ -2855,17 +2856,68 @@ GOMP_OFFLOAD_run (int ord, void *tgt_fn, void *tgt_vars, void **args)
else if (r != CUDA_ERROR_NOT_READY)
GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r));
if (__atomic_load_n (&ptx_dev->rev_data->fn, __ATOMIC_ACQUIRE) != 0)
struct rev_offload *rev_metadata = ptx_dev->rev_data;
/* Claim a portion of the ring buffer to process on this iteration.
Don't mark them as consumed until all the data has been read out. */
unsigned int consumed = __atomic_load_n (&rev_metadata->consumed,
__ATOMIC_ACQUIRE);
unsigned int from = __atomic_load_n (&rev_metadata->claimed,
__ATOMIC_RELAXED);
unsigned int to = __atomic_load_n (&rev_metadata->next_slot,
__ATOMIC_RELAXED);
if (consumed > to)
{
struct rev_offload *rev_data = ptx_dev->rev_data;
/* Overflow happens when we exceed UINTMAX requests. */
GOMP_PLUGIN_fatal ("NVPTX reverse offload buffer overflowed.\n");
}
to = MIN(to, consumed + REV_OFFLOAD_QUEUE_SIZE / 2);
if (to <= from)
/* Nothing to do; poll again. */
goto poll_again;
if (!__atomic_compare_exchange_n (&rev_metadata->claimed, &from, to,
false,
__ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
/* Collision with another thread ... go around again. */
goto poll_again;
unsigned int index;
for (index = from; index < to; index++)
{
int slot = index % REV_OFFLOAD_QUEUE_SIZE;
/* Wait while the target finishes filling in the slot. */
while (__atomic_load_n (&ptx_dev->rev_data->queue[slot].signal,
__ATOMIC_ACQUIRE) == 0)
; /* spin */
/* Pass the request to libgomp; this will queue the request and
return right away, without waiting for the kernel to run. */
struct rev_req *rev_data = &ptx_dev->rev_data->queue[slot];
GOMP_PLUGIN_target_rev (rev_data->fn, rev_data->mapnum,
rev_data->addrs, rev_data->sizes,
rev_data->kinds, rev_data->dev_num,
reverse_offload_aq);
if (!nvptx_goacc_asyncqueue_synchronize (reverse_offload_aq))
exit (EXIT_FAILURE);
__atomic_store_n (&rev_data->fn, 0, __ATOMIC_RELEASE);
rev_data->signal, true);
/* Ensure that the slot doesn't trigger early, when reused. */
__atomic_store_n (&rev_data->signal, 0, __ATOMIC_RELEASE);
}
/* The data is now consumed so release the slots for reuse. */
unsigned int consumed_so_far = from;
while (!__atomic_compare_exchange_n (&rev_metadata->consumed,
&consumed_so_far, to, false,
__ATOMIC_RELEASE, __ATOMIC_RELAXED))
{
/* Another thread didn't consume all it claimed yet.... */
consumed_so_far = from;
usleep (1);
}
poll_again:
usleep (1);
}
else

View File

@@ -3671,16 +3671,18 @@ gomp_map_cdata_lookup (struct cpy_data *d, uint64_t *devaddrs,
tgt_start, tgt_end);
}
/* Handle reverse offload. This is called by the device plugins for a
reverse offload; it is not called if the outer target runs on the host.
/* Handle reverse offload. This is called by the host worker thread to
execute a single reverse offload request; it is not called if the outer
target runs on the host.
The mapping is simplified device-affecting constructs (except for target
with device(ancestor:1)) must not be encountered; in particular not
target (enter/exit) data. */
void
gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
struct goacc_asyncqueue *aq)
static void
gomp_target_rev_internal (uint64_t fn_ptr, uint64_t mapnum,
uint64_t devaddrs_ptr, uint64_t sizes_ptr,
uint64_t kinds_ptr, struct gomp_device_descr *devicep,
struct goacc_asyncqueue *aq)
{
/* Return early if there is no offload code. */
if (sizeof (OFFLOAD_PLUGINS) == sizeof (""))
@@ -3697,7 +3699,6 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
unsigned short *kinds;
const bool short_mapkind = true;
const int typemask = short_mapkind ? 0xff : 0x7;
struct gomp_device_descr *devicep = resolve_device (dev_num, false);
reverse_splay_tree_key n;
struct reverse_splay_tree_key_s k;
@@ -4108,6 +4109,134 @@ gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
}
}
static struct target_rev_queue_s
{
uint64_t fn_ptr;
uint64_t mapnum;
uint64_t devaddrs_ptr;
uint64_t sizes_ptr;
uint64_t kinds_ptr;
struct gomp_device_descr *devicep;
volatile int *signal;
bool use_aq;
struct target_rev_queue_s *next;
} *target_rev_queue_head = NULL, *target_rev_queue_last = NULL;
static gomp_mutex_t target_rev_queue_lock = 0;
static int target_rev_thread_count = 0;
static void *
gomp_target_rev_worker_thread (void *)
{
struct target_rev_queue_s *rev_kernel = NULL;
struct goacc_asyncqueue *aq = NULL;
struct gomp_device_descr *aq_devicep;
while (1)
{
gomp_mutex_lock (&target_rev_queue_lock);
/* Take a reverse-offload kernel request from the queue. */
rev_kernel = target_rev_queue_head;
if (rev_kernel)
{
target_rev_queue_head = rev_kernel->next;
if (target_rev_queue_head == NULL)
target_rev_queue_last = NULL;
}
if (rev_kernel == NULL)
{
target_rev_thread_count--;
gomp_mutex_unlock (&target_rev_queue_lock);
break;
}
gomp_mutex_unlock (&target_rev_queue_lock);
/* Ensure we have a suitable device queue for the memory transfers. */
if (rev_kernel->use_aq)
{
if (aq && aq_devicep != rev_kernel->devicep)
{
aq_devicep->openacc.async.destruct_func (aq);
aq = NULL;
}
if (!aq)
{
aq_devicep = rev_kernel->devicep;
aq = aq_devicep->openacc.async.construct_func (aq_devicep->target_id);
}
}
/* Run the kernel on the host. */
gomp_target_rev_internal (rev_kernel->fn_ptr, rev_kernel->mapnum,
rev_kernel->devaddrs_ptr, rev_kernel->sizes_ptr,
rev_kernel->kinds_ptr, rev_kernel->devicep, aq);
/* Signal the device that the reverse-offload is completed. */
int one = 1;
gomp_copy_host2dev (rev_kernel->devicep, aq, (void*)rev_kernel->signal,
&one, sizeof (one), false, NULL);
/* We're done with this request. */
free (rev_kernel);
/* Loop around and see if another request is waiting. */
}
if (aq)
aq_devicep->openacc.async.destruct_func (aq);
return NULL;
}
void
gomp_target_rev (uint64_t fn_ptr, uint64_t mapnum, uint64_t devaddrs_ptr,
uint64_t sizes_ptr, uint64_t kinds_ptr, int dev_num,
volatile int *signal, bool use_aq)
{
struct gomp_device_descr *devicep = resolve_device (dev_num, false);
/* Create a new queue node. */
struct target_rev_queue_s *newreq = gomp_malloc (sizeof (*newreq));
newreq->fn_ptr = fn_ptr;
newreq->mapnum = mapnum;
newreq->devaddrs_ptr = devaddrs_ptr;
newreq->sizes_ptr = sizes_ptr;
newreq->kinds_ptr = kinds_ptr;
newreq->devicep = devicep;
newreq->signal = signal;
newreq->use_aq = use_aq;
newreq->next = NULL;
gomp_mutex_lock (&target_rev_queue_lock);
/* Enqueue the reverse-offload request. */
if (target_rev_queue_last)
{
target_rev_queue_last->next = newreq;
target_rev_queue_last = newreq;
}
else
target_rev_queue_last = target_rev_queue_head = newreq;
/* Launch a new thread to process the request asynchronously.
If the thread pool limit has been reached then an existing thread will
pick up the job when it is ready. */
if (target_rev_thread_count < gomp_reverse_offload_threads)
{
target_rev_thread_count++;
gomp_mutex_unlock (&target_rev_queue_lock);
pthread_t t;
pthread_create (&t, NULL, gomp_target_rev_worker_thread, NULL);
}
else
gomp_mutex_unlock (&target_rev_queue_lock);
}
/* Host fallback for GOMP_target_data{,_ext} routines. */
static void

View File

@@ -0,0 +1,26 @@
/* { dg-do run } */
/* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */
/* Test that the reverse offload message buffers can cope with a lot of
requests. */
#pragma omp requires reverse_offload
int main ()
{
#pragma omp target teams distribute parallel for collapse(2)
for (int i=0; i < 100; i++)
for (int j=0; j < 16; j++)
{
int val = 0;
#pragma omp target device ( ancestor:1 ) firstprivate(i,j) map(from:val)
{
val = i + j;
}
if (val != i + j)
__builtin_abort ();
}
return 0;
}

View File

@@ -0,0 +1,31 @@
/* { dg-do run } */
/* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */
/* Test that the reverse offload message buffers can cope with multiple
requests from multiple kernels. */
#pragma omp requires reverse_offload
int main ()
{
for (int n=0; n < 5; n++)
{
#pragma omp target teams distribute parallel for nowait collapse(2)
for (int i=0; i < 32; i++)
for (int j=0; j < 16; j++)
{
int val = 0;
#pragma omp target device ( ancestor:1 ) firstprivate(i,j) map(from:val)
{
val = i + j;
}
if (val != i + j)
__builtin_abort ();
}
}
#pragma omp taskwait
return 0;
}