mirror of
https://gcc.gnu.org/git/gcc.git
synced 2026-02-22 03:46:53 -05:00
libgomp, nvptx: Cuda pinned memory
Use Cuda to pin memory, instead of Linux mlock, when available.
There are two advantages: firstly, this gives a significant speed boost for
NVPTX offloading, and secondly, it side-steps the usual OS ulimit/rlimit
setting.
The design adds a device independent plugin API for allocating pinned memory,
and then implements it for NVPTX. At present, the other supported devices do
not have equivalent capabilities (or requirements).
libgomp/ChangeLog:
* config/linux/allocator.c: Include assert.h.
(using_device_for_page_locked): New variable.
(linux_memspace_alloc): Add init0 parameter. Support device pinning.
(linux_memspace_calloc): Set init0 to true.
(linux_memspace_free): Support device pinning.
(linux_memspace_realloc): Support device pinning.
(MEMSPACE_ALLOC): Set init0 to false.
* libgomp-plugin.h
(GOMP_OFFLOAD_page_locked_host_alloc): New prototype.
(GOMP_OFFLOAD_page_locked_host_free): Likewise.
* libgomp.h (gomp_page_locked_host_alloc): Likewise.
(gomp_page_locked_host_free): Likewise.
(struct gomp_device_descr): Add page_locked_host_alloc_func and
page_locked_host_free_func.
* libgomp.texi: Adjust the docs for the pinned trait.
* plugin/plugin-nvptx.c
(GOMP_OFFLOAD_page_locked_host_alloc): New function.
(GOMP_OFFLOAD_page_locked_host_free): Likewise.
* target.c (device_for_page_locked): New variable.
(get_device_for_page_locked): New function.
(gomp_page_locked_host_alloc): Likewise.
(gomp_page_locked_host_free): Likewise.
(gomp_load_plugin_for_device): Add page_locked_host_alloc and
page_locked_host_free.
* testsuite/libgomp.c/alloc-pinned-1.c: Change expectations for NVPTX
devices.
* testsuite/libgomp.c/alloc-pinned-2.c: Likewise.
* testsuite/libgomp.c/alloc-pinned-3.c: Likewise.
* testsuite/libgomp.c/alloc-pinned-4.c: Likewise.
* testsuite/libgomp.c/alloc-pinned-5.c: Likewise.
* testsuite/libgomp.c/alloc-pinned-6.c: Likewise.
Co-Authored-By: Thomas Schwinge <thomas@codesourcery.com>
(cherry picked from commit 3b8d9d579c)
This commit is contained in:
@@ -36,6 +36,11 @@
|
||||
|
||||
/* Implement malloc routines that can handle pinned memory on Linux.
|
||||
|
||||
Given that pinned memory is typically used to help host <-> device memory
|
||||
transfers, we attempt to allocate such memory using a device (really:
|
||||
libgomp plugin), but fall back to mmap plus mlock if no suitable device is
|
||||
available.
|
||||
|
||||
It's possible to use mlock on any heap memory, but using munlock is
|
||||
problematic if there are multiple pinned allocations on the same page.
|
||||
Tracking all that manually would be possible, but adds overhead. This may
|
||||
@@ -49,49 +54,75 @@
|
||||
#define _GNU_SOURCE
|
||||
#include <sys/mman.h>
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
#include "libgomp.h"
|
||||
#ifdef HAVE_INTTYPES_H
|
||||
# include <inttypes.h> /* For PRIu64. */
|
||||
#endif
|
||||
|
||||
static int using_device_for_page_locked
|
||||
= /* uninitialized */ -1;
|
||||
|
||||
static void *
|
||||
linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin)
|
||||
linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin,
|
||||
bool init0)
|
||||
{
|
||||
(void)memspace;
|
||||
void *addr;
|
||||
|
||||
if (pin)
|
||||
{
|
||||
/* Note that mmap always returns zeroed memory and is therefore also a
|
||||
suitable implementation of calloc. */
|
||||
void *addr = mmap (NULL, size, PROT_READ | PROT_WRITE,
|
||||
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
||||
if (addr == MAP_FAILED)
|
||||
return NULL;
|
||||
|
||||
if (mlock (addr, size))
|
||||
int using_device = __atomic_load_n (&using_device_for_page_locked,
|
||||
MEMMODEL_RELAXED);
|
||||
if (using_device != 0)
|
||||
{
|
||||
#ifdef HAVE_INTTYPES_H
|
||||
gomp_debug (0, "libgomp: failed to pin %"PRIu64" bytes of"
|
||||
" memory (ulimit too low?)\n", (uint64_t) size);
|
||||
#else
|
||||
gomp_debug (0, "libgomp: failed to pin %lu bytes of"
|
||||
" memory (ulimit too low?)\n", (unsigned long) size);
|
||||
#endif
|
||||
munmap (addr, size);
|
||||
return NULL;
|
||||
using_device = gomp_page_locked_host_alloc (&addr, size);
|
||||
int using_device_old
|
||||
= __atomic_exchange_n (&using_device_for_page_locked,
|
||||
using_device, MEMMODEL_RELAXED);
|
||||
assert (using_device_old == -1
|
||||
/* We shouldn't have concurrently changed our mind. */
|
||||
|| using_device_old == using_device);
|
||||
}
|
||||
if (using_device == 0)
|
||||
{
|
||||
addr = mmap (NULL, size, PROT_READ | PROT_WRITE,
|
||||
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
||||
if (addr == MAP_FAILED)
|
||||
addr = NULL;
|
||||
else
|
||||
{
|
||||
/* 'mmap' zero-initializes. */
|
||||
init0 = false;
|
||||
|
||||
return addr;
|
||||
if (mlock (addr, size))
|
||||
{
|
||||
#ifdef HAVE_INTTYPES_H
|
||||
gomp_debug (0, "libgomp: failed to pin %"PRIu64" bytes of"
|
||||
" memory (ulimit too low?)\n", (uint64_t) size);
|
||||
#else
|
||||
gomp_debug (0, "libgomp: failed to pin %lu bytes of memory"
|
||||
" (ulimit too low?)\n", (unsigned long) size);
|
||||
#endif
|
||||
munmap (addr, size);
|
||||
addr = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
return malloc (size);
|
||||
addr = malloc (size);
|
||||
|
||||
if (addr && init0)
|
||||
memset (addr, 0, size);
|
||||
|
||||
return addr;
|
||||
}
|
||||
|
||||
static void *
|
||||
linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
|
||||
{
|
||||
if (pin)
|
||||
return linux_memspace_alloc (memspace, size, pin);
|
||||
return linux_memspace_alloc (memspace, size, pin, true);
|
||||
else
|
||||
return calloc (1, size);
|
||||
}
|
||||
@@ -100,10 +131,17 @@ static void
|
||||
linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
|
||||
int pin)
|
||||
{
|
||||
(void)memspace;
|
||||
|
||||
if (pin)
|
||||
munmap (addr, size);
|
||||
{
|
||||
int using_device
|
||||
= __atomic_load_n (&using_device_for_page_locked,
|
||||
MEMMODEL_RELAXED);
|
||||
if (using_device == 1)
|
||||
gomp_page_locked_host_free (addr);
|
||||
else
|
||||
/* 'munlock'ing is implicit with following 'munmap'. */
|
||||
munmap (addr, size);
|
||||
}
|
||||
else
|
||||
free (addr);
|
||||
}
|
||||
@@ -114,6 +152,14 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
|
||||
{
|
||||
if (oldpin && pin)
|
||||
{
|
||||
/* We can only expect to be able to just 'mremap' if not using a device
|
||||
for page-locked memory. */
|
||||
int using_device
|
||||
= __atomic_load_n (&using_device_for_page_locked,
|
||||
MEMMODEL_RELAXED);
|
||||
if (using_device != 0)
|
||||
goto manual_realloc;
|
||||
|
||||
void *newaddr = mremap (addr, oldsize, size, MREMAP_MAYMOVE);
|
||||
if (newaddr == MAP_FAILED)
|
||||
return NULL;
|
||||
@@ -121,18 +167,19 @@ linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
|
||||
return newaddr;
|
||||
}
|
||||
else if (oldpin || pin)
|
||||
{
|
||||
void *newaddr = linux_memspace_alloc (memspace, size, pin);
|
||||
if (newaddr)
|
||||
{
|
||||
memcpy (newaddr, addr, oldsize < size ? oldsize : size);
|
||||
linux_memspace_free (memspace, addr, oldsize, oldpin);
|
||||
}
|
||||
|
||||
return newaddr;
|
||||
}
|
||||
goto manual_realloc;
|
||||
else
|
||||
return realloc (addr, size);
|
||||
|
||||
manual_realloc:;
|
||||
void *newaddr = linux_memspace_alloc (memspace, size, pin, false);
|
||||
if (newaddr)
|
||||
{
|
||||
memcpy (newaddr, addr, oldsize < size ? oldsize : size);
|
||||
linux_memspace_free (memspace, addr, oldsize, oldpin);
|
||||
}
|
||||
|
||||
return newaddr;
|
||||
}
|
||||
|
||||
static int
|
||||
@@ -143,7 +190,7 @@ linux_memspace_validate (omp_memspace_handle_t, unsigned, int)
|
||||
}
|
||||
|
||||
#define MEMSPACE_ALLOC(MEMSPACE, SIZE, PIN) \
|
||||
linux_memspace_alloc (MEMSPACE, SIZE, PIN)
|
||||
linux_memspace_alloc (MEMSPACE, SIZE, PIN, false)
|
||||
#define MEMSPACE_CALLOC(MEMSPACE, SIZE, PIN) \
|
||||
linux_memspace_calloc (MEMSPACE, SIZE, PIN)
|
||||
#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE, OLDPIN, PIN) \
|
||||
|
||||
@@ -167,6 +167,8 @@ extern int GOMP_OFFLOAD_load_image (int, unsigned, const void *,
|
||||
extern bool GOMP_OFFLOAD_unload_image (int, unsigned, const void *);
|
||||
extern void *GOMP_OFFLOAD_alloc (int, size_t);
|
||||
extern bool GOMP_OFFLOAD_free (int, void *);
|
||||
extern bool GOMP_OFFLOAD_page_locked_host_alloc (void **, size_t);
|
||||
extern bool GOMP_OFFLOAD_page_locked_host_free (void *);
|
||||
extern bool GOMP_OFFLOAD_dev2host (int, void *, const void *, size_t);
|
||||
extern bool GOMP_OFFLOAD_host2dev (int, void *, const void *, size_t);
|
||||
extern bool GOMP_OFFLOAD_dev2dev (int, void *, const void *, size_t);
|
||||
|
||||
@@ -1136,6 +1136,8 @@ extern int gomp_get_num_devices (void);
|
||||
extern bool gomp_target_task_fn (void *);
|
||||
extern void gomp_target_rev (uint64_t, uint64_t, uint64_t, uint64_t, uint64_t,
|
||||
int, volatile int *, bool);
|
||||
extern bool gomp_page_locked_host_alloc (void **, size_t);
|
||||
extern void gomp_page_locked_host_free (void *);
|
||||
|
||||
/* Splay tree definitions. */
|
||||
typedef struct splay_tree_node_s *splay_tree_node;
|
||||
@@ -1435,6 +1437,8 @@ struct gomp_device_descr
|
||||
__typeof (GOMP_OFFLOAD_unload_image) *unload_image_func;
|
||||
__typeof (GOMP_OFFLOAD_alloc) *alloc_func;
|
||||
__typeof (GOMP_OFFLOAD_free) *free_func;
|
||||
__typeof (GOMP_OFFLOAD_page_locked_host_alloc) *page_locked_host_alloc_func;
|
||||
__typeof (GOMP_OFFLOAD_page_locked_host_free) *page_locked_host_free_func;
|
||||
__typeof (GOMP_OFFLOAD_dev2host) *dev2host_func;
|
||||
__typeof (GOMP_OFFLOAD_host2dev) *host2dev_func;
|
||||
__typeof (GOMP_OFFLOAD_dev2dev) *dev2dev_func;
|
||||
|
||||
@@ -6957,8 +6957,11 @@ a @code{nearest} allocation.
|
||||
|
||||
Additional notes regarding the traits:
|
||||
@itemize
|
||||
@item The @code{pinned} trait is supported on Linux hosts, but is subject to
|
||||
the OS @code{ulimit}/@code{rlimit} locked memory settings.
|
||||
@item The @code{pinned} trait is supported on Linux hosts, but is usually
|
||||
subject to the OS @code{ulimit}/@code{rlimit} locked memory settings (see
|
||||
@ref{Offload-Target Specifics} for exceptions). It currently uses
|
||||
@code{mmap} and is therefore optimized for few allocations, including
|
||||
large data.
|
||||
@item The default for the @code{pool_size} trait is no pool and for every
|
||||
(re)allocation the associated library routine is called, which might
|
||||
internally use a memory pool.
|
||||
@@ -7065,6 +7068,12 @@ The implementation remark:
|
||||
@code{omp_thread_mem_alloc}, all use low-latency memory as first
|
||||
preference, and fall back to main graphics memory when the low-latency
|
||||
pool is exhausted.
|
||||
@item Pinned memory allocated using @code{omp_alloc} with the
|
||||
@code{ompx_gnu_pinned_mem_alloc} allocator or the @code{pinned} trait is
|
||||
obtained via the CUDA API when an NVPTX device is present. This provides
|
||||
a performance boost for NVPTX offload code and also allows unlimited use
|
||||
of pinned memory regardless of the OS @code{ulimit}/@code{rlimit}
|
||||
settings.
|
||||
@item The OpenMP routines @code{omp_target_memcpy_rect} and
|
||||
@code{omp_target_memcpy_rect_async} and the @code{target update}
|
||||
directive for non-contiguous list items use the 3D memory-copy function
|
||||
|
||||
@@ -1826,6 +1826,39 @@ GOMP_OFFLOAD_free (int ord, void *ptr)
|
||||
&& nvptx_free (ptr, ptx_devices[ord]));
|
||||
}
|
||||
|
||||
bool
|
||||
GOMP_OFFLOAD_page_locked_host_alloc (void **ptr, size_t size)
|
||||
{
|
||||
if (size == 0)
|
||||
{
|
||||
/* Special case to ensure omp_alloc specification compliance. */
|
||||
*ptr = NULL;
|
||||
return true;
|
||||
}
|
||||
|
||||
CUresult r;
|
||||
|
||||
unsigned int flags = 0;
|
||||
/* Given 'CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING', we don't need
|
||||
'flags |= CU_MEMHOSTALLOC_PORTABLE;' here. */
|
||||
r = CUDA_CALL_NOCHECK (cuMemHostAlloc, ptr, size, flags);
|
||||
if (r == CUDA_ERROR_OUT_OF_MEMORY)
|
||||
*ptr = NULL;
|
||||
else if (r != CUDA_SUCCESS)
|
||||
{
|
||||
GOMP_PLUGIN_error ("cuMemHostAlloc error: %s", cuda_error (r));
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
GOMP_OFFLOAD_page_locked_host_free (void *ptr)
|
||||
{
|
||||
CUDA_CALL (cuMemFreeHost, ptr);
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
GOMP_OFFLOAD_openacc_exec (void (*fn) (void *),
|
||||
size_t mapnum __attribute__((unused)),
|
||||
|
||||
113
libgomp/target.c
113
libgomp/target.c
@@ -5037,6 +5037,117 @@ omp_target_free (void *device_ptr, int device_num)
|
||||
gomp_mutex_unlock (&devicep->lock);
|
||||
}
|
||||
|
||||
/* Device (really: libgomp plugin) to use for paged-locked memory. We
|
||||
assume there is either none or exactly one such device for the lifetime of
|
||||
the process. */
|
||||
|
||||
static struct gomp_device_descr *device_for_page_locked
|
||||
= /* uninitialized */ (void *) -1;
|
||||
|
||||
static struct gomp_device_descr *
|
||||
get_device_for_page_locked (void)
|
||||
{
|
||||
struct gomp_device_descr *device;
|
||||
#ifdef HAVE_SYNC_BUILTINS
|
||||
device
|
||||
= __atomic_load_n (&device_for_page_locked, MEMMODEL_RELAXED);
|
||||
if (device == (void *) -1)
|
||||
{
|
||||
gomp_init_targets_once ();
|
||||
|
||||
device = NULL;
|
||||
for (int i = 0; i < num_devices; ++i)
|
||||
{
|
||||
/* We consider only the first device of potentially several of the
|
||||
same type as this functionality is not specific to an individual
|
||||
offloading device, but instead relates to the host-side
|
||||
implementation of the respective offloading implementation. */
|
||||
if (devices[i].target_id != 0)
|
||||
continue;
|
||||
|
||||
if (!devices[i].page_locked_host_alloc_func)
|
||||
continue;
|
||||
|
||||
if (device)
|
||||
gomp_fatal ("Unclear how %s and %s libgomp plugins may"
|
||||
" simultaneously provide functionality"
|
||||
" for page-locked memory",
|
||||
device->name, devices[i].name);
|
||||
|
||||
device = &devices[i];
|
||||
gomp_debug (0, "Using device %s for page-locked memory\n",
|
||||
device->name);
|
||||
}
|
||||
|
||||
struct gomp_device_descr *device_old
|
||||
= __atomic_exchange_n (&device_for_page_locked, device,
|
||||
MEMMODEL_RELAXED);
|
||||
assert (device_old == (void *) -1
|
||||
/* We shouldn't have concurrently found a different or no
|
||||
device. */
|
||||
|| device_old == device);
|
||||
}
|
||||
#else /* !HAVE_SYNC_BUILTINS */
|
||||
(void) &device_for_page_locked;
|
||||
device = NULL;
|
||||
#endif /* HAVE_SYNC_BUILTINS */
|
||||
|
||||
return device;
|
||||
}
|
||||
|
||||
/* Allocate page-locked host memory.
|
||||
Returns whether we have a device capable of that. */
|
||||
|
||||
attribute_hidden bool
|
||||
gomp_page_locked_host_alloc (void **ptr, size_t size)
|
||||
{
|
||||
struct gomp_device_descr *device = get_device_for_page_locked ();
|
||||
if (device)
|
||||
{
|
||||
gomp_mutex_lock (&device->lock);
|
||||
if (device->state == GOMP_DEVICE_UNINITIALIZED)
|
||||
gomp_init_device (device);
|
||||
else if (device->state == GOMP_DEVICE_FINALIZED)
|
||||
{
|
||||
gomp_mutex_unlock (&device->lock);
|
||||
gomp_fatal ("Device %s used for page-locked memory is finalized",
|
||||
device->name);
|
||||
}
|
||||
gomp_mutex_unlock (&device->lock);
|
||||
|
||||
if (!device->page_locked_host_alloc_func (ptr, size))
|
||||
gomp_fatal ("Failed to allocate page-locked host memory"
|
||||
" via %s libgomp plugin",
|
||||
device->name);
|
||||
}
|
||||
return device != NULL;
|
||||
}
|
||||
|
||||
/* Free page-locked host memory.
|
||||
This must only be called if 'gomp_page_locked_host_alloc' returned
|
||||
'true'. */
|
||||
|
||||
attribute_hidden void
|
||||
gomp_page_locked_host_free (void *ptr)
|
||||
{
|
||||
struct gomp_device_descr *device = get_device_for_page_locked ();
|
||||
assert (device);
|
||||
|
||||
gomp_mutex_lock (&device->lock);
|
||||
assert (device->state != GOMP_DEVICE_UNINITIALIZED);
|
||||
if (device->state == GOMP_DEVICE_FINALIZED)
|
||||
{
|
||||
gomp_mutex_unlock (&device->lock);
|
||||
return;
|
||||
}
|
||||
gomp_mutex_unlock (&device->lock);
|
||||
|
||||
if (!device->page_locked_host_free_func (ptr))
|
||||
gomp_fatal ("Failed to free page-locked host memory"
|
||||
" via %s libgomp plugin",
|
||||
device->name);
|
||||
}
|
||||
|
||||
int
|
||||
omp_target_is_present (const void *ptr, int device_num)
|
||||
{
|
||||
@@ -6234,6 +6345,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device,
|
||||
DLSYM (unload_image);
|
||||
DLSYM (alloc);
|
||||
DLSYM (free);
|
||||
DLSYM_OPT (page_locked_host_alloc, page_locked_host_alloc);
|
||||
DLSYM_OPT (page_locked_host_free, page_locked_host_free);
|
||||
DLSYM (dev2host);
|
||||
DLSYM (host2dev);
|
||||
DLSYM_OPT (memcpy2d, memcpy2d);
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
/* { dg-skip-if "Pinning not implemented on this host" { ! *-*-linux-gnu* } } */
|
||||
|
||||
/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
|
||||
|
||||
/* Test that pinned memory works. */
|
||||
|
||||
#include <stdio.h>
|
||||
@@ -63,10 +65,16 @@ verify0 (char *p, size_t s)
|
||||
int
|
||||
main ()
|
||||
{
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
/* Go big or go home.
|
||||
The OS ulimit does not affect memory locked via CUDA for NVPTX devices. */
|
||||
const int SIZE = 40 * 1024 * 1024;
|
||||
#else
|
||||
/* Allocate at least a page each time, allowing space for overhead,
|
||||
but stay within the ulimit. */
|
||||
const int SIZE = PAGE_SIZE - 128;
|
||||
CHECK_SIZE (SIZE * 5); // This is intended to help diagnose failures
|
||||
#endif
|
||||
|
||||
const omp_alloctrait_t traits[] = {
|
||||
{ omp_atk_pinned, 1 }
|
||||
@@ -88,21 +96,39 @@ main ()
|
||||
abort ();
|
||||
|
||||
int amount = get_pinned_mem ();
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
/* This doesn't show up as process 'VmLck'ed memory. */
|
||||
if (amount != 0)
|
||||
abort ();
|
||||
#else
|
||||
if (amount == 0)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
p = omp_realloc (p, SIZE * 2, allocator, allocator);
|
||||
|
||||
int amount2 = get_pinned_mem ();
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
/* This doesn't show up as process 'VmLck'ed memory. */
|
||||
if (amount2 != 0)
|
||||
abort ();
|
||||
#else
|
||||
if (amount2 <= amount)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
/* SIZE*2 ensures that it doesn't slot into the space possibly
|
||||
vacated by realloc. */
|
||||
p = omp_calloc (1, SIZE * 2, allocator);
|
||||
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
/* This doesn't show up as process 'VmLck'ed memory. */
|
||||
if (get_pinned_mem () != 0)
|
||||
abort ();
|
||||
#else
|
||||
if (get_pinned_mem () <= amount2)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
verify0 (p, SIZE * 2);
|
||||
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
/* { dg-skip-if "Pinning not implemented on this host" { ! *-*-linux-gnu* } } */
|
||||
|
||||
/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
|
||||
|
||||
/* Test that pinned memory works (pool_size code path). */
|
||||
|
||||
#include <stdio.h>
|
||||
@@ -63,10 +65,16 @@ verify0 (char *p, size_t s)
|
||||
int
|
||||
main ()
|
||||
{
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
/* Go big or go home.
|
||||
The OS ulimit does not affect memory locked via CUDA for NVPTX devices. */
|
||||
const int SIZE = 40 * 1024 * 1024;
|
||||
#else
|
||||
/* Allocate at least a page each time, allowing space for overhead,
|
||||
but stay within the ulimit. */
|
||||
const int SIZE = PAGE_SIZE - 128;
|
||||
CHECK_SIZE (SIZE * 5); // This is intended to help diagnose failures
|
||||
#endif
|
||||
|
||||
const omp_alloctrait_t traits[] = {
|
||||
{ omp_atk_pinned, 1 },
|
||||
@@ -89,16 +97,28 @@ main ()
|
||||
abort ();
|
||||
|
||||
int amount = get_pinned_mem ();
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
/* This doesn't show up as process 'VmLck'ed memory. */
|
||||
if (amount != 0)
|
||||
abort ();
|
||||
#else
|
||||
if (amount == 0)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
p = omp_realloc (p, SIZE * 2, allocator, allocator);
|
||||
if (!p)
|
||||
abort ();
|
||||
|
||||
int amount2 = get_pinned_mem ();
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
/* This doesn't show up as process 'VmLck'ed memory. */
|
||||
if (amount2 != 0)
|
||||
abort ();
|
||||
#else
|
||||
if (amount2 <= amount)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
/* SIZE*2 ensures that it doesn't slot into the space possibly
|
||||
vacated by realloc. */
|
||||
@@ -106,8 +126,14 @@ main ()
|
||||
if (!p)
|
||||
abort ();
|
||||
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
/* This doesn't show up as process 'VmLck'ed memory. */
|
||||
if (get_pinned_mem () != 0)
|
||||
abort ();
|
||||
#else
|
||||
if (get_pinned_mem () <= amount2)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
verify0 (p, SIZE * 2);
|
||||
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
/* { dg-do run } */
|
||||
|
||||
/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
|
||||
|
||||
/* Test that pinned memory fails correctly. */
|
||||
|
||||
#include <stdio.h>
|
||||
@@ -75,8 +77,15 @@ verify0 (char *p, size_t s)
|
||||
int
|
||||
main ()
|
||||
{
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
/* Go big or go home.
|
||||
The OS ulimit does not affect memory locked via CUDA for NVPTX devices. */
|
||||
const int SIZE = 40 * 1024 * 1024;
|
||||
#else
|
||||
/* This needs to be large enough to cover multiple pages. */
|
||||
const int SIZE = PAGE_SIZE * 4;
|
||||
#endif
|
||||
const int PIN_LIMIT = PAGE_SIZE * 2;
|
||||
|
||||
/* Pinned memory, no fallback. */
|
||||
const omp_alloctrait_t traits1[] = {
|
||||
@@ -101,23 +110,34 @@ main ()
|
||||
#endif
|
||||
|
||||
/* Ensure that the limit is smaller than the allocation. */
|
||||
set_pin_limit (SIZE / 2);
|
||||
set_pin_limit (PIN_LIMIT);
|
||||
|
||||
// Sanity check
|
||||
if (get_pinned_mem () != 0)
|
||||
abort ();
|
||||
|
||||
// Should fail
|
||||
void *p1 = omp_alloc (SIZE, allocator1);
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
// Doesn't care about 'set_pin_limit'.
|
||||
if (!p1)
|
||||
abort ();
|
||||
#else
|
||||
// Should fail
|
||||
if (p1)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
// Should fail
|
||||
void *p2 = omp_calloc (1, SIZE, allocator1);
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
// Doesn't care about 'set_pin_limit'.
|
||||
if (!p2)
|
||||
abort ();
|
||||
#else
|
||||
// Should fail
|
||||
if (p2)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
// Should fall back
|
||||
void *p3 = omp_alloc (SIZE, allocator2);
|
||||
if (!p3)
|
||||
abort ();
|
||||
@@ -128,16 +148,29 @@ main ()
|
||||
abort ();
|
||||
verify0 (p4, SIZE);
|
||||
|
||||
// Should fail to realloc
|
||||
void *notpinned = omp_alloc (SIZE, omp_default_mem_alloc);
|
||||
void *p5 = omp_realloc (notpinned, SIZE, allocator1, omp_default_mem_alloc);
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
// Doesn't care about 'set_pin_limit'; does reallocate.
|
||||
if (!notpinned || !p5 || p5 == notpinned)
|
||||
abort ();
|
||||
#else
|
||||
// Should fail to realloc
|
||||
if (!notpinned || p5)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
// Should fall back to no realloc needed
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
void *p6 = omp_realloc (p5, SIZE, allocator2, allocator1);
|
||||
// Does reallocate.
|
||||
if (p5 == p6)
|
||||
abort ();
|
||||
#else
|
||||
void *p6 = omp_realloc (notpinned, SIZE, allocator2, omp_default_mem_alloc);
|
||||
// Should fall back to no realloc needed
|
||||
if (p6 != notpinned)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
// No memory should have been pinned
|
||||
int amount = get_pinned_mem ();
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
/* { dg-do run } */
|
||||
|
||||
/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
|
||||
|
||||
/* Test that pinned memory fails correctly, pool_size code path. */
|
||||
|
||||
#include <stdio.h>
|
||||
@@ -75,8 +77,15 @@ verify0 (char *p, size_t s)
|
||||
int
|
||||
main ()
|
||||
{
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
/* Go big or go home.
|
||||
The OS ulimit does not affect memory locked via CUDA for NVPTX devices. */
|
||||
const int SIZE = 40 * 1024 * 1024;
|
||||
#else
|
||||
/* This needs to be large enough to cover multiple pages. */
|
||||
const int SIZE = PAGE_SIZE * 4;
|
||||
#endif
|
||||
const int PIN_LIMIT = PAGE_SIZE * 2;
|
||||
|
||||
/* Pinned memory, no fallback. */
|
||||
const omp_alloctrait_t traits1[] = {
|
||||
@@ -103,21 +112,33 @@ main ()
|
||||
#endif
|
||||
|
||||
/* Ensure that the limit is smaller than the allocation. */
|
||||
set_pin_limit (SIZE / 2);
|
||||
set_pin_limit (PIN_LIMIT);
|
||||
|
||||
// Sanity check
|
||||
if (get_pinned_mem () != 0)
|
||||
abort ();
|
||||
|
||||
// Should fail
|
||||
void *p = omp_alloc (SIZE, allocator1);
|
||||
if (p)
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
// Doesn't care about 'set_pin_limit'.
|
||||
if (!p)
|
||||
abort ();
|
||||
|
||||
#else
|
||||
// Should fail
|
||||
p = omp_calloc (1, SIZE, allocator1);
|
||||
if (p)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
p = omp_calloc (1, SIZE, allocator1);
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
// Doesn't care about 'set_pin_limit'.
|
||||
if (!p)
|
||||
abort ();
|
||||
#else
|
||||
// Should fail
|
||||
if (p)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
// Should fall back
|
||||
p = omp_alloc (SIZE, allocator2);
|
||||
@@ -130,16 +151,29 @@ main ()
|
||||
abort ();
|
||||
verify0 (p, SIZE);
|
||||
|
||||
// Should fail to realloc
|
||||
void *notpinned = omp_alloc (SIZE, omp_default_mem_alloc);
|
||||
p = omp_realloc (notpinned, SIZE, allocator1, omp_default_mem_alloc);
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
// Doesn't care about 'set_pin_limit'; does reallocate.
|
||||
if (!notpinned || !p || p == notpinned)
|
||||
abort ();
|
||||
#else
|
||||
// Should fail to realloc
|
||||
if (!notpinned || p)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
// Should fall back to no realloc needed
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
void *p_ = omp_realloc (p, SIZE, allocator2, allocator1);
|
||||
// Does reallocate.
|
||||
if (p_ == p)
|
||||
abort ();
|
||||
#else
|
||||
p = omp_realloc (notpinned, SIZE, allocator2, omp_default_mem_alloc);
|
||||
// Should fall back to no realloc needed
|
||||
if (p != notpinned)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
// No memory should have been pinned
|
||||
int amount = get_pinned_mem ();
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
/* { dg-skip-if "Pinning not implemented on this host" { ! *-*-linux-gnu* } } */
|
||||
|
||||
/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
|
||||
|
||||
/* Test that ompx_gnu_pinned_mem_alloc works. */
|
||||
|
||||
#include <stdio.h>
|
||||
@@ -63,10 +65,16 @@ verify0 (char *p, size_t s)
|
||||
int
|
||||
main ()
|
||||
{
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
/* Go big or go home.
|
||||
The OS ulimit does not affect memory locked via CUDA for NVPTX devices. */
|
||||
const int SIZE = 40 * 1024 * 1024;
|
||||
#else
|
||||
/* Allocate at least a page each time, allowing space for overhead,
|
||||
but stay within the ulimit. */
|
||||
const int SIZE = PAGE_SIZE - 128;
|
||||
CHECK_SIZE (SIZE * 5);
|
||||
#endif
|
||||
|
||||
// Sanity check
|
||||
if (get_pinned_mem () != 0)
|
||||
@@ -77,22 +85,40 @@ main ()
|
||||
abort ();
|
||||
|
||||
int amount = get_pinned_mem ();
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
/* This doesn't show up as process 'VmLck'ed memory. */
|
||||
if (amount != 0)
|
||||
abort ();
|
||||
#else
|
||||
if (amount == 0)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
p = omp_realloc (p, SIZE * 2, ompx_gnu_pinned_mem_alloc,
|
||||
ompx_gnu_pinned_mem_alloc);
|
||||
|
||||
int amount2 = get_pinned_mem ();
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
/* This doesn't show up as process 'VmLck'ed memory. */
|
||||
if (amount2 != 0)
|
||||
abort ();
|
||||
#else
|
||||
if (amount2 <= amount)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
/* SIZE*2 ensures that it doesn't slot into the space possibly
|
||||
vacated by realloc. */
|
||||
p = omp_calloc (1, SIZE * 2, ompx_gnu_pinned_mem_alloc);
|
||||
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
/* This doesn't show up as process 'VmLck'ed memory. */
|
||||
if (get_pinned_mem () != 0)
|
||||
abort ();
|
||||
#else
|
||||
if (get_pinned_mem () <= amount2)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
verify0 (p, SIZE * 2);
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
/* { dg-do run } */
|
||||
/* { dg-additional-options -DOFFLOAD_DEVICE_NVPTX { target offload_device_nvptx } } */
|
||||
|
||||
/* Test that ompx_gnu_pinned_mem_alloc fails correctly. */
|
||||
|
||||
@@ -66,32 +67,57 @@ set_pin_limit (int size)
|
||||
int
|
||||
main ()
|
||||
{
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
/* Go big or go home.
|
||||
The OS ulimit does not affect memory locked via CUDA for NVPTX devices. */
|
||||
const int SIZE = 40 * 1024 * 1024;
|
||||
#else
|
||||
/* Allocate at least a page each time, but stay within the ulimit. */
|
||||
const int SIZE = PAGE_SIZE * 4;
|
||||
#endif
|
||||
const int PIN_LIMIT = PAGE_SIZE*2;
|
||||
|
||||
/* Ensure that the limit is smaller than the allocation. */
|
||||
set_pin_limit (SIZE / 2);
|
||||
set_pin_limit (PIN_LIMIT);
|
||||
|
||||
// Sanity check
|
||||
if (get_pinned_mem () != 0)
|
||||
abort ();
|
||||
|
||||
// Should fail
|
||||
void *p = omp_alloc (SIZE, ompx_gnu_pinned_mem_alloc);
|
||||
if (p)
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
// Doesn't care about 'set_pin_limit'.
|
||||
if (!p)
|
||||
abort ();
|
||||
|
||||
#else
|
||||
// Should fail
|
||||
p = omp_calloc (1, SIZE, ompx_gnu_pinned_mem_alloc);
|
||||
if (p)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
p = omp_calloc (1, SIZE, ompx_gnu_pinned_mem_alloc);
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
// Doesn't care about 'set_pin_limit'.
|
||||
if (!p)
|
||||
abort ();
|
||||
#else
|
||||
// Should fail
|
||||
if (p)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
// Should fail to realloc
|
||||
void *notpinned = omp_alloc (SIZE, omp_default_mem_alloc);
|
||||
p = omp_realloc (notpinned, SIZE, ompx_gnu_pinned_mem_alloc,
|
||||
omp_default_mem_alloc);
|
||||
#ifdef OFFLOAD_DEVICE_NVPTX
|
||||
// Doesn't care about 'set_pin_limit'; does reallocate.
|
||||
if (!notpinned || !p || p == notpinned)
|
||||
abort ();
|
||||
#else
|
||||
// Should fail to realloc
|
||||
if (!notpinned || p)
|
||||
abort ();
|
||||
#endif
|
||||
|
||||
// No memory should have been pinned
|
||||
int amount = get_pinned_mem ();
|
||||
|
||||
Reference in New Issue
Block a user