mirror of
https://forge.sourceware.org/marek/gcc.git
synced 2026-02-22 03:47:02 -05:00
This patch introduces a new custom memory allocator for use with pinned memory (in the case where the Cuda allocator isn't available). In future, this allocator will also be used for Managed Memory. Both memories are incompatible with the system malloc because allocated memory cannot share a page with memory allocated for other purposes. This means that small allocations will no longer consume an entire page of pinned memory. Unfortunately, it also means that pinned memory pages will never be unmapped (although they may be reused). This isn't a technical limitation; the "free" algorithm could be extended in future, if needed. The implementation is not perfect; there are various corner cases (especially related to extending onto new pages) where allocations and reallocations may be sub-optimal, but it should still be a step forward in support for small allocations. I have considered using libmemkind's "fixed" memory but rejected it for three reasons: 1) libmemkind may not always be present at runtime, 2) there's no currently documented means to extend a "fixed" kind one page at a time (although the code appears to have an undocumented function that may do the job, and/or extending libmemkind to support the MAP_LOCKED mmap flag with its regular kinds would be straight-forward), 3) Managed Memory benefits from having the metadata located in different memory and using an external implementation makes it hard to guarantee this. libgomp/ChangeLog: * Makefile.am (libgomp_la_SOURCES): Add simple-allocator.c. * Makefile.in: Regenerate. * basic-allocator.c: Mention simple-allocator in the comment. * config/linux/allocator.c: Include unistd.h. (pin_ctx): New variable. (ctxlock): New variable. (linux_init_pin_ctx): New function. (linux_memspace_alloc): Use simple-allocator for pinned memory. (linux_memspace_free): Likewise. (linux_memspace_realloc): Likewise. * libgomp.h (gomp_simple_alloc_init_context): New prototype. (gomp_simple_alloc_register_memory): New prototype. (gomp_simple_alloc): New prototype. (gomp_simple_free): New prototype. (gomp_simple_realloc): New prototype. * libgomp.texi: Update pinned memory trait documentation. * testsuite/libgomp.c/alloc-pinned-8.c: New test. * simple-allocator.c: New file.
240 lines
7.1 KiB
C
240 lines
7.1 KiB
C
/* Copyright (C) 2022-2025 Free Software Foundation, Inc.
|
|
Contributed by Jakub Jelinek <jakub@redhat.com>.
|
|
|
|
This file is part of the GNU Offloading and Multi Processing Library
|
|
(libgomp).
|
|
|
|
Libgomp is free software; you can redistribute it and/or modify it
|
|
under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 3, or (at your option)
|
|
any later version.
|
|
|
|
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
more details.
|
|
|
|
Under Section 7 of GPL version 3, you are granted additional
|
|
permissions described in the GCC Runtime Library Exception, version
|
|
3.1, as published by the Free Software Foundation.
|
|
|
|
You should have received a copy of the GNU General Public License and
|
|
a copy of the GCC Runtime Library Exception along with this program;
|
|
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
/* This file contains wrappers for the system allocation routines. Most
|
|
places in the OpenMP API do not make any provision for failure, so in
|
|
general we cannot allow memory allocation to fail. */
|
|
|
|
#define _GNU_SOURCE
|
|
#include "libgomp.h"
|
|
#if defined(PLUGIN_SUPPORT) && defined(LIBGOMP_USE_PTHREADS)
|
|
#define LIBGOMP_USE_MEMKIND
|
|
#define LIBGOMP_USE_LIBNUMA
|
|
#endif
|
|
|
|
/* Implement malloc routines that can handle pinned memory on Linux.
|
|
|
|
Given that pinned memory is typically used to help host <-> device memory
|
|
transfers, we attempt to allocate such memory using a device (really:
|
|
libgomp plugin), but fall back to mmap plus mlock if no suitable device is
|
|
available.
|
|
|
|
It's possible to use mlock on any heap memory, but using munlock is
|
|
problematic if there are multiple pinned allocations on the same page.
|
|
Tracking all that manually would be possible, but adds overhead. This may
|
|
be worth it if there are a lot of small allocations getting pinned, but
|
|
this seems less likely in a HPC application.
|
|
|
|
Instead we optimize for large pinned allocations, and use mmap to ensure
|
|
that two pinned allocations don't share the same page. This also means
|
|
that large allocations don't pin extra pages by being poorly aligned. */
|
|
|
|
#define _GNU_SOURCE
|
|
#include <sys/mman.h>
|
|
#include <unistd.h>
|
|
#include <string.h>
|
|
#include <assert.h>
|
|
#include "libgomp.h"
|
|
#ifdef HAVE_INTTYPES_H
|
|
# include <inttypes.h> /* For PRIu64. */
|
|
#endif
|
|
|
|
static int using_device_for_page_locked
|
|
= /* uninitialized */ -1;
|
|
|
|
|
|
static gomp_simple_alloc_ctx_p pin_ctx = NULL;
|
|
static pthread_once_t ctxlock = PTHREAD_ONCE_INIT;
|
|
|
|
static void
|
|
linux_init_pin_ctx ()
|
|
{
|
|
pin_ctx = gomp_simple_alloc_init_context ();
|
|
}
|
|
|
|
static void *
|
|
linux_memspace_alloc (omp_memspace_handle_t memspace, size_t size, int pin,
|
|
bool init0)
|
|
{
|
|
void *addr = NULL;
|
|
|
|
if (pin)
|
|
{
|
|
int using_device = __atomic_load_n (&using_device_for_page_locked,
|
|
MEMMODEL_RELAXED);
|
|
if (using_device != 0)
|
|
{
|
|
using_device = gomp_page_locked_host_alloc (&addr, size);
|
|
int using_device_old
|
|
= __atomic_exchange_n (&using_device_for_page_locked,
|
|
using_device, MEMMODEL_RELAXED);
|
|
assert (using_device_old == -1
|
|
/* We shouldn't have concurrently changed our mind. */
|
|
|| using_device_old == using_device);
|
|
}
|
|
if (using_device == 0)
|
|
{
|
|
static int pagesize = 0;
|
|
static void *addrhint = NULL;
|
|
|
|
if (!pagesize)
|
|
pagesize = sysconf(_SC_PAGE_SIZE);
|
|
|
|
while (1)
|
|
{
|
|
addr = gomp_simple_alloc (pin_ctx, size);
|
|
if (addr)
|
|
break;
|
|
|
|
/* Round up to a whole page. */
|
|
size_t misalignment = size % pagesize;
|
|
size_t mmap_size = (misalignment > 0
|
|
? size + pagesize - misalignment
|
|
: size);
|
|
void *newpage = mmap (addrhint, mmap_size, PROT_READ | PROT_WRITE,
|
|
MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
|
|
if (newpage == MAP_FAILED)
|
|
break;
|
|
else
|
|
{
|
|
if (mlock (newpage, size))
|
|
{
|
|
#ifdef HAVE_INTTYPES_H
|
|
gomp_debug (0, "libgomp: failed to pin %"PRIu64" bytes"
|
|
" of memory (ulimit too low?)\n",
|
|
(uint64_t) size);
|
|
#else
|
|
gomp_debug (0, "libgomp: failed to pin %lu bytes of"
|
|
" memory (ulimit too low?)\n",
|
|
(unsigned long) size);
|
|
#endif
|
|
munmap (newpage, size);
|
|
break;
|
|
}
|
|
|
|
addrhint = newpage + mmap_size;
|
|
|
|
pthread_once (&ctxlock, linux_init_pin_ctx);
|
|
gomp_simple_alloc_register_memory (pin_ctx, newpage,
|
|
mmap_size);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
addr = malloc (size);
|
|
|
|
if (addr && init0)
|
|
memset (addr, 0, size);
|
|
|
|
return addr;
|
|
}
|
|
|
|
static void *
|
|
linux_memspace_calloc (omp_memspace_handle_t memspace, size_t size, int pin)
|
|
{
|
|
if (pin)
|
|
return linux_memspace_alloc (memspace, size, pin, true);
|
|
else
|
|
return calloc (1, size);
|
|
}
|
|
|
|
static void
|
|
linux_memspace_free (omp_memspace_handle_t memspace, void *addr, size_t size,
|
|
int pin)
|
|
{
|
|
if (pin)
|
|
{
|
|
int using_device
|
|
= __atomic_load_n (&using_device_for_page_locked,
|
|
MEMMODEL_RELAXED);
|
|
if (using_device == 1)
|
|
gomp_page_locked_host_free (addr);
|
|
else
|
|
/* The "simple" allocator does not (currently) munmap locked pages
|
|
(meaning that the number of locked pages never decreases), but it
|
|
can reuse the freed memory in subsequent gomp_simple_alloc calls. */
|
|
gomp_simple_free (pin_ctx, addr);
|
|
}
|
|
else
|
|
free (addr);
|
|
}
|
|
|
|
static void *
|
|
linux_memspace_realloc (omp_memspace_handle_t memspace, void *addr,
|
|
size_t oldsize, size_t size, int oldpin, int pin)
|
|
{
|
|
if (oldpin && pin)
|
|
{
|
|
int using_device
|
|
= __atomic_load_n (&using_device_for_page_locked,
|
|
MEMMODEL_RELAXED);
|
|
/* The device plugin API does not support realloc,
|
|
but the gomp_simple_alloc allocator does. */
|
|
if (using_device == 0)
|
|
{
|
|
/* This can fail if there is insufficient pinned memory free. */
|
|
void *newaddr = gomp_simple_realloc (pin_ctx, addr, size);
|
|
if (newaddr)
|
|
return newaddr;
|
|
}
|
|
}
|
|
else if (oldpin || pin)
|
|
/* Moving from pinned to unpinned memory cannot be done in-place. */
|
|
;
|
|
else
|
|
return realloc (addr, size);
|
|
|
|
/* In-place reallocation failed. Fall back to copy. */
|
|
void *newaddr = linux_memspace_alloc (memspace, size, pin, false);
|
|
if (newaddr)
|
|
{
|
|
memcpy (newaddr, addr, oldsize < size ? oldsize : size);
|
|
linux_memspace_free (memspace, addr, oldsize, oldpin);
|
|
}
|
|
|
|
return newaddr;
|
|
}
|
|
|
|
static int
|
|
linux_memspace_validate (omp_memspace_handle_t, unsigned, int)
|
|
{
|
|
/* Everything should be accepted on Linux, including pinning. */
|
|
return 1;
|
|
}
|
|
|
|
#define MEMSPACE_ALLOC(MEMSPACE, SIZE, PIN) \
|
|
linux_memspace_alloc (MEMSPACE, SIZE, PIN, false)
|
|
#define MEMSPACE_CALLOC(MEMSPACE, SIZE, PIN) \
|
|
linux_memspace_calloc (MEMSPACE, SIZE, PIN)
|
|
#define MEMSPACE_REALLOC(MEMSPACE, ADDR, OLDSIZE, SIZE, OLDPIN, PIN) \
|
|
linux_memspace_realloc (MEMSPACE, ADDR, OLDSIZE, SIZE, OLDPIN, PIN)
|
|
#define MEMSPACE_FREE(MEMSPACE, ADDR, SIZE, PIN) \
|
|
linux_memspace_free (MEMSPACE, ADDR, SIZE, PIN)
|
|
#define MEMSPACE_VALIDATE(MEMSPACE, ACCESS, PIN) \
|
|
linux_memspace_validate (MEMSPACE, ACCESS, PIN)
|
|
|
|
#include "../../allocator.c"
|