mirror of
https://forge.sourceware.org/marek/gcc.git
synced 2026-02-22 03:47:02 -05:00
This patch implements "managed" memory for AMD GCN GPUs in OpenMP. It builds on the support added to the NVPTX libgomp for CUDA Managed Memory, a week or two ago. These features were first posted here a few years ago, as part of a larger Unified Shared Memory patch series, and then in a slightly changed version just over a year ago. Hopefully this time the controversial bits have been removed. Since we do not use HIP we cannot use hipMallocManaged, so this patch attempts to replicate the same effect by setting the appropriate attributes. This works on more devices than support proper USM, but still I cannot be sure that the settings are correct for every device out there (I have tested on gfx900, gfx906, gfx908, gfx90a, and gfx1100). The HSA header file update uses the most recent files relicensed for us by AMD, at the time of the first patch posting. Those files have certainly moved on in the upstream sources, but I did not ask to get those relicensed. include/ChangeLog: * hsa.h: Import newer version. * hsa_ext_amd.h: Likewise. * hsa_ext_image.h: Likewise. libgomp/ChangeLog: * Makefile.in: Regenerate. * libgomp-plugin.h (gomp_simple_alloc_init_context): New prototype. (gomp_simple_alloc_register_memory): New prototype. (gomp_simple_alloc): New prototype. (gomp_simple_free): New prototype. (gomp_simple_realloc): New prototype. * libgomp.h (gomp_simple_alloc_init_context): Move to libgomp-plugin.h. (gomp_simple_alloc_register_memory): Likewise. (gomp_simple_alloc): Likewise. (gomp_simple_free): Likewise. (gomp_simple_realloc): Likewise. * libgomp.texi: Update AMD managed memory description. * plugin/Makefrag.am (libgomp_plugin_gcn_la_SOURCES): Add simple-allocator.c and plugin/mutex.c. * plugin/plugin-gcn.c: Include sys/mman.h and unistd.h. (struct hsa_runtime_fn_info): Add hsa_amd_svm_attributes_set_fn. (dump_hsa_system_info): Add HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED and HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT to the GCN_DEBUG output. (init_hsa_runtime_functions): Add hsa_amd_svm_attributes_set. (isa_matches_agent): Add a new error message for the case where the ISA doesn't match but the name does. (managed_ctx): New variable. (managed_heap_create): New function. (GOMP_OFFLOAD_managed_alloc): Likewise. (GOMP_OFFLOAD_managed_free): Likewise. * simple-allocator.c (gomp_fatal): New macro. * testsuite/lib/libgomp.exp (check_effective_target_omp_managedmem): Add amdgcn support checker. (check_effective_target_offload_target_amdgcn_with_xnack): New. * testsuite/libgomp.c-c++-common/requires-4.c: Ignore xnack warning. * testsuite/libgomp.c-c++-common/requires-4a.c: Ignore xnack warning. * testsuite/libgomp.c-c++-common/requires-5.c: Ignore xnack warning. * testsuite/libgomp.c++/alloc-managed-1.C: Add -mxnack=on, if needed. * testsuite/libgomp.c/alloc-managed-1.c: Likewise. * testsuite/libgomp.c/alloc-managed-2.c: Likewise. * testsuite/libgomp.c/alloc-managed-3.c: Likewise. * testsuite/libgomp.c/alloc-managed-4.c: Likewise. * testsuite/libgomp.fortran/alloc-managed-1.f90: Likewise. * plugin/mutex.c: New file.
317 lines
9.4 KiB
C
317 lines
9.4 KiB
C
/* Copyright (C) 2025 Free Software Foundation, Inc.
|
|
|
|
This file is part of the GNU Offloading and Multi Processing Library
|
|
(libgomp).
|
|
|
|
Libgomp is free software; you can redistribute it and/or modify it
|
|
under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 3, or (at your option)
|
|
any later version.
|
|
|
|
Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
more details.
|
|
|
|
Under Section 7 of GPL version 3, you are granted additional
|
|
permissions described in the GCC Runtime Library Exception, version
|
|
3.1, as published by the Free Software Foundation.
|
|
|
|
You should have received a copy of the GNU General Public License and
|
|
a copy of the GCC Runtime Library Exception along with this program;
|
|
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
/* This is a simple "malloc" implementation intended for use with device
|
|
Managed Memory and Pinned Memory. It allocates memory from a pool allocated
|
|
and configured by the device plugin, or the OS-specific allocator
|
|
(for pinned).
|
|
|
|
Unlike the "basic" allocator, this implementation keeps the allocated/free
|
|
chain in a side-table (splay tree) to ensure that the allocation routine
|
|
does not have the side-effect of migrating all the managed memory pages back
|
|
into host memory. Keeping the meta-data elsewhere is also useful for pinned
|
|
memory, which is typically an extremely limited resource. */
|
|
|
|
#include <string.h>
|
|
#include "libgomp.h"
|
|
|
|
/* Use a splay tree to track allocations. */
|
|
|
|
typedef struct simple_alloc_splay_tree_node_s *simple_alloc_splay_tree_node;
|
|
typedef struct simple_alloc_splay_tree_s *simple_alloc_splay_tree;
|
|
typedef struct simple_alloc_splay_tree_key_s *simple_alloc_splay_tree_key;
|
|
|
|
struct simple_alloc_splay_tree_key_s {
|
|
void *base;
|
|
size_t size;
|
|
};
|
|
|
|
static inline int
|
|
simple_alloc_splay_compare (simple_alloc_splay_tree_key x,
|
|
simple_alloc_splay_tree_key y)
|
|
{
|
|
return (x->base == y->base ? 0
|
|
: x->base > y->base ? 1
|
|
: -1);
|
|
}
|
|
|
|
#define splay_tree_prefix simple_alloc
|
|
#include "splay-tree.h"
|
|
|
|
/* 128-byte granularity means GPU cache-line aligned. */
|
|
#define ALIGN(VAR) (((VAR) + 127) & ~127)
|
|
|
|
/* The context data prevents the need for global state. */
|
|
struct gomp_simple_alloc_context {
|
|
struct simple_alloc_splay_tree_s allocations;
|
|
struct simple_alloc_splay_tree_s free_space;
|
|
gomp_mutex_t lock;
|
|
};
|
|
|
|
gomp_simple_alloc_ctx_p
|
|
gomp_simple_alloc_init_context ()
|
|
{
|
|
return calloc (1, sizeof (struct gomp_simple_alloc_context));
|
|
}
|
|
|
|
/* Coalesce contiguous free space into one entry. This considers the entries
|
|
either side of the root node only, so it should be called each time a new
|
|
entry in inserted into the root. */
|
|
|
|
static void
|
|
simple_alloc_coalesce_free_space (gomp_simple_alloc_ctx_p ctx)
|
|
{
|
|
simple_alloc_splay_tree_node prev, next, node = ctx->free_space.root;
|
|
|
|
for (prev = node->left; prev && prev->right; prev = prev->right)
|
|
;
|
|
for (next = node->right; next && next->left; next = next->left)
|
|
;
|
|
|
|
/* Coalesce adjacent free chunks. */
|
|
if (next
|
|
&& node->key.base + node->key.size == next->key.base)
|
|
{
|
|
/* Free chunk follows. */
|
|
node->key.size += next->key.size;
|
|
simple_alloc_splay_tree_remove (&ctx->free_space, &next->key);
|
|
free (next);
|
|
}
|
|
if (prev
|
|
&& prev->key.base + prev->key.size == node->key.base)
|
|
{
|
|
/* Free chunk precedes. */
|
|
prev->key.size += node->key.size;
|
|
simple_alloc_splay_tree_remove (&ctx->free_space, &node->key);
|
|
free (node);
|
|
}
|
|
}
|
|
|
|
/* Add a new memory region into the free chain. This is how our heap is
|
|
initialized and extended (using memory acquired by an external caller). If
|
|
the new region is contiguous with an existing region then any free space
|
|
will be coalesced. */
|
|
|
|
void
|
|
gomp_simple_alloc_register_memory (gomp_simple_alloc_ctx_p ctx, char *base,
|
|
size_t size)
|
|
{
|
|
if (base == NULL || ctx == NULL)
|
|
return;
|
|
|
|
gomp_mutex_lock (&ctx->lock);
|
|
|
|
simple_alloc_splay_tree_node node;
|
|
node = malloc (sizeof (struct simple_alloc_splay_tree_node_s));
|
|
node->key.base = base;
|
|
node->key.size = size;
|
|
node->left = NULL;
|
|
node->right = NULL;
|
|
simple_alloc_splay_tree_insert (&ctx->free_space, node);
|
|
simple_alloc_coalesce_free_space (ctx);
|
|
|
|
gomp_mutex_unlock (&ctx->lock);
|
|
}
|
|
|
|
/* This splay_tree_foreach callback selects the first free space large enough
|
|
to hold the allocation needed. Since the splay_tree walk may start in the
|
|
middle the "first" isn't necessarily the "leftmost" entry. */
|
|
|
|
struct simple_alloc_callback_data {
|
|
size_t size;
|
|
simple_alloc_splay_tree_node found;
|
|
};
|
|
|
|
static int
|
|
simple_alloc_callback (simple_alloc_splay_tree_key key, void *data)
|
|
{
|
|
struct simple_alloc_callback_data *cbd
|
|
= (struct simple_alloc_callback_data *)data;
|
|
|
|
if (key->size >= cbd->size)
|
|
{
|
|
cbd->found = (simple_alloc_splay_tree_node)key;
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/* Simple "malloc". Selects and moves and address range from ctx->free_space to
|
|
ctx->allocations, while leaving any excess in ctx->free_space. */
|
|
|
|
void *
|
|
gomp_simple_alloc (gomp_simple_alloc_ctx_p ctx, size_t size)
|
|
{
|
|
if (ctx == NULL)
|
|
return NULL;
|
|
|
|
/* Memory is allocated in N-byte granularity. */
|
|
size = ALIGN (size);
|
|
|
|
gomp_mutex_lock (&ctx->lock);
|
|
|
|
if (!ctx->free_space.root)
|
|
{
|
|
/* No memory registered, or no free space. */
|
|
gomp_mutex_unlock (&ctx->lock);
|
|
return NULL;
|
|
}
|
|
|
|
/* Find a suitable free block. */
|
|
struct simple_alloc_callback_data cbd = {size, NULL};
|
|
simple_alloc_splay_tree_foreach_lazy (&ctx->free_space,
|
|
simple_alloc_callback, &cbd);
|
|
simple_alloc_splay_tree_node freenode = cbd.found;
|
|
|
|
void *result = NULL;
|
|
if (freenode)
|
|
{
|
|
/* Allocation successful. */
|
|
result = freenode->key.base;
|
|
simple_alloc_splay_tree_node allocnode = malloc (sizeof (*allocnode));
|
|
allocnode->key.base = result;
|
|
allocnode->key.size = size;
|
|
allocnode->left = NULL;
|
|
allocnode->right = NULL;
|
|
simple_alloc_splay_tree_insert (&ctx->allocations, allocnode);
|
|
|
|
/* Update the free chain. */
|
|
size_t stillfree_size = freenode->key.size - size;
|
|
if (stillfree_size > 0)
|
|
{
|
|
freenode->key.base = freenode->key.base + size;
|
|
freenode->key.size = stillfree_size;
|
|
}
|
|
else
|
|
{
|
|
simple_alloc_splay_tree_remove (&ctx->free_space, &freenode->key);
|
|
free (freenode);
|
|
}
|
|
}
|
|
|
|
gomp_mutex_unlock (&ctx->lock);
|
|
|
|
return result;
|
|
}
|
|
|
|
/* Simple "free". Moves an address range from ctx->allocations to
|
|
ctx->free_space and merges that record with any contiguous free memory. */
|
|
|
|
void
|
|
gomp_simple_free (gomp_simple_alloc_ctx_p ctx, void *addr)
|
|
{
|
|
if (ctx == NULL)
|
|
return;
|
|
|
|
gomp_mutex_lock (&ctx->lock);
|
|
|
|
/* Convert the memory map to free. */
|
|
struct simple_alloc_splay_tree_key_s key = {addr};
|
|
simple_alloc_splay_tree_key found
|
|
= simple_alloc_splay_tree_lookup (&ctx->allocations, &key);
|
|
if (!found)
|
|
GOMP_PLUGIN_fatal ("invalid free");
|
|
simple_alloc_splay_tree_remove (&ctx->allocations, &key);
|
|
simple_alloc_splay_tree_insert (&ctx->free_space,
|
|
(simple_alloc_splay_tree_node)found);
|
|
simple_alloc_coalesce_free_space (ctx);
|
|
|
|
gomp_mutex_unlock (&ctx->lock);
|
|
}
|
|
|
|
/* Simple "realloc". Works in-place, if possible; reallocates otherwise. */
|
|
|
|
void *
|
|
gomp_simple_realloc (gomp_simple_alloc_ctx_p ctx, void *addr, size_t newsize)
|
|
{
|
|
if (ctx == NULL)
|
|
return NULL;
|
|
|
|
newsize = ALIGN (newsize);
|
|
|
|
gomp_mutex_lock (&ctx->lock);
|
|
|
|
/* Convert the memory map to free. */
|
|
struct simple_alloc_splay_tree_key_s key = {addr};
|
|
simple_alloc_splay_tree_key found
|
|
= simple_alloc_splay_tree_lookup (&ctx->allocations, &key);
|
|
if (!found)
|
|
GOMP_PLUGIN_fatal ("invalid realloc");
|
|
|
|
if (newsize == found->size)
|
|
; /* Nothing to do. */
|
|
else if (newsize < found->size)
|
|
{
|
|
/* We're reducing the allocation size. */
|
|
simple_alloc_splay_tree_node newfree = malloc (sizeof (*newfree));
|
|
newfree->key.base = found->base + newsize;
|
|
newfree->key.size = found->size - newsize;
|
|
newfree->left = NULL;
|
|
newfree->right = NULL;
|
|
simple_alloc_splay_tree_insert (&ctx->free_space, newfree);
|
|
simple_alloc_coalesce_free_space (ctx);
|
|
}
|
|
else
|
|
{
|
|
/* We're extending the allocation. */
|
|
struct simple_alloc_splay_tree_key_s freekey = {addr + found->size};
|
|
simple_alloc_splay_tree_key foundfree;
|
|
foundfree = simple_alloc_splay_tree_lookup (&ctx->free_space, &freekey);
|
|
if (foundfree && foundfree->size >= newsize - found->size)
|
|
{
|
|
/* Allocation can be expanded in place. */
|
|
foundfree->base += found->size;
|
|
foundfree->size -= newsize - found->size;
|
|
found->size = newsize;
|
|
|
|
if (foundfree->size == 0)
|
|
simple_alloc_splay_tree_remove (&ctx->free_space, &freekey);
|
|
}
|
|
else
|
|
{
|
|
/* Allocation must be relocated.
|
|
Release the lock and use alloc/free. */
|
|
gomp_mutex_unlock (&ctx->lock);
|
|
|
|
void *newaddr = gomp_simple_alloc (ctx, newsize);
|
|
if (!newaddr)
|
|
return NULL;
|
|
|
|
memcpy (newaddr, addr, found->size);
|
|
gomp_simple_free (ctx, addr);
|
|
return newaddr;
|
|
}
|
|
}
|
|
|
|
gomp_mutex_unlock (&ctx->lock);
|
|
return addr;
|
|
}
|
|
|
|
/* Include the splay tree code inline, with the prefixes added. */
|
|
#define splay_tree_prefix simple_alloc
|
|
#define splay_tree_c
|
|
#define gomp_fatal GOMP_PLUGIN_fatal /* So it links into a plugin. */
|
|
#include "splay-tree.h"
|