diff --git a/include/hsa.h b/include/hsa.h index 3c7be95d7fd0..28867a91a7cd 100644 --- a/include/hsa.h +++ b/include/hsa.h @@ -1,6 +1,6 @@ //////////////////////////////////////////////////////////////////////////////// // -// Copyright (C) 2014-2020 Advanced Micro Devices Inc. All rights reserved. +// Copyright (C) 2014-2022 Advanced Micro Devices Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person or organization // obtaining a copy of the software and accompanying documentation covered by @@ -467,7 +467,19 @@ typedef enum { * String containing the ROCr build identifier. */ HSA_AMD_SYSTEM_INFO_BUILD_VERSION = 0x200, - + /** + * Returns true if hsa_amd_svm_* APIs are supported by the driver. The type of + * this attribute is bool. + */ + HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED = 0x201, + // TODO: Should this be per Agent? + /** + * Returns true if all Agents have access to system allocated memory (such as + * that allocated by mmap, malloc, or new) by default. + * If false then system allocated memory may only be made SVM accessible to + * an Agent by declaration of accessibility with hsa_amd_svm_set_attributes. + * The type of this attribute is bool. + */ HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT = 0x202 } hsa_system_info_t; @@ -986,8 +998,16 @@ typedef enum { * Minor version of the HSA runtime specification supported by the * agent. The type of this attribute is uint16_t. */ - HSA_AGENT_INFO_VERSION_MINOR = 22 - + HSA_AGENT_INFO_VERSION_MINOR = 22, + /** + * This enum does not have a fixed underlying type, thus in C++ post D2338: + * If the enumeration type does not have a fixed underlying type, the value is + * unchanged if the original value is within the range of the enumeration + * values (9.7.1 [dcl.enum]), and otherwise, the behavior is + * undefined. + * Thus increase the range of this enum to encompass vendor extensions. + */ + HSA_AGENT_INFO_LAST = INT32_MAX } hsa_agent_info_t; /** diff --git a/include/hsa_ext_amd.h b/include/hsa_ext_amd.h index e29e88090eb0..16a6aa01d259 100644 --- a/include/hsa_ext_amd.h +++ b/include/hsa_ext_amd.h @@ -1,6 +1,6 @@ //////////////////////////////////////////////////////////////////////////////// // -// Copyright (C) 2014-2020 Advanced Micro Devices Inc. All rights reserved. +// Copyright (C) 2014-2022 Advanced Micro Devices Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person or organization // obtaining a copy of the software and accompanying documentation covered by @@ -41,6 +41,115 @@ extern "C" { #endif +/** \addtogroup aql Architected Queuing Language + * @{ + */ + +/** + * @brief A fixed-size type used to represent ::hsa_signal_condition_t constants. + */ +typedef uint32_t hsa_signal_condition32_t; + +/** + * @brief AMD vendor specific packet type. + */ +typedef enum { + /** + * Packet used by agents to delay processing of subsequent packets until a + * configurable condition is satisfied by an HSA signal. Only kernel dispatch + * queues created from AMD GPU Agents support this packet. + */ + HSA_AMD_PACKET_TYPE_BARRIER_VALUE = 2, +} hsa_amd_packet_type_t; + +/** + * @brief A fixed-size type used to represent ::hsa_amd_packet_type_t constants. + */ +typedef uint8_t hsa_amd_packet_type8_t; + +/** + * @brief AMD vendor specific AQL packet header + */ +typedef struct hsa_amd_packet_header_s { + /** + * Packet header. Used to configure multiple packet parameters such as the + * packet type. The parameters are described by ::hsa_packet_header_t. + */ + uint16_t header; + + /** + *Format of the vendor specific packet. + */ + hsa_amd_packet_type8_t AmdFormat; + + /** + * Reserved. Must be 0. + */ + uint8_t reserved; +} hsa_amd_vendor_packet_header_t; + +/** + * @brief AMD barrier value packet. Halts packet processing and waits for + * (signal_value & ::mask) ::cond ::value to be satisfied, where signal_value + * is the value of the signal ::signal. + */ +typedef struct hsa_amd_barrier_value_packet_s { + /** + * AMD vendor specific packet header. + */ + hsa_amd_vendor_packet_header_t header; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved0; + + /** + * Dependent signal object. A signal with a handle value of 0 is + * allowed and is interpreted by the packet processor a satisfied + * dependency. + */ + hsa_signal_t signal; + + /** + * Value to compare against. + */ + hsa_signal_value_t value; + + /** + * Bit mask to be combined by bitwise AND with ::signal's value. + */ + hsa_signal_value_t mask; + + /** + * Comparison operation. See ::hsa_signal_condition_t. + */ + hsa_signal_condition32_t cond; + + /** + * Reserved. Must be 0. + */ + uint32_t reserved1; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved2; + + /** + * Reserved. Must be 0. + */ + uint64_t reserved3; + + /** + * Signal used to indicate completion of the job. The application can use the + * special signal handle 0 to indicate that no signal is used. + */ + hsa_signal_t completion_signal; +} hsa_amd_barrier_value_packet_t; + +/** @} */ + /** * @brief Enumeration constants added to ::hsa_status_t. * @@ -61,6 +170,20 @@ enum { * Agent executed an invalid shader instruction. */ HSA_STATUS_ERROR_ILLEGAL_INSTRUCTION = 42, + + /** + * Agent attempted to access an inaccessible address. + * See hsa_amd_register_system_event_handler and + * HSA_AMD_GPU_MEMORY_FAULT_EVENT for more information on illegal accesses. + */ + HSA_STATUS_ERROR_MEMORY_FAULT = 43, + + /** + * The CU mask was successfully set but the mask attempted to enable a CU + * which was disabled for the process. CUs disabled for the process remain + * disabled. + */ + HSA_STATUS_CU_MASK_REDUCED = 44, }; /** @@ -479,6 +602,37 @@ hsa_status_t HSA_API hsa_amd_signal_create(hsa_signal_value_t initial_value, uin const hsa_agent_t* consumers, uint64_t attributes, hsa_signal_t* signal); +/** + * @brief Returns a pointer to the value of a signal. + * + * Use of this API does not modify the lifetime of ::signal and any + * hsa_signal_value_t retrieved by this API has lifetime equal to that of + * ::signal. + * + * This API is intended for partial interoperability with non-HSA compatible + * devices and should not be used where HSA interfaces are available. + * + * Use of the signal value must comply with use restritions of ::signal. + * Use may result in data races if the operations performed are not platform + * atomic. Use with HSA_AMD_SIGNAL_AMD_GPU_ONLY or HSA_AMD_SIGNAL_IPC + * attributed signals is required. + * + * @param[in] Signal handle to extract the signal value pointer from. + * + * @param[out] Location where the extracted signal value pointer will be placed. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL signal is not a valid hsa_signal_t + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT value_ptr is NULL. + */ +hsa_status_t hsa_amd_signal_value_pointer(hsa_signal_t signal, + volatile hsa_signal_value_t** value_ptr); + /** * @brief Asyncronous signal handler function type. * @@ -621,31 +775,68 @@ hsa_status_t HSA_API hsa_amd_image_get_info_max_dim(hsa_agent_t agent, void* value); /** - * @brief Set a CU affinity to specific queues within the process, this function - * call is "atomic". + * @brief Set a queue's CU affinity mask. + * + * @details Enables the queue to run on only selected CUs. The given mask is + * combined by bitwise AND with any device wide mask in HSA_CU_MASK before + * being applied. + * If num_cu_mask_count is 0 then the request is interpreted as a request to + * enable all CUs and no cu_mask array need be given. * * @param[in] queue A pointer to HSA queue. * - * @param[in] num_cu_mask_count Size of CUMask bit array passed in. + * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits. * * @param[in] cu_mask Bit-vector representing the CU mask. * * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. * + * @retval ::HSA_STATUS_CU_MASK_REDUCED The function was successfully executed + * but the given mask attempted to enable a CU which was disabled by + * HSA_CU_MASK. CUs disabled by HSA_CU_MASK remain disabled. + * + * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been + * initialized. + * + * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid. + * + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is not + * a multiple of 32 or @p num_cu_mask_count is not 0 and cu_mask is NULL. + * Devices with work group processors must even-index contiguous pairwise + * CU enable e.g. 0x33(b'110011) is valid while 0x5(0x101) and 0x6(b'0110) + * are invalid. + * + */ +hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, + uint32_t num_cu_mask_count, + const uint32_t* cu_mask); + +/** + * @brief Retrieve a queue's CU affinity mask. + * + * @details Returns the first num_cu_mask_count bits of a queue's CU mask. + * Ensure that num_cu_mask_count is at least as large as + * HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT to retrieve the entire mask. + * + * @param[in] queue A pointer to HSA queue. + * + * @param[in] num_cu_mask_count Size of CUMask bit array passed in, in bits. + * + * @param[out] cu_mask Bit-vector representing the CU mask. + * + * @retval ::HSA_STATUS_SUCCESS The function has been executed successfully. + * * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * * @retval ::HSA_STATUS_ERROR_INVALID_QUEUE @p queue is NULL or invalid. * - * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is not - * multiple of 32 or @p cu_mask is NULL. - * - * @retval ::HSA_STATUS_ERROR failed to call thunk api + * @retval ::HSA_STATUS_ERROR_INVALID_ARGUMENT @p num_cu_mask_count is 0, not + * a multiple of 32 or @p cu_mask is NULL. * */ -hsa_status_t HSA_API hsa_amd_queue_cu_set_mask(const hsa_queue_t* queue, - uint32_t num_cu_mask_count, - const uint32_t* cu_mask); +hsa_status_t HSA_API hsa_amd_queue_cu_get_mask(const hsa_queue_t* queue, uint32_t num_cu_mask_count, + uint32_t* cu_mask); /** * @brief Memory segments associated with a memory pool. @@ -778,6 +969,24 @@ typedef enum { HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16, } hsa_amd_memory_pool_info_t; +/** + * @brief Memory pool flag used to specify allocation directives + * + */ +typedef enum hsa_amd_memory_pool_flag_s { + /** + * Allocates memory that conforms to standard HSA memory consistency model + */ + HSA_AMD_MEMORY_POOL_STANDARD_FLAG = 0, + /** + * Allocates fine grain memory type where memory ordering is per point to point + * connection. Atomic memory operations on these memory buffers are not + * guaranteed to be visible at system scope. + */ + HSA_AMD_MEMORY_POOL_PCIE_FLAG = 1, + +} hsa_amd_memory_pool_flag_t; + /** * @brief Get the current value of an attribute of a memory pool. * @@ -846,7 +1055,7 @@ hsa_status_t HSA_API hsa_amd_agent_iterate_memory_pools( * ::HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE in @p memory_pool. * * @param[in] flags A bit-field that is used to specify allocation - * directives. Reserved parameter, must be 0. + * directives. * * @param[out] ptr Pointer to the location where to store the base virtual * address of @@ -903,6 +1112,8 @@ hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr); * * @param[in] dst_agent Agent associated with the @p dst. The agent must be able to directly * access both the source and destination buffers in their current locations. + * May be zero in which case the runtime will attempt to discover the destination agent. + * Discovery may have variable and/or high latency. * * @param[in] src A valid pointer to the source of data to be copied. The source * buffer must not overlap with the destination buffer, otherwise the copy will succeed @@ -910,6 +1121,8 @@ hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr); * * @param[in] src_agent Agent associated with the @p src. The agent must be able to directly * access both the source and destination buffers in their current locations. + * May be zero in which case the runtime will attempt to discover the destination agent. + * Discovery may have variable and/or high latency. * * @param[in] size Number of bytes to copy. If @p size is 0, no copy is * performed and the function returns success. Copying a number of bytes larger @@ -920,9 +1133,9 @@ hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr); * * @param[in] dep_signals List of signals that must be waited on before the copy * operation starts. The copy will start after every signal has been observed with - * the value 0. The dependent signal should not include completion signal from hsa_amd_memory_async_copy - * operation to be issued in future as that can result in a deadlock. If @p num_dep_signals is 0, this - * argument is ignored. + * the value 0. The dependent signal should not include completion signal from + * hsa_amd_memory_async_copy operation to be issued in future as that can result + * in a deadlock. If @p num_dep_signals is 0, this argument is ignored. * * @param[in] completion_signal Signal used to indicate completion of the copy * operation. When the copy operation is finished, the value of the signal is @@ -937,7 +1150,7 @@ hsa_status_t HSA_API hsa_amd_memory_pool_free(void* ptr); * @retval ::HSA_STATUS_ERROR_NOT_INITIALIZED The HSA runtime has not been * initialized. * - * @retval ::HSA_STATUS_ERROR_INVALID_AGENT The agent is invalid. + * @retval ::HSA_STATUS_ERROR_INVALID_AGENT An agent is invalid or no discovered agent has access. * * @retval ::HSA_STATUS_ERROR_INVALID_SIGNAL @p completion_signal is invalid. * @@ -1576,6 +1789,12 @@ typedef struct hsa_amd_pointer_info_s { GPU boards) any such agent may be returned. */ hsa_agent_t agentOwner; + /* + Contains a bitfield of hsa_amd_memory_pool_global_flag_t values. + Reports the effective global flags bitmask for the allocation. This field is not meaningful if + the type of the allocation is HSA_EXT_POINTER_TYPE_UNKNOWN. + */ + uint32_t global_flags; } hsa_amd_pointer_info_t; /** @@ -1611,7 +1830,7 @@ typedef struct hsa_amd_pointer_info_s { * * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT NULL in @p ptr or @p info. */ -hsa_status_t HSA_API hsa_amd_pointer_info(void* ptr, +hsa_status_t HSA_API hsa_amd_pointer_info(const void* ptr, hsa_amd_pointer_info_t* info, void* (*alloc)(size_t), uint32_t* num_agents_accessible, @@ -1635,7 +1854,7 @@ hsa_status_t HSA_API hsa_amd_pointer_info(void* ptr, * * @retval HSA_STATUS_ERROR_INVALID_ARGUMENT @p ptr is not known to ROCr. */ -hsa_status_t HSA_API hsa_amd_pointer_info_set_userdata(void* ptr, +hsa_status_t HSA_API hsa_amd_pointer_info_set_userdata(const void* ptr, void* userdata); /** @@ -1809,11 +2028,11 @@ typedef enum { // GPU attempted access to a host only page. HSA_AMD_MEMORY_FAULT_HOST_ONLY = 1 << 3, // DRAM ECC failure. - HSA_AMD_MEMORY_FAULT_DRAM_ECC = 1 << 4, + HSA_AMD_MEMORY_FAULT_DRAMECC = 1 << 4, // Can't determine the exact fault address. HSA_AMD_MEMORY_FAULT_IMPRECISE = 1 << 5, // SRAM ECC failure (ie registers, no fault address). - HSA_AMD_MEMORY_FAULT_SRAM_ECC = 1 << 6, + HSA_AMD_MEMORY_FAULT_SRAMECC = 1 << 6, // GPU reset following unspecified hang. HSA_AMD_MEMORY_FAULT_HANG = 1 << 31 } hsa_amd_memory_fault_reason_t; @@ -1970,6 +2189,181 @@ hsa_status_t HSA_API hsa_amd_register_deallocation_callback(void* ptr, hsa_status_t HSA_API hsa_amd_deregister_deallocation_callback(void* ptr, hsa_amd_deallocation_callback_t callback); +typedef enum hsa_amd_svm_model_s { + /** + * Updates to memory with this attribute conform to HSA memory consistency + * model. + */ + HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED = 0, + /** + * Writes to memory with this attribute can be performed by a single agent + * at a time. + */ + HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED = 1, + /** + * Memory region queried contains subregions with both + * HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED and + * HSA_AMD_SVM_GLOBAL_FLAG_FINE_GRAINED attributes. + * + * This attribute can not be used in hsa_amd_svm_attributes_set. It is a + * possible return from hsa_amd_svm_attributes_get indicating that the query + * region contains both coarse and fine grained memory. + */ + HSA_AMD_SVM_GLOBAL_FLAG_INDETERMINATE = 2 +} hsa_amd_svm_model_t; + +typedef enum hsa_amd_svm_attribute_s { + // Memory model attribute. + // Type of this attribute is hsa_amd_svm_model_t. + HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG = 0, + // Marks the range read only. This allows multiple physical copies to be + // placed local to each accessing device. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_READ_ONLY = 1, + // Automatic migrations should attempt to keep the memory within the xgmi hive + // containing accessible agents. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_HIVE_LOCAL = 2, + // Page granularity to migrate at once. Page granularity is specified as + // log2(page_count). + // Type of this attribute is uint64_t. + HSA_AMD_SVM_ATTRIB_MIGRATION_GRANULARITY = 3, + // Physical location to prefer when automatic migration occurs. + // Set to the null agent handle (handle == 0) to indicate there + // is no preferred location. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_PREFERRED_LOCATION = 4, + // This attribute can not be used in ::hsa_amd_svm_attributes_set (see + // ::hsa_amd_svm_prefetch_async). + // Queries the physical location of most recent prefetch command. + // If the prefetch location has not been set or is not uniform across the + // address range then returned hsa_agent_t::handle will be 0. + // Querying this attribute will return the destination agent of the most + // recent ::hsa_amd_svm_prefetch_async targeting the address range. If + // multiple async prefetches have been issued targeting the region and the + // most recently issued prefetch has completed then the query will return + // the location of the most recently completed prefetch. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION = 5, + // Optimizes with the anticipation that the majority of operations to the + // range will be read operations. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_READ_MOSTLY = 6, + // Allows the execution on GPU. + // Type of this attribute is bool. + HSA_AMD_SVM_ATTRIB_GPU_EXEC = 7, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Enables an agent for access to the range. Access may incur a page fault + // and associated memory migration. Either this or + // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE is required prior to SVM + // access if HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE = 0x200, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Enables an agent for access to the range without page faults. Access + // will not incur a page fault and will not cause access based migration. + // and associated memory migration. Either this or + // HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE is required prior to SVM access if + // HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT is false. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE = 0x201, + // This attribute can not be used in ::hsa_amd_svm_attributes_get. + // Denies an agent access to the memory range. Access will cause a terminal + // segfault. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_AGENT_NO_ACCESS = 0x202, + // This attribute can not be used in ::hsa_amd_svm_attributes_set. + // Returns the access attribute associated with the agent. + // The agent to query must be set in the attribute value field. + // The attribute enum will be replaced with the agent's current access + // attribute for the address range. + // TODO: Clarify KFD return value for non-uniform access attribute. + // Type of this attribute is hsa_agent_t. + HSA_AMD_SVM_ATTRIB_ACCESS_QUERY = 0x203, +} hsa_amd_svm_attribute_t; + +// List type for hsa_amd_svm_attributes_set/get. +typedef struct hsa_amd_svm_attribute_pair_s { + // hsa_amd_svm_attribute_t value. + uint64_t attribute; + // Attribute value. Bit values should be interpreted according to the type + // given in the associated attribute description. + uint64_t value; +} hsa_amd_svm_attribute_pair_t; + +/** + * @brief Sets SVM memory attributes. + * + * If HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT returns false then enabling + * access to an Agent via this API (setting HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE + * or HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE) is required prior to SVM + * memory access by that Agent. + * + * Attributes HSA_AMD_SVM_ATTRIB_ACCESS_QUERY and HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION + * may not be used with this API. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] attribute_list List of attributes to set for the address range. + * + * @param[in] attribute_count Length of @p attribute_list. + */ +hsa_status_t hsa_amd_svm_attributes_set(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +/** + * @brief Gets SVM memory attributes. + * + * Attributes HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE, + * HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE and + * HSA_AMD_SVM_ATTRIB_PREFETCH_LOCATION may not be used with this API. + * + * Note that attribute HSA_AMD_SVM_ATTRIB_ACCESS_QUERY takes as input an + * hsa_agent_t and returns the current access type through its attribute field. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] attribute_list List of attributes to set for the address range. + * + * @param[in] attribute_count Length of @p attribute_list. + */ +hsa_status_t hsa_amd_svm_attributes_get(void* ptr, size_t size, + hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); + +/** + * @brief Asynchronously migrates memory to an agent. + * + * Schedules memory migration to @p agent when @p dep_signals have been observed equal to zero. + * @p completion_signal will decrement when the migration is complete. + * + * @param[in] ptr Will be aligned down to nearest page boundary. + * + * @param[in] size Will be aligned up to nearest page boundary. + * + * @param[in] agent Agent to migrate to. + * + * @param[in] num_dep_signals Number of dependent signals. Can be 0. + * + * @param[in] dep_signals List of signals that must be waited on before the migration + * operation starts. The migration will start after every signal has been observed with + * the value 0. If @p num_dep_signals is 0, this argument is ignored. + * + * @param[in] completion_signal Signal used to indicate completion of the migration + * operation. When the migration operation is finished, the value of the signal is + * decremented. The runtime indicates that an error has occurred during the copy + * operation by setting the value of the completion signal to a negative + * number. If no completion signal is required this handle may be null. + */ +hsa_status_t hsa_amd_svm_prefetch_async(void* ptr, size_t size, hsa_agent_t agent, + uint32_t num_dep_signals, const hsa_signal_t* dep_signals, + hsa_signal_t completion_signal); + #ifdef __cplusplus } // end extern "C" block #endif diff --git a/include/hsa_ext_image.h b/include/hsa_ext_image.h index e94d8da202d6..ab820b3e3c87 100644 --- a/include/hsa_ext_image.h +++ b/include/hsa_ext_image.h @@ -1,6 +1,6 @@ //////////////////////////////////////////////////////////////////////////////// // -// Copyright (C) 2014-2020 Advanced Micro Devices Inc. All rights reserved. +// Copyright (C) 2014-2022 Advanced Micro Devices Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person or organization // obtaining a copy of the software and accompanying documentation covered by diff --git a/libgomp/Makefile.in b/libgomp/Makefile.in index 5f8a5f570647..5dca37c5aac5 100644 --- a/libgomp/Makefile.in +++ b/libgomp/Makefile.in @@ -187,7 +187,9 @@ am__DEPENDENCIES_1 = @PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_DEPENDENCIES = libgomp.la \ @PLUGIN_GCN_TRUE@ $(am__DEPENDENCIES_1) @PLUGIN_GCN_TRUE@am_libgomp_plugin_gcn_la_OBJECTS = \ -@PLUGIN_GCN_TRUE@ libgomp_plugin_gcn_la-plugin-gcn.lo +@PLUGIN_GCN_TRUE@ libgomp_plugin_gcn_la-plugin-gcn.lo \ +@PLUGIN_GCN_TRUE@ libgomp_plugin_gcn_la-simple-allocator.lo \ +@PLUGIN_GCN_TRUE@ libgomp_plugin_gcn_la-mutex.lo libgomp_plugin_gcn_la_OBJECTS = $(am_libgomp_plugin_gcn_la_OBJECTS) AM_V_lt = $(am__v_lt_@AM_V@) am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@) @@ -584,7 +586,9 @@ libgomp_la_SOURCES = alloc.c atomic.c barrier.c critical.c env.c \ # AMD GCN plugin @PLUGIN_GCN_TRUE@libgomp_plugin_gcn_version_info = -version-info $(libtool_VERSION) -@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c +@PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c simple-allocator.c \ +@PLUGIN_GCN_TRUE@ plugin/mutex.c + @PLUGIN_GCN_TRUE@libgomp_plugin_gcn_la_CPPFLAGS = $(AM_CPPFLAGS) \ @PLUGIN_GCN_TRUE@ -D_GNU_SOURCE @@ -760,7 +764,9 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/iter_ull.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp-plugin.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_gcn_la-mutex.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_gcn_la-plugin-gcn.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_gcn_la-simple-allocator.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/lock.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/loop.Plo@am__quote@ @@ -823,6 +829,20 @@ libgomp_plugin_gcn_la-plugin-gcn.lo: plugin/plugin-gcn.c @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_gcn_la-plugin-gcn.lo `test -f 'plugin/plugin-gcn.c' || echo '$(srcdir)/'`plugin/plugin-gcn.c +libgomp_plugin_gcn_la-simple-allocator.lo: simple-allocator.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_gcn_la-simple-allocator.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_gcn_la-simple-allocator.Tpo -c -o libgomp_plugin_gcn_la-simple-allocator.lo `test -f 'simple-allocator.c' || echo '$(srcdir)/'`simple-allocator.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_gcn_la-simple-allocator.Tpo $(DEPDIR)/libgomp_plugin_gcn_la-simple-allocator.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='simple-allocator.c' object='libgomp_plugin_gcn_la-simple-allocator.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_gcn_la-simple-allocator.lo `test -f 'simple-allocator.c' || echo '$(srcdir)/'`simple-allocator.c + +libgomp_plugin_gcn_la-mutex.lo: plugin/mutex.c +@am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_gcn_la-mutex.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_gcn_la-mutex.Tpo -c -o libgomp_plugin_gcn_la-mutex.lo `test -f 'plugin/mutex.c' || echo '$(srcdir)/'`plugin/mutex.c +@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_gcn_la-mutex.Tpo $(DEPDIR)/libgomp_plugin_gcn_la-mutex.Plo +@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='plugin/mutex.c' object='libgomp_plugin_gcn_la-mutex.lo' libtool=yes @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_gcn_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_gcn_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o libgomp_plugin_gcn_la-mutex.lo `test -f 'plugin/mutex.c' || echo '$(srcdir)/'`plugin/mutex.c + libgomp_plugin_nvptx_la-plugin-nvptx.lo: plugin/plugin-nvptx.c @am__fastdepCC_TRUE@ $(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(libgomp_plugin_nvptx_la_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(libgomp_plugin_nvptx_la_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT libgomp_plugin_nvptx_la-plugin-nvptx.lo -MD -MP -MF $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Tpo -c -o libgomp_plugin_nvptx_la-plugin-nvptx.lo `test -f 'plugin/plugin-nvptx.c' || echo '$(srcdir)/'`plugin/plugin-nvptx.c @am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Tpo $(DEPDIR)/libgomp_plugin_nvptx_la-plugin-nvptx.Plo diff --git a/libgomp/libgomp-plugin.h b/libgomp/libgomp-plugin.h index 5b4704484dd0..71e74527e719 100644 --- a/libgomp/libgomp-plugin.h +++ b/libgomp/libgomp-plugin.h @@ -236,6 +236,18 @@ extern const char *GOMP_OFFLOAD_get_interop_type_desc (struct interop_obj_t *, omp_interop_property_t); #endif +/* simple-allocator.c */ + +typedef struct gomp_simple_alloc_context *gomp_simple_alloc_ctx_p; + +gomp_simple_alloc_ctx_p gomp_simple_alloc_init_context (); +void gomp_simple_alloc_register_memory (gomp_simple_alloc_ctx_p ctx, + char *base, size_t size); +void *gomp_simple_alloc (gomp_simple_alloc_ctx_p ctx, size_t size); +void gomp_simple_free (gomp_simple_alloc_ctx_p ctx, void *addr); +void *gomp_simple_realloc (gomp_simple_alloc_ctx_p ctx, void *addr, + size_t newsize); + #ifdef __cplusplus } #endif diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index ff445d1e90c6..46db7d41f322 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -1676,16 +1676,7 @@ gomp_thread_to_pthread_t (struct gomp_thread *thr) } #endif -/* simple-allocator.c */ - -typedef struct gomp_simple_alloc_context *gomp_simple_alloc_ctx_p; - -gomp_simple_alloc_ctx_p gomp_simple_alloc_init_context (); -void gomp_simple_alloc_register_memory (gomp_simple_alloc_ctx_p ctx, - char *base, size_t size); -void *gomp_simple_alloc (gomp_simple_alloc_ctx_p ctx, size_t size); -void gomp_simple_free (gomp_simple_alloc_ctx_p ctx, void *addr); -void *gomp_simple_realloc (gomp_simple_alloc_ctx_p ctx, void *addr, - size_t newsize); +/* simple-allocator.c has its prototypes in libgomp-plugin.h so it's + accessible from both. */ #endif /* LIBGOMP_H */ diff --git a/libgomp/libgomp.texi b/libgomp/libgomp.texi index 2d37c8731744..abe8ef75d8e7 100644 --- a/libgomp/libgomp.texi +++ b/libgomp/libgomp.texi @@ -7185,11 +7185,26 @@ The implementation remark: a performance boost for NVPTX offload code and also allows unlimited use of pinned memory regardless of the OS @code{ulimit}/@code{rlimit} settings. -@item Managed memory allocated with the OpenMP +@item Managed memory allocated on the host with the @code{ompx_gnu_managed_mem_alloc} allocator or in the - @code{ompx_gnu_managed_mem_space} is not currently supported for AMD GPU - devices; attempting to use it in an allocator will trigger the fall-back - trait. + @code{ompx_gnu_managed_mem_space} (both GNU extensions) allocate memory + equivalent to HIP Managed Memory, although @emph{not} actually allocated + using @code{hipMallocManaged}. This memory is accessible by both the + host and the device at the same address, so it need not be mapped with + @code{map} clauses. Instead, use the @code{is_device_ptr} clause or + @code{has_device_addr} clause to indicate that the pointer is already + accessible on the device. The ROCm runtime will automatically handle + data migration between host and device as needed. Not all AMD GPU + devices support this feature, and many that do require that + @code{-mxnack=on} is configured at compile time. If managed memory is + not supported by the default device, as configured at the moment the + allocator is called, then the allocator will use the fall-back setting. + If the default device is configured differently when the memory is freed, + via @code{omp_free} or @code{omp_realloc}, the result may be undefined. + If the current device does not support Unified Shared Memory (or it is + not enabled with @code{HSA_XNACK=1}) then Managed Memory might still + work, but allocations may only be visible to a single device (whichever + was the default device when the @emph{first} allocation was made). @item The OpenMP routines @code{omp_target_memcpy_rect} and @code{omp_target_memcpy_rect_async} and the @code{target update} directive for non-contiguous list items use the 3D memory-copy function diff --git a/libgomp/plugin/Makefrag.am b/libgomp/plugin/Makefrag.am index 9c273e72f789..dbc02f3cda8e 100644 --- a/libgomp/plugin/Makefrag.am +++ b/libgomp/plugin/Makefrag.am @@ -57,7 +57,8 @@ if PLUGIN_GCN # AMD GCN plugin libgomp_plugin_gcn_version_info = -version-info $(libtool_VERSION) toolexeclib_LTLIBRARIES += libgomp-plugin-gcn.la -libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c +libgomp_plugin_gcn_la_SOURCES = plugin/plugin-gcn.c simple-allocator.c \ + plugin/mutex.c libgomp_plugin_gcn_la_CPPFLAGS = $(AM_CPPFLAGS) \ -D_GNU_SOURCE libgomp_plugin_gcn_la_LDFLAGS = $(libgomp_plugin_gcn_version_info) \ diff --git a/libgomp/plugin/mutex.c b/libgomp/plugin/mutex.c new file mode 100644 index 000000000000..e6981ad0c919 --- /dev/null +++ b/libgomp/plugin/mutex.c @@ -0,0 +1,58 @@ +/* Mutex implementation for libgomp plugins. + + Copyright (C) 2025 Free Software Foundation, Inc. + + Contributed by BayLibre + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* This is a minimal implementation of the gomp_mutex_t spinlocks, but + without all the dependencies used by the config/linux/mutex implementation. + + At the time of writing, this is only used by simple_alloc which has + short-lived locks and should be fine with these. The actual locks are in + a header file, so only the fallback "slow" functions are needed here. */ + +#include "config.h" +#include +#include "libgomp.h" + +#ifndef HAVE_SYNC_BUILTINS +#error "HAVE_SYNC_BUILTINS is required to build this" +#endif + +void +gomp_mutex_lock_slow (gomp_mutex_t *mutex, int oldval) +{ + while (oldval == 1) + { + usleep (1); + oldval = __atomic_exchange_n (mutex, 1, __ATOMIC_ACQUIRE); + } +} + +void +gomp_mutex_unlock_slow (gomp_mutex_t *mutex) +{ + GOMP_PLUGIN_fatal ("gomp_mutex_unlock_slow should be unreachable"); +} diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index cd5a19b03551..ece41c59bbb5 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -50,6 +50,8 @@ #include "oacc-plugin.h" #include "oacc-int.h" #include +#include +#include /* Create hash-table for declare target's indirect clause on the host; see build-target-indirect-htab.h for details. */ @@ -228,6 +230,9 @@ struct hsa_runtime_fn_info const hsa_dim3_t *range, hsa_agent_t copy_agent, hsa_amd_copy_direction_t dir, uint32_t num_dep_signals, const hsa_signal_t *dep_signals, hsa_signal_t completion_signal); + hsa_status_t (*hsa_amd_svm_attributes_set_fn) + (void* ptr, size_t size, hsa_amd_svm_attribute_pair_t* attribute_list, + size_t attribute_count); }; /* As an HIP runtime is dlopened, following structure defines function @@ -746,6 +751,24 @@ dump_hsa_system_info (void) } else GCN_WARNING ("HSA_SYSTEM_INFO_EXTENSIONS: FAILED\n"); + + bool svm_supported; + status = hsa_fns.hsa_system_get_info_fn + (HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED, &svm_supported); + if (status == HSA_STATUS_SUCCESS) + GCN_DEBUG ("HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: %s\n", + (svm_supported ? "TRUE" : "FALSE")); + else + GCN_WARNING ("HSA_AMD_SYSTEM_INFO_SVM_SUPPORTED: FAILED\n"); + + bool svm_accessible; + status = hsa_fns.hsa_system_get_info_fn + (HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT, &svm_accessible); + if (status == HSA_STATUS_SUCCESS) + GCN_DEBUG ("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: %s\n", + (svm_accessible ? "TRUE" : "FALSE")); + else + GCN_WARNING ("HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT: FAILED\n"); } /* Dump information about the available hardware. */ @@ -1470,6 +1493,7 @@ init_hsa_runtime_functions (void) DLSYM_OPT_FN (hsa_amd_memory_lock) DLSYM_OPT_FN (hsa_amd_memory_unlock) DLSYM_OPT_FN (hsa_amd_memory_async_copy_rect) + DLSYM_OPT_FN (hsa_amd_svm_attributes_set) return true; #undef DLSYM_OPT_FN #undef DLSYM_FN @@ -2527,6 +2551,13 @@ isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image, "Consider using ROCR_VISIBLE_DEVICES to disable incompatible " "devices or run with LOADER_ENABLE_LOGGING=1 for more details.", device_isa_s, agent_isa_s, agent->device_id); + else if (strcmp (device_isa_s, agent_isa_s) == 0) + snprintf (msg, sizeof msg, + "GCN code object features do not match for an unknown reason " + "(device %d).\n" + "Try to adjust the HSA_XNACK setting (perhaps?), or use\n" + "ROCR_VISIBLE_DEVICES to disable incompatible devices.\n", + agent->device_id); else snprintf (msg, sizeof msg, "GCN code object ISA '%s' is incompatible with GPU ISA '%s' " @@ -3188,6 +3219,125 @@ wait_queue (struct goacc_asyncqueue *aq) } /* }}} */ +/* {{{ Managed Memory + + This implements an allocator equivalent to CUDA "Managed" memory, in which + the pages automatically migrate between host and device memory, as needed. + These allocations are visible from both the host and devices without the + need for explicit mappings. However, OpenMP does need "is_device_ptr" or + "has_device_addr" to function properly. + + There isn't a high-level HSA/ROCr API to allocate managed memory, so we + use regular memory and register it with the driver by setting it to + "coarse-grained" mode, and setting the "accessible by default" attribute + on devices where HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT isn't set + as standard (as it isn't on systems that don't support USM, or when + HSA_XNACK != 1). + + This is in contrast to GOMP_OFFLOAD_alloc which allocates coarse-grained + *GPU memory*, which is not visible on the host. + + It would be possible to register memory returned by malloc, but + experimentation shows that doing so causes memory faults within the HSA + runtime code. Therefore, the Managed memory space is allocated as a + largish block and then subdivided via a custom allocator. The "simple" + allocator is designed specifically to store its free-chain outside of + the registered pages so that allocation does not inadvertently cause + pages to migrate. + + Note: if the user has multiple mismatched devices, and one or more do + not support USM (or XNACK is off), then each page of the Managed heap + could end up associated with a different device (by calling omp_alloc + before and after omp_set_default_device). This issue remains + an *unhandled* edge-case, at present. */ + +gomp_simple_alloc_ctx_p managed_ctx = NULL; + +/* Initialize or extend the Managed memory space. This is called whenever + allocation fails. SIZE is the minimum size required for the failed + allocation to succeed; the function may choose a larger size. + Note that Linux lazy allocation means that the memory returned isn't + guaranteed to actually exist. */ + +static bool +managed_heap_create (struct agent_info *agent, size_t size) +{ + static int lock = 0; + while (__atomic_exchange_n (&lock, 1, __ATOMIC_ACQUIRE) != 0) + ; + + size_t default_size = 1L * 1024 * 1024 * 1024; /* 1GB */ + if (size < default_size) + size = default_size; + + /* Round up to a whole page. */ + int pagesize = getpagesize (); + int misalignment = size % pagesize; + if (misalignment > 0) + size += pagesize - misalignment; + + /* Try to get contiguous memory, but it might not be possible. + The most recent previous allocation is at the head of the list. */ + static void *addrhint = NULL; + void *new_pages = mmap (addrhint, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (!new_pages) + { + GCN_DEBUG ("Could not allocate Managed Memory heap."); + __atomic_store_n (&lock, 0, __ATOMIC_RELEASE); + return false; + } + + /* Register the heap allocation as coarse grained, "Managed" memory. */ + struct hsa_amd_svm_attribute_pair_s attr = { + HSA_AMD_SVM_ATTRIB_GLOBAL_FLAG, + HSA_AMD_SVM_GLOBAL_FLAG_COARSE_GRAINED + }; + hsa_status_t status = hsa_fns.hsa_amd_svm_attributes_set_fn (new_pages, size, + &attr, 1); + if (status != HSA_STATUS_SUCCESS) + GOMP_PLUGIN_fatal ("Failed to allocate Unified Shared Memory;" + " please update your drivers and/or kernel"); + + /* The HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE setting is required on devices + without default SVM. */ + static int svm_accessible = 0xff; /* Use 0xff as "undefined". */ + if (svm_accessible == 0xff) + { + status = hsa_fns.hsa_system_get_info_fn + (HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT, &svm_accessible); + if (status != HSA_STATUS_SUCCESS) + { + GCN_DEBUG ("warning: failed to query " + " HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT\n"); + svm_accessible = false; + } + } + if (svm_accessible == false) + { + struct hsa_amd_svm_attribute_pair_s attr2; + attr2.attribute = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE; + attr2.value = agent->id.handle; + status = hsa_fns.hsa_amd_svm_attributes_set_fn (new_pages, size, &attr2, + 1); + if (status != HSA_STATUS_SUCCESS) + GOMP_PLUGIN_fatal ("Failed to allocate Unified Shared Memory;" + " please update your drivers and/or kernel"); + } + + addrhint = new_pages + size; + + /* Initialize a new Managed memory heap, or add the new memory into an + existing Managed memory heap. */ + if (!managed_ctx) + managed_ctx = gomp_simple_alloc_init_context (); + gomp_simple_alloc_register_memory (managed_ctx, new_pages, size); + + __atomic_store_n (&lock, 0, __ATOMIC_RELEASE); + return true; +} + +/* }}} */ /* {{{ OpenACC support */ /* Execute an OpenACC kernel, synchronously or asynchronously. */ @@ -5061,6 +5211,35 @@ GOMP_OFFLOAD_async_run (int device, void *tgt_fn, void *tgt_vars, GOMP_PLUGIN_target_task_completion, async_data); } +/* Allocate memory suitable for Managed Memory. */ + +void * +GOMP_OFFLOAD_managed_alloc (int device, size_t size) +{ + struct agent_info *agent = get_agent_info (device); + while (1) + { + void *result = gomp_simple_alloc (managed_ctx, size); + if (result) + return result; + + /* Allocation failed. Try again if we can create a new heap block. + Note: it's possible another thread could get to the new memory + first, so the while loop is necessary. */ + if (!managed_heap_create (agent, size)) + return NULL; + } +} + +/* Free memory allocated via GOMP_OFFLOAD_managed_alloc. */ + +bool +GOMP_OFFLOAD_managed_free (int device, void *ptr) +{ + gomp_simple_free (managed_ctx, ptr); + return true; +} + /* }}} */ /* {{{ OpenACC Plugin API */ diff --git a/libgomp/simple-allocator.c b/libgomp/simple-allocator.c index 531bd18e74a6..25ec2c77c727 100644 --- a/libgomp/simple-allocator.c +++ b/libgomp/simple-allocator.c @@ -312,4 +312,5 @@ gomp_simple_realloc (gomp_simple_alloc_ctx_p ctx, void *addr, size_t newsize) /* Include the splay tree code inline, with the prefixes added. */ #define splay_tree_prefix simple_alloc #define splay_tree_c +#define gomp_fatal GOMP_PLUGIN_fatal /* So it links into a plugin. */ #include "splay-tree.h" diff --git a/libgomp/testsuite/lib/libgomp.exp b/libgomp/testsuite/lib/libgomp.exp index ba55cd39e2ba..f5683b50725a 100644 --- a/libgomp/testsuite/lib/libgomp.exp +++ b/libgomp/testsuite/lib/libgomp.exp @@ -729,5 +729,41 @@ proc check_effective_target_omp_managedmem { } { if { [check_effective_target_offload_device_nvptx] } { return 1 } + + if { [libgomp_check_effective_target_offload_target "amdgcn"] } { + if [check_runtime_nocache managed_available_ { + #include + #include + int main () + { + const omp_alloctrait_t traits[] = { + { omp_atk_fallback, omp_atv_null_fb } + }; + omp_allocator_handle_t managed_no_fallback + = omp_init_allocator (ompx_gnu_managed_mem_space, 1, traits); + void *a = omp_alloc (16, managed_no_fallback); + return a == NULL; + } + } ] { + return 1 + } + } + + return 0 +} + +# return 1 if -mxnack=on is accepted + +proc check_effective_target_offload_target_amdgcn_with_xnack { } { + if { [libgomp_check_effective_target_offload_target "amdgcn"] } { + return [check_no_compiler_messages amd_xnack_ executable { + int main () { + #pragma omp target + ; + return 0; + } + } "-foffload-options=amdgcn-amdhsa=-mxnack=on" ] + } + return 0 } diff --git a/libgomp/testsuite/libgomp.c++/alloc-managed-1.C b/libgomp/testsuite/libgomp.c++/alloc-managed-1.C index afd7fd648c67..86de0aac4004 100644 --- a/libgomp/testsuite/libgomp.c++/alloc-managed-1.C +++ b/libgomp/testsuite/libgomp.c++/alloc-managed-1.C @@ -1,5 +1,6 @@ // { dg-do run } // { dg-require-effective-target omp_managedmem } +// { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn_with_xnack } } // Check that the ompx::allocator::gnu_managed_mem allocator can allocate // Managed Memory, and that host and target can see the data, at the same diff --git a/libgomp/testsuite/libgomp.c-c++-common/requires-4.c b/libgomp/testsuite/libgomp.c-c++-common/requires-4.c index 8cb4821ee53c..2ebd9c1c8ce6 100644 --- a/libgomp/testsuite/libgomp.c-c++-common/requires-4.c +++ b/libgomp/testsuite/libgomp.c-c++-common/requires-4.c @@ -3,6 +3,10 @@ /* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */ /* { dg-additional-sources requires-4-aux.c } */ +/* GCC explicitly disables XNACK for gfx908 (and others) as the hardware + support is limited, which results in a diagnostic. */ +/* { dg-excess-errors "Unified Shared Memory is enabled, but XNACK is disabled" { target offload_target_amdgcn } } */ + /* Check no diagnostic by device-compiler's or host compiler's lto1. Other file uses: 'requires reverse_offload', but that's inactive as there are no declare target directives, device constructs nor device routines */ diff --git a/libgomp/testsuite/libgomp.c-c++-common/requires-4a.c b/libgomp/testsuite/libgomp.c-c++-common/requires-4a.c index 0e0db927c2c1..85301cf31d97 100644 --- a/libgomp/testsuite/libgomp.c-c++-common/requires-4a.c +++ b/libgomp/testsuite/libgomp.c-c++-common/requires-4a.c @@ -3,6 +3,10 @@ /* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */ /* { dg-additional-sources requires-4-aux.c } */ +/* GCC explicitly disables XNACK for gfx908 (and others) as the hardware + support is limited, which results in a diagnostic. */ +/* { dg-excess-errors "Unified Shared Memory is enabled, but XNACK is disabled" { target offload_target_amdgcn } } */ + /* Same as requires-4.c, but uses heap memory for 'a'. */ /* Check no diagnostic by device-compiler's or host compiler's lto1. diff --git a/libgomp/testsuite/libgomp.c-c++-common/requires-5.c b/libgomp/testsuite/libgomp.c-c++-common/requires-5.c index d43d78db6fab..4fd7f1c7885c 100644 --- a/libgomp/testsuite/libgomp.c-c++-common/requires-5.c +++ b/libgomp/testsuite/libgomp.c-c++-common/requires-5.c @@ -1,5 +1,6 @@ /* { dg-additional-options "-foffload-options=nvptx-none=-misa=sm_35" { target { offload_target_nvptx } } } */ /* { dg-additional-sources requires-5-aux.c } */ +/* { dg-excess-errors "Unified Shared Memory is enabled, but XNACK is disabled" { target offload_target_amdgcn } } */ /* Depending on offload device capabilities, it may print something like the following (only) if GOMP_DEBUG=1: diff --git a/libgomp/testsuite/libgomp.c/alloc-managed-1.c b/libgomp/testsuite/libgomp.c/alloc-managed-1.c index 31b252fc0ae6..88ddcf36d4af 100644 --- a/libgomp/testsuite/libgomp.c/alloc-managed-1.c +++ b/libgomp/testsuite/libgomp.c/alloc-managed-1.c @@ -1,5 +1,6 @@ /* { dg-do run } */ /* { dg-require-effective-target omp_managedmem } */ +/* { dg-additional-options "-foffload-options=amdgcn-amdhsa=-mxnack=on" { target offload_target_amdgcn_with_xnack } } */ /* Check that omp_alloc can allocate Managed Memory, and that host and target can see the data, at the same address, without a mapping. */ diff --git a/libgomp/testsuite/libgomp.c/alloc-managed-2.c b/libgomp/testsuite/libgomp.c/alloc-managed-2.c index f7fd30a4f679..660f6e6ed255 100644 --- a/libgomp/testsuite/libgomp.c/alloc-managed-2.c +++ b/libgomp/testsuite/libgomp.c/alloc-managed-2.c @@ -1,5 +1,6 @@ /* { dg-do run } */ /* { dg-require-effective-target omp_managedmem } */ +/* { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn_with_xnack } } */ /* Check that omp_calloc can allocate Managed Memory, and that host and target can see the data, at the same address, without a mapping. */ diff --git a/libgomp/testsuite/libgomp.c/alloc-managed-3.c b/libgomp/testsuite/libgomp.c/alloc-managed-3.c index 17828b76962e..fefdeb3a932e 100644 --- a/libgomp/testsuite/libgomp.c/alloc-managed-3.c +++ b/libgomp/testsuite/libgomp.c/alloc-managed-3.c @@ -1,5 +1,6 @@ /* { dg-do run } */ /* { dg-require-effective-target omp_managedmem } */ +/* { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn_with_xnack } } */ /* Check that omp_realloc can allocate Managed Memory, and that host and target can see the data, at the same address, without a mapping. */ diff --git a/libgomp/testsuite/libgomp.c/alloc-managed-4.c b/libgomp/testsuite/libgomp.c/alloc-managed-4.c index 4eaf8259b6fc..577e3e28ec1f 100644 --- a/libgomp/testsuite/libgomp.c/alloc-managed-4.c +++ b/libgomp/testsuite/libgomp.c/alloc-managed-4.c @@ -1,5 +1,6 @@ /* { dg-do run } */ /* { dg-require-effective-target omp_managedmem } */ +/* { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn_with_xnack } } */ /* { dg-shouldfail "" } */ /* { dg-output "libgomp: attempted to free managed memory at 0x\[0-9a-f\]+, but the default device is set to the host device" } */ diff --git a/libgomp/testsuite/libgomp.fortran/alloc-managed-1.f90 b/libgomp/testsuite/libgomp.fortran/alloc-managed-1.f90 index 685aeef7dae2..e19eb043daaa 100644 --- a/libgomp/testsuite/libgomp.fortran/alloc-managed-1.f90 +++ b/libgomp/testsuite/libgomp.fortran/alloc-managed-1.f90 @@ -1,5 +1,6 @@ ! { dg-do run } ! { dg-require-effective-target omp_managedmem } +! { dg-additional-options -foffload-options=amdgcn-amdhsa=-mxnack=on { target offload_target_amdgcn_with_xnack } } ! Check that omp_alloc can allocate Managed Memory, and that host and target ! can see the data, at the same address, without a mapping.