diff --git a/gcc/config/gcn/gcn-run.cc b/gcc/config/gcn/gcn-run.cc index 606772e6212..4232a17b784 100644 --- a/gcc/config/gcn/gcn-run.cc +++ b/gcc/config/gcn/gcn-run.cc @@ -35,6 +35,7 @@ #include #include "hsa.h" +#include "../../../libgomp/config/gcn/libgomp-gcn.h" #ifndef HSA_RUNTIME_LIB #define HSA_RUNTIME_LIB "libhsa-runtime64.so.1" @@ -487,39 +488,16 @@ device_malloc (size_t size, hsa_region_t region) automatically assign the exit value to *return_value. */ struct kernargs { - /* Kernargs. */ - int32_t argc; - int64_t argv; - int64_t out_ptr; - int64_t heap_ptr; - - /* Output data. */ - struct output - { - int return_value; - unsigned int next_output; - struct printf_data - { - int written; - char msg[128]; - int type; - union - { - int64_t ivalue; - double dvalue; - char text[128]; - }; - } queue[1024]; - unsigned int consumed; - } output_data; + union { + struct { + int32_t argc; + int64_t argv; + } args; + struct kernargs_abi abi; + }; + struct output output_data; }; -struct heap -{ - int64_t size; - char data[0]; -} heap; - /* Print any console output from the kernel. We print all entries from "consumed" to the next entry without a "written" flag, or "next_output" is reached. The buffer is circular, but the @@ -687,6 +665,16 @@ main (int argc, char *argv[]) for (int i = 0; i < kernel_argc; i++) args_size += strlen (kernel_argv[i]) + 1; + /* The device stack can be adjusted via an environment variable. */ + char *envvar = getenv ("GCN_STACK_SIZE"); + int stack_size = 1 * 1024 * 1024; /* 1MB default. */ + if (envvar) + { + int val = atoi (envvar); + if (val) + stack_size = val; + } + /* Allocate device memory for both function parameters and the argv data. */ struct kernargs *kernargs = device_malloc (sizeof (*kernargs), @@ -702,11 +690,12 @@ main (int argc, char *argv[]) XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device, HSA_ACCESS_PERMISSION_RW), "Assign heap to device agent"); + void *stack = device_malloc (stack_size, heap_region); /* Write the data to the target. */ - kernargs->argc = kernel_argc; - kernargs->argv = (int64_t) args->argv_data; - kernargs->out_ptr = (int64_t) &kernargs->output_data; + kernargs->args.argc = kernel_argc; + kernargs->args.argv = (int64_t) args->argv_data; + kernargs->abi.out_ptr = (int64_t) &kernargs->output_data; kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */ kernargs->output_data.next_output = 0; for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue) @@ -721,8 +710,11 @@ main (int argc, char *argv[]) memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1); offset += arg_len; } - kernargs->heap_ptr = (int64_t) heap; + kernargs->abi.heap_ptr = (int64_t) heap; hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size)); + kernargs->abi.arena_ptr = 0; + kernargs->abi.stack_ptr = (int64_t) stack; + kernargs->abi.stack_size_per_thread = stack_size; /* Run constructors on the GPU. */ run (init_array_kernel, kernargs); diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc index edde7bad518..23ab01e75d8 100644 --- a/gcc/config/gcn/gcn.cc +++ b/gcc/config/gcn/gcn.cc @@ -138,21 +138,6 @@ gcn_option_override (void) : ISA_UNKNOWN); gcc_assert (gcn_isa != ISA_UNKNOWN); - /* The default stack size needs to be small for offload kernels because - there may be many, many threads. Also, a smaller stack gives a - measureable performance boost. But, a small stack is insufficient - for running the testsuite, so we use a larger default for the stand - alone case. */ - if (stack_size_opt == -1) - { - if (flag_openacc || flag_openmp) - /* 512 bytes per work item = 32kB total. */ - stack_size_opt = 512 * 64; - else - /* 1MB total. */ - stack_size_opt = 1048576; - } - /* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and worker broadcasts. */ if (gang_private_size_opt == -1) @@ -228,11 +213,9 @@ static const struct gcn_kernel_arg_type }; static const long default_requested_args - = (1 << PRIVATE_SEGMENT_BUFFER_ARG) - | (1 << DISPATCH_PTR_ARG) + = (1 << DISPATCH_PTR_ARG) | (1 << QUEUE_PTR_ARG) | (1 << KERNARG_SEGMENT_PTR_ARG) - | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG) | (1 << WORKGROUP_ID_X_ARG) | (1 << WORK_ITEM_ID_X_ARG) | (1 << WORK_ITEM_ID_Y_ARG) @@ -1865,10 +1848,14 @@ gcn_addr_space_convert (rtx op, tree from_type, tree to_type) if (AS_LDS_P (as_from) && AS_FLAT_P (as_to)) { - rtx queue = gen_rtx_REG (DImode, - cfun->machine->args.reg[QUEUE_PTR_ARG]); + /* The high bits of the QUEUE_PTR_ARG register are used by + GCN_BUILTIN_FIRST_CALL_THIS_THREAD_P, so mask them out. */ + rtx queue_reg = gen_rtx_REG (DImode, + cfun->machine->args.reg[QUEUE_PTR_ARG]); + rtx queue_ptr = gen_reg_rtx (DImode); + emit_insn (gen_anddi3 (queue_ptr, queue_reg, GEN_INT (0xffffffffffff))); rtx group_seg_aperture_hi = gen_rtx_MEM (SImode, - gen_rtx_PLUS (DImode, queue, + gen_rtx_PLUS (DImode, queue_ptr, gen_int_mode (64, SImode))); rtx tmp = gen_reg_rtx (DImode); @@ -2521,6 +2508,11 @@ gcn_conditional_register_usage (void) fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1; fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1; } + if (cfun->machine->args.reg[QUEUE_PTR_ARG] >= 0) + { + fixed_regs[cfun->machine->args.reg[QUEUE_PTR_ARG]] = 1; + fixed_regs[cfun->machine->args.reg[QUEUE_PTR_ARG] + 1] = 1; + } if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0) fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1; if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0) @@ -3346,10 +3338,56 @@ gcn_expand_prologue () } else { - rtx wave_offset = gen_rtx_REG (SImode, - cfun->machine->args. - reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]); + if (TARGET_PACKED_WORK_ITEMS) + { + /* v0 conatins the X, Y and Z dimensions all in one. + Expand them out for ABI compatibility. */ + /* TODO: implement and use zero_extract. */ + rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1)); + emit_insn (gen_andv64si3 (v1, gen_rtx_REG (V64SImode, VGPR_REGNO (0)), + gen_rtx_CONST_INT (VOIDmode, 0x3FF << 10))); + emit_insn (gen_lshrv64si3 (v1, v1, gen_rtx_CONST_INT (VOIDmode, 10))); + emit_insn (gen_prologue_use (v1)); + rtx v2 = gen_rtx_REG (V64SImode, VGPR_REGNO (2)); + emit_insn (gen_andv64si3 (v2, gen_rtx_REG (V64SImode, VGPR_REGNO (0)), + gen_rtx_CONST_INT (VOIDmode, 0x3FF << 20))); + emit_insn (gen_lshrv64si3 (v2, v2, gen_rtx_CONST_INT (VOIDmode, 20))); + emit_insn (gen_prologue_use (v2)); + } + + /* We no longer use the private segment for the stack (it's not + accessible to reverse offload), so we must calculate a wave offset + from the grid dimensions and stack size, which is calculated on the + host, and passed in the kernargs region. + See libgomp-gcn.h for details. */ + rtx wave_offset = gen_rtx_REG (SImode, FIRST_PARM_REG); + + rtx num_waves_mem = gcn_oacc_dim_size (1); + rtx num_waves = gen_rtx_REG (SImode, FIRST_PARM_REG+1); + set_mem_addr_space (num_waves_mem, ADDR_SPACE_SCALAR_FLAT); + emit_move_insn (num_waves, num_waves_mem); + + rtx workgroup_num = gcn_oacc_dim_pos (0); + rtx wave_num = gen_rtx_REG (SImode, FIRST_PARM_REG+2); + emit_move_insn(wave_num, gcn_oacc_dim_pos (1)); + + rtx thread_id = gen_rtx_REG (SImode, FIRST_PARM_REG+3); + emit_insn (gen_mulsi3 (thread_id, num_waves, workgroup_num)); + emit_insn (gen_addsi3_scc (thread_id, thread_id, wave_num)); + + rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg + [KERNARG_SEGMENT_PTR_ARG]); + rtx stack_size_mem = gen_rtx_MEM (SImode, + gen_rtx_PLUS (DImode, kernarg_reg, + GEN_INT (52))); + set_mem_addr_space (stack_size_mem, ADDR_SPACE_SCALAR_FLAT); + emit_move_insn (wave_offset, stack_size_mem); + + emit_insn (gen_mulsi3 (wave_offset, wave_offset, thread_id)); + + /* The FLAT_SCRATCH_INIT is not usually needed, but can be enabled + via the function attributes. */ if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG)) { rtx fs_init_lo = @@ -3386,10 +3424,12 @@ gcn_expand_prologue () HOST_WIDE_INT sp_adjust = (offsets->local_vars + offsets->outgoing_args_size); - /* Initialise FP and SP from the buffer descriptor in s[0:3]. */ - emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0)); - emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1), - gen_int_mode (0xffff, SImode))); + /* Initialize FP and SP from space allocated on the host. */ + rtx stack_addr_mem = gen_rtx_MEM (DImode, + gen_rtx_PLUS (DImode, kernarg_reg, + GEN_INT (40))); + set_mem_addr_space (stack_addr_mem, ADDR_SPACE_SCALAR_FLAT); + emit_move_insn (fp, stack_addr_mem); rtx scc = gen_rtx_REG (BImode, SCC_REG); emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc)); emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc)); @@ -3445,25 +3485,6 @@ gcn_expand_prologue () emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG))); } - if (TARGET_PACKED_WORK_ITEMS - && cfun && cfun->machine && !cfun->machine->normal_function) - { - /* v0 conatins the X, Y and Z dimensions all in one. - Expand them out for ABI compatibility. */ - /* TODO: implement and use zero_extract. */ - rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1)); - emit_insn (gen_andv64si3 (v1, gen_rtx_REG (V64SImode, VGPR_REGNO (0)), - gen_rtx_CONST_INT (VOIDmode, 0x3FF << 10))); - emit_insn (gen_lshrv64si3 (v1, v1, gen_rtx_CONST_INT (VOIDmode, 10))); - emit_insn (gen_prologue_use (v1)); - - rtx v2 = gen_rtx_REG (V64SImode, VGPR_REGNO (2)); - emit_insn (gen_andv64si3 (v2, gen_rtx_REG (V64SImode, VGPR_REGNO (0)), - gen_rtx_CONST_INT (VOIDmode, 0x3FF << 20))); - emit_insn (gen_lshrv64si3 (v2, v2, gen_rtx_CONST_INT (VOIDmode, 20))); - emit_insn (gen_prologue_use (v2)); - } - if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp) { /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel. */ @@ -4504,26 +4525,53 @@ gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ , cf. struct hsa_kernel_dispatch_packet_s in the HSA doc. */ rtx ptr; if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0 - && cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0) + && cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0) { - rtx size_rtx = gen_rtx_REG (DImode, - cfun->machine->args.reg[DISPATCH_PTR_ARG]); - size_rtx = gen_rtx_MEM (SImode, - gen_rtx_PLUS (DImode, size_rtx, - GEN_INT (6*2 + 3*4))); - size_rtx = gen_rtx_MULT (SImode, size_rtx, GEN_INT (64)); + rtx num_waves_mem = gcn_oacc_dim_size (1); + rtx num_waves = gen_reg_rtx (SImode); + set_mem_addr_space (num_waves_mem, ADDR_SPACE_SCALAR_FLAT); + emit_move_insn (num_waves, num_waves_mem); - ptr = gen_rtx_REG (DImode, - cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]); - ptr = gen_rtx_AND (DImode, ptr, GEN_INT (0x0000ffffffffffff)); - ptr = gen_rtx_PLUS (DImode, ptr, size_rtx); - if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0) - { - rtx off; - off = gen_rtx_REG (SImode, - cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]); - ptr = gen_rtx_PLUS (DImode, ptr, off); - } + rtx workgroup_num = gcn_oacc_dim_pos (0); + rtx wave_num = gen_reg_rtx (SImode); + emit_move_insn(wave_num, gcn_oacc_dim_pos (1)); + + rtx thread_id = gen_reg_rtx (SImode); + emit_insn (gen_mulsi3 (thread_id, num_waves, workgroup_num)); + emit_insn (gen_addsi3_scc (thread_id, thread_id, wave_num)); + + rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg + [KERNARG_SEGMENT_PTR_ARG]); + rtx stack_size_mem = gen_rtx_MEM (SImode, + gen_rtx_PLUS (DImode, + kernarg_reg, + GEN_INT (52))); + set_mem_addr_space (stack_size_mem, ADDR_SPACE_SCALAR_FLAT); + rtx stack_size = gen_reg_rtx (SImode); + emit_move_insn (stack_size, stack_size_mem); + + rtx wave_offset = gen_reg_rtx (SImode); + emit_insn (gen_mulsi3 (wave_offset, stack_size, thread_id)); + + rtx stack_limit_offset = gen_reg_rtx (SImode); + emit_insn (gen_addsi3 (stack_limit_offset, wave_offset, + stack_size)); + + rtx stack_limit_offset_di = gen_reg_rtx (DImode); + emit_move_insn (gen_rtx_SUBREG (SImode, stack_limit_offset_di, 4), + const0_rtx); + emit_move_insn (gen_rtx_SUBREG (SImode, stack_limit_offset_di, 0), + stack_limit_offset); + + rtx stack_addr_mem = gen_rtx_MEM (DImode, + gen_rtx_PLUS (DImode, + kernarg_reg, + GEN_INT (40))); + set_mem_addr_space (stack_addr_mem, ADDR_SPACE_SCALAR_FLAT); + rtx stack_addr = gen_reg_rtx (DImode); + emit_move_insn (stack_addr, stack_addr_mem); + + ptr = gen_rtx_PLUS (DImode, stack_addr, stack_limit_offset_di); } else { @@ -4551,11 +4599,11 @@ gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ , whether it was the first call. */ rtx result = gen_reg_rtx (BImode); emit_move_insn (result, const0_rtx); - if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0) + if (cfun->machine->args.reg[QUEUE_PTR_ARG] >= 0) { rtx not_first = gen_label_rtx (); rtx reg = gen_rtx_REG (DImode, - cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]); + cfun->machine->args.reg[QUEUE_PTR_ARG]); reg = gcn_operand_part (DImode, reg, 1); rtx cmp = force_reg (SImode, gen_rtx_LSHIFTRT (SImode, reg, GEN_INT (16))); @@ -6041,16 +6089,13 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree) "\t .amdhsa_reserve_vcc\t1\n" "\t .amdhsa_reserve_flat_scratch\t0\n" "\t .amdhsa_reserve_xnack_mask\t%i\n" - "\t .amdhsa_private_segment_fixed_size\t%i\n" + "\t .amdhsa_private_segment_fixed_size\t0\n" "\t .amdhsa_group_segment_fixed_size\t%u\n" "\t .amdhsa_float_denorm_mode_32\t3\n" "\t .amdhsa_float_denorm_mode_16_64\t3\n", vgpr, sgpr, xnack_enabled, - /* workitem_private_segment_bytes_size needs to be - one 64th the wave-front stack size. */ - stack_size_opt / 64, LDS_SIZE); if (gcn_arch == PROCESSOR_GFX90a) fprintf (file, @@ -6075,7 +6120,7 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree) " .kernarg_segment_size: %i\n" " .kernarg_segment_align: %i\n" " .group_segment_fixed_size: %u\n" - " .private_segment_fixed_size: %i\n" + " .private_segment_fixed_size: 0\n" " .wavefront_size: 64\n" " .sgpr_count: %i\n" " .vgpr_count: %i\n" @@ -6083,7 +6128,6 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree) cfun->machine->kernarg_segment_byte_size, cfun->machine->kernarg_segment_alignment, LDS_SIZE, - stack_size_opt / 64, sgpr, vgpr); if (gcn_arch == PROCESSOR_GFX90a) fprintf (file, " .agpr_count: 0\n"); // AGPRs are not used, yet diff --git a/gcc/config/gcn/gcn.h b/gcc/config/gcn/gcn.h index 19ad5214580..4ff9a5d4d12 100644 --- a/gcc/config/gcn/gcn.h +++ b/gcc/config/gcn/gcn.h @@ -183,7 +183,7 @@ #define FIXED_REGISTERS { \ /* Scalars. */ \ - 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, \ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, \ /* fp sp lr. */ \ 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, \ /* exec_save, cc_save */ \ diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt index e616ea0453f..c5c32bdc833 100644 --- a/gcc/config/gcn/gcn.opt +++ b/gcc/config/gcn/gcn.opt @@ -69,7 +69,7 @@ int stack_size_opt = -1 mstack-size= Target RejectNegative Joined UInteger Var(stack_size_opt) Init(-1) --mstack-size= Set the private segment size per wave-front, in bytes. +Obsolete; use GCN_STACK_SIZE at runtime. int gang_private_size_opt = -1 diff --git a/gcc/testsuite/gcc.c-torture/execute/pr47237.c b/gcc/testsuite/gcc.c-torture/execute/pr47237.c index 98124065b2f..944bdb7c93a 100644 --- a/gcc/testsuite/gcc.c-torture/execute/pr47237.c +++ b/gcc/testsuite/gcc.c-torture/execute/pr47237.c @@ -1,4 +1,4 @@ -/* { dg-xfail-if "can cause stack underflow" { nios2-*-* } } */ +/* { dg-xfail-run-if "can cause stack underflow" { nios2-*-* amdgcn-*-* } } */ /* { dg-require-effective-target untyped_assembly } */ #define INTEGER_ARG 5 diff --git a/gcc/testsuite/gcc.dg/builtin-apply3.c b/gcc/testsuite/gcc.dg/builtin-apply3.c index 37c5209b91c..8fc20030ed7 100644 --- a/gcc/testsuite/gcc.dg/builtin-apply3.c +++ b/gcc/testsuite/gcc.dg/builtin-apply3.c @@ -6,6 +6,7 @@ /* { dg-do run } */ /* { dg-require-effective-target untyped_assembly } */ +/* { dg-xfail-run-if "can cause stack underflow" { amdgcn-*-* } } */ #define INTEGER_ARG 5 diff --git a/gcc/testsuite/gcc.dg/builtin-apply4.c b/gcc/testsuite/gcc.dg/builtin-apply4.c index cca9187a1d3..aa491c18de4 100644 --- a/gcc/testsuite/gcc.dg/builtin-apply4.c +++ b/gcc/testsuite/gcc.dg/builtin-apply4.c @@ -3,6 +3,7 @@ /* { dg-additional-options "-mno-mmx" { target { { i?86-*-* x86_64-*-* } && ia32 } } } */ /* { dg-do run } */ /* { dg-require-effective-target untyped_assembly } */ +/* { dg-xfail-run-if "can cause stack underflow" { amdgcn-*-* } } */ extern void abort (void); diff --git a/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-3.c b/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-3.c index 37c5209b91c..8fc20030ed7 100644 --- a/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-3.c +++ b/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-3.c @@ -6,6 +6,7 @@ /* { dg-do run } */ /* { dg-require-effective-target untyped_assembly } */ +/* { dg-xfail-run-if "can cause stack underflow" { amdgcn-*-* } } */ #define INTEGER_ARG 5 diff --git a/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-4.c b/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-4.c index 78b10322edc..94b20123724 100644 --- a/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-4.c +++ b/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-4.c @@ -2,6 +2,7 @@ /* { dg-do run } */ /* { dg-additional-options "-fgnu89-inline" } */ /* { dg-require-effective-target untyped_assembly } */ +/* { dg-xfail-run-if "can cause stack underflow" { amdgcn-*-* } } */ extern void abort (void); diff --git a/include/gomp-constants.h b/include/gomp-constants.h index 3f72a15ef55..1b9b07dc245 100644 --- a/include/gomp-constants.h +++ b/include/gomp-constants.h @@ -284,7 +284,7 @@ enum gomp_map_kind to the plugin interface defined in libgomp/libgomp.h. */ #define GOMP_VERSION 2 #define GOMP_VERSION_NVIDIA_PTX 1 -#define GOMP_VERSION_GCN 2 +#define GOMP_VERSION_GCN 3 #define GOMP_VERSION_PACK(LIB, DEV) (((LIB) << 16) | (DEV)) #define GOMP_VERSION_LIB(PACK) (((PACK) >> 16) & 0xffff) diff --git a/libgomp/config/gcn/libgomp-gcn.h b/libgomp/config/gcn/libgomp-gcn.h index cc0fc134915..f62b7dde0e7 100644 --- a/libgomp/config/gcn/libgomp-gcn.h +++ b/libgomp/config/gcn/libgomp-gcn.h @@ -30,6 +30,40 @@ #ifndef LIBGOMP_GCN_H #define LIBGOMP_GCN_H 1 +#define DEFAULT_GCN_STACK_SIZE (32*1024) +#define DEFAULT_TEAM_ARENA_SIZE (64*1024) + +struct heap +{ + int64_t size; + char data[0]; +}; + +/* This struct defines the (unofficial) ABI-defined values the compiler + expects to find in first bytes of the kernargs space. + The plugin may choose to place additional data later in the kernargs + memory allocation, but those are not in any fixed location. */ +struct kernargs_abi { + /* Leave space for the real kernel arguments. + OpenACC and OpenMP only use one pointer. */ + int64_t dummy1; + int64_t dummy2; + + /* A pointer to struct output, below, for console output data. */ + int64_t out_ptr; /* Offset 16. */ + + /* A pointer to struct heap. */ + int64_t heap_ptr; /* Offset 24. */ + + /* A pointer to the ephemeral memory areas. + The team arena is only needed for OpenMP. + Each should have enough space for all the teams and threads. */ + int64_t arena_ptr; /* Offset 32. */ + int64_t stack_ptr; /* Offset 40. */ + int arena_size_per_team; /* Offset 48. */ + int stack_size_per_thread; /* Offset 52. */ +}; + /* This struct is also used in Newlib's libc/sys/amdgcn/write.c. */ struct output { diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c index 527aa088c2a..f03207c84e3 100644 --- a/libgomp/config/gcn/team.c +++ b/libgomp/config/gcn/team.c @@ -60,14 +60,16 @@ gomp_gcn_enter_kernel (void) /* Initialize the team arena for optimized memory allocation. The arena has been allocated on the host side, and the address passed in via the kernargs. Each team takes a small slice of it. */ - void **kernargs = (void**) __builtin_gcn_kernarg_ptr (); - void *team_arena = (kernargs[4] + TEAM_ARENA_SIZE*teamid); + struct kernargs_abi *kernargs = + (struct kernargs_abi*) __builtin_gcn_kernarg_ptr (); + void *team_arena = ((void*)kernargs->arena_ptr + + kernargs->arena_size_per_team * teamid); void * __lds *arena_start = (void * __lds *)TEAM_ARENA_START; void * __lds *arena_free = (void * __lds *)TEAM_ARENA_FREE; void * __lds *arena_end = (void * __lds *)TEAM_ARENA_END; *arena_start = team_arena; *arena_free = team_arena; - *arena_end = team_arena + TEAM_ARENA_SIZE; + *arena_end = team_arena + kernargs->arena_size_per_team; /* Allocate and initialize the team-local-storage data. */ struct gomp_thread *thrs = team_malloc_cleared (sizeof (*thrs) diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index e7e409ff105..ba8fe348aba 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -112,8 +112,8 @@ extern void gomp_aligned_free (void *); /* Optimized allocators for team-specific data that will die with the team. */ #ifdef __AMDGCN__ +#include "libgomp-gcn.h" /* The arena is initialized in config/gcn/team.c. */ -#define TEAM_ARENA_SIZE 64*1024 /* Must match the value in plugin-gcn.c. */ #define TEAM_ARENA_START 16 /* LDS offset of free pointer. */ #define TEAM_ARENA_FREE 24 /* LDS offset of free pointer. */ #define TEAM_ARENA_END 32 /* LDS offset of end pointer. */ @@ -135,7 +135,8 @@ team_malloc (size_t size) { /* While this is experimental, let's make sure we know when OOM happens. */ - const char msg[] = "GCN team arena exhausted\n"; + const char msg[] = "GCN team arena exhausted;" + " configure with GCN_TEAM_ARENA_SIZE=bytes\n"; write (2, msg, sizeof(msg)-1); /* Fall back to using the heap (slowly). */ diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index b5d9dac7c86..a7b35059ab3 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -237,20 +237,7 @@ struct kernel_dispatch in libgomp target code. */ struct kernargs { - /* Leave space for the real kernel arguments. - OpenACC and OpenMP only use one pointer. */ - int64_t dummy1; - int64_t dummy2; - - /* A pointer to struct output, below, for console output data. */ - int64_t out_ptr; - - /* A pointer to struct heap, below. */ - int64_t heap_ptr; - - /* A pointer to an ephemeral memory arena. - Only needed for OpenMP. */ - int64_t arena_ptr; + struct kernargs_abi abi; /* Output data. */ struct output output_data; @@ -426,9 +413,9 @@ struct agent_info /* The HSA memory region from which to allocate device data. */ hsa_region_t data_region; - /* Allocated team arenas. */ - struct team_arena_list *team_arena_list; - pthread_mutex_t team_arena_write_lock; + /* Allocated ephemeral memories (team arena and stack space). */ + struct ephemeral_memories_list *ephemeral_memories_list; + pthread_mutex_t ephemeral_memories_write_lock; /* Read-write lock that protects kernels which are running or about to be run from interference with loading and unloading of images. Needs to be @@ -510,17 +497,18 @@ struct module_info }; /* A linked list of memory arenas allocated on the device. - These are only used by OpenMP, as a means to optimize per-team malloc. */ + These are used by OpenMP, as a means to optimize per-team malloc, + and for host-accessible stack space. */ -struct team_arena_list +struct ephemeral_memories_list { - struct team_arena_list *next; + struct ephemeral_memories_list *next; - /* The number of teams determines the size of the allocation. */ - int num_teams; - /* The device address of the arena itself. */ - void *arena; - /* A flag to prevent two asynchronous kernels trying to use the same arena. + /* The size is determined by the number of teams and threads. */ + size_t size; + /* The device address allocated memory. */ + void *address; + /* A flag to prevent two asynchronous kernels trying to use the same memory. The mutex is locked until the kernel exits. */ pthread_mutex_t in_use; }; @@ -539,15 +527,6 @@ struct hsa_context_info char driver_version_s[30]; }; -/* Format of the on-device heap. - - This must match the definition in Newlib and gcn-run. */ - -struct heap { - int64_t size; - char data[0]; -}; - /* }}} */ /* {{{ Global variables */ @@ -565,6 +544,11 @@ static struct hsa_runtime_fn_info hsa_fns; static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE; +/* Ephemeral memory sizes for each kernel launch. */ + +static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE; +static int stack_size = DEFAULT_GCN_STACK_SIZE; + /* Flag to decide whether print to stderr information about what is going on. Set in init_debug depending on environment variables. */ @@ -1020,9 +1004,13 @@ print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent) fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue); fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs); fprintf (stderr, "%*sheap address: %p\n", indent, "", - (void*)kernargs->heap_ptr); - fprintf (stderr, "%*sarena address: %p\n", indent, "", - (void*)kernargs->arena_ptr); + (void*)kernargs->abi.heap_ptr); + fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent, + "", (void*)kernargs->abi.arena_ptr, + kernargs->abi.arena_size_per_team); + fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent, + "", (void*)kernargs->abi.stack_ptr, + kernargs->abi.stack_size_per_thread); fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object); fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "", dispatch->private_segment_size); @@ -1082,6 +1070,22 @@ init_environment_variables (void) if (tmp) gcn_kernel_heap_size = tmp; } + + const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE"); + if (arena) + { + int tmp = atoi (arena); + if (tmp) + team_arena_size = tmp;; + } + + const char *stack = secure_getenv ("GCN_STACK_SIZE"); + if (stack) + { + int tmp = atoi (stack); + if (tmp) + stack_size = tmp;; + } } /* Return malloc'd string with name of SYMBOL. */ @@ -1693,85 +1697,103 @@ isa_code(const char *isa) { /* }}} */ /* {{{ Run */ -/* Create or reuse a team arena. +/* Create or reuse a team arena and stack space. Team arenas are used by OpenMP to avoid calling malloc multiple times while setting up each team. This is purely a performance optimization. - Allocating an arena also costs performance, albeit on the host side, so - this function will reuse an existing arena if a large enough one is idle. - The arena is released, but not deallocated, when the kernel exits. */ + The stack space is used by all kernels. We must allocate it in such a + way that the reverse offload implmentation can access the data. -static void * -get_team_arena (struct agent_info *agent, int num_teams) + Allocating this memory costs performance, so this function will reuse an + existing allocation if a large enough one is idle. + The memory lock is released, but not deallocated, when the kernel exits. */ + +static void +configure_ephemeral_memories (struct kernel_info *kernel, + struct kernargs_abi *kernargs, int num_teams, + int num_threads) { - struct team_arena_list **next_ptr = &agent->team_arena_list; - struct team_arena_list *item; + struct agent_info *agent = kernel->agent; + struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list; + struct ephemeral_memories_list *item; + + int actual_arena_size = (kernel->kind == KIND_OPENMP + ? team_arena_size : 0); + int actual_arena_total_size = actual_arena_size * num_teams; + size_t size = (actual_arena_total_size + + num_teams * num_threads * stack_size); for (item = *next_ptr; item; next_ptr = &item->next, item = item->next) { - if (item->num_teams < num_teams) + if (item->size < size) continue; - if (pthread_mutex_trylock (&item->in_use)) - continue; - - return item->arena; + if (pthread_mutex_trylock (&item->in_use) == 0) + break; } - GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams); - - if (pthread_mutex_lock (&agent->team_arena_write_lock)) + if (!item) { - GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex"); - return false; - } - item = malloc (sizeof (*item)); - item->num_teams = num_teams; - item->next = NULL; - *next_ptr = item; + GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads" + " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""), + num_teams, num_threads, size); - if (pthread_mutex_init (&item->in_use, NULL)) - { - GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex"); - return false; - } - if (pthread_mutex_lock (&item->in_use)) - { - GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex"); - return false; - } - if (pthread_mutex_unlock (&agent->team_arena_write_lock)) - { - GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex"); - return false; + if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock)) + { + GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex"); + return; + } + item = malloc (sizeof (*item)); + item->size = size; + item->next = NULL; + *next_ptr = item; + + if (pthread_mutex_init (&item->in_use, NULL)) + { + GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex"); + return; + } + if (pthread_mutex_lock (&item->in_use)) + { + GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex"); + return; + } + if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock)) + { + GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex"); + return; + } + + hsa_status_t status; + status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size, + &item->address); + if (status != HSA_STATUS_SUCCESS) + hsa_fatal ("Could not allocate memory for GCN kernel arena", status); + status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id, + HSA_ACCESS_PERMISSION_RW); + if (status != HSA_STATUS_SUCCESS) + hsa_fatal ("Could not assign arena & stack memory to device", status); } - const int TEAM_ARENA_SIZE = 64*1024; /* Must match libgomp.h. */ - hsa_status_t status; - status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, - TEAM_ARENA_SIZE*num_teams, - &item->arena); - if (status != HSA_STATUS_SUCCESS) - hsa_fatal ("Could not allocate memory for GCN kernel arena", status); - status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id, - HSA_ACCESS_PERMISSION_RW); - if (status != HSA_STATUS_SUCCESS) - hsa_fatal ("Could not assign arena memory to device", status); - - return item->arena; + kernargs->arena_ptr = (actual_arena_total_size + ? (uint64_t)item->address + : 0); + kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size; + kernargs->arena_size_per_team = actual_arena_size; + kernargs->stack_size_per_thread = stack_size; } -/* Mark a team arena available for reuse. */ +/* Mark an ephemeral memory space available for reuse. */ static void -release_team_arena (struct agent_info* agent, void *arena) +release_ephemeral_memories (struct agent_info* agent, void *address) { - struct team_arena_list *item; + struct ephemeral_memories_list *item; - for (item = agent->team_arena_list; item; item = item->next) + for (item = agent->ephemeral_memories_list; item; item = item->next) { - if (item->arena == arena) + if (item->address == address) { if (pthread_mutex_unlock (&item->in_use)) GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex"); @@ -1784,22 +1806,22 @@ release_team_arena (struct agent_info* agent, void *arena) /* Clean up all the allocated team arenas. */ static bool -destroy_team_arenas (struct agent_info *agent) +destroy_ephemeral_memories (struct agent_info *agent) { - struct team_arena_list *item, *next; + struct ephemeral_memories_list *item, *next; - for (item = agent->team_arena_list; item; item = next) + for (item = agent->ephemeral_memories_list; item; item = next) { next = item->next; - hsa_fns.hsa_memory_free_fn (item->arena); + hsa_fns.hsa_memory_free_fn (item->address); if (pthread_mutex_destroy (&item->in_use)) { - GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex"); + GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex"); return false; } free (item); } - agent->team_arena_list = NULL; + agent->ephemeral_memories_list = NULL; return true; } @@ -1871,7 +1893,8 @@ alloc_by_agent (struct agent_info *agent, size_t size) the necessary device signals and memory allocations. */ static struct kernel_dispatch * -create_kernel_dispatch (struct kernel_info *kernel, int num_teams) +create_kernel_dispatch (struct kernel_info *kernel, int num_teams, + int num_threads) { struct agent_info *agent = kernel->agent; struct kernel_dispatch *shadow @@ -1906,7 +1929,7 @@ create_kernel_dispatch (struct kernel_info *kernel, int num_teams) struct kernargs *kernargs = shadow->kernarg_address; /* Zero-initialize the output_data (minimum needed). */ - kernargs->out_ptr = (int64_t)&kernargs->output_data; + kernargs->abi.out_ptr = (int64_t)&kernargs->output_data; kernargs->output_data.next_output = 0; for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue) @@ -1916,13 +1939,10 @@ create_kernel_dispatch (struct kernel_info *kernel, int num_teams) kernargs->output_data.consumed = 0; /* Pass in the heap location. */ - kernargs->heap_ptr = (int64_t)kernel->module->heap; + kernargs->abi.heap_ptr = (int64_t)kernel->module->heap; - /* Create an arena. */ - if (kernel->kind == KIND_OPENMP) - kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams); - else - kernargs->arena_ptr = 0; + /* Create the ephemeral memory spaces. */ + configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads); /* Ensure we can recognize unset return values. */ kernargs->output_data.return_value = 0xcafe0000; @@ -2006,9 +2026,10 @@ release_kernel_dispatch (struct kernel_dispatch *shadow) GCN_DEBUG ("Released kernel dispatch: %p\n", shadow); struct kernargs *kernargs = shadow->kernarg_address; - void *arena = (void *)kernargs->arena_ptr; - if (arena) - release_team_arena (shadow->agent, arena); + void *addr = (void *)kernargs->abi.arena_ptr; + if (!addr) + addr = (void *)kernargs->abi.stack_ptr; + release_ephemeral_memories (shadow->agent, addr); hsa_fns.hsa_memory_free_fn (shadow->kernarg_address); @@ -2238,7 +2259,8 @@ run_kernel (struct kernel_info *kernel, void *vars, packet->workgroup_size_z); struct kernel_dispatch *shadow - = create_kernel_dispatch (kernel, packet->grid_size_x); + = create_kernel_dispatch (kernel, packet->grid_size_x, + packet->grid_size_z); shadow->queue = command_q; if (debug) @@ -3280,14 +3302,14 @@ GOMP_OFFLOAD_init_device (int n) GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex"); return false; } - if (pthread_mutex_init (&agent->team_arena_write_lock, NULL)) + if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL)) { GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex"); return false; } agent->async_queues = NULL; agent->omp_async_queue = NULL; - agent->team_arena_list = NULL; + agent->ephemeral_memories_list = NULL; uint32_t queue_size; hsa_status_t status; @@ -3640,7 +3662,7 @@ GOMP_OFFLOAD_fini_device (int n) agent->module = NULL; } - if (!destroy_team_arenas (agent)) + if (!destroy_ephemeral_memories (agent)) return false; if (!destroy_hsa_program (agent)) @@ -3666,9 +3688,9 @@ GOMP_OFFLOAD_fini_device (int n) GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex"); return false; } - if (pthread_mutex_destroy (&agent->team_arena_write_lock)) + if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock)) { - GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex"); + GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex"); return false; } agent->initialized = false;