diff --git a/gcc/config/gcn/gcn-run.cc b/gcc/config/gcn/gcn-run.cc
index 606772e6212..4232a17b784 100644
--- a/gcc/config/gcn/gcn-run.cc
+++ b/gcc/config/gcn/gcn-run.cc
@@ -35,6 +35,7 @@
 #include <signal.h>
 
 #include "hsa.h"
+#include "../../../libgomp/config/gcn/libgomp-gcn.h"
 
 #ifndef HSA_RUNTIME_LIB
 #define HSA_RUNTIME_LIB "libhsa-runtime64.so.1"
@@ -487,39 +488,16 @@ device_malloc (size_t size, hsa_region_t region)
    automatically assign the exit value to *return_value.  */
 struct kernargs
 {
-  /* Kernargs.  */
-  int32_t argc;
-  int64_t argv;
-  int64_t out_ptr;
-  int64_t heap_ptr;
-
-  /* Output data.  */
-  struct output
-  {
-    int return_value;
-    unsigned int next_output;
-    struct printf_data
-    {
-      int written;
-      char msg[128];
-      int type;
-      union
-      {
-	int64_t ivalue;
-	double dvalue;
-	char text[128];
-      };
-    } queue[1024];
-    unsigned int consumed;
-  } output_data;
+  union {
+    struct {
+      int32_t argc;
+      int64_t argv;
+    } args;
+    struct kernargs_abi abi;
+  };
+  struct output output_data;
 };
 
-struct heap
-{
-  int64_t size;
-  char data[0];
-} heap;
-
 /* Print any console output from the kernel.
    We print all entries from "consumed" to the next entry without a "written"
    flag, or "next_output" is reached.  The buffer is circular, but the
@@ -687,6 +665,16 @@ main (int argc, char *argv[])
   for (int i = 0; i < kernel_argc; i++)
     args_size += strlen (kernel_argv[i]) + 1;
 
+  /* The device stack can be adjusted via an environment variable.  */
+  char *envvar = getenv ("GCN_STACK_SIZE");
+  int stack_size = 1 * 1024 * 1024;  /* 1MB default.  */
+  if (envvar)
+    {
+      int val = atoi (envvar);
+      if (val)
+	stack_size = val;
+    }
+
   /* Allocate device memory for both function parameters and the argv
      data.  */
   struct kernargs *kernargs = device_malloc (sizeof (*kernargs),
@@ -702,11 +690,12 @@ main (int argc, char *argv[])
   XHSA (hsa_fns.hsa_memory_assign_agent_fn (heap, device,
 					    HSA_ACCESS_PERMISSION_RW),
 	"Assign heap to device agent");
+  void *stack = device_malloc (stack_size, heap_region);
 
   /* Write the data to the target.  */
-  kernargs->argc = kernel_argc;
-  kernargs->argv = (int64_t) args->argv_data;
-  kernargs->out_ptr = (int64_t) &kernargs->output_data;
+  kernargs->args.argc = kernel_argc;
+  kernargs->args.argv = (int64_t) args->argv_data;
+  kernargs->abi.out_ptr = (int64_t) &kernargs->output_data;
   kernargs->output_data.return_value = 0xcafe0000; /* Default return value. */
   kernargs->output_data.next_output = 0;
   for (unsigned i = 0; i < (sizeof (kernargs->output_data.queue)
@@ -721,8 +710,11 @@ main (int argc, char *argv[])
       memcpy (&args->strings[offset], kernel_argv[i], arg_len + 1);
       offset += arg_len;
     }
-  kernargs->heap_ptr = (int64_t) heap;
+  kernargs->abi.heap_ptr = (int64_t) heap;
   hsa_fns.hsa_memory_copy_fn (&heap->size, &heap_size, sizeof (heap_size));
+  kernargs->abi.arena_ptr = 0;
+  kernargs->abi.stack_ptr = (int64_t) stack;
+  kernargs->abi.stack_size_per_thread = stack_size;
 
   /* Run constructors on the GPU.  */
   run (init_array_kernel, kernargs);
diff --git a/gcc/config/gcn/gcn.cc b/gcc/config/gcn/gcn.cc
index edde7bad518..23ab01e75d8 100644
--- a/gcc/config/gcn/gcn.cc
+++ b/gcc/config/gcn/gcn.cc
@@ -138,21 +138,6 @@ gcn_option_override (void)
       : ISA_UNKNOWN);
   gcc_assert (gcn_isa != ISA_UNKNOWN);
 
-  /* The default stack size needs to be small for offload kernels because
-     there may be many, many threads.  Also, a smaller stack gives a
-     measureable performance boost.  But, a small stack is insufficient
-     for running the testsuite, so we use a larger default for the stand
-     alone case.  */
-  if (stack_size_opt == -1)
-    {
-      if (flag_openacc || flag_openmp)
-	/* 512 bytes per work item = 32kB total.  */
-	stack_size_opt = 512 * 64;
-      else
-	/* 1MB total.  */
-	stack_size_opt = 1048576;
-    }
-
   /* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and
      worker broadcasts.  */
   if (gang_private_size_opt == -1)
@@ -228,11 +213,9 @@ static const struct gcn_kernel_arg_type
 };
 
 static const long default_requested_args
-	= (1 << PRIVATE_SEGMENT_BUFFER_ARG)
-	  | (1 << DISPATCH_PTR_ARG)
+	= (1 << DISPATCH_PTR_ARG)
 	  | (1 << QUEUE_PTR_ARG)
 	  | (1 << KERNARG_SEGMENT_PTR_ARG)
-	  | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG)
 	  | (1 << WORKGROUP_ID_X_ARG)
 	  | (1 << WORK_ITEM_ID_X_ARG)
 	  | (1 << WORK_ITEM_ID_Y_ARG)
@@ -1865,10 +1848,14 @@ gcn_addr_space_convert (rtx op, tree from_type, tree to_type)
 
   if (AS_LDS_P (as_from) && AS_FLAT_P (as_to))
     {
-      rtx queue = gen_rtx_REG (DImode,
-			       cfun->machine->args.reg[QUEUE_PTR_ARG]);
+      /* The high bits of the QUEUE_PTR_ARG register are used by
+	 GCN_BUILTIN_FIRST_CALL_THIS_THREAD_P, so mask them out.  */
+      rtx queue_reg = gen_rtx_REG (DImode,
+				   cfun->machine->args.reg[QUEUE_PTR_ARG]);
+      rtx queue_ptr = gen_reg_rtx (DImode);
+      emit_insn (gen_anddi3 (queue_ptr, queue_reg, GEN_INT (0xffffffffffff)));
       rtx group_seg_aperture_hi = gen_rtx_MEM (SImode,
-				     gen_rtx_PLUS (DImode, queue,
+				     gen_rtx_PLUS (DImode, queue_ptr,
 						   gen_int_mode (64, SImode)));
       rtx tmp = gen_reg_rtx (DImode);
 
@@ -2521,6 +2508,11 @@ gcn_conditional_register_usage (void)
       fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1;
       fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1;
     }
+  if (cfun->machine->args.reg[QUEUE_PTR_ARG] >= 0)
+    {
+      fixed_regs[cfun->machine->args.reg[QUEUE_PTR_ARG]] = 1;
+      fixed_regs[cfun->machine->args.reg[QUEUE_PTR_ARG] + 1] = 1;
+    }
   if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0)
     fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1;
   if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0)
@@ -3346,10 +3338,56 @@ gcn_expand_prologue ()
     }
   else
     {
-      rtx wave_offset = gen_rtx_REG (SImode,
-				     cfun->machine->args.
-				     reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
+      if (TARGET_PACKED_WORK_ITEMS)
+	{
+	  /* v0 conatins the X, Y and Z dimensions all in one.
+	     Expand them out for ABI compatibility.  */
+	  /* TODO: implement and use zero_extract.  */
+	  rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
+	  emit_insn (gen_andv64si3 (v1, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
+				    gen_rtx_CONST_INT (VOIDmode, 0x3FF << 10)));
+	  emit_insn (gen_lshrv64si3 (v1, v1, gen_rtx_CONST_INT (VOIDmode, 10)));
+	  emit_insn (gen_prologue_use (v1));
 
+	  rtx v2 = gen_rtx_REG (V64SImode, VGPR_REGNO (2));
+	  emit_insn (gen_andv64si3 (v2, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
+				    gen_rtx_CONST_INT (VOIDmode, 0x3FF << 20)));
+	  emit_insn (gen_lshrv64si3 (v2, v2, gen_rtx_CONST_INT (VOIDmode, 20)));
+	  emit_insn (gen_prologue_use (v2));
+	}
+
+      /* We no longer use the private segment for the stack (it's not
+	 accessible to reverse offload), so we must calculate a wave offset
+	 from the grid dimensions and stack size, which is calculated on the
+	 host, and passed in the kernargs region.
+	 See libgomp-gcn.h for details.  */
+      rtx wave_offset = gen_rtx_REG (SImode, FIRST_PARM_REG);
+
+      rtx num_waves_mem = gcn_oacc_dim_size (1);
+      rtx num_waves = gen_rtx_REG (SImode, FIRST_PARM_REG+1);
+      set_mem_addr_space (num_waves_mem, ADDR_SPACE_SCALAR_FLAT);
+      emit_move_insn (num_waves, num_waves_mem);
+
+      rtx workgroup_num = gcn_oacc_dim_pos (0);
+      rtx wave_num = gen_rtx_REG (SImode, FIRST_PARM_REG+2);
+      emit_move_insn(wave_num, gcn_oacc_dim_pos (1));
+
+      rtx thread_id = gen_rtx_REG (SImode, FIRST_PARM_REG+3);
+      emit_insn (gen_mulsi3 (thread_id, num_waves, workgroup_num));
+      emit_insn (gen_addsi3_scc (thread_id, thread_id, wave_num));
+
+      rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
+				     [KERNARG_SEGMENT_PTR_ARG]);
+      rtx stack_size_mem = gen_rtx_MEM (SImode,
+					gen_rtx_PLUS (DImode, kernarg_reg,
+						      GEN_INT (52)));
+      set_mem_addr_space (stack_size_mem, ADDR_SPACE_SCALAR_FLAT);
+      emit_move_insn (wave_offset, stack_size_mem);
+
+      emit_insn (gen_mulsi3 (wave_offset, wave_offset, thread_id));
+
+      /* The FLAT_SCRATCH_INIT is not usually needed, but can be enabled
+	 via the function attributes.  */
       if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG))
 	{
 	  rtx fs_init_lo =
@@ -3386,10 +3424,12 @@ gcn_expand_prologue ()
       HOST_WIDE_INT sp_adjust = (offsets->local_vars
 				 + offsets->outgoing_args_size);
 
-      /* Initialise FP and SP from the buffer descriptor in s[0:3].  */
-      emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0));
-      emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1),
-				 gen_int_mode (0xffff, SImode)));
+      /* Initialize FP and SP from space allocated on the host.  */
+      rtx stack_addr_mem = gen_rtx_MEM (DImode,
+					gen_rtx_PLUS (DImode, kernarg_reg,
+						      GEN_INT (40)));
+      set_mem_addr_space (stack_addr_mem, ADDR_SPACE_SCALAR_FLAT);
+      emit_move_insn (fp, stack_addr_mem);
       rtx scc = gen_rtx_REG (BImode, SCC_REG);
       emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc));
       emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc));
@@ -3445,25 +3485,6 @@ gcn_expand_prologue ()
     emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
   }
 
-  if (TARGET_PACKED_WORK_ITEMS
-      && cfun && cfun->machine && !cfun->machine->normal_function)
-  {
-    /* v0 conatins the X, Y and Z dimensions all in one.
-       Expand them out for ABI compatibility.  */
-    /* TODO: implement and use zero_extract.  */
-    rtx v1 = gen_rtx_REG (V64SImode, VGPR_REGNO (1));
-    emit_insn (gen_andv64si3 (v1, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
-	       gen_rtx_CONST_INT (VOIDmode, 0x3FF << 10)));
-    emit_insn (gen_lshrv64si3 (v1, v1, gen_rtx_CONST_INT (VOIDmode, 10)));
-    emit_insn (gen_prologue_use (v1));
-
-    rtx v2 = gen_rtx_REG (V64SImode, VGPR_REGNO (2));
-    emit_insn (gen_andv64si3 (v2, gen_rtx_REG (V64SImode, VGPR_REGNO (0)),
-	       gen_rtx_CONST_INT (VOIDmode, 0x3FF << 20)));
-    emit_insn (gen_lshrv64si3 (v2, v2, gen_rtx_CONST_INT (VOIDmode, 20)));
-    emit_insn (gen_prologue_use (v2));
-  }
-
   if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp)
     {
       /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel.  */
@@ -4504,26 +4525,53 @@ gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
 	   cf. struct hsa_kernel_dispatch_packet_s in the HSA doc.  */
 	rtx ptr;
 	if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0
-	    && cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
+	    && cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0)
 	  {
-	    rtx size_rtx = gen_rtx_REG (DImode,
-			     cfun->machine->args.reg[DISPATCH_PTR_ARG]);
-	    size_rtx = gen_rtx_MEM (SImode,
-				    gen_rtx_PLUS (DImode, size_rtx,
-						  GEN_INT (6*2 + 3*4)));
-	    size_rtx = gen_rtx_MULT (SImode, size_rtx, GEN_INT (64));
+	    rtx num_waves_mem = gcn_oacc_dim_size (1);
+	    rtx num_waves = gen_reg_rtx (SImode);
+	    set_mem_addr_space (num_waves_mem, ADDR_SPACE_SCALAR_FLAT);
+	    emit_move_insn (num_waves, num_waves_mem);
 
-	    ptr = gen_rtx_REG (DImode,
-		    cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]);
-	    ptr = gen_rtx_AND (DImode, ptr, GEN_INT (0x0000ffffffffffff));
-	    ptr = gen_rtx_PLUS (DImode, ptr, size_rtx);
-	    if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0)
-	      {
-		rtx off;
-		off = gen_rtx_REG (SImode,
-		      cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]);
-		ptr = gen_rtx_PLUS (DImode, ptr, off);
-	      }
+	    rtx workgroup_num = gcn_oacc_dim_pos (0);
+	    rtx wave_num = gen_reg_rtx (SImode);
+	    emit_move_insn(wave_num, gcn_oacc_dim_pos (1));
+
+	    rtx thread_id = gen_reg_rtx (SImode);
+	    emit_insn (gen_mulsi3 (thread_id, num_waves, workgroup_num));
+	    emit_insn (gen_addsi3_scc (thread_id, thread_id, wave_num));
+
+	    rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg
+					   [KERNARG_SEGMENT_PTR_ARG]);
+	    rtx stack_size_mem = gen_rtx_MEM (SImode,
+					      gen_rtx_PLUS (DImode,
+							    kernarg_reg,
+							    GEN_INT (52)));
+	    set_mem_addr_space (stack_size_mem, ADDR_SPACE_SCALAR_FLAT);
+	    rtx stack_size = gen_reg_rtx (SImode);
+	    emit_move_insn (stack_size, stack_size_mem);
+
+	    rtx wave_offset = gen_reg_rtx (SImode);
+	    emit_insn (gen_mulsi3 (wave_offset, stack_size, thread_id));
+
+	    rtx stack_limit_offset = gen_reg_rtx (SImode);
+	    emit_insn (gen_addsi3 (stack_limit_offset, wave_offset,
+				   stack_size));
+
+	    rtx stack_limit_offset_di = gen_reg_rtx (DImode);
+	    emit_move_insn (gen_rtx_SUBREG (SImode, stack_limit_offset_di, 4),
+			    const0_rtx);
+	    emit_move_insn (gen_rtx_SUBREG (SImode, stack_limit_offset_di, 0),
+			    stack_limit_offset);
+
+	    rtx stack_addr_mem = gen_rtx_MEM (DImode,
+					      gen_rtx_PLUS (DImode,
+							    kernarg_reg,
+							    GEN_INT (40)));
+	    set_mem_addr_space (stack_addr_mem, ADDR_SPACE_SCALAR_FLAT);
+	    rtx stack_addr = gen_reg_rtx (DImode);
+	    emit_move_insn (stack_addr, stack_addr_mem);
+
+	    ptr = gen_rtx_PLUS (DImode, stack_addr, stack_limit_offset_di);
 	  }
 	else
 	  {
@@ -4551,11 +4599,11 @@ gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ ,
 	   whether it was the first call.  */
 	rtx result = gen_reg_rtx (BImode);
 	emit_move_insn (result, const0_rtx);
-	if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0)
+	if (cfun->machine->args.reg[QUEUE_PTR_ARG] >= 0)
 	  {
 	    rtx not_first = gen_label_rtx ();
 	    rtx reg = gen_rtx_REG (DImode,
-			cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]);
+			cfun->machine->args.reg[QUEUE_PTR_ARG]);
 	    reg = gcn_operand_part (DImode, reg, 1);
 	    rtx cmp = force_reg (SImode,
 				 gen_rtx_LSHIFTRT (SImode, reg, GEN_INT (16)));
@@ -6041,16 +6089,13 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
 	   "\t  .amdhsa_reserve_vcc\t1\n"
 	   "\t  .amdhsa_reserve_flat_scratch\t0\n"
 	   "\t  .amdhsa_reserve_xnack_mask\t%i\n"
-	   "\t  .amdhsa_private_segment_fixed_size\t%i\n"
+	   "\t  .amdhsa_private_segment_fixed_size\t0\n"
 	   "\t  .amdhsa_group_segment_fixed_size\t%u\n"
 	   "\t  .amdhsa_float_denorm_mode_32\t3\n"
 	   "\t  .amdhsa_float_denorm_mode_16_64\t3\n",
 	   vgpr,
 	   sgpr,
 	   xnack_enabled,
-	   /* workitem_private_segment_bytes_size needs to be
-	      one 64th the wave-front stack size.  */
-	   stack_size_opt / 64,
 	   LDS_SIZE);
   if (gcn_arch == PROCESSOR_GFX90a)
     fprintf (file,
@@ -6075,7 +6120,7 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
 	   "            .kernarg_segment_size: %i\n"
 	   "            .kernarg_segment_align: %i\n"
 	   "            .group_segment_fixed_size: %u\n"
-	   "            .private_segment_fixed_size: %i\n"
+	   "            .private_segment_fixed_size: 0\n"
 	   "            .wavefront_size: 64\n"
 	   "            .sgpr_count: %i\n"
 	   "            .vgpr_count: %i\n"
@@ -6083,7 +6128,6 @@ gcn_hsa_declare_function_name (FILE *file, const char *name, tree)
 	   cfun->machine->kernarg_segment_byte_size,
 	   cfun->machine->kernarg_segment_alignment,
 	   LDS_SIZE,
-	   stack_size_opt / 64,
 	   sgpr, vgpr);
   if (gcn_arch == PROCESSOR_GFX90a)
     fprintf (file, "            .agpr_count: 0\n"); // AGPRs are not used, yet
diff --git a/gcc/config/gcn/gcn.h b/gcc/config/gcn/gcn.h
index 19ad5214580..4ff9a5d4d12 100644
--- a/gcc/config/gcn/gcn.h
+++ b/gcc/config/gcn/gcn.h
@@ -183,7 +183,7 @@
 
 #define FIXED_REGISTERS {			    \
     /* Scalars.  */				    \
-    1, 1, 0, 0, 1, 1, 1, 1, 1, 1,		    \
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,		    \
 /*		fp    sp    lr.  */		    \
     1, 1, 0, 0, 0, 0, 1, 1, 0, 0,		    \
 /*  exec_save, cc_save */			    \
diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt
index e616ea0453f..c5c32bdc833 100644
--- a/gcc/config/gcn/gcn.opt
+++ b/gcc/config/gcn/gcn.opt
@@ -69,7 +69,7 @@ int stack_size_opt = -1
 
 mstack-size=
 Target RejectNegative Joined UInteger Var(stack_size_opt) Init(-1)
--mstack-size=<number>	Set the private segment size per wave-front, in bytes.
+Obsolete; use GCN_STACK_SIZE at runtime.
 
 int gang_private_size_opt = -1
 
diff --git a/gcc/testsuite/gcc.c-torture/execute/pr47237.c b/gcc/testsuite/gcc.c-torture/execute/pr47237.c
index 98124065b2f..944bdb7c93a 100644
--- a/gcc/testsuite/gcc.c-torture/execute/pr47237.c
+++ b/gcc/testsuite/gcc.c-torture/execute/pr47237.c
@@ -1,4 +1,4 @@
-/* { dg-xfail-if "can cause stack underflow" { nios2-*-* } } */
+/* { dg-xfail-run-if "can cause stack underflow" { nios2-*-* amdgcn-*-* } } */
 /* { dg-require-effective-target untyped_assembly } */
 #define INTEGER_ARG  5
 
diff --git a/gcc/testsuite/gcc.dg/builtin-apply3.c b/gcc/testsuite/gcc.dg/builtin-apply3.c
index 37c5209b91c..8fc20030ed7 100644
--- a/gcc/testsuite/gcc.dg/builtin-apply3.c
+++ b/gcc/testsuite/gcc.dg/builtin-apply3.c
@@ -6,6 +6,7 @@
 
 /* { dg-do run } */
 /* { dg-require-effective-target untyped_assembly } */
+/* { dg-xfail-run-if "can cause stack underflow" { amdgcn-*-* } } */
 
 
 #define INTEGER_ARG  5
diff --git a/gcc/testsuite/gcc.dg/builtin-apply4.c b/gcc/testsuite/gcc.dg/builtin-apply4.c
index cca9187a1d3..aa491c18de4 100644
--- a/gcc/testsuite/gcc.dg/builtin-apply4.c
+++ b/gcc/testsuite/gcc.dg/builtin-apply4.c
@@ -3,6 +3,7 @@
 /* { dg-additional-options "-mno-mmx" { target { { i?86-*-* x86_64-*-* } && ia32 } } } */
 /* { dg-do run } */
 /* { dg-require-effective-target untyped_assembly } */
+/* { dg-xfail-run-if "can cause stack underflow" { amdgcn-*-* } } */
 
 extern void abort (void);
 
diff --git a/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-3.c b/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-3.c
index 37c5209b91c..8fc20030ed7 100644
--- a/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-3.c
+++ b/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-3.c
@@ -6,6 +6,7 @@
 
 /* { dg-do run } */
 /* { dg-require-effective-target untyped_assembly } */
+/* { dg-xfail-run-if "can cause stack underflow" { amdgcn-*-* } } */
 
 
 #define INTEGER_ARG  5
diff --git a/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-4.c b/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-4.c
index 78b10322edc..94b20123724 100644
--- a/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-4.c
+++ b/gcc/testsuite/gcc.dg/torture/stackalign/builtin-apply-4.c
@@ -2,6 +2,7 @@
 /* { dg-do run } */
 /* { dg-additional-options "-fgnu89-inline" } */
 /* { dg-require-effective-target untyped_assembly } */
+/* { dg-xfail-run-if "can cause stack underflow" { amdgcn-*-* } } */
 
 extern void abort (void);
 
diff --git a/include/gomp-constants.h b/include/gomp-constants.h
index 3f72a15ef55..1b9b07dc245 100644
--- a/include/gomp-constants.h
+++ b/include/gomp-constants.h
@@ -284,7 +284,7 @@ enum gomp_map_kind
    to the plugin interface defined in libgomp/libgomp.h.  */
 #define GOMP_VERSION	2
 #define GOMP_VERSION_NVIDIA_PTX 1
-#define GOMP_VERSION_GCN 2
+#define GOMP_VERSION_GCN 3
 
 #define GOMP_VERSION_PACK(LIB, DEV) (((LIB) << 16) | (DEV))
 #define GOMP_VERSION_LIB(PACK) (((PACK) >> 16) & 0xffff)
diff --git a/libgomp/config/gcn/libgomp-gcn.h b/libgomp/config/gcn/libgomp-gcn.h
index cc0fc134915..f62b7dde0e7 100644
--- a/libgomp/config/gcn/libgomp-gcn.h
+++ b/libgomp/config/gcn/libgomp-gcn.h
@@ -30,6 +30,40 @@
 #ifndef LIBGOMP_GCN_H
 #define LIBGOMP_GCN_H 1
 
+#define DEFAULT_GCN_STACK_SIZE (32*1024)
+#define DEFAULT_TEAM_ARENA_SIZE (64*1024)
+
+struct heap
+{
+  int64_t size;
+  char data[0];
+};
+
+/* This struct defines the (unofficial) ABI-defined values the compiler
+   expects to find in first bytes of the kernargs space.
+   The plugin may choose to place additional data later in the kernargs
+   memory allocation, but those are not in any fixed location.  */
+struct kernargs_abi {
+  /* Leave space for the real kernel arguments.
+     OpenACC and OpenMP only use one pointer.  */
+  int64_t dummy1;
+  int64_t dummy2;
+
+  /* A pointer to struct output, below, for console output data.  */
+  int64_t out_ptr;		/* Offset 16.  */
+
+  /* A pointer to struct heap.  */
+  int64_t heap_ptr;		/* Offset 24.  */
+
+  /* A pointer to the ephemeral memory areas.
+     The team arena is only needed for OpenMP.
+     Each should have enough space for all the teams and threads.  */
+  int64_t arena_ptr;		/* Offset 32.  */
+  int64_t stack_ptr;		/* Offset 40.  */
+  int arena_size_per_team;	/* Offset 48.  */
+  int stack_size_per_thread;	/* Offset 52.  */
+};
+
 /* This struct is also used in Newlib's libc/sys/amdgcn/write.c.  */
 struct output
 {
diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c
index 527aa088c2a..f03207c84e3 100644
--- a/libgomp/config/gcn/team.c
+++ b/libgomp/config/gcn/team.c
@@ -60,14 +60,16 @@ gomp_gcn_enter_kernel (void)
       /* Initialize the team arena for optimized memory allocation.
          The arena has been allocated on the host side, and the address
          passed in via the kernargs.  Each team takes a small slice of it.  */
-      void **kernargs = (void**) __builtin_gcn_kernarg_ptr ();
-      void *team_arena = (kernargs[4] + TEAM_ARENA_SIZE*teamid);
+      struct kernargs_abi *kernargs =
+	(struct kernargs_abi*) __builtin_gcn_kernarg_ptr ();
+      void *team_arena = ((void*)kernargs->arena_ptr
+			  + kernargs->arena_size_per_team * teamid);
       void * __lds *arena_start = (void * __lds *)TEAM_ARENA_START;
       void * __lds *arena_free = (void * __lds *)TEAM_ARENA_FREE;
       void * __lds *arena_end = (void * __lds *)TEAM_ARENA_END;
       *arena_start = team_arena;
       *arena_free = team_arena;
-      *arena_end = team_arena + TEAM_ARENA_SIZE;
+      *arena_end = team_arena + kernargs->arena_size_per_team;
 
       /* Allocate and initialize the team-local-storage data.  */
       struct gomp_thread *thrs = team_malloc_cleared (sizeof (*thrs)
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index e7e409ff105..ba8fe348aba 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -112,8 +112,8 @@ extern void gomp_aligned_free (void *);
 /* Optimized allocators for team-specific data that will die with the team.  */
 
 #ifdef __AMDGCN__
+#include "libgomp-gcn.h"
 /* The arena is initialized in config/gcn/team.c.  */
-#define TEAM_ARENA_SIZE  64*1024  /* Must match the value in plugin-gcn.c.  */
 #define TEAM_ARENA_START 16  /* LDS offset of free pointer.  */
 #define TEAM_ARENA_FREE  24  /* LDS offset of free pointer.  */
 #define TEAM_ARENA_END   32  /* LDS offset of end pointer.  */
@@ -135,7 +135,8 @@ team_malloc (size_t size)
     {
       /* While this is experimental, let's make sure we know when OOM
 	 happens.  */
-      const char msg[] = "GCN team arena exhausted\n";
+      const char msg[] = "GCN team arena exhausted;"
+			 " configure with GCN_TEAM_ARENA_SIZE=bytes\n";
       write (2, msg, sizeof(msg)-1);
 
       /* Fall back to using the heap (slowly).  */
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index b5d9dac7c86..a7b35059ab3 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -237,20 +237,7 @@ struct kernel_dispatch
    in libgomp target code.  */
 
 struct kernargs {
-  /* Leave space for the real kernel arguments.
-     OpenACC and OpenMP only use one pointer.  */
-  int64_t dummy1;
-  int64_t dummy2;
-
-  /* A pointer to struct output, below, for console output data.  */
-  int64_t out_ptr;
-
-  /* A pointer to struct heap, below.  */
-  int64_t heap_ptr;
-
-  /* A pointer to an ephemeral memory arena.
-    Only needed for OpenMP.  */
-  int64_t arena_ptr;
+  struct kernargs_abi abi;
 
   /* Output data.  */
   struct output output_data;
@@ -426,9 +413,9 @@ struct agent_info
   /* The HSA memory region from which to allocate device data.  */
   hsa_region_t data_region;
 
-  /* Allocated team arenas.  */
-  struct team_arena_list *team_arena_list;
-  pthread_mutex_t team_arena_write_lock;
+  /* Allocated ephemeral memories (team arena and stack space).  */
+  struct ephemeral_memories_list *ephemeral_memories_list;
+  pthread_mutex_t ephemeral_memories_write_lock;
 
   /* Read-write lock that protects kernels which are running or about to be run
      from interference with loading and unloading of images.  Needs to be
@@ -510,17 +497,18 @@ struct module_info
 };
 
 /* A linked list of memory arenas allocated on the device.
-   These are only used by OpenMP, as a means to optimize per-team malloc.  */
+   These are used by OpenMP, as a means to optimize per-team malloc,
+   and for host-accessible stack space.  */
 
-struct team_arena_list
+struct ephemeral_memories_list
 {
-  struct team_arena_list *next;
+  struct ephemeral_memories_list *next;
 
-  /* The number of teams determines the size of the allocation.  */
-  int num_teams;
-  /* The device address of the arena itself.  */
-  void *arena;
-  /* A flag to prevent two asynchronous kernels trying to use the same arena.
+  /* The size is determined by the number of teams and threads.  */
+  size_t size;
+  /* The device address allocated memory.  */
+  void *address;
+  /* A flag to prevent two asynchronous kernels trying to use the same memory.
      The mutex is locked until the kernel exits.  */
   pthread_mutex_t in_use;
 };
@@ -539,15 +527,6 @@ struct hsa_context_info
   char driver_version_s[30];
 };
 
-/* Format of the on-device heap.
-
-   This must match the definition in Newlib and gcn-run.  */
-
-struct heap {
-  int64_t size;
-  char data[0];
-};
-
 /* }}}  */
 /* {{{ Global variables  */
 
@@ -565,6 +544,11 @@ static struct hsa_runtime_fn_info hsa_fns;
 
 static size_t gcn_kernel_heap_size = DEFAULT_GCN_HEAP_SIZE;
 
+/* Ephemeral memory sizes for each kernel launch.  */
+
+static int team_arena_size = DEFAULT_TEAM_ARENA_SIZE;
+static int stack_size = DEFAULT_GCN_STACK_SIZE;
+
 /* Flag to decide whether print to stderr information about what is going on.
    Set in init_debug depending on environment variables.  */
 
@@ -1020,9 +1004,13 @@ print_kernel_dispatch (struct kernel_dispatch *dispatch, unsigned indent)
   fprintf (stderr, "%*squeue: %p\n", indent, "", dispatch->queue);
   fprintf (stderr, "%*skernarg_address: %p\n", indent, "", kernargs);
   fprintf (stderr, "%*sheap address: %p\n", indent, "",
-	   (void*)kernargs->heap_ptr);
-  fprintf (stderr, "%*sarena address: %p\n", indent, "",
-	   (void*)kernargs->arena_ptr);
+	   (void*)kernargs->abi.heap_ptr);
+  fprintf (stderr, "%*sarena address: %p (%d bytes per workgroup)\n", indent,
+	   "", (void*)kernargs->abi.arena_ptr,
+	   kernargs->abi.arena_size_per_team);
+  fprintf (stderr, "%*sstack address: %p (%d bytes per wavefront)\n", indent,
+	   "", (void*)kernargs->abi.stack_ptr,
+	   kernargs->abi.stack_size_per_thread);
   fprintf (stderr, "%*sobject: %lu\n", indent, "", dispatch->object);
   fprintf (stderr, "%*sprivate_segment_size: %u\n", indent, "",
 	   dispatch->private_segment_size);
@@ -1082,6 +1070,22 @@ init_environment_variables (void)
       if (tmp)
 	gcn_kernel_heap_size = tmp;
     }
+
+  const char *arena = secure_getenv ("GCN_TEAM_ARENA_SIZE");
+  if (arena)
+    {
+      int tmp = atoi (arena);
+      if (tmp)
+	team_arena_size = tmp;;
+    }
+
+  const char *stack = secure_getenv ("GCN_STACK_SIZE");
+  if (stack)
+    {
+      int tmp = atoi (stack);
+      if (tmp)
+	stack_size = tmp;;
+    }
 }
 
 /* Return malloc'd string with name of SYMBOL.  */
@@ -1693,85 +1697,103 @@ isa_code(const char *isa) {
 /* }}}  */
 /* {{{ Run  */
 
-/* Create or reuse a team arena.
+/* Create or reuse a team arena and stack space.
  
    Team arenas are used by OpenMP to avoid calling malloc multiple times
    while setting up each team.  This is purely a performance optimization.
 
-   Allocating an arena also costs performance, albeit on the host side, so
-   this function will reuse an existing arena if a large enough one is idle.
-   The arena is released, but not deallocated, when the kernel exits.  */
+   The stack space is used by all kernels.  We must allocate it in such a
+   way that the reverse offload implmentation can access the data.
 
-static void *
-get_team_arena (struct agent_info *agent, int num_teams)
+   Allocating this memory costs performance, so this function will reuse an
+   existing allocation if a large enough one is idle.
+   The memory lock is released, but not deallocated, when the kernel exits.  */
+
+static void
+configure_ephemeral_memories (struct kernel_info *kernel,
+			      struct kernargs_abi *kernargs, int num_teams,
+			      int num_threads)
 {
-  struct team_arena_list **next_ptr = &agent->team_arena_list;
-  struct team_arena_list *item;
+  struct agent_info *agent = kernel->agent;
+  struct ephemeral_memories_list **next_ptr = &agent->ephemeral_memories_list;
+  struct ephemeral_memories_list *item;
+
+  int actual_arena_size = (kernel->kind == KIND_OPENMP
+			   ? team_arena_size : 0);
+  int actual_arena_total_size = actual_arena_size * num_teams;
+  size_t size = (actual_arena_total_size
+		 + num_teams * num_threads * stack_size);
 
   for (item = *next_ptr; item; next_ptr = &item->next, item = item->next)
     {
-      if (item->num_teams < num_teams)
+      if (item->size < size)
 	continue;
 
-      if (pthread_mutex_trylock (&item->in_use))
-	continue;
-
-      return item->arena;
+      if (pthread_mutex_trylock (&item->in_use) == 0)
+	break;
     }
 
-  GCN_DEBUG ("Creating a new arena for %d teams\n", num_teams);
-
-  if (pthread_mutex_lock (&agent->team_arena_write_lock))
+  if (!item)
     {
-      GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
-      return false;
-    }
-  item = malloc (sizeof (*item));
-  item->num_teams = num_teams;
-  item->next = NULL;
-  *next_ptr = item;
+      GCN_DEBUG ("Creating a new %sstack for %d teams with %d threads"
+		 " (%zd bytes)\n", (actual_arena_size ? "arena and " : ""),
+		 num_teams, num_threads, size);
 
-  if (pthread_mutex_init (&item->in_use, NULL))
-    {
-      GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
-      return false;
-    }
-  if (pthread_mutex_lock (&item->in_use))
-    {
-      GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
-      return false;
-    }
-  if (pthread_mutex_unlock (&agent->team_arena_write_lock))
-    {
-      GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
-      return false;
+      if (pthread_mutex_lock (&agent->ephemeral_memories_write_lock))
+	{
+	  GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
+	  return;
+	}
+      item = malloc (sizeof (*item));
+      item->size = size;
+      item->next = NULL;
+      *next_ptr = item;
+
+      if (pthread_mutex_init (&item->in_use, NULL))
+	{
+	  GOMP_PLUGIN_error ("Failed to initialize a GCN memory write mutex");
+	  return;
+	}
+      if (pthread_mutex_lock (&item->in_use))
+	{
+	  GOMP_PLUGIN_error ("Could not lock a GCN agent program mutex");
+	  return;
+	}
+      if (pthread_mutex_unlock (&agent->ephemeral_memories_write_lock))
+	{
+	  GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
+	  return;
+	}
+
+      hsa_status_t status;
+      status = hsa_fns.hsa_memory_allocate_fn (agent->data_region, size,
+					       &item->address);
+      if (status != HSA_STATUS_SUCCESS)
+	hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
+      status = hsa_fns.hsa_memory_assign_agent_fn (item->address, agent->id,
+						   HSA_ACCESS_PERMISSION_RW);
+      if (status != HSA_STATUS_SUCCESS)
+	hsa_fatal ("Could not assign arena & stack memory to device", status);
     }
 
-  const int TEAM_ARENA_SIZE = 64*1024;  /* Must match libgomp.h.  */
-  hsa_status_t status;
-  status = hsa_fns.hsa_memory_allocate_fn (agent->data_region,
-					   TEAM_ARENA_SIZE*num_teams,
-					   &item->arena);
-  if (status != HSA_STATUS_SUCCESS)
-    hsa_fatal ("Could not allocate memory for GCN kernel arena", status);
-  status = hsa_fns.hsa_memory_assign_agent_fn (item->arena, agent->id,
-					       HSA_ACCESS_PERMISSION_RW);
-  if (status != HSA_STATUS_SUCCESS)
-    hsa_fatal ("Could not assign arena memory to device", status);
-
-  return item->arena;
+  kernargs->arena_ptr = (actual_arena_total_size
+			 ? (uint64_t)item->address
+			 : 0);
+  kernargs->stack_ptr = (uint64_t)item->address + actual_arena_total_size;
+  kernargs->arena_size_per_team = actual_arena_size;
+  kernargs->stack_size_per_thread = stack_size;
 }
 
-/* Mark a team arena available for reuse.  */
+/* Mark an ephemeral memory space available for reuse.  */
 
 static void
-release_team_arena (struct agent_info* agent, void *arena)
+release_ephemeral_memories (struct agent_info* agent, void *address)
 {
-  struct team_arena_list *item;
+  struct ephemeral_memories_list *item;
 
-  for (item = agent->team_arena_list; item; item = item->next)
+  for (item = agent->ephemeral_memories_list; item; item = item->next)
     {
-      if (item->arena == arena)
+      if (item->address == address)
 	{
 	  if (pthread_mutex_unlock (&item->in_use))
 	    GOMP_PLUGIN_error ("Could not unlock a GCN agent program mutex");
@@ -1784,22 +1806,22 @@ release_team_arena (struct agent_info* agent, void *arena)
 /* Clean up all the allocated team arenas.  */
 
 static bool
-destroy_team_arenas (struct agent_info *agent)
+destroy_ephemeral_memories (struct agent_info *agent)
 {
-  struct team_arena_list *item, *next;
+  struct ephemeral_memories_list *item, *next;
 
-  for (item = agent->team_arena_list; item; item = next)
+  for (item = agent->ephemeral_memories_list; item; item = next)
     {
       next = item->next;
-      hsa_fns.hsa_memory_free_fn (item->arena);
+      hsa_fns.hsa_memory_free_fn (item->address);
       if (pthread_mutex_destroy (&item->in_use))
 	{
-	  GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
+	  GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
 	  return false;
 	}
       free (item);
     }
-  agent->team_arena_list = NULL;
+  agent->ephemeral_memories_list = NULL;
 
   return true;
 }
@@ -1871,7 +1893,8 @@ alloc_by_agent (struct agent_info *agent, size_t size)
    the necessary device signals and memory allocations.  */
 
 static struct kernel_dispatch *
-create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
+create_kernel_dispatch (struct kernel_info *kernel, int num_teams,
+			int num_threads)
 {
   struct agent_info *agent = kernel->agent;
   struct kernel_dispatch *shadow
@@ -1906,7 +1929,7 @@ create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
   struct kernargs *kernargs = shadow->kernarg_address;
 
   /* Zero-initialize the output_data (minimum needed).  */
-  kernargs->out_ptr = (int64_t)&kernargs->output_data;
+  kernargs->abi.out_ptr = (int64_t)&kernargs->output_data;
   kernargs->output_data.next_output = 0;
   for (unsigned i = 0;
        i < (sizeof (kernargs->output_data.queue)
@@ -1916,13 +1939,10 @@ create_kernel_dispatch (struct kernel_info *kernel, int num_teams)
   kernargs->output_data.consumed = 0;
 
   /* Pass in the heap location.  */
-  kernargs->heap_ptr = (int64_t)kernel->module->heap;
+  kernargs->abi.heap_ptr = (int64_t)kernel->module->heap;
 
-  /* Create an arena.  */
-  if (kernel->kind == KIND_OPENMP)
-    kernargs->arena_ptr = (int64_t)get_team_arena (agent, num_teams);
-  else
-    kernargs->arena_ptr = 0;
+  /* Create the ephemeral memory spaces.  */
+  configure_ephemeral_memories (kernel, &kernargs->abi, num_teams, num_threads);
 
   /* Ensure we can recognize unset return values.  */
   kernargs->output_data.return_value = 0xcafe0000;
@@ -2006,9 +2026,10 @@ release_kernel_dispatch (struct kernel_dispatch *shadow)
   GCN_DEBUG ("Released kernel dispatch: %p\n", shadow);
 
   struct kernargs *kernargs = shadow->kernarg_address;
-  void *arena = (void *)kernargs->arena_ptr;
-  if (arena)
-    release_team_arena (shadow->agent, arena);
+  void *addr = (void *)kernargs->abi.arena_ptr;
+  if (!addr)
+    addr = (void *)kernargs->abi.stack_ptr;
+  release_ephemeral_memories (shadow->agent, addr);
 
   hsa_fns.hsa_memory_free_fn (shadow->kernarg_address);
 
@@ -2238,7 +2259,8 @@ run_kernel (struct kernel_info *kernel, void *vars,
 	     packet->workgroup_size_z);
 
   struct kernel_dispatch *shadow
-    = create_kernel_dispatch (kernel, packet->grid_size_x);
+    = create_kernel_dispatch (kernel, packet->grid_size_x,
+			      packet->grid_size_z);
   shadow->queue = command_q;
 
   if (debug)
@@ -3280,14 +3302,14 @@ GOMP_OFFLOAD_init_device (int n)
       GOMP_PLUGIN_error ("Failed to initialize a GCN agent queue mutex");
       return false;
     }
-  if (pthread_mutex_init (&agent->team_arena_write_lock, NULL))
+  if (pthread_mutex_init (&agent->ephemeral_memories_write_lock, NULL))
     {
       GOMP_PLUGIN_error ("Failed to initialize a GCN team arena write mutex");
       return false;
     }
   agent->async_queues = NULL;
   agent->omp_async_queue = NULL;
-  agent->team_arena_list = NULL;
+  agent->ephemeral_memories_list = NULL;
 
   uint32_t queue_size;
   hsa_status_t status;
@@ -3640,7 +3662,7 @@ GOMP_OFFLOAD_fini_device (int n)
       agent->module = NULL;
     }
 
-  if (!destroy_team_arenas (agent))
+  if (!destroy_ephemeral_memories (agent))
     return false;
 
   if (!destroy_hsa_program (agent))
@@ -3666,9 +3688,9 @@ GOMP_OFFLOAD_fini_device (int n)
       GOMP_PLUGIN_error ("Failed to destroy a GCN agent queue mutex");
       return false;
     }
-  if (pthread_mutex_destroy (&agent->team_arena_write_lock))
+  if (pthread_mutex_destroy (&agent->ephemeral_memories_write_lock))
     {
-      GOMP_PLUGIN_error ("Failed to destroy a GCN team arena mutex");
+      GOMP_PLUGIN_error ("Failed to destroy a GCN memory mutex");
       return false;
     }
   agent->initialized = false;