diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h
index 5d62a845bec030bae7a7d26511b6160e15dcdc5c..2d385e0f10438d20c0e8fd36aadc1dd0c8fa27f7 100644
--- a/gcc/config/gcn/gcn-protos.h
+++ b/gcc/config/gcn/gcn-protos.h
@@ -41,7 +41,8 @@ extern bool gcn_global_address_p (rtx);
 extern tree gcn_goacc_adjust_private_decl (location_t, tree var, int level);
 extern tree gcn_goacc_create_worker_broadcast_record (tree record_type,
 						      bool sender,
-						      const char *name);
+						      const char *name,
+						      unsigned HOST_WIDE_INT offset);
 extern void gcn_goacc_reduction (gcall *call);
 extern bool gcn_hard_regno_rename_ok (unsigned int from_reg,
 				      unsigned int to_reg);
diff --git a/gcc/config/gcn/gcn-tree.c b/gcc/config/gcn/gcn-tree.c
index f722d2d3c4eba48a59ffef617c1150a8a9a4bb1f..d8b913b88cf7eda0ad61108821dd8322a459a16e 100644
--- a/gcc/config/gcn/gcn-tree.c
+++ b/gcc/config/gcn/gcn-tree.c
@@ -309,7 +309,6 @@ static tree
 gcn_goacc_get_worker_red_decl (tree type, unsigned offset)
 {
   machine_function *machfun = cfun->machine;
-  tree existing_decl;
 
   if (TREE_CODE (type) == REFERENCE_TYPE)
     type = TREE_TYPE (type);
@@ -319,31 +318,12 @@ gcn_goacc_get_worker_red_decl (tree type, unsigned offset)
 			    (TYPE_QUALS (type)
 			     | ENCODE_QUAL_ADDR_SPACE (ADDR_SPACE_LDS)));
 
-  if (machfun->reduc_decls
-      && offset < machfun->reduc_decls->length ()
-      && (existing_decl = (*machfun->reduc_decls)[offset]))
-    {
-      gcc_assert (TREE_TYPE (existing_decl) == var_type);
-      return existing_decl;
-    }
-  else
-    {
-      char name[50];
-      sprintf (name, ".oacc_reduction_%u", offset);
-      tree decl = create_tmp_var_raw (var_type, name);
-
-      DECL_CONTEXT (decl) = NULL_TREE;
-      TREE_STATIC (decl) = 1;
-
-      varpool_node::finalize_decl (decl);
-
-      vec_safe_grow_cleared (machfun->reduc_decls, offset + 1, true);
-      (*machfun->reduc_decls)[offset] = decl;
+  gcc_assert (offset
+	      < (machfun->reduction_limit - machfun->reduction_base));
+  tree ptr_type = build_pointer_type (var_type);
+  tree addr = build_int_cst (ptr_type, machfun->reduction_base + offset);
 
-      return decl;
-    }
-
-  return NULL_TREE;
+  return build_simple_mem_ref (addr);
 }
 
 /* Expand IFN_GOACC_REDUCTION_SETUP.  */
@@ -500,7 +480,7 @@ gcn_goacc_reduction_teardown (gcall *call)
     }
 
   if (lhs)
-    gimplify_assign (lhs, var, &seq);
+    gimplify_assign (lhs, unshare_expr (var), &seq);
 
   pop_gimplify_context (NULL);
 
@@ -581,27 +561,24 @@ gcn_goacc_adjust_private_decl (location_t, tree var, int level)
 
 tree
 gcn_goacc_create_worker_broadcast_record (tree record_type, bool sender,
-					  const char *name)
+					  const char *name,
+					  unsigned HOST_WIDE_INT offset)
 {
-  tree type = record_type;
-
-  TYPE_ADDR_SPACE (type) = ADDR_SPACE_LDS;
+  tree type = build_qualified_type (record_type,
+				    TYPE_QUALS_NO_ADDR_SPACE (record_type)
+				    | ENCODE_QUAL_ADDR_SPACE (ADDR_SPACE_LDS));
 
   if (!sender)
-    type = build_pointer_type (type);
-
-  tree decl = create_tmp_var_raw (type, name);
-
-  if (sender)
     {
-      DECL_CONTEXT (decl) = NULL_TREE;
-      TREE_STATIC (decl) = 1;
+      tree ptr_type = build_pointer_type (type);
+      return create_tmp_var_raw (ptr_type, name);
     }
 
-  if (sender)
-    varpool_node::finalize_decl (decl);
+  if (record_type == char_type_node)
+    offset = 1;
 
-  return decl;
+  tree ptr_type = build_pointer_type (type);
+  return build_int_cst (ptr_type, offset);
 }
 
 /* }}}  */
diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index 9df282774989d16b35e4e78ea9a80053cfb26ebb..b1bfdeac7b64082cb3930fa0c7f6f23534ba3f93 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -73,14 +73,21 @@ int gcn_isa = 3;		/* Default to GCN3.  */
  
    We want to permit full occupancy, so size accordingly.  */
 
+/* Use this as a default, but allow it to grow if the user requests a large
+   amount of gang-private shared-memory space.  */
+static int acc_lds_size = 0x600;
+
 #define OMP_LDS_SIZE 0x600    /* 0x600 is 1/40 total, rounded down.  */
-#define ACC_LDS_SIZE 32768    /* Half of the total should be fine.  */
+#define ACC_LDS_SIZE acc_lds_size
 #define OTHER_LDS_SIZE 65536  /* If in doubt, reserve all of it.  */
 
 #define LDS_SIZE (flag_openacc ? ACC_LDS_SIZE \
 		  : flag_openmp ? OMP_LDS_SIZE \
 		  : OTHER_LDS_SIZE)
 
+static int gang_private_hwm = 32;
+static hash_map<tree, int> lds_allocs;
+
 /* The number of registers usable by normal non-kernel functions.
    The SGPR count includes any special extra registers such as VCC.  */
 
@@ -99,13 +106,6 @@ gcn_init_machine_status (void)
 
   f = ggc_cleared_alloc<machine_function> ();
 
-  /* Set up LDS allocation for broadcasting for this function.  */
-  f->lds_allocated = 32;
-  f->lds_allocs = hash_map<tree, int>::create_ggc (64);
-
-  /* And LDS temporary decls for worker reductions.  */
-  vec_alloc (f->reduc_decls, 0);
-
   if (TARGET_GCN3)
     f->use_flat_addressing = true;
 
@@ -145,6 +145,24 @@ gcn_option_override (void)
 	stack_size_opt = 1048576;
     }
 
+  /* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and
+     worker broadcasts.  */
+  if (gang_private_size_opt == -1)
+    gang_private_size_opt = 512;
+  else if (gang_private_size_opt < gang_private_hwm)
+    gang_private_size_opt = gang_private_hwm;
+  else if (gang_private_size_opt >= acc_lds_size - 1024)
+    {
+      /* We need some space for reductions and worker broadcasting.  If the
+	 user requests a large amount of gang-private LDS space, we might not
+	 have enough left for the former.  Increase the LDS allocation in that
+	 case, although this may reduce the maximum occupancy on the
+	 hardware.  */
+      acc_lds_size = gang_private_size_opt + 1024;
+      if (acc_lds_size > 32768)
+	acc_lds_size = 32768;
+    }
+
   /* The xnack option is a placeholder, for now.  */
   if (flag_xnack)
     sorry ("XNACK support");
@@ -3066,7 +3084,7 @@ gcn_expand_prologue ()
      The low-part is the address of the topmost addressable byte, which is
      size-1.  The high-part is an offset and should be zero.  */
   emit_move_insn (gen_rtx_REG (SImode, M0_REG),
-		  gen_int_mode (LDS_SIZE-1, SImode));
+		  gen_int_mode (LDS_SIZE, SImode));
 
   emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG)));
 
@@ -5161,6 +5179,28 @@ gcn_fixup_accel_lto_options (tree fndecl)
     }
 }
 
+/* Implement TARGET_GOACC_SHARED_MEM_LAYOUT hook.  */
+
+static void
+gcn_shared_mem_layout (unsigned HOST_WIDE_INT *lo,
+		       unsigned HOST_WIDE_INT *hi,
+		       int ARG_UNUSED (dims[GOMP_DIM_MAX]),
+		       unsigned HOST_WIDE_INT
+			 ARG_UNUSED (private_size[GOMP_DIM_MAX]),
+		       unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX])
+{
+  *lo = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
+  /* !!! We can maybe use dims[] to estimate the maximum number of work
+     groups/wavefronts/etc. we will launch, and therefore tune the maximum
+     amount of LDS we should use.  For now, use a minimal amount to try to
+     maximise occupancy.  */
+  *hi = acc_lds_size;
+  machine_function *machfun = cfun->machine;
+  machfun->reduction_base = gang_private_size_opt;
+  machfun->reduction_limit
+    = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER];
+}
+
 /* }}}  */
 /* {{{ ASM Output.  */
 
@@ -5488,17 +5528,18 @@ gcn_section_type_flags (tree decl, const char *name, int reloc)
 
 /* Helper function for gcn_asm_output_symbol_ref.
 
-   FIXME: If we want to have propagation blocks allocated separately and
-   statically like this, it would be better done via symbol refs and the
-   assembler/linker.  This is a temporary hack.  */
+   FIXME: This function is used to lay out gang-private variables in LDS
+   on a per-CU basis.
+   There may be cases in which gang-private variables in different compilation
+   units could clobber each other.  In that case we should be relying on the
+   linker to lay out gang-private LDS space, but that doesn't appear to be
+   possible at present.  */
 
 static void
 gcn_print_lds_decl (FILE *f, tree var)
 {
   int *offset;
-  machine_function *machfun = cfun->machine;
-
-  if ((offset = machfun->lds_allocs->get (var)))
+  if ((offset = lds_allocs.get (var)))
     fprintf (f, "%u", (unsigned) *offset);
   else
     {
@@ -5508,14 +5549,14 @@ gcn_print_lds_decl (FILE *f, tree var)
       if (size > align && size > 4 && align < 8)
 	align = 8;
 
-      machfun->lds_allocated = ((machfun->lds_allocated + align - 1)
-				& ~(align - 1));
+      gang_private_hwm = ((gang_private_hwm + align - 1) & ~(align - 1));
 
-      machfun->lds_allocs->put (var, machfun->lds_allocated);
-      fprintf (f, "%u", machfun->lds_allocated);
-      machfun->lds_allocated += size;
-      if (machfun->lds_allocated > LDS_SIZE)
-	error ("local data-share memory exhausted");
+      lds_allocs.put (var, gang_private_hwm);
+      fprintf (f, "%u", gang_private_hwm);
+      gang_private_hwm += size;
+      if (gang_private_hwm > gang_private_size_opt)
+	error ("gang-private data-share memory exhausted (increase with "
+	       "%<-mgang-private-size=<number>%>)");
     }
 }
 
@@ -6515,6 +6556,8 @@ gcn_dwarf_register_span (rtx rtl)
 #define TARGET_GOACC_REDUCTION gcn_goacc_reduction
 #undef  TARGET_GOACC_VALIDATE_DIMS
 #define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
+#undef  TARGET_GOACC_SHARED_MEM_LAYOUT
+#define TARGET_GOACC_SHARED_MEM_LAYOUT gcn_shared_mem_layout
 #undef  TARGET_HARD_REGNO_MODE_OK
 #define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
 #undef  TARGET_HARD_REGNO_NREGS
diff --git a/gcc/config/gcn/gcn.h b/gcc/config/gcn/gcn.h
index 5822ec34aa7bcf32b4593602a8c6d4cb1129c973..b97ec482d852a3a931c9b233dd08368be0d9225b 100644
--- a/gcc/config/gcn/gcn.h
+++ b/gcc/config/gcn/gcn.h
@@ -576,10 +576,8 @@ struct GTY(()) machine_function
   HOST_WIDE_INT local_vars;
   HOST_WIDE_INT callee_saves;
 
-  unsigned lds_allocated;
-  hash_map<tree, int> *lds_allocs;
-
-  vec<tree, va_gc> *reduc_decls;
+  unsigned HOST_WIDE_INT reduction_base;
+  unsigned HOST_WIDE_INT reduction_limit;
 
   bool use_flat_addressing;
 };
diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt
index 6faacca42bbb063a5d0358a1d7092f32eaf8573c..09cf191db016fd7e599fa77f69a5bb4cc051d3fe 100644
--- a/gcc/config/gcn/gcn.opt
+++ b/gcc/config/gcn/gcn.opt
@@ -68,6 +68,12 @@ mstack-size=
 Target RejectNegative Joined UInteger Var(stack_size_opt) Init(-1)
 -mstack-size=<number>	Set the private segment size per wave-front, in bytes.
 
+int gang_private_size_opt = -1
+
+mgang-private-size=
+Target RejectNegative Joined UInteger Var(gang_private_size_opt) Init(-1)
+Amount of local data-share (LDS) memory to reserve for gang-private variables.
+
 Wopenacc-dims
 Target Var(warn_openacc_dims) Warning
 Warn about invalid OpenACC dimensions.
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index be8148583d8571b0d035b1938db9d056bfd213a8..902402d75032fdce2cfeaa23a0c82da98d2d47b3 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -6415,7 +6415,7 @@ private variables at OpenACC device-lowering time using the
 @code{TARGET_GOACC_ADJUST_PRIVATE_DECL} target hook.
 @end deftypefn
 
-@deftypefn {Target Hook} tree TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD (tree @var{rec}, bool @var{sender}, const char *@var{name})
+@deftypefn {Target Hook} tree TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD (tree @var{rec}, bool @var{sender}, const char *@var{name}, unsigned HOST_WIDE_INT @var{offset})
 Create a record used to propagate local-variable state from an active
 worker to other workers.  A possible implementation might adjust the type
 of REC to place the new variable in shared GPU memory.
@@ -6424,6 +6424,13 @@ Presence of this target hook indicates that middle end neutering/broadcasting
 be used.
 @end deftypefn
 
+@deftypefn {Target Hook} void TARGET_GOACC_SHARED_MEM_LAYOUT (unsigned HOST_WIDE_INT *@var{}, unsigned HOST_WIDE_INT *@var{}, @var{int[]}, unsigned @var{HOST_WIDE_INT[]}, unsigned @var{HOST_WIDE_INT[]})
+Lay out a fixed shared-memory region on the target.  The LO and HI
+arguments should be set to a range of addresses that can be used for worker
+broadcasting. The dimensions, reduction size and gang-private size
+arguments are for the current offload region.
+@end deftypefn
+
 @node Anchored Addresses
 @section Anchored Addresses
 @cindex anchored addresses
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index d088eee4afecdbb5575b0f4f796ac344e4449155..86352dc9bd2146587a95b7561151b3f84db8021f 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4228,6 +4228,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD
 
+@hook TARGET_GOACC_SHARED_MEM_LAYOUT
+
 @node Anchored Addresses
 @section Anchored Addresses
 @cindex anchored addresses
diff --git a/gcc/omp-oacc-neuter-broadcast.cc b/gcc/omp-oacc-neuter-broadcast.cc
index 3fe92248c4e2b278cdea217acc1727918fef9fa2..e0bd01311ee1245509edc6142241e509029d3162 100644
--- a/gcc/omp-oacc-neuter-broadcast.cc
+++ b/gcc/omp-oacc-neuter-broadcast.cc
@@ -53,6 +53,8 @@
 #include "tree-cfg.h"
 #include "omp-offload.h"
 #include "attribs.h"
+#include "targhooks.h"
+#include "diagnostic-core.h"
 
 /* Loop structure of the function.  The entire function is described as
    a NULL loop.  */
@@ -968,6 +970,8 @@ build_receiver_ref (tree var, tree receiver_decl, field_map_t *fields)
 static tree
 build_sender_ref (tree var, tree sender_decl, field_map_t *fields)
 {
+  if (POINTER_TYPE_P (TREE_TYPE (sender_decl)))
+    sender_decl = build_simple_mem_ref (sender_decl);
   tree field = *fields->get (var);
   return oacc_build_component_ref (sender_decl, field);
 }
@@ -1005,7 +1009,9 @@ static void
 worker_single_copy (basic_block from, basic_block to,
 		    hash_set<tree> *def_escapes_block,
 		    hash_set<tree> *worker_partitioned_uses,
-		    tree record_type, record_field_map_t *record_field_map)
+		    tree record_type, record_field_map_t *record_field_map,
+		    unsigned HOST_WIDE_INT placement,
+		    bool isolate_broadcasts)
 {
   /* If we only have virtual defs, we'll have no record type, but we still want
      to emit single_copy_start and (particularly) single_copy_end to act as
@@ -1016,10 +1022,12 @@ worker_single_copy (basic_block from, basic_block to,
 
   tree sender_decl
     = targetm.goacc.create_worker_broadcast_record (record_type, true,
-						    ".oacc_worker_o");
+						    ".oacc_worker_o",
+						    placement);
   tree receiver_decl
     = targetm.goacc.create_worker_broadcast_record (record_type, false,
-						    ".oacc_worker_i");
+						    ".oacc_worker_i",
+						    placement);
 
   gimple_stmt_iterator gsi = gsi_last_bb (to);
   if (EDGE_COUNT (to->succs) > 1)
@@ -1033,12 +1041,23 @@ worker_single_copy (basic_block from, basic_block to,
 
   tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
 
-  gimple *call = gimple_build_call (decl, 1,
-				    build_fold_addr_expr (sender_decl));
+  gimple *call
+    = gimple_build_call (decl, 1,
+			 POINTER_TYPE_P (TREE_TYPE (sender_decl))
+			 ? sender_decl : build_fold_addr_expr (sender_decl));
   gimple_call_set_lhs (call, lhs);
   gsi_insert_before (&start, call, GSI_NEW_STMT);
   update_stmt (call);
 
+  /* The shared-memory range for this block overflowed.  Add a barrier before
+     the GOACC_single_copy_start call.  */
+  if (isolate_broadcasts)
+    {
+      decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
+      gimple *acc_bar = gimple_build_call (decl, 0);
+      gsi_insert_before (&start, acc_bar, GSI_SAME_STMT);
+    }
+
   tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
 
   gimple *conv = gimple_build_assign (conv_tmp,
@@ -1206,13 +1225,26 @@ worker_single_copy (basic_block from, basic_block to,
 	}
     }
 
+  /* The shared-memory range for this block overflowed.  Add a barrier at the
+     end.  */
+  if (isolate_broadcasts)
+    {
+      gsi = gsi_start_bb (exit_block);
+      decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
+      gimple *acc_bar = gimple_build_call (decl, 0);
+      gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT);
+    }
+
   /* It's possible for the ET->DEST block (the work done by the active thread)
      to finish with a control-flow insn, e.g. a UNIQUE function call.  Split
      the block and add SENDER_SEQ in the latter part to avoid having control
      flow in the middle of a BB.  */
 
   decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END);
-  call = gimple_build_call (decl, 1, build_fold_addr_expr (sender_decl));
+  call = gimple_build_call (decl, 1,
+			    POINTER_TYPE_P (TREE_TYPE (sender_decl))
+			    ? sender_decl
+			    : build_fold_addr_expr (sender_decl));
   gimple_seq_add_stmt (&sender_seq, call);
 
   gsi = gsi_last_bb (body);
@@ -1222,12 +1254,16 @@ worker_single_copy (basic_block from, basic_block to,
   gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING);
 }
 
+typedef hash_map<basic_block, std::pair<unsigned HOST_WIDE_INT, bool> >
+  blk_offset_map_t;
+
 static void
 neuter_worker_single (parallel_g *par, unsigned outer_mask,
 		      bitmap worker_single, bitmap vector_single,
 		      vec<propagation_set *> *prop_set,
 		      hash_set<tree> *partitioned_var_uses,
-		      record_field_map_t *record_field_map)
+		      record_field_map_t *record_field_map,
+		      blk_offset_map_t *blk_offset_map)
 {
   unsigned mask = outer_mask | par->mask;
 
@@ -1316,9 +1352,19 @@ neuter_worker_single (parallel_g *par, unsigned outer_mask,
 	  tree record_type = (tree) block->aux;
 
 	  if (has_defs)
-	    worker_single_copy (block, block, &def_escapes_block,
-				&worker_partitioned_uses, record_type,
-				record_field_map);
+	    {
+	      std::pair<unsigned HOST_WIDE_INT, bool> *off_rngalloc
+		= blk_offset_map->get (block);
+	      gcc_assert (!record_type || off_rngalloc);
+	      unsigned HOST_WIDE_INT offset
+		= off_rngalloc ? off_rngalloc->first : 0;
+	      bool range_allocated
+		= off_rngalloc ? off_rngalloc->second : true;
+	      worker_single_copy (block, block, &def_escapes_block,
+				  &worker_partitioned_uses, record_type,
+				  record_field_map,
+				  offset, !range_allocated);
+	    }
 	  else
 	    worker_single_simple (block, block, &def_escapes_block);
 	}
@@ -1354,14 +1400,159 @@ neuter_worker_single (parallel_g *par, unsigned outer_mask,
 
   if (par->inner)
     neuter_worker_single (par->inner, mask, worker_single, vector_single,
-			  prop_set, partitioned_var_uses, record_field_map);
+			  prop_set, partitioned_var_uses, record_field_map,
+			  blk_offset_map);
   if (par->next)
     neuter_worker_single (par->next, outer_mask, worker_single, vector_single,
-			  prop_set, partitioned_var_uses, record_field_map);
+			  prop_set, partitioned_var_uses, record_field_map,
+			  blk_offset_map);
+}
+
+static void
+dfs_broadcast_reachable_1 (basic_block bb, sbitmap reachable)
+{
+  if (bb->flags & BB_VISITED)
+    return;
+
+  bb->flags |= BB_VISITED;
+
+  if (bb->succs)
+    {
+      edge e;
+      edge_iterator ei;
+      FOR_EACH_EDGE (e, ei, bb->succs)
+	{
+	  basic_block dest = e->dest;
+	  if (dest->aux)
+	    bitmap_set_bit (reachable, dest->index);
+	  else
+	    dfs_broadcast_reachable_1 (dest, reachable);
+	}
+    }
 }
 
+typedef std::pair<int, tree> idx_decl_pair_t;
+
+typedef auto_vec<splay_tree> used_range_vec_t;
+
 static int
-execute_omp_oacc_neuter_broadcast ()
+sort_size_descending (const void *a, const void *b)
+{
+  const idx_decl_pair_t *pa = (const idx_decl_pair_t *) a;
+  const idx_decl_pair_t *pb = (const idx_decl_pair_t *) b;
+  unsigned HOST_WIDE_INT asize = tree_to_uhwi (TYPE_SIZE_UNIT (pa->second));
+  unsigned HOST_WIDE_INT bsize = tree_to_uhwi (TYPE_SIZE_UNIT (pb->second));
+  return bsize - asize;
+}
+
+class addr_range
+{
+public:
+  addr_range (unsigned HOST_WIDE_INT addr_lo, unsigned HOST_WIDE_INT addr_hi)
+    : lo (addr_lo), hi (addr_hi)
+    { }
+  addr_range (const addr_range &ar) : lo (ar.lo), hi (ar.hi)
+    { }
+  addr_range () : lo (0), hi (0)
+    { }
+
+  bool invalid () { return lo == 0 && hi == 0; }
+
+  unsigned HOST_WIDE_INT lo;
+  unsigned HOST_WIDE_INT hi;
+};
+
+static int
+splay_tree_compare_addr_range (splay_tree_key a, splay_tree_key b)
+{
+  addr_range *ar = (addr_range *) a;
+  addr_range *br = (addr_range *) b;
+  if (ar->lo == br->lo && ar->hi == br->hi)
+    return 0;
+  if (ar->hi <= br->lo)
+    return -1;
+  else if (ar->lo >= br->hi)
+    return 1;
+  return 0;
+}
+
+static void
+splay_tree_free_key (splay_tree_key k)
+{
+  addr_range *ar = (addr_range *) k;
+  delete ar;
+}
+
+static addr_range
+first_fit_range (splay_tree s, unsigned HOST_WIDE_INT size,
+		 unsigned HOST_WIDE_INT align, addr_range *bounds)
+{
+  splay_tree_node min = splay_tree_min (s);
+  if (min)
+    {
+      splay_tree_node next;
+      while ((next = splay_tree_successor (s, min->key)))
+	{
+	  unsigned HOST_WIDE_INT lo = ((addr_range *) min->key)->hi;
+	  unsigned HOST_WIDE_INT hi = ((addr_range *) next->key)->lo;
+	  unsigned HOST_WIDE_INT base = (lo + align - 1) & ~(align - 1);
+	  if (base + size <= hi)
+	    return addr_range (base, base + size);
+	  min = next;
+	}
+
+      unsigned HOST_WIDE_INT base = ((addr_range *)min->key)->hi;
+      base = (base + align - 1) & ~(align - 1);
+      if (base + size <= bounds->hi)
+	return addr_range (base, base + size);
+      else
+	return addr_range ();
+    }
+  else
+    {
+      unsigned HOST_WIDE_INT lo = bounds->lo;
+      lo = (lo + align - 1) & ~(align - 1);
+      if (lo + size <= bounds->hi)
+	return addr_range (lo, lo + size);
+      else
+	return addr_range ();
+    }
+}
+
+static int
+merge_ranges_1 (splay_tree_node n, void *ptr)
+{
+  splay_tree accum = (splay_tree) ptr;
+  addr_range ar = *(addr_range *) n->key;
+
+  splay_tree_node old = splay_tree_lookup (accum, n->key);
+
+  /* We might have an overlap.  Create a new range covering the
+     overlapping parts.  */
+  if (old)
+    {
+      addr_range *old_ar = (addr_range *) old->key;
+      ar.lo = MIN (old_ar->lo, ar.lo);
+      ar.hi = MAX (old_ar->hi, ar.hi);
+      splay_tree_remove (accum, old->key);
+    }
+
+  addr_range *new_ar = new addr_range (ar);
+
+  splay_tree_insert (accum, (splay_tree_key) new_ar, n->value);
+
+  return 0;
+}
+
+static void
+merge_ranges (splay_tree accum, splay_tree sp)
+{
+  splay_tree_foreach (sp, merge_ranges_1, (void *) accum);
+}
+
+static void
+oacc_do_neutering (unsigned HOST_WIDE_INT bounds_lo,
+		   unsigned HOST_WIDE_INT bounds_hi)
 {
   bb_stmt_map_t bb_stmt_map;
   auto_bitmap worker_single, vector_single;
@@ -1450,8 +1641,123 @@ execute_omp_oacc_neuter_broadcast ()
 	}
     }
 
+  sbitmap *reachable
+    = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
+			    last_basic_block_for_fn (cfun));
+
+  bitmap_vector_clear (reachable, last_basic_block_for_fn (cfun));
+
+  auto_vec<std::pair<int, tree> > priority;
+
+  FOR_ALL_BB_FN (bb, cfun)
+    {
+      if (bb->aux)
+	{
+	  tree record_type = (tree) bb->aux;
+
+	  basic_block bb2;
+	  FOR_ALL_BB_FN (bb2, cfun)
+	    bb2->flags &= ~BB_VISITED;
+
+	  priority.safe_push (std::make_pair (bb->index, record_type));
+	  dfs_broadcast_reachable_1 (bb, reachable[bb->index]);
+	}
+    }
+
+  sbitmap *inverted
+    = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
+			    last_basic_block_for_fn (cfun));
+
+  bitmap_vector_clear (inverted, last_basic_block_for_fn (cfun));
+
+  for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
+    {
+      sbitmap_iterator bi;
+      unsigned int j;
+      EXECUTE_IF_SET_IN_BITMAP (reachable[i], 0, j, bi)
+	bitmap_set_bit (inverted[j], i);
+    }
+
+  for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
+    bitmap_ior (reachable[i], reachable[i], inverted[i]);
+
+  sbitmap_vector_free (inverted);
+
+  used_range_vec_t used_ranges;
+
+  used_ranges.safe_grow_cleared (last_basic_block_for_fn (cfun));
+
+  blk_offset_map_t blk_offset_map;
+
+  addr_range worker_shm_bounds (bounds_lo, bounds_hi);
+
+  priority.qsort (sort_size_descending);
+  for (unsigned int i = 0; i < priority.length (); i++)
+    {
+      idx_decl_pair_t p = priority[i];
+      int blkno = p.first;
+      tree record_type = p.second;
+      HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (record_type));
+      HOST_WIDE_INT align = TYPE_ALIGN_UNIT (record_type);
+
+      splay_tree conflicts = splay_tree_new (splay_tree_compare_addr_range,
+					     splay_tree_free_key, NULL);
+
+      if (!used_ranges[blkno])
+	used_ranges[blkno] = splay_tree_new (splay_tree_compare_addr_range,
+					     splay_tree_free_key, NULL);
+      else
+	merge_ranges (conflicts, used_ranges[blkno]);
+
+      sbitmap_iterator bi;
+      unsigned int j;
+      EXECUTE_IF_SET_IN_BITMAP (reachable[blkno], 0, j, bi)
+	if (used_ranges[j])
+	  merge_ranges (conflicts, used_ranges[j]);
+
+      addr_range ar
+	= first_fit_range (conflicts, size, align, &worker_shm_bounds);
+
+      splay_tree_delete (conflicts);
+
+      if (ar.invalid ())
+	{
+	  unsigned HOST_WIDE_INT base;
+	  base = bounds_lo + random () % 512;
+	  base = (base + align - 1) & ~(align - 1);
+	  if (base + size > bounds_hi)
+	    error_at (UNKNOWN_LOCATION, "shared-memory region overflow");
+	  std::pair<unsigned HOST_WIDE_INT, bool> base_inrng
+	    = std::make_pair (base, false);
+	  blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
+	}
+      else
+	{
+	  splay_tree_node old = splay_tree_lookup (used_ranges[blkno],
+						   (splay_tree_key) &ar);
+	  if (old)
+	    {
+	      fprintf (stderr, "trying to map [%d..%d] but [%d..%d] is "
+		       "already mapped in block %d\n", (int) ar.lo,
+		       (int) ar.hi, (int) ((addr_range *) old->key)->lo,
+		       (int) ((addr_range *) old->key)->hi, blkno);
+	      abort ();
+	    }
+
+	  addr_range *arp = new addr_range (ar);
+	  splay_tree_insert (used_ranges[blkno], (splay_tree_key) arp,
+			     (splay_tree_value) blkno);
+	  std::pair<unsigned HOST_WIDE_INT, bool> base_inrng
+	    = std::make_pair (ar.lo, true);
+	  blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
+	}
+    }
+
+  sbitmap_vector_free (reachable);
+
   neuter_worker_single (par, mask, worker_single, vector_single, &prop_set,
-			&partitioned_var_uses, &record_field_map);
+			&partitioned_var_uses, &record_field_map,
+			&blk_offset_map);
 
   for (auto it : record_field_map)
     delete it.second;
@@ -1478,6 +1784,107 @@ execute_omp_oacc_neuter_broadcast ()
       fprintf (dump_file, "\n\nAfter neutering:\n\n");
       dump_function_to_file (current_function_decl, dump_file, dump_flags);
     }
+}
+
+static int
+execute_omp_oacc_neuter_broadcast ()
+{
+  unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX];
+  unsigned HOST_WIDE_INT private_size[GOMP_DIM_MAX];
+
+  for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
+    {
+      reduction_size[i] = 0;
+      private_size[i] = 0;
+    }
+
+  /* Calculate shared memory size required for reduction variables and
+     gang-private memory for this offloaded function.  */
+  basic_block bb;
+  FOR_ALL_BB_FN (bb, cfun)
+    {
+      for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+	   !gsi_end_p (gsi);
+	   gsi_next (&gsi))
+	{
+	  gimple *stmt = gsi_stmt (gsi);
+	  if (!is_gimple_call (stmt))
+	    continue;
+	  gcall *call = as_a <gcall *> (stmt);
+	  if (!gimple_call_internal_p (call))
+	    continue;
+	  enum internal_fn ifn_code = gimple_call_internal_fn (call);
+	  switch (ifn_code)
+	    {
+	    default: break;
+	    case IFN_GOACC_REDUCTION:
+	      if (integer_minus_onep (gimple_call_arg (call, 3)))
+		continue;
+	      else
+		{
+		  unsigned code = TREE_INT_CST_LOW (gimple_call_arg (call, 0));
+		  /* Only count reduction variables once: the choice to pick
+		     the setup call is fairly arbitrary.  */
+		  if (code == IFN_GOACC_REDUCTION_SETUP)
+		    {
+		      int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
+		      tree var = gimple_call_arg (call, 2);
+		      tree offset = gimple_call_arg (call, 5);
+		      tree var_type = TREE_TYPE (var);
+		      unsigned HOST_WIDE_INT limit
+			= (tree_to_uhwi (offset)
+			   + tree_to_uhwi (TYPE_SIZE_UNIT (var_type)));
+		      reduction_size[level]
+			= MAX (reduction_size[level], limit);
+		    }
+		}
+	      break;
+	    case IFN_UNIQUE:
+	      {
+		enum ifn_unique_kind kind
+		  = ((enum ifn_unique_kind)
+		     TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
+
+		if (kind == IFN_UNIQUE_OACC_PRIVATE)
+		  {
+		    HOST_WIDE_INT level
+		      = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
+		    if (level == -1)
+		      break;
+		    for (unsigned i = 3;
+			 i < gimple_call_num_args (call);
+			 i++)
+		      {
+			tree arg = gimple_call_arg (call, i);
+			gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
+			tree decl = TREE_OPERAND (arg, 0);
+			unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (decl);
+			private_size[level] = ((private_size[level] + align - 1)
+					       & ~(align - 1));
+			unsigned HOST_WIDE_INT decl_size
+			  = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (decl)));
+			private_size[level] += decl_size;
+		      }
+		  }
+	      }
+	      break;
+	    }
+	}
+    }
+
+  int dims[GOMP_DIM_MAX];
+  for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
+    dims[i] = oacc_get_fn_dim_size (current_function_decl, i);
+
+  /* Find bounds of shared-memory buffer space we can use.  */
+  unsigned HOST_WIDE_INT bounds_lo = 0, bounds_hi = 0;
+  if (targetm.goacc.shared_mem_layout)
+    targetm.goacc.shared_mem_layout (&bounds_lo, &bounds_hi, dims,
+				     private_size, reduction_size);
+
+  /* Perform worker partitioning unless we know 'num_workers(1)'.  */
+  if (dims[GOMP_DIM_WORKER] != 1)
+    oacc_do_neutering (bounds_lo, bounds_hi);
 
   return 0;
 }
@@ -1518,12 +1925,6 @@ public:
     if (!attr)
       return false;
 
-    /* Not relevant for 'num_workers(1)'.  */
-    int worker_dim
-      = oacc_get_fn_dim_size (fun->decl, GOMP_DIM_WORKER);
-    if (worker_dim == 1)
-      return false;
-
     return true;
   }
 
diff --git a/gcc/target.def b/gcc/target.def
index bfa819609c21bd71c0cc585c01dba42534453f47..c5d90cace80d75ef17cd5eddf657db8334f786de 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1764,7 +1764,17 @@ of REC to place the new variable in shared GPU memory.\n\
 \n\
 Presence of this target hook indicates that middle end neutering/broadcasting\n\
 be used.",
-tree, (tree rec, bool sender, const char *name),
+tree, (tree rec, bool sender, const char *name, unsigned HOST_WIDE_INT offset),
+NULL)
+
+DEFHOOK
+(shared_mem_layout,
+"Lay out a fixed shared-memory region on the target.  The LO and HI\n\
+arguments should be set to a range of addresses that can be used for worker\n\
+broadcasting. The dimensions, reduction size and gang-private size\n\
+arguments are for the current offload region.",
+void, (unsigned HOST_WIDE_INT *, unsigned HOST_WIDE_INT *, int[],
+       unsigned HOST_WIDE_INT[], unsigned HOST_WIDE_INT[]),
 NULL)
 
 HOOK_VECTOR_END (goacc)
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-many.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-many.c
index d763a754a11f6f795d66c69353e67addde25394f..37839edfb0938fd66a0ef7cf3d8a17ad70cc694c 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-many.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-many.c
@@ -1,3 +1,7 @@
+/* To avoid 'error: shared-memory region overflow':
+   { dg-additional-options "-foffload-options=amdgcn-amdhsa=-mgang-private-size=64" { target openacc_radeon_accel_selected } }
+*/
+
 #include <assert.h>
 #include <stdio.h>