diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h index 5d62a845bec030bae7a7d26511b6160e15dcdc5c..2d385e0f10438d20c0e8fd36aadc1dd0c8fa27f7 100644 --- a/gcc/config/gcn/gcn-protos.h +++ b/gcc/config/gcn/gcn-protos.h @@ -41,7 +41,8 @@ extern bool gcn_global_address_p (rtx); extern tree gcn_goacc_adjust_private_decl (location_t, tree var, int level); extern tree gcn_goacc_create_worker_broadcast_record (tree record_type, bool sender, - const char *name); + const char *name, + unsigned HOST_WIDE_INT offset); extern void gcn_goacc_reduction (gcall *call); extern bool gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg); diff --git a/gcc/config/gcn/gcn-tree.c b/gcc/config/gcn/gcn-tree.c index f722d2d3c4eba48a59ffef617c1150a8a9a4bb1f..d8b913b88cf7eda0ad61108821dd8322a459a16e 100644 --- a/gcc/config/gcn/gcn-tree.c +++ b/gcc/config/gcn/gcn-tree.c @@ -309,7 +309,6 @@ static tree gcn_goacc_get_worker_red_decl (tree type, unsigned offset) { machine_function *machfun = cfun->machine; - tree existing_decl; if (TREE_CODE (type) == REFERENCE_TYPE) type = TREE_TYPE (type); @@ -319,31 +318,12 @@ gcn_goacc_get_worker_red_decl (tree type, unsigned offset) (TYPE_QUALS (type) | ENCODE_QUAL_ADDR_SPACE (ADDR_SPACE_LDS))); - if (machfun->reduc_decls - && offset < machfun->reduc_decls->length () - && (existing_decl = (*machfun->reduc_decls)[offset])) - { - gcc_assert (TREE_TYPE (existing_decl) == var_type); - return existing_decl; - } - else - { - char name[50]; - sprintf (name, ".oacc_reduction_%u", offset); - tree decl = create_tmp_var_raw (var_type, name); - - DECL_CONTEXT (decl) = NULL_TREE; - TREE_STATIC (decl) = 1; - - varpool_node::finalize_decl (decl); - - vec_safe_grow_cleared (machfun->reduc_decls, offset + 1, true); - (*machfun->reduc_decls)[offset] = decl; + gcc_assert (offset + < (machfun->reduction_limit - machfun->reduction_base)); + tree ptr_type = build_pointer_type (var_type); + tree addr = build_int_cst (ptr_type, machfun->reduction_base + offset); - return decl; - } - - return NULL_TREE; + return build_simple_mem_ref (addr); } /* Expand IFN_GOACC_REDUCTION_SETUP. */ @@ -500,7 +480,7 @@ gcn_goacc_reduction_teardown (gcall *call) } if (lhs) - gimplify_assign (lhs, var, &seq); + gimplify_assign (lhs, unshare_expr (var), &seq); pop_gimplify_context (NULL); @@ -581,27 +561,24 @@ gcn_goacc_adjust_private_decl (location_t, tree var, int level) tree gcn_goacc_create_worker_broadcast_record (tree record_type, bool sender, - const char *name) + const char *name, + unsigned HOST_WIDE_INT offset) { - tree type = record_type; - - TYPE_ADDR_SPACE (type) = ADDR_SPACE_LDS; + tree type = build_qualified_type (record_type, + TYPE_QUALS_NO_ADDR_SPACE (record_type) + | ENCODE_QUAL_ADDR_SPACE (ADDR_SPACE_LDS)); if (!sender) - type = build_pointer_type (type); - - tree decl = create_tmp_var_raw (type, name); - - if (sender) { - DECL_CONTEXT (decl) = NULL_TREE; - TREE_STATIC (decl) = 1; + tree ptr_type = build_pointer_type (type); + return create_tmp_var_raw (ptr_type, name); } - if (sender) - varpool_node::finalize_decl (decl); + if (record_type == char_type_node) + offset = 1; - return decl; + tree ptr_type = build_pointer_type (type); + return build_int_cst (ptr_type, offset); } /* }}} */ diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index 9df282774989d16b35e4e78ea9a80053cfb26ebb..b1bfdeac7b64082cb3930fa0c7f6f23534ba3f93 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -73,14 +73,21 @@ int gcn_isa = 3; /* Default to GCN3. */ We want to permit full occupancy, so size accordingly. */ +/* Use this as a default, but allow it to grow if the user requests a large + amount of gang-private shared-memory space. */ +static int acc_lds_size = 0x600; + #define OMP_LDS_SIZE 0x600 /* 0x600 is 1/40 total, rounded down. */ -#define ACC_LDS_SIZE 32768 /* Half of the total should be fine. */ +#define ACC_LDS_SIZE acc_lds_size #define OTHER_LDS_SIZE 65536 /* If in doubt, reserve all of it. */ #define LDS_SIZE (flag_openacc ? ACC_LDS_SIZE \ : flag_openmp ? OMP_LDS_SIZE \ : OTHER_LDS_SIZE) +static int gang_private_hwm = 32; +static hash_map<tree, int> lds_allocs; + /* The number of registers usable by normal non-kernel functions. The SGPR count includes any special extra registers such as VCC. */ @@ -99,13 +106,6 @@ gcn_init_machine_status (void) f = ggc_cleared_alloc<machine_function> (); - /* Set up LDS allocation for broadcasting for this function. */ - f->lds_allocated = 32; - f->lds_allocs = hash_map<tree, int>::create_ggc (64); - - /* And LDS temporary decls for worker reductions. */ - vec_alloc (f->reduc_decls, 0); - if (TARGET_GCN3) f->use_flat_addressing = true; @@ -145,6 +145,24 @@ gcn_option_override (void) stack_size_opt = 1048576; } + /* Reserve 1Kb (somewhat arbitrarily) of LDS space for reduction results and + worker broadcasts. */ + if (gang_private_size_opt == -1) + gang_private_size_opt = 512; + else if (gang_private_size_opt < gang_private_hwm) + gang_private_size_opt = gang_private_hwm; + else if (gang_private_size_opt >= acc_lds_size - 1024) + { + /* We need some space for reductions and worker broadcasting. If the + user requests a large amount of gang-private LDS space, we might not + have enough left for the former. Increase the LDS allocation in that + case, although this may reduce the maximum occupancy on the + hardware. */ + acc_lds_size = gang_private_size_opt + 1024; + if (acc_lds_size > 32768) + acc_lds_size = 32768; + } + /* The xnack option is a placeholder, for now. */ if (flag_xnack) sorry ("XNACK support"); @@ -3066,7 +3084,7 @@ gcn_expand_prologue () The low-part is the address of the topmost addressable byte, which is size-1. The high-part is an offset and should be zero. */ emit_move_insn (gen_rtx_REG (SImode, M0_REG), - gen_int_mode (LDS_SIZE-1, SImode)); + gen_int_mode (LDS_SIZE, SImode)); emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG))); @@ -5161,6 +5179,28 @@ gcn_fixup_accel_lto_options (tree fndecl) } } +/* Implement TARGET_GOACC_SHARED_MEM_LAYOUT hook. */ + +static void +gcn_shared_mem_layout (unsigned HOST_WIDE_INT *lo, + unsigned HOST_WIDE_INT *hi, + int ARG_UNUSED (dims[GOMP_DIM_MAX]), + unsigned HOST_WIDE_INT + ARG_UNUSED (private_size[GOMP_DIM_MAX]), + unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX]) +{ + *lo = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER]; + /* !!! We can maybe use dims[] to estimate the maximum number of work + groups/wavefronts/etc. we will launch, and therefore tune the maximum + amount of LDS we should use. For now, use a minimal amount to try to + maximise occupancy. */ + *hi = acc_lds_size; + machine_function *machfun = cfun->machine; + machfun->reduction_base = gang_private_size_opt; + machfun->reduction_limit + = gang_private_size_opt + reduction_size[GOMP_DIM_WORKER]; +} + /* }}} */ /* {{{ ASM Output. */ @@ -5488,17 +5528,18 @@ gcn_section_type_flags (tree decl, const char *name, int reloc) /* Helper function for gcn_asm_output_symbol_ref. - FIXME: If we want to have propagation blocks allocated separately and - statically like this, it would be better done via symbol refs and the - assembler/linker. This is a temporary hack. */ + FIXME: This function is used to lay out gang-private variables in LDS + on a per-CU basis. + There may be cases in which gang-private variables in different compilation + units could clobber each other. In that case we should be relying on the + linker to lay out gang-private LDS space, but that doesn't appear to be + possible at present. */ static void gcn_print_lds_decl (FILE *f, tree var) { int *offset; - machine_function *machfun = cfun->machine; - - if ((offset = machfun->lds_allocs->get (var))) + if ((offset = lds_allocs.get (var))) fprintf (f, "%u", (unsigned) *offset); else { @@ -5508,14 +5549,14 @@ gcn_print_lds_decl (FILE *f, tree var) if (size > align && size > 4 && align < 8) align = 8; - machfun->lds_allocated = ((machfun->lds_allocated + align - 1) - & ~(align - 1)); + gang_private_hwm = ((gang_private_hwm + align - 1) & ~(align - 1)); - machfun->lds_allocs->put (var, machfun->lds_allocated); - fprintf (f, "%u", machfun->lds_allocated); - machfun->lds_allocated += size; - if (machfun->lds_allocated > LDS_SIZE) - error ("local data-share memory exhausted"); + lds_allocs.put (var, gang_private_hwm); + fprintf (f, "%u", gang_private_hwm); + gang_private_hwm += size; + if (gang_private_hwm > gang_private_size_opt) + error ("gang-private data-share memory exhausted (increase with " + "%<-mgang-private-size=<number>%>)"); } } @@ -6515,6 +6556,8 @@ gcn_dwarf_register_span (rtx rtl) #define TARGET_GOACC_REDUCTION gcn_goacc_reduction #undef TARGET_GOACC_VALIDATE_DIMS #define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims +#undef TARGET_GOACC_SHARED_MEM_LAYOUT +#define TARGET_GOACC_SHARED_MEM_LAYOUT gcn_shared_mem_layout #undef TARGET_HARD_REGNO_MODE_OK #define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok #undef TARGET_HARD_REGNO_NREGS diff --git a/gcc/config/gcn/gcn.h b/gcc/config/gcn/gcn.h index 5822ec34aa7bcf32b4593602a8c6d4cb1129c973..b97ec482d852a3a931c9b233dd08368be0d9225b 100644 --- a/gcc/config/gcn/gcn.h +++ b/gcc/config/gcn/gcn.h @@ -576,10 +576,8 @@ struct GTY(()) machine_function HOST_WIDE_INT local_vars; HOST_WIDE_INT callee_saves; - unsigned lds_allocated; - hash_map<tree, int> *lds_allocs; - - vec<tree, va_gc> *reduc_decls; + unsigned HOST_WIDE_INT reduction_base; + unsigned HOST_WIDE_INT reduction_limit; bool use_flat_addressing; }; diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt index 6faacca42bbb063a5d0358a1d7092f32eaf8573c..09cf191db016fd7e599fa77f69a5bb4cc051d3fe 100644 --- a/gcc/config/gcn/gcn.opt +++ b/gcc/config/gcn/gcn.opt @@ -68,6 +68,12 @@ mstack-size= Target RejectNegative Joined UInteger Var(stack_size_opt) Init(-1) -mstack-size=<number> Set the private segment size per wave-front, in bytes. +int gang_private_size_opt = -1 + +mgang-private-size= +Target RejectNegative Joined UInteger Var(gang_private_size_opt) Init(-1) +Amount of local data-share (LDS) memory to reserve for gang-private variables. + Wopenacc-dims Target Var(warn_openacc_dims) Warning Warn about invalid OpenACC dimensions. diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index be8148583d8571b0d035b1938db9d056bfd213a8..902402d75032fdce2cfeaa23a0c82da98d2d47b3 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -6415,7 +6415,7 @@ private variables at OpenACC device-lowering time using the @code{TARGET_GOACC_ADJUST_PRIVATE_DECL} target hook. @end deftypefn -@deftypefn {Target Hook} tree TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD (tree @var{rec}, bool @var{sender}, const char *@var{name}) +@deftypefn {Target Hook} tree TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD (tree @var{rec}, bool @var{sender}, const char *@var{name}, unsigned HOST_WIDE_INT @var{offset}) Create a record used to propagate local-variable state from an active worker to other workers. A possible implementation might adjust the type of REC to place the new variable in shared GPU memory. @@ -6424,6 +6424,13 @@ Presence of this target hook indicates that middle end neutering/broadcasting be used. @end deftypefn +@deftypefn {Target Hook} void TARGET_GOACC_SHARED_MEM_LAYOUT (unsigned HOST_WIDE_INT *@var{}, unsigned HOST_WIDE_INT *@var{}, @var{int[]}, unsigned @var{HOST_WIDE_INT[]}, unsigned @var{HOST_WIDE_INT[]}) +Lay out a fixed shared-memory region on the target. The LO and HI +arguments should be set to a range of addresses that can be used for worker +broadcasting. The dimensions, reduction size and gang-private size +arguments are for the current offload region. +@end deftypefn + @node Anchored Addresses @section Anchored Addresses @cindex anchored addresses diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index d088eee4afecdbb5575b0f4f796ac344e4449155..86352dc9bd2146587a95b7561151b3f84db8021f 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4228,6 +4228,8 @@ address; but often a machine-dependent strategy can generate better code. @hook TARGET_GOACC_CREATE_WORKER_BROADCAST_RECORD +@hook TARGET_GOACC_SHARED_MEM_LAYOUT + @node Anchored Addresses @section Anchored Addresses @cindex anchored addresses diff --git a/gcc/omp-oacc-neuter-broadcast.cc b/gcc/omp-oacc-neuter-broadcast.cc index 3fe92248c4e2b278cdea217acc1727918fef9fa2..e0bd01311ee1245509edc6142241e509029d3162 100644 --- a/gcc/omp-oacc-neuter-broadcast.cc +++ b/gcc/omp-oacc-neuter-broadcast.cc @@ -53,6 +53,8 @@ #include "tree-cfg.h" #include "omp-offload.h" #include "attribs.h" +#include "targhooks.h" +#include "diagnostic-core.h" /* Loop structure of the function. The entire function is described as a NULL loop. */ @@ -968,6 +970,8 @@ build_receiver_ref (tree var, tree receiver_decl, field_map_t *fields) static tree build_sender_ref (tree var, tree sender_decl, field_map_t *fields) { + if (POINTER_TYPE_P (TREE_TYPE (sender_decl))) + sender_decl = build_simple_mem_ref (sender_decl); tree field = *fields->get (var); return oacc_build_component_ref (sender_decl, field); } @@ -1005,7 +1009,9 @@ static void worker_single_copy (basic_block from, basic_block to, hash_set<tree> *def_escapes_block, hash_set<tree> *worker_partitioned_uses, - tree record_type, record_field_map_t *record_field_map) + tree record_type, record_field_map_t *record_field_map, + unsigned HOST_WIDE_INT placement, + bool isolate_broadcasts) { /* If we only have virtual defs, we'll have no record type, but we still want to emit single_copy_start and (particularly) single_copy_end to act as @@ -1016,10 +1022,12 @@ worker_single_copy (basic_block from, basic_block to, tree sender_decl = targetm.goacc.create_worker_broadcast_record (record_type, true, - ".oacc_worker_o"); + ".oacc_worker_o", + placement); tree receiver_decl = targetm.goacc.create_worker_broadcast_record (record_type, false, - ".oacc_worker_i"); + ".oacc_worker_i", + placement); gimple_stmt_iterator gsi = gsi_last_bb (to); if (EDGE_COUNT (to->succs) > 1) @@ -1033,12 +1041,23 @@ worker_single_copy (basic_block from, basic_block to, tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl))); - gimple *call = gimple_build_call (decl, 1, - build_fold_addr_expr (sender_decl)); + gimple *call + = gimple_build_call (decl, 1, + POINTER_TYPE_P (TREE_TYPE (sender_decl)) + ? sender_decl : build_fold_addr_expr (sender_decl)); gimple_call_set_lhs (call, lhs); gsi_insert_before (&start, call, GSI_NEW_STMT); update_stmt (call); + /* The shared-memory range for this block overflowed. Add a barrier before + the GOACC_single_copy_start call. */ + if (isolate_broadcasts) + { + decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); + gimple *acc_bar = gimple_build_call (decl, 0); + gsi_insert_before (&start, acc_bar, GSI_SAME_STMT); + } + tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl)); gimple *conv = gimple_build_assign (conv_tmp, @@ -1206,13 +1225,26 @@ worker_single_copy (basic_block from, basic_block to, } } + /* The shared-memory range for this block overflowed. Add a barrier at the + end. */ + if (isolate_broadcasts) + { + gsi = gsi_start_bb (exit_block); + decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER); + gimple *acc_bar = gimple_build_call (decl, 0); + gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT); + } + /* It's possible for the ET->DEST block (the work done by the active thread) to finish with a control-flow insn, e.g. a UNIQUE function call. Split the block and add SENDER_SEQ in the latter part to avoid having control flow in the middle of a BB. */ decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END); - call = gimple_build_call (decl, 1, build_fold_addr_expr (sender_decl)); + call = gimple_build_call (decl, 1, + POINTER_TYPE_P (TREE_TYPE (sender_decl)) + ? sender_decl + : build_fold_addr_expr (sender_decl)); gimple_seq_add_stmt (&sender_seq, call); gsi = gsi_last_bb (body); @@ -1222,12 +1254,16 @@ worker_single_copy (basic_block from, basic_block to, gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING); } +typedef hash_map<basic_block, std::pair<unsigned HOST_WIDE_INT, bool> > + blk_offset_map_t; + static void neuter_worker_single (parallel_g *par, unsigned outer_mask, bitmap worker_single, bitmap vector_single, vec<propagation_set *> *prop_set, hash_set<tree> *partitioned_var_uses, - record_field_map_t *record_field_map) + record_field_map_t *record_field_map, + blk_offset_map_t *blk_offset_map) { unsigned mask = outer_mask | par->mask; @@ -1316,9 +1352,19 @@ neuter_worker_single (parallel_g *par, unsigned outer_mask, tree record_type = (tree) block->aux; if (has_defs) - worker_single_copy (block, block, &def_escapes_block, - &worker_partitioned_uses, record_type, - record_field_map); + { + std::pair<unsigned HOST_WIDE_INT, bool> *off_rngalloc + = blk_offset_map->get (block); + gcc_assert (!record_type || off_rngalloc); + unsigned HOST_WIDE_INT offset + = off_rngalloc ? off_rngalloc->first : 0; + bool range_allocated + = off_rngalloc ? off_rngalloc->second : true; + worker_single_copy (block, block, &def_escapes_block, + &worker_partitioned_uses, record_type, + record_field_map, + offset, !range_allocated); + } else worker_single_simple (block, block, &def_escapes_block); } @@ -1354,14 +1400,159 @@ neuter_worker_single (parallel_g *par, unsigned outer_mask, if (par->inner) neuter_worker_single (par->inner, mask, worker_single, vector_single, - prop_set, partitioned_var_uses, record_field_map); + prop_set, partitioned_var_uses, record_field_map, + blk_offset_map); if (par->next) neuter_worker_single (par->next, outer_mask, worker_single, vector_single, - prop_set, partitioned_var_uses, record_field_map); + prop_set, partitioned_var_uses, record_field_map, + blk_offset_map); +} + +static void +dfs_broadcast_reachable_1 (basic_block bb, sbitmap reachable) +{ + if (bb->flags & BB_VISITED) + return; + + bb->flags |= BB_VISITED; + + if (bb->succs) + { + edge e; + edge_iterator ei; + FOR_EACH_EDGE (e, ei, bb->succs) + { + basic_block dest = e->dest; + if (dest->aux) + bitmap_set_bit (reachable, dest->index); + else + dfs_broadcast_reachable_1 (dest, reachable); + } + } } +typedef std::pair<int, tree> idx_decl_pair_t; + +typedef auto_vec<splay_tree> used_range_vec_t; + static int -execute_omp_oacc_neuter_broadcast () +sort_size_descending (const void *a, const void *b) +{ + const idx_decl_pair_t *pa = (const idx_decl_pair_t *) a; + const idx_decl_pair_t *pb = (const idx_decl_pair_t *) b; + unsigned HOST_WIDE_INT asize = tree_to_uhwi (TYPE_SIZE_UNIT (pa->second)); + unsigned HOST_WIDE_INT bsize = tree_to_uhwi (TYPE_SIZE_UNIT (pb->second)); + return bsize - asize; +} + +class addr_range +{ +public: + addr_range (unsigned HOST_WIDE_INT addr_lo, unsigned HOST_WIDE_INT addr_hi) + : lo (addr_lo), hi (addr_hi) + { } + addr_range (const addr_range &ar) : lo (ar.lo), hi (ar.hi) + { } + addr_range () : lo (0), hi (0) + { } + + bool invalid () { return lo == 0 && hi == 0; } + + unsigned HOST_WIDE_INT lo; + unsigned HOST_WIDE_INT hi; +}; + +static int +splay_tree_compare_addr_range (splay_tree_key a, splay_tree_key b) +{ + addr_range *ar = (addr_range *) a; + addr_range *br = (addr_range *) b; + if (ar->lo == br->lo && ar->hi == br->hi) + return 0; + if (ar->hi <= br->lo) + return -1; + else if (ar->lo >= br->hi) + return 1; + return 0; +} + +static void +splay_tree_free_key (splay_tree_key k) +{ + addr_range *ar = (addr_range *) k; + delete ar; +} + +static addr_range +first_fit_range (splay_tree s, unsigned HOST_WIDE_INT size, + unsigned HOST_WIDE_INT align, addr_range *bounds) +{ + splay_tree_node min = splay_tree_min (s); + if (min) + { + splay_tree_node next; + while ((next = splay_tree_successor (s, min->key))) + { + unsigned HOST_WIDE_INT lo = ((addr_range *) min->key)->hi; + unsigned HOST_WIDE_INT hi = ((addr_range *) next->key)->lo; + unsigned HOST_WIDE_INT base = (lo + align - 1) & ~(align - 1); + if (base + size <= hi) + return addr_range (base, base + size); + min = next; + } + + unsigned HOST_WIDE_INT base = ((addr_range *)min->key)->hi; + base = (base + align - 1) & ~(align - 1); + if (base + size <= bounds->hi) + return addr_range (base, base + size); + else + return addr_range (); + } + else + { + unsigned HOST_WIDE_INT lo = bounds->lo; + lo = (lo + align - 1) & ~(align - 1); + if (lo + size <= bounds->hi) + return addr_range (lo, lo + size); + else + return addr_range (); + } +} + +static int +merge_ranges_1 (splay_tree_node n, void *ptr) +{ + splay_tree accum = (splay_tree) ptr; + addr_range ar = *(addr_range *) n->key; + + splay_tree_node old = splay_tree_lookup (accum, n->key); + + /* We might have an overlap. Create a new range covering the + overlapping parts. */ + if (old) + { + addr_range *old_ar = (addr_range *) old->key; + ar.lo = MIN (old_ar->lo, ar.lo); + ar.hi = MAX (old_ar->hi, ar.hi); + splay_tree_remove (accum, old->key); + } + + addr_range *new_ar = new addr_range (ar); + + splay_tree_insert (accum, (splay_tree_key) new_ar, n->value); + + return 0; +} + +static void +merge_ranges (splay_tree accum, splay_tree sp) +{ + splay_tree_foreach (sp, merge_ranges_1, (void *) accum); +} + +static void +oacc_do_neutering (unsigned HOST_WIDE_INT bounds_lo, + unsigned HOST_WIDE_INT bounds_hi) { bb_stmt_map_t bb_stmt_map; auto_bitmap worker_single, vector_single; @@ -1450,8 +1641,123 @@ execute_omp_oacc_neuter_broadcast () } } + sbitmap *reachable + = sbitmap_vector_alloc (last_basic_block_for_fn (cfun), + last_basic_block_for_fn (cfun)); + + bitmap_vector_clear (reachable, last_basic_block_for_fn (cfun)); + + auto_vec<std::pair<int, tree> > priority; + + FOR_ALL_BB_FN (bb, cfun) + { + if (bb->aux) + { + tree record_type = (tree) bb->aux; + + basic_block bb2; + FOR_ALL_BB_FN (bb2, cfun) + bb2->flags &= ~BB_VISITED; + + priority.safe_push (std::make_pair (bb->index, record_type)); + dfs_broadcast_reachable_1 (bb, reachable[bb->index]); + } + } + + sbitmap *inverted + = sbitmap_vector_alloc (last_basic_block_for_fn (cfun), + last_basic_block_for_fn (cfun)); + + bitmap_vector_clear (inverted, last_basic_block_for_fn (cfun)); + + for (int i = 0; i < last_basic_block_for_fn (cfun); i++) + { + sbitmap_iterator bi; + unsigned int j; + EXECUTE_IF_SET_IN_BITMAP (reachable[i], 0, j, bi) + bitmap_set_bit (inverted[j], i); + } + + for (int i = 0; i < last_basic_block_for_fn (cfun); i++) + bitmap_ior (reachable[i], reachable[i], inverted[i]); + + sbitmap_vector_free (inverted); + + used_range_vec_t used_ranges; + + used_ranges.safe_grow_cleared (last_basic_block_for_fn (cfun)); + + blk_offset_map_t blk_offset_map; + + addr_range worker_shm_bounds (bounds_lo, bounds_hi); + + priority.qsort (sort_size_descending); + for (unsigned int i = 0; i < priority.length (); i++) + { + idx_decl_pair_t p = priority[i]; + int blkno = p.first; + tree record_type = p.second; + HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (record_type)); + HOST_WIDE_INT align = TYPE_ALIGN_UNIT (record_type); + + splay_tree conflicts = splay_tree_new (splay_tree_compare_addr_range, + splay_tree_free_key, NULL); + + if (!used_ranges[blkno]) + used_ranges[blkno] = splay_tree_new (splay_tree_compare_addr_range, + splay_tree_free_key, NULL); + else + merge_ranges (conflicts, used_ranges[blkno]); + + sbitmap_iterator bi; + unsigned int j; + EXECUTE_IF_SET_IN_BITMAP (reachable[blkno], 0, j, bi) + if (used_ranges[j]) + merge_ranges (conflicts, used_ranges[j]); + + addr_range ar + = first_fit_range (conflicts, size, align, &worker_shm_bounds); + + splay_tree_delete (conflicts); + + if (ar.invalid ()) + { + unsigned HOST_WIDE_INT base; + base = bounds_lo + random () % 512; + base = (base + align - 1) & ~(align - 1); + if (base + size > bounds_hi) + error_at (UNKNOWN_LOCATION, "shared-memory region overflow"); + std::pair<unsigned HOST_WIDE_INT, bool> base_inrng + = std::make_pair (base, false); + blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng); + } + else + { + splay_tree_node old = splay_tree_lookup (used_ranges[blkno], + (splay_tree_key) &ar); + if (old) + { + fprintf (stderr, "trying to map [%d..%d] but [%d..%d] is " + "already mapped in block %d\n", (int) ar.lo, + (int) ar.hi, (int) ((addr_range *) old->key)->lo, + (int) ((addr_range *) old->key)->hi, blkno); + abort (); + } + + addr_range *arp = new addr_range (ar); + splay_tree_insert (used_ranges[blkno], (splay_tree_key) arp, + (splay_tree_value) blkno); + std::pair<unsigned HOST_WIDE_INT, bool> base_inrng + = std::make_pair (ar.lo, true); + blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng); + } + } + + sbitmap_vector_free (reachable); + neuter_worker_single (par, mask, worker_single, vector_single, &prop_set, - &partitioned_var_uses, &record_field_map); + &partitioned_var_uses, &record_field_map, + &blk_offset_map); for (auto it : record_field_map) delete it.second; @@ -1478,6 +1784,107 @@ execute_omp_oacc_neuter_broadcast () fprintf (dump_file, "\n\nAfter neutering:\n\n"); dump_function_to_file (current_function_decl, dump_file, dump_flags); } +} + +static int +execute_omp_oacc_neuter_broadcast () +{ + unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX]; + unsigned HOST_WIDE_INT private_size[GOMP_DIM_MAX]; + + for (unsigned i = 0; i < GOMP_DIM_MAX; i++) + { + reduction_size[i] = 0; + private_size[i] = 0; + } + + /* Calculate shared memory size required for reduction variables and + gang-private memory for this offloaded function. */ + basic_block bb; + FOR_ALL_BB_FN (bb, cfun) + { + for (gimple_stmt_iterator gsi = gsi_start_bb (bb); + !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gimple *stmt = gsi_stmt (gsi); + if (!is_gimple_call (stmt)) + continue; + gcall *call = as_a <gcall *> (stmt); + if (!gimple_call_internal_p (call)) + continue; + enum internal_fn ifn_code = gimple_call_internal_fn (call); + switch (ifn_code) + { + default: break; + case IFN_GOACC_REDUCTION: + if (integer_minus_onep (gimple_call_arg (call, 3))) + continue; + else + { + unsigned code = TREE_INT_CST_LOW (gimple_call_arg (call, 0)); + /* Only count reduction variables once: the choice to pick + the setup call is fairly arbitrary. */ + if (code == IFN_GOACC_REDUCTION_SETUP) + { + int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); + tree var = gimple_call_arg (call, 2); + tree offset = gimple_call_arg (call, 5); + tree var_type = TREE_TYPE (var); + unsigned HOST_WIDE_INT limit + = (tree_to_uhwi (offset) + + tree_to_uhwi (TYPE_SIZE_UNIT (var_type))); + reduction_size[level] + = MAX (reduction_size[level], limit); + } + } + break; + case IFN_UNIQUE: + { + enum ifn_unique_kind kind + = ((enum ifn_unique_kind) + TREE_INT_CST_LOW (gimple_call_arg (call, 0))); + + if (kind == IFN_UNIQUE_OACC_PRIVATE) + { + HOST_WIDE_INT level + = TREE_INT_CST_LOW (gimple_call_arg (call, 2)); + if (level == -1) + break; + for (unsigned i = 3; + i < gimple_call_num_args (call); + i++) + { + tree arg = gimple_call_arg (call, i); + gcc_assert (TREE_CODE (arg) == ADDR_EXPR); + tree decl = TREE_OPERAND (arg, 0); + unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (decl); + private_size[level] = ((private_size[level] + align - 1) + & ~(align - 1)); + unsigned HOST_WIDE_INT decl_size + = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (decl))); + private_size[level] += decl_size; + } + } + } + break; + } + } + } + + int dims[GOMP_DIM_MAX]; + for (unsigned i = 0; i < GOMP_DIM_MAX; i++) + dims[i] = oacc_get_fn_dim_size (current_function_decl, i); + + /* Find bounds of shared-memory buffer space we can use. */ + unsigned HOST_WIDE_INT bounds_lo = 0, bounds_hi = 0; + if (targetm.goacc.shared_mem_layout) + targetm.goacc.shared_mem_layout (&bounds_lo, &bounds_hi, dims, + private_size, reduction_size); + + /* Perform worker partitioning unless we know 'num_workers(1)'. */ + if (dims[GOMP_DIM_WORKER] != 1) + oacc_do_neutering (bounds_lo, bounds_hi); return 0; } @@ -1518,12 +1925,6 @@ public: if (!attr) return false; - /* Not relevant for 'num_workers(1)'. */ - int worker_dim - = oacc_get_fn_dim_size (fun->decl, GOMP_DIM_WORKER); - if (worker_dim == 1) - return false; - return true; } diff --git a/gcc/target.def b/gcc/target.def index bfa819609c21bd71c0cc585c01dba42534453f47..c5d90cace80d75ef17cd5eddf657db8334f786de 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -1764,7 +1764,17 @@ of REC to place the new variable in shared GPU memory.\n\ \n\ Presence of this target hook indicates that middle end neutering/broadcasting\n\ be used.", -tree, (tree rec, bool sender, const char *name), +tree, (tree rec, bool sender, const char *name, unsigned HOST_WIDE_INT offset), +NULL) + +DEFHOOK +(shared_mem_layout, +"Lay out a fixed shared-memory region on the target. The LO and HI\n\ +arguments should be set to a range of addresses that can be used for worker\n\ +broadcasting. The dimensions, reduction size and gang-private size\n\ +arguments are for the current offload region.", +void, (unsigned HOST_WIDE_INT *, unsigned HOST_WIDE_INT *, int[], + unsigned HOST_WIDE_INT[], unsigned HOST_WIDE_INT[]), NULL) HOOK_VECTOR_END (goacc) diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-many.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-many.c index d763a754a11f6f795d66c69353e67addde25394f..37839edfb0938fd66a0ef7cf3d8a17ad70cc694c 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-many.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/broadcast-many.c @@ -1,3 +1,7 @@ +/* To avoid 'error: shared-memory region overflow': + { dg-additional-options "-foffload-options=amdgcn-amdhsa=-mgang-private-size=64" { target openacc_radeon_accel_selected } } +*/ + #include <assert.h> #include <stdio.h>