@@ -58,6 +58,7 @@ along with GCC; see the file COPYING3. If not see
#include "graphite.h"
#include "graphite-oacc.h"
#include "stdlib.h"
+#include "internal-fn.h"
struct ast_build_info
{
@@ -1698,6 +1699,127 @@ graphite_oacc_analyze_scop (scop_p scop)
print_isl_schedule (dump_file, scop->original_schedule);
}
+ if (flag_graphite_runtime_alias_checks
+ && scop->unhandled_alias_ddrs.length () > 0)
+ {
+ sese_info_p region = scop->scop_info;
+
+ /* Usually there will be a chunking loop with the actual work loop
+ inside it. In some corner cases there may only be one loop. */
+ loop_p top_loop = region->region.entry->dest->loop_father;
+ loop_p active_loop = top_loop->inner ? top_loop->inner : top_loop;
+ tree cond = generate_alias_cond (scop->unhandled_alias_ddrs, active_loop);
+
+ /* Walk back to GOACC_LOOP block. */
+ basic_block goacc_loop_block = region->region.entry->src;
+
+ /* Find the GOACC_LOOP calls. If there aren't any then this is not an
+ OpenACC kernels loop and will need different handling. */
+ gimple_stmt_iterator gsitop = gsi_start_bb (goacc_loop_block);
+ while (!gsi_end_p (gsitop)
+ && (!is_gimple_call (gsi_stmt (gsitop))
+ || !gimple_call_internal_p (gsi_stmt (gsitop))
+ || (gimple_call_internal_fn (gsi_stmt (gsitop))
+ != IFN_GOACC_LOOP)))
+ gsi_next (&gsitop);
+
+ if (!gsi_end_p (gsitop))
+ {
+ /* Move the GOACC_LOOP CHUNK and STEP calls to after any hoisted
+ statements. There ought not be any problematic dependencies because
+ the chunk size and step are only computed for very specific purposes.
+ They may not be at the very top of the block, but they should be
+ found together (the asserts test this assuption). */
+ gimple_stmt_iterator gsibottom = gsi_last_bb (goacc_loop_block);
+ gsi_move_after (&gsitop, &gsibottom);
+ gimple_stmt_iterator gsiinsert = gsibottom;
+ gcc_checking_assert (is_gimple_call (gsi_stmt (gsitop))
+ && gimple_call_internal_p (gsi_stmt (gsitop))
+ && (gimple_call_internal_fn (gsi_stmt (gsitop))
+ == IFN_GOACC_LOOP));
+ gsi_move_after (&gsitop, &gsibottom);
+
+ /* Insert "noalias_p = COND" before the GOACC_LOOP statements.
+ Note that these likely depend on some of the hoisted statements. */
+ tree cond_val = force_gimple_operand_gsi (&gsiinsert, cond, true, NULL,
+ true, GSI_NEW_STMT);
+
+ /* Insert the cond_val into each GOACC_LOOP call in the region. */
+ for (int n = -1; n < (int)region->bbs.length (); n++)
+ {
+ /* Cover the region plus goacc_loop_block. */
+ basic_block bb = n < 0 ? goacc_loop_block : region->bbs[n];
+
+ for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+ !gsi_end_p (gsi);
+ gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+ if (!is_gimple_call (stmt)
+ || !gimple_call_internal_p (stmt))
+ continue;
+
+ gcall *goacc_call = as_a <gcall*> (stmt);
+ if (gimple_call_internal_fn (goacc_call) != IFN_GOACC_LOOP)
+ continue;
+
+ enum ifn_goacc_loop_kind code = (enum ifn_goacc_loop_kind)
+ TREE_INT_CST_LOW (gimple_call_arg (goacc_call, 0));
+ int argno = 0;
+ switch (code)
+ {
+ case IFN_GOACC_LOOP_CHUNKS:
+ case IFN_GOACC_LOOP_STEP:
+ argno = 6;
+ break;
+
+ case IFN_GOACC_LOOP_OFFSET:
+ case IFN_GOACC_LOOP_BOUND:
+ argno = 7;
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ gimple_call_set_arg (goacc_call, argno, cond_val);
+ update_stmt (goacc_call);
+
+ if (dump_enabled_p () && dump_flags & TDF_DETAILS)
+ dump_printf (MSG_NOTE,
+ "Runtime alias condition applied to: %G",
+ goacc_call);
+ }
+ }
+ }
+ else
+ {
+ /* There wasn't any GOACC_LOOP calls where we expected to find them,
+ therefore this isn't an OpenACC parallel loop. If it runs
+ sequentially then there's no need to worry about aliasing, so
+ nothing much to do here. */
+ if (dump_enabled_p ())
+ dump_printf (MSG_NOTE, "Runtime alias check *not* inserted for"
+ " bb %d (GOACC_LOOP not found)");
+
+ /* Unset can_be_parallel, in case something else might use it. */
+ for (unsigned int i = 0; i < region->bbs.length (); i++)
+ if (region->bbs[i]->loop_father)
+ region->bbs[i]->loop_father->can_be_parallel = 0;
+ }
+
+ /* The loop-nest vec is shared by all DDRs. */
+ DDR_LOOP_NEST (scop->unhandled_alias_ddrs[0]).release ();
+
+ unsigned int i;
+ struct data_dependence_relation *ddr;
+
+ FOR_EACH_VEC_ELT (scop->unhandled_alias_ddrs, i, ddr)
+ if (ddr)
+ free_dependence_relation (ddr);
+ scop->unhandled_alias_ddrs.truncate (0);
+ }
+
/* Analyze dependences in SCoP and mark loops as parallelizable accordingly. */
isl_schedule_foreach_schedule_node_top_down (
scop->original_schedule, visit_schedule_loop_node, scop->dependence);
@@ -1679,7 +1679,7 @@ dr_defs_outside_region (const sese_l ®ion, data_reference_p dr)
break;
}
- return opt_result::success ();
+ return res;
}
/* Check that all constituents of DR that are used by the
@@ -1691,21 +1691,23 @@ dr_well_analyzed_for_runtime_alias_check_p (data_reference_p dr)
static const char* error =
"data-reference not well-analyzed for runtime check.";
gimple* stmt = DR_STMT (dr);
+ opt_result res = opt_result::success ();
if (! DR_BASE_ADDRESS (dr))
- return opt_result::failure_at (stmt, "%s no base address.\n", error);
+ res = opt_result::failure_at (stmt, "%s no base address.\n", error);
else if (! DR_OFFSET (dr))
- return opt_result::failure_at (stmt, "%s no offset.\n", error);
+ res = opt_result::failure_at (stmt, "%s no offset.\n", error);
else if (! DR_INIT (dr))
- return opt_result::failure_at (stmt, "%s no init.\n", error);
+ res = opt_result::failure_at (stmt, "%s no init.\n", error);
else if (! DR_STEP (dr))
- return opt_result::failure_at (stmt, "%s no step.\n", error);
+ res = opt_result::failure_at (stmt, "%s no step.\n", error);
else if (! tree_fits_uhwi_p (DR_STEP (dr)))
- return opt_result::failure_at (stmt, "%s step too large.\n", error);
+ res = opt_result::failure_at (stmt, "%s step too large.\n", error);
- DEBUG_PRINT (dump_data_reference (dump_file, dr));
+ if (!res)
+ DEBUG_PRINT (dump_data_reference (dump_file, dr));
- return opt_result::success ();
+ return res;
}
/* Return TRUE if it is possible to create a runtime alias check for
@@ -7762,10 +7762,11 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
ass = gimple_build_assign (chunk_no, expr);
gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 6,
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
build_int_cst (integer_type_node,
IFN_GOACC_LOOP_CHUNKS),
- dir, range, s, chunk_size, gwv);
+ dir, range, s, chunk_size, gwv,
+ integer_one_node);
gimple_call_set_lhs (call, chunk_max);
gimple_set_location (call, loc);
gsi_insert_before (&gsi, call, GSI_SAME_STMT);
@@ -7773,10 +7774,11 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
else
chunk_size = chunk_no;
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 6,
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
build_int_cst (integer_type_node,
IFN_GOACC_LOOP_STEP),
- dir, range, s, chunk_size, gwv);
+ dir, range, s, chunk_size, gwv,
+ integer_one_node);
gimple_call_set_lhs (call, step);
gimple_set_location (call, loc);
gsi_insert_before (&gsi, call, GSI_SAME_STMT);
@@ -7810,20 +7812,20 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
/* Loop offset & bound go into head_bb. */
gsi = gsi_start_bb (head_bb);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 8,
build_int_cst (integer_type_node,
IFN_GOACC_LOOP_OFFSET),
- dir, range, s,
- chunk_size, gwv, chunk_no);
+ dir, range, s, chunk_size, gwv, chunk_no,
+ integer_one_node);
gimple_call_set_lhs (call, offset_init);
gimple_set_location (call, loc);
gsi_insert_after (&gsi, call, GSI_CONTINUE_LINKING);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 8,
build_int_cst (integer_type_node,
IFN_GOACC_LOOP_BOUND),
- dir, range, s,
- chunk_size, gwv, offset_init);
+ dir, range, s, chunk_size, gwv,
+ offset_init, integer_one_node);
gimple_call_set_lhs (call, bound);
gimple_set_location (call, loc);
gsi_insert_after (&gsi, call, GSI_CONTINUE_LINKING);
@@ -7873,22 +7875,25 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
tree chunk = build_int_cst (diff_type, 0); /* Never chunked. */
t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_OFFSET);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
- element_s, chunk, e_gwv, chunk);
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 8, t, dir, e_range,
+ element_s, chunk, e_gwv, chunk,
+ integer_one_node);
gimple_call_set_lhs (call, e_offset);
gimple_set_location (call, loc);
gsi_insert_before (&gsi, call, GSI_SAME_STMT);
t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_BOUND);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
- element_s, chunk, e_gwv, e_offset);
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 8, t, dir, e_range,
+ element_s, chunk, e_gwv, e_offset,
+ integer_one_node);
gimple_call_set_lhs (call, e_bound);
gimple_set_location (call, loc);
gsi_insert_before (&gsi, call, GSI_SAME_STMT);
t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_STEP);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 6, t, dir, e_range,
- element_s, chunk, e_gwv);
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
+ element_s, chunk, e_gwv,
+ integer_one_node);
gimple_call_set_lhs (call, e_step);
gimple_set_location (call, loc);
gsi_insert_before (&gsi, call, GSI_SAME_STMT);
@@ -584,6 +584,7 @@ oacc_xform_loop (gcall *call)
unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
tree vf_by_vectorizer = NULL_TREE;
+ tree noalias = NULL_TREE;
/* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */
if (!lhs)
@@ -648,202 +649,244 @@ oacc_xform_loop (gcall *call)
switch (code)
{
- default: gcc_unreachable ();
+ default:
+ gcc_unreachable ();
case IFN_GOACC_LOOP_CHUNKS:
+ noalias = gimple_call_arg (call, 6);
if (!chunking)
- r = build_int_cst (type, 1);
+ r = build_int_cst (type, 1);
else
- {
- /* chunk_max
- = (range - dir) / (chunks * step * num_threads) + dir */
- tree per = oacc_thread_numbers (false, mask, &seq);
- per = fold_convert (type, per);
- chunk_size = fold_convert (type, chunk_size);
- per = fold_build2 (MULT_EXPR, type, per, chunk_size);
- per = fold_build2 (MULT_EXPR, type, per, step);
- r = fold_build2 (MINUS_EXPR, type, range, dir);
- r = fold_build2 (PLUS_EXPR, type, r, per);
- r = build2 (TRUNC_DIV_EXPR, type, r, per);
- }
+ {
+ /* chunk_max
+ = (range - dir) / (chunks * step * num_threads) + dir */
+ tree per = oacc_thread_numbers (false, mask, &seq);
+ per = fold_convert (type, per);
+ noalias = fold_convert (type, noalias);
+ per = fold_build2 (MULT_EXPR, type, per, noalias);
+ per = fold_build2 (MAX_EXPR, type, per, fold_convert (type, integer_one_node));
+ chunk_size = fold_convert (type, chunk_size);
+ per = fold_build2 (MULT_EXPR, type, per, chunk_size);
+ per = fold_build2 (MULT_EXPR, type, per, step);
+ r = fold_build2 (MINUS_EXPR, type, range, dir);
+ r = fold_build2 (PLUS_EXPR, type, r, per);
+ r = build2 (TRUNC_DIV_EXPR, type, r, per);
+ }
break;
case IFN_GOACC_LOOP_STEP:
+ noalias = gimple_call_arg (call, 6);
{
- if (vf_by_vectorizer)
- r = step;
- else
- {
- /* If striding, step by the entire compute volume, otherwise
- step by the inner volume. */
- unsigned volume = striding ? mask : inner_mask;
-
- r = oacc_thread_numbers (false, volume, &seq);
- r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
- }
+ if (vf_by_vectorizer)
+ r = step;
+ else
+ {
+ /* If striding, step by the entire compute volume, otherwise
+ step by the inner volume. */
+ unsigned volume = striding ? mask : inner_mask;
+
+ noalias = fold_convert (type, noalias);
+ r = oacc_thread_numbers (false, volume, &seq);
+ r = fold_convert (type, r);
+ r = build2 (MULT_EXPR, type, r, noalias);
+ r = build2 (MAX_EXPR, type, r, fold_convert (type, fold_convert (type, integer_one_node)));
+ r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
+ }
+ break;
}
- break;
-
- case IFN_GOACC_LOOP_OFFSET:
- if (vf_by_vectorizer)
- {
- /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
- the loop. */
- if (flag_tree_loop_vectorize
- || !global_options_set.x_flag_tree_loop_vectorize)
- {
- /* Enable vectorization on non-SIMT targets. */
- basic_block bb = gsi_bb (gsi);
- class loop *chunk_loop = bb->loop_father;
- class loop *inner_loop = chunk_loop->inner;
-
- /* Chunking isn't supported for VF_BY_VECTORIZER loops yet,
- so we know that the outer chunking loop will be executed just
- once and the inner loop is the one which must be
- vectorized (unless it has been optimized out for some
- reason). */
- gcc_assert (!chunking);
-
- if (inner_loop)
- {
- inner_loop->force_vectorize = true;
- inner_loop->safelen = INT_MAX;
-
- cfun->has_force_vectorize_loops = true;
- }
- }
- /* ...and expand the abstract loops such that the vectorizer can
- work on them more effectively.
-
- It might be nicer to merge this code with the "!striding" case
- below, particularly if chunking support is added. */
- tree warppos
- = oacc_thread_numbers (true, mask, vf_by_vectorizer, &seq);
- warppos = fold_convert (diff_type, warppos);
-
- tree volume
- = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq);
- volume = fold_convert (diff_type, volume);
-
- tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
- chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per);
- chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir);
- chunk_size = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size,
- per);
-
- warppos = fold_build2 (MULT_EXPR, diff_type, warppos, chunk_size);
-
- tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
- chunk = fold_build2 (MULT_EXPR, diff_type, chunk, volume);
- r = fold_build2 (PLUS_EXPR, diff_type, chunk, warppos);
- }
- else if (striding)
- {
- r = oacc_thread_numbers (true, mask, &seq);
- r = fold_convert (diff_type, r);
- }
- else
- {
- tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
- tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
- tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
- inner_size, outer_size);
-
- volume = fold_convert (diff_type, volume);
- if (chunking)
- chunk_size = fold_convert (diff_type, chunk_size);
- else
- {
- tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
- /* chunk_size = (range + per - 1) / per. */
- chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
- chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
- chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
- }
-
- tree span = build2 (MULT_EXPR, diff_type, chunk_size,
- fold_convert (diff_type, inner_size));
- r = oacc_thread_numbers (true, outer_mask, &seq);
- r = fold_convert (diff_type, r);
- r = build2 (MULT_EXPR, diff_type, r, span);
-
- tree inner = oacc_thread_numbers (true, inner_mask, &seq);
- inner = fold_convert (diff_type, inner);
- r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
-
- if (chunking)
- {
- tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
- tree per
- = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
- per = build2 (MULT_EXPR, diff_type, per, chunk);
-
- r = build2 (PLUS_EXPR, diff_type, r, per);
- }
- }
- r = fold_build2 (MULT_EXPR, diff_type, r, step);
- if (type != diff_type)
- r = fold_convert (type, r);
- break;
-
- case IFN_GOACC_LOOP_BOUND:
- if (vf_by_vectorizer)
- {
- tree volume
- = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq);
- volume = fold_convert (diff_type, volume);
-
- tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
- chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per);
- chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir);
- chunk_size = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size,
- per);
-
- vf_by_vectorizer = fold_convert (diff_type, vf_by_vectorizer);
- tree vecsize = fold_build2 (MULT_EXPR, diff_type, chunk_size,
- vf_by_vectorizer);
- vecsize = fold_build2 (MULT_EXPR, diff_type, vecsize, step);
- tree vecend = fold_convert (diff_type, gimple_call_arg (call, 6));
- vecend = fold_build2 (PLUS_EXPR, diff_type, vecend, vecsize);
- r = fold_build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR, diff_type,
- range, vecend);
- }
- else if (striding)
- r = range;
- else
- {
- tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
- tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
- tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
- inner_size, outer_size);
-
- volume = fold_convert (diff_type, volume);
- if (chunking)
- chunk_size = fold_convert (diff_type, chunk_size);
- else
- {
- tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
- /* chunk_size = (range + per - 1) / per. */
- chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
- chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
- chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
- }
-
- tree span = build2 (MULT_EXPR, diff_type, chunk_size,
- fold_convert (diff_type, inner_size));
-
- r = fold_build2 (MULT_EXPR, diff_type, span, step);
+ case IFN_GOACC_LOOP_OFFSET:
+ noalias = gimple_call_arg (call, 7);
+ if (vf_by_vectorizer)
+ {
+ /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
+ the loop. */
+ if (flag_tree_loop_vectorize
+ || !global_options_set.x_flag_tree_loop_vectorize)
+ {
+ /* Enable vectorization on non-SIMT targets. */
+ basic_block bb = gsi_bb (gsi);
+ class loop *chunk_loop = bb->loop_father;
+ class loop *inner_loop = chunk_loop->inner;
+
+ /* Chunking isn't supported for VF_BY_VECTORIZER loops yet,
+ so we know that the outer chunking loop will be executed
+ just once and the inner loop is the one which must be
+ vectorized (unless it has been optimized out for some
+ reason). */
+ gcc_assert (!chunking);
+
+ if (inner_loop)
+ {
+ inner_loop->force_vectorize = true;
+ inner_loop->safelen = INT_MAX;
+
+ cfun->has_force_vectorize_loops = true;
+ }
+ }
+
+ /* ...and expand the abstract loops such that the vectorizer can
+ work on them more effectively.
+
+ It might be nicer to merge this code with the "!striding" case
+ below, particularly if chunking support is added. */
+ tree warppos
+ = oacc_thread_numbers (true, mask, vf_by_vectorizer, &seq);
+ warppos = fold_convert (diff_type, warppos);
+
+ tree volume
+ = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq);
+ volume = fold_convert (diff_type, volume);
+
+ tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+ chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per);
+ chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir);
+ chunk_size
+ = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+
+ warppos = fold_build2 (MULT_EXPR, diff_type, warppos, chunk_size);
+
+ tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
+ chunk = fold_build2 (MULT_EXPR, diff_type, chunk, volume);
+ r = fold_build2 (PLUS_EXPR, diff_type, chunk, warppos);
+ }
+ else if (striding)
+ {
+ r = oacc_thread_numbers (true, mask, &seq);
+ r = fold_convert (diff_type, r);
+ tree tmp1 = build2 (NE_EXPR, boolean_type_node, r,
+ fold_convert (diff_type, integer_zero_node));
+ tree tmp2 = build2 (EQ_EXPR, boolean_type_node, noalias,
+ boolean_false_node);
+ tree tmp3 = build2 (BIT_AND_EXPR, diff_type,
+ fold_convert (diff_type, tmp1),
+ fold_convert (diff_type, tmp2));
+ tree tmp4 = build2 (MULT_EXPR, diff_type, tmp3, range);
+ r = build2 (PLUS_EXPR, diff_type, r, tmp4);
+ }
+ else
+ {
+ tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
+ tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
+ tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
+ inner_size, outer_size);
+
+ volume = fold_convert (diff_type, volume);
+ if (chunking)
+ chunk_size = fold_convert (diff_type, chunk_size);
+ else
+ {
+ tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+ /* chunk_size = (range + per - 1) / per. */
+ chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
+ chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
+ chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+ }
+
+ /* Curtail the range in all but one thread when there may be
+ aliasing to prevent parallelization. */
+ tree n = oacc_thread_numbers (true, mask, &seq);
+ n = fold_convert (diff_type, n);
+ tree tmp1 = build2 (NE_EXPR, boolean_type_node, n,
+ fold_convert (diff_type, integer_zero_node));
+ tree tmp2 = build2 (EQ_EXPR, boolean_type_node, noalias,
+ boolean_false_node);
+ tree tmp3 = build2 (BIT_AND_EXPR, diff_type,
+ fold_convert (diff_type, tmp1),
+ fold_convert (diff_type, tmp2));
+ range = build2 (MULT_EXPR, diff_type, tmp3, range);
+
+ tree span = build2 (MULT_EXPR, diff_type, chunk_size,
+ fold_convert (diff_type, inner_size));
+ r = oacc_thread_numbers (true, outer_mask, &seq);
+ r = fold_convert (diff_type, r);
+ r = build2 (PLUS_EXPR, diff_type, r, range);
+ r = build2 (MULT_EXPR, diff_type, r, span);
+
+ tree inner = oacc_thread_numbers (true, inner_mask, &seq);
+
+ inner = fold_convert (diff_type, inner);
+ r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
+
+ if (chunking)
+ {
+ tree chunk
+ = fold_convert (diff_type, gimple_call_arg (call, 6));
+ tree per
+ = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
+ per = build2 (MULT_EXPR, diff_type, per, chunk);
+
+ r = build2 (PLUS_EXPR, diff_type, r, per);
+ }
+ }
+ r = fold_build2 (MULT_EXPR, diff_type, r, step);
+ if (type != diff_type)
+ r = fold_convert (type, r);
+ break;
- tree offset = gimple_call_arg (call, 6);
- r = build2 (PLUS_EXPR, diff_type, r,
- fold_convert (diff_type, offset));
- r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
- diff_type, r, range);
- }
- if (diff_type != type)
- r = fold_convert (type, r);
- break;
+ case IFN_GOACC_LOOP_BOUND:
+ if (vf_by_vectorizer)
+ {
+ tree volume
+ = oacc_thread_numbers (false, mask, vf_by_vectorizer, &seq);
+ volume = fold_convert (diff_type, volume);
+
+ tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+ chunk_size = fold_build2 (PLUS_EXPR, diff_type, range, per);
+ chunk_size = fold_build2 (MINUS_EXPR, diff_type, chunk_size, dir);
+ chunk_size
+ = fold_build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+
+ vf_by_vectorizer = fold_convert (diff_type, vf_by_vectorizer);
+ tree vecsize = fold_build2 (MULT_EXPR, diff_type, chunk_size,
+ vf_by_vectorizer);
+ vecsize = fold_build2 (MULT_EXPR, diff_type, vecsize, step);
+ tree vecend = fold_convert (diff_type, gimple_call_arg (call, 6));
+ vecend = fold_build2 (PLUS_EXPR, diff_type, vecend, vecsize);
+ r = fold_build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
+ diff_type, range, vecend);
+ }
+ else if (striding)
+ r = range;
+ else
+ {
+ noalias = fold_convert (diff_type, gimple_call_arg (call, 7));
+
+ tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
+ tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
+ tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
+ inner_size, outer_size);
+
+ volume = fold_convert (diff_type, volume);
+ volume = fold_build2 (MULT_EXPR, diff_type, volume, noalias);
+ volume
+ = fold_build2 (MAX_EXPR, diff_type, volume, fold_convert (diff_type, integer_one_node));
+ if (chunking)
+ chunk_size = fold_convert (diff_type, chunk_size);
+ else
+ {
+ tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+ /* chunk_size = (range + per - 1) / per. */
+ chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
+ chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
+ chunk_size
+ = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+ }
+
+ tree span = build2 (MULT_EXPR, diff_type, chunk_size,
+ fold_convert (diff_type, inner_size));
+
+ r = fold_build2 (MULT_EXPR, diff_type, span, step);
+
+ tree offset = gimple_call_arg (call, 6);
+ r = build2 (PLUS_EXPR, diff_type, r,
+ fold_convert (diff_type, offset));
+ r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR, diff_type, r,
+ range);
+ }
+ if (diff_type != type)
+ r = fold_convert (type, r);
+ break;
}
gimplify_assign (lhs, r, &seq);
new file mode 100644
@@ -0,0 +1,79 @@
+/* Test that a simple array copy does the right thing when the input and
+ output data overlap. The GPU kernel should automatically switch to
+ a sequential operation mode in order to give the expected results. */
+
+#include <stdlib.h>
+#include <openacc.h>
+
+void f(int *data, int n, int to, int from, int count)
+{
+ /* We cannot use copyin for two overlapping arrays because we get an error
+ that the memory is already present. We also cannot do the pointer
+ arithmetic inside the kernels region because it just ends up using
+ host pointers (bug?). Using enter data with a single array, and
+ acc_deviceptr solves the problem. */
+#pragma acc enter data copyin(data[0:n])
+
+ int *a = (int*)acc_deviceptr (data+to);
+ int *b = (int*)acc_deviceptr (data+from);
+
+#pragma acc kernels
+ for (int i = 0; i < count; i++)
+ a[i] = b[i];
+
+#pragma acc exit data copyout(data[0:n])
+}
+
+#define N 2000
+
+int data[N];
+
+int
+main ()
+{
+ for (int i=0; i < N; i++)
+ data[i] = i;
+
+ /* Baseline test; no aliasing. The high part of the data is copied to
+ the lower part. */
+ int to = 0;
+ int from = N/2;
+ int count = N/2;
+ f (data, N, to, from, count);
+ for (int i=0; i < N; i++)
+ if (data[i] != (i%count)+count)
+ exit (1);
+
+ /* Check various amounts of data overlap. */
+ int tests[] = {1, 10, N/4, N/2-10, N/2-1};
+ for (int t = 0; t < sizeof (tests)/sizeof(tests[0]); t++)
+ {
+ for (int i=0; i < N; i++)
+ data[i] = i;
+
+ /* Output overlaps the latter part of input; expect the initial no-aliased
+ part of the input to repeat throughout the aliased portion. */
+ to = tests[t];
+ from = 0;
+ count = N-tests[t];
+ f (data, N, to, from, count);
+ for (int i=0; i < N; i++)
+ if (data[i] != i%tests[t])
+ exit (2);
+
+ for (int i=0; i < N; i++)
+ data[i] = i;
+
+ /* Input overlaps the latter part of the output; expect the copy to work
+ in the obvious manner. */
+ to = 0;
+ from = tests[t];
+ count = N-tests[t];
+ f (data, N, to, from, count);
+ for (int i=0; i < count; i++)
+ if (data[i+to] != i+tests[t])
+ exit (3);
+ }
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,90 @@
+/* Test that a simple array copy does the right thing when the input and
+ output data overlap. The GPU kernel should automatically switch to
+ a sequential operation mode in order to give the expected results.
+
+ This test does not check the correctness of the output (there are other
+ tests for that), but checks that the code really does select the faster
+ path, when it can, by comparing the timing. */
+
+/* No optimization means no issue with aliasing.
+ { dg-skip-if "" { *-*-* } { "-O0" } { "" } }
+ { dg-skip-if "" { *-*-* } { "-foffload=disable" } { "" } } */
+
+#include <stdlib.h>
+#include <sys/time.h>
+#include <openacc.h>
+
+void f(int *data, int n, int to, int from, int count)
+{
+ int *a = (int*)acc_deviceptr (data+to);
+ int *b = (int*)acc_deviceptr (data+from);
+
+#pragma acc kernels
+ for (int i = 0; i < count; i++)
+ a[i] = b[i];
+}
+
+#define N 1000000
+int data[N];
+
+int
+main ()
+{
+ struct timeval start, stop, difference;
+ long basetime, aliastime;
+
+ for (int i=0; i < N; i++)
+ data[i] = i;
+
+ /* Ensure that the data copies are outside the timed zone. */
+#pragma acc enter data copyin(data[0:N])
+
+ /* Baseline test; no aliasing. The high part of the data is copied to
+ the lower part. */
+ int to = 0;
+ int from = N/2;
+ int count = N/2;
+ gettimeofday (&start, NULL);
+ f (data, N, to, from, count);
+ gettimeofday (&stop, NULL);
+ timersub (&stop, &start, &difference);
+ basetime = difference.tv_sec * 1000000 + difference.tv_usec;
+
+ /* Check various amounts of data overlap. */
+ int tests[] = {1, 10, N/4, N/2-10, N/2-1};
+ for (int i = 0; i < sizeof (tests)/sizeof(tests[0]); i++)
+ {
+ to = 0;
+ from = N/2 - tests[i];
+ gettimeofday (&start, NULL);
+ f (data, N, to, from, count);
+ gettimeofday (&stop, NULL);
+ timersub (&stop, &start, &difference);
+ aliastime = difference.tv_sec * 1000000 + difference.tv_usec;
+
+ /* If the aliased runtime is less than 200% of the non-aliased runtime
+ then the runtime alias check probably selected the wrong path.
+ (Actually we expect the difference to be far greater than that.) */
+ if (basetime*2 > aliastime)
+ exit (1);
+ }
+
+ /* Repeat the baseline check just to make sure it didn't also get slower
+ after the first run. */
+ to = 0;
+ from = N/2;
+ gettimeofday (&start, NULL);
+ f (data, N, to, from, count);
+ gettimeofday (&stop, NULL);
+ timersub (&stop, &start, &difference);
+ int controltime = difference.tv_sec * 1000000 + difference.tv_usec;
+
+ /* The two times should be roughly the same, but we just check it wouldn't
+ pass the aliastime test above. */
+ if (basetime*2 <= controltime)
+ exit (2);
+
+#pragma acc exit data copyout(data[0:N])
+
+ return 0;
+}