@@ -58,6 +58,7 @@ along with GCC; see the file COPYING3. If not see
#include "graphite.h"
#include "graphite-oacc.h"
#include "stdlib.h"
+#include "internal-fn.h"
struct ast_build_info
{
@@ -1697,6 +1698,127 @@ graphite_oacc_analyze_scop (scop_p scop)
print_isl_schedule (dump_file, scop->original_schedule);
}
+ if (flag_graphite_runtime_alias_checks
+ && scop->unhandled_alias_ddrs.length () > 0)
+ {
+ sese_info_p region = scop->scop_info;
+
+ /* Usually there will be a chunking loop with the actual work loop
+ inside it. In some corner cases there may only be one loop. */
+ loop_p top_loop = region->region.entry->dest->loop_father;
+ loop_p active_loop = top_loop->inner ? top_loop->inner : top_loop;
+ tree cond = generate_alias_cond (scop->unhandled_alias_ddrs, active_loop);
+
+ /* Walk back to GOACC_LOOP block. */
+ basic_block goacc_loop_block = region->region.entry->src;
+
+ /* Find the GOACC_LOOP calls. If there aren't any then this is not an
+ OpenACC kernels loop and will need different handling. */
+ gimple_stmt_iterator gsitop = gsi_start_bb (goacc_loop_block);
+ while (!gsi_end_p (gsitop)
+ && (!is_gimple_call (gsi_stmt (gsitop))
+ || !gimple_call_internal_p (gsi_stmt (gsitop))
+ || (gimple_call_internal_fn (gsi_stmt (gsitop))
+ != IFN_GOACC_LOOP)))
+ gsi_next (&gsitop);
+
+ if (!gsi_end_p (gsitop))
+ {
+ /* Move the GOACC_LOOP CHUNK and STEP calls to after any hoisted
+ statements. There ought not be any problematic dependencies because
+ the chunk size and step are only computed for very specific purposes.
+ They may not be at the very top of the block, but they should be
+ found together (the asserts test this assuption). */
+ gimple_stmt_iterator gsibottom = gsi_last_bb (goacc_loop_block);
+ gsi_move_after (&gsitop, &gsibottom);
+ gimple_stmt_iterator gsiinsert = gsibottom;
+ gcc_checking_assert (is_gimple_call (gsi_stmt (gsitop))
+ && gimple_call_internal_p (gsi_stmt (gsitop))
+ && (gimple_call_internal_fn (gsi_stmt (gsitop))
+ == IFN_GOACC_LOOP));
+ gsi_move_after (&gsitop, &gsibottom);
+
+ /* Insert "noalias_p = COND" before the GOACC_LOOP statements.
+ Note that these likely depend on some of the hoisted statements. */
+ tree cond_val = force_gimple_operand_gsi (&gsiinsert, cond, true, NULL,
+ true, GSI_NEW_STMT);
+
+ /* Insert the cond_val into each GOACC_LOOP call in the region. */
+ for (int n = -1; n < (int)region->bbs.length (); n++)
+ {
+ /* Cover the region plus goacc_loop_block. */
+ basic_block bb = n < 0 ? goacc_loop_block : region->bbs[n];
+
+ for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+ !gsi_end_p (gsi);
+ gsi_next (&gsi))
+ {
+ gimple *stmt = gsi_stmt (gsi);
+ if (!is_gimple_call (stmt)
+ || !gimple_call_internal_p (stmt))
+ continue;
+
+ gcall *goacc_call = as_a <gcall*> (stmt);
+ if (gimple_call_internal_fn (goacc_call) != IFN_GOACC_LOOP)
+ continue;
+
+ enum ifn_goacc_loop_kind code = (enum ifn_goacc_loop_kind)
+ TREE_INT_CST_LOW (gimple_call_arg (goacc_call, 0));
+ int argno = 0;
+ switch (code)
+ {
+ case IFN_GOACC_LOOP_CHUNKS:
+ case IFN_GOACC_LOOP_STEP:
+ argno = 6;
+ break;
+
+ case IFN_GOACC_LOOP_OFFSET:
+ case IFN_GOACC_LOOP_BOUND:
+ argno = 7;
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
+
+ gimple_call_set_arg (goacc_call, argno, cond_val);
+ update_stmt (goacc_call);
+
+ if (dump_enabled_p () && dump_flags & TDF_DETAILS)
+ dump_printf (MSG_NOTE,
+ "Runtime alias condition applied to: %G",
+ goacc_call);
+ }
+ }
+ }
+ else
+ {
+ /* There wasn't any GOACC_LOOP calls where we expected to find them,
+ therefore this isn't an OpenACC parallel loop. If it runs
+ sequentially then there's no need to worry about aliasing, so
+ nothing much to do here. */
+ if (dump_enabled_p ())
+ dump_printf (MSG_NOTE, "Runtime alias check *not* inserted for"
+ " bb %d (GOACC_LOOP not found)");
+
+ /* Unset can_be_parallel, in case something else might use it. */
+ for (unsigned int i = 0; i < region->bbs.length (); i++)
+ if (region->bbs[i]->loop_father)
+ region->bbs[i]->loop_father->can_be_parallel = 0;
+ }
+
+ /* The loop-nest vec is shared by all DDRs. */
+ DDR_LOOP_NEST (scop->unhandled_alias_ddrs[0]).release ();
+
+ unsigned int i;
+ struct data_dependence_relation *ddr;
+
+ FOR_EACH_VEC_ELT (scop->unhandled_alias_ddrs, i, ddr)
+ if (ddr)
+ free_dependence_relation (ddr);
+ scop->unhandled_alias_ddrs.truncate (0);
+ }
+
/* Analyze dependences in SCoP and mark loops as parallelizable accordingly. */
isl_schedule_foreach_schedule_node_top_down (
scop->original_schedule, visit_schedule_loop_node, scop->dependence);
@@ -7719,10 +7719,11 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
ass = gimple_build_assign (chunk_no, expr);
gsi_insert_before (&gsi, ass, GSI_SAME_STMT);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 6,
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
build_int_cst (integer_type_node,
IFN_GOACC_LOOP_CHUNKS),
- dir, range, s, chunk_size, gwv);
+ dir, range, s, chunk_size, gwv,
+ integer_one_node);
gimple_call_set_lhs (call, chunk_max);
gimple_set_location (call, loc);
gsi_insert_before (&gsi, call, GSI_SAME_STMT);
@@ -7730,10 +7731,11 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
else
chunk_size = chunk_no;
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 6,
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
build_int_cst (integer_type_node,
IFN_GOACC_LOOP_STEP),
- dir, range, s, chunk_size, gwv);
+ dir, range, s, chunk_size, gwv,
+ integer_one_node);
gimple_call_set_lhs (call, step);
gimple_set_location (call, loc);
gsi_insert_before (&gsi, call, GSI_SAME_STMT);
@@ -7767,20 +7769,20 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
/* Loop offset & bound go into head_bb. */
gsi = gsi_start_bb (head_bb);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 8,
build_int_cst (integer_type_node,
IFN_GOACC_LOOP_OFFSET),
- dir, range, s,
- chunk_size, gwv, chunk_no);
+ dir, range, s, chunk_size, gwv, chunk_no,
+ integer_one_node);
gimple_call_set_lhs (call, offset_init);
gimple_set_location (call, loc);
gsi_insert_after (&gsi, call, GSI_CONTINUE_LINKING);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 8,
build_int_cst (integer_type_node,
IFN_GOACC_LOOP_BOUND),
- dir, range, s,
- chunk_size, gwv, offset_init);
+ dir, range, s, chunk_size, gwv,
+ offset_init, integer_one_node);
gimple_call_set_lhs (call, bound);
gimple_set_location (call, loc);
gsi_insert_after (&gsi, call, GSI_CONTINUE_LINKING);
@@ -7830,22 +7832,25 @@ expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
tree chunk = build_int_cst (diff_type, 0); /* Never chunked. */
t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_OFFSET);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
- element_s, chunk, e_gwv, chunk);
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 8, t, dir, e_range,
+ element_s, chunk, e_gwv, chunk,
+ integer_one_node);
gimple_call_set_lhs (call, e_offset);
gimple_set_location (call, loc);
gsi_insert_before (&gsi, call, GSI_SAME_STMT);
t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_BOUND);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
- element_s, chunk, e_gwv, e_offset);
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 8, t, dir, e_range,
+ element_s, chunk, e_gwv, e_offset,
+ integer_one_node);
gimple_call_set_lhs (call, e_bound);
gimple_set_location (call, loc);
gsi_insert_before (&gsi, call, GSI_SAME_STMT);
t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_STEP);
- call = gimple_build_call_internal (IFN_GOACC_LOOP, 6, t, dir, e_range,
- element_s, chunk, e_gwv);
+ call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
+ element_s, chunk, e_gwv,
+ integer_one_node);
gimple_call_set_lhs (call, e_step);
gimple_set_location (call, loc);
gsi_insert_before (&gsi, call, GSI_SAME_STMT);
@@ -555,6 +555,7 @@ oacc_xform_loop (gcall *call)
bool chunking = false, striding = true;
unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
+ tree noalias = NULL_TREE;
/* Skip lowering if return value of IFN_GOACC_LOOP call is not used. */
if (!lhs)
@@ -596,147 +597,165 @@ oacc_xform_loop (gcall *call)
switch (code)
{
- default: gcc_unreachable ();
+ default:
+ gcc_unreachable ();
case IFN_GOACC_LOOP_CHUNKS:
+ noalias = gimple_call_arg (call, 6);
if (!chunking)
- r = build_int_cst (type, 1);
+ r = build_int_cst (type, 1);
else
- {
- /* chunk_max
- = (range - dir) / (chunks * step * num_threads) + dir */
- tree per = oacc_thread_numbers (false, mask, &seq);
- per = fold_convert (type, per);
- chunk_size = fold_convert (type, chunk_size);
- per = fold_build2 (MULT_EXPR, type, per, chunk_size);
- per = fold_build2 (MULT_EXPR, type, per, step);
- r = build2 (MINUS_EXPR, type, range, dir);
- r = build2 (PLUS_EXPR, type, r, per);
- r = build2 (TRUNC_DIV_EXPR, type, r, per);
- }
+ {
+ /* chunk_max
+ = (range - dir) / (chunks * step * num_threads) + dir */
+ tree per = oacc_thread_numbers (false, mask, &seq);
+ per = fold_convert (type, per);
+ noalias = fold_convert (type, noalias);
+ per = fold_build2 (MULT_EXPR, type, per, noalias);
+ per = fold_build2 (MAX_EXPR, type, per, fold_convert (type, integer_one_node));
+ chunk_size = fold_convert (type, chunk_size);
+ per = fold_build2 (MULT_EXPR, type, per, chunk_size);
+ per = fold_build2 (MULT_EXPR, type, per, step);
+ r = fold_build2 (MINUS_EXPR, type, range, dir);
+ r = fold_build2 (PLUS_EXPR, type, r, per);
+ r = build2 (TRUNC_DIV_EXPR, type, r, per);
+ }
break;
case IFN_GOACC_LOOP_STEP:
+ noalias = gimple_call_arg (call, 6);
{
/* If striding, step by the entire compute volume, otherwise
- step by the inner volume. */
+ step by the inner volume. */
unsigned volume = striding ? mask : inner_mask;
+ noalias = fold_convert (type, noalias);
r = oacc_thread_numbers (false, volume, &seq);
+ r = fold_convert (type, r);
+ r = build2 (MULT_EXPR, type, r, noalias);
+ r = build2 (MAX_EXPR, type, r, fold_convert (type, fold_convert (type, integer_one_node)));
r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
+ break;
}
- break;
-
- case IFN_GOACC_LOOP_OFFSET:
- /* Enable vectorization on non-SIMT targets. */
- if (!targetm.simt.vf
- && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
- /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
- the loop. */
- && (flag_tree_loop_vectorize
- || !OPTION_SET_P (flag_tree_loop_vectorize)))
- {
- basic_block bb = gsi_bb (gsi);
- class loop *parent = bb->loop_father;
- class loop *body = parent->inner;
-
- parent->force_vectorize = true;
- parent->safelen = INT_MAX;
-
- /* "Chunking loops" may have inner loops. */
- if (parent->inner)
- {
- body->force_vectorize = true;
- body->safelen = INT_MAX;
- }
-
- cfun->has_force_vectorize_loops = true;
- }
- if (striding)
- {
- r = oacc_thread_numbers (true, mask, &seq);
- r = fold_convert (diff_type, r);
- }
- else
- {
- tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
- tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
- tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
- inner_size, outer_size);
-
- volume = fold_convert (diff_type, volume);
- if (chunking)
- chunk_size = fold_convert (diff_type, chunk_size);
- else
- {
- tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
- chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
- chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
- chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
- }
-
- tree span = build2 (MULT_EXPR, diff_type, chunk_size,
- fold_convert (diff_type, inner_size));
- r = oacc_thread_numbers (true, outer_mask, &seq);
- r = fold_convert (diff_type, r);
- r = build2 (MULT_EXPR, diff_type, r, span);
-
- tree inner = oacc_thread_numbers (true, inner_mask, &seq);
- inner = fold_convert (diff_type, inner);
- r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
-
- if (chunking)
- {
- tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
- tree per
- = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
- per = build2 (MULT_EXPR, diff_type, per, chunk);
-
- r = build2 (PLUS_EXPR, diff_type, r, per);
- }
- }
- r = fold_build2 (MULT_EXPR, diff_type, r, step);
- if (type != diff_type)
- r = fold_convert (type, r);
- break;
-
- case IFN_GOACC_LOOP_BOUND:
- if (striding)
- r = range;
- else
- {
- tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
- tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
- tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
- inner_size, outer_size);
-
- volume = fold_convert (diff_type, volume);
- if (chunking)
- chunk_size = fold_convert (diff_type, chunk_size);
- else
- {
- tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
-
- chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
- chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
- chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
- }
-
- tree span = build2 (MULT_EXPR, diff_type, chunk_size,
- fold_convert (diff_type, inner_size));
-
- r = fold_build2 (MULT_EXPR, diff_type, span, step);
-
- tree offset = gimple_call_arg (call, 6);
- r = build2 (PLUS_EXPR, diff_type, r,
- fold_convert (diff_type, offset));
- r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
- diff_type, r, range);
- }
- if (diff_type != type)
- r = fold_convert (type, r);
- break;
+ case IFN_GOACC_LOOP_OFFSET:
+ noalias = gimple_call_arg (call, 7);
+ if (striding)
+ {
+ r = oacc_thread_numbers (true, mask, &seq);
+ r = fold_convert (diff_type, r);
+ tree tmp1 = build2 (NE_EXPR, boolean_type_node, r,
+ fold_convert (diff_type, integer_zero_node));
+ tree tmp2 = build2 (EQ_EXPR, boolean_type_node, noalias,
+ boolean_false_node);
+ tree tmp3 = build2 (BIT_AND_EXPR, diff_type,
+ fold_convert (diff_type, tmp1),
+ fold_convert (diff_type, tmp2));
+ tree tmp4 = build2 (MULT_EXPR, diff_type, tmp3, range);
+ r = build2 (PLUS_EXPR, diff_type, r, tmp4);
+ }
+ else
+ {
+ tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
+ tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
+ tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
+ inner_size, outer_size);
+
+ volume = fold_convert (diff_type, volume);
+ if (chunking)
+ chunk_size = fold_convert (diff_type, chunk_size);
+ else
+ {
+ tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+ /* chunk_size = (range + per - 1) / per. */
+ chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
+ chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
+ chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+ }
+
+ /* Curtail the range in all but one thread when there may be
+ aliasing to prevent parallelization. */
+ tree n = oacc_thread_numbers (true, mask, &seq);
+ n = fold_convert (diff_type, n);
+ tree tmp1 = build2 (NE_EXPR, boolean_type_node, n,
+ fold_convert (diff_type, integer_zero_node));
+ tree tmp2 = build2 (EQ_EXPR, boolean_type_node, noalias,
+ boolean_false_node);
+ tree tmp3 = build2 (BIT_AND_EXPR, diff_type,
+ fold_convert (diff_type, tmp1),
+ fold_convert (diff_type, tmp2));
+ range = build2 (MULT_EXPR, diff_type, tmp3, range);
+
+ tree span = build2 (MULT_EXPR, diff_type, chunk_size,
+ fold_convert (diff_type, inner_size));
+ r = oacc_thread_numbers (true, outer_mask, &seq);
+ r = fold_convert (diff_type, r);
+ r = build2 (PLUS_EXPR, diff_type, r, range);
+ r = build2 (MULT_EXPR, diff_type, r, span);
+
+ tree inner = oacc_thread_numbers (true, inner_mask, &seq);
+
+ inner = fold_convert (diff_type, inner);
+ r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
+
+ if (chunking)
+ {
+ tree chunk
+ = fold_convert (diff_type, gimple_call_arg (call, 6));
+ tree per
+ = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
+ per = build2 (MULT_EXPR, diff_type, per, chunk);
+
+ r = build2 (PLUS_EXPR, diff_type, r, per);
+ }
+ }
+ r = fold_build2 (MULT_EXPR, diff_type, r, step);
+ if (type != diff_type)
+ r = fold_convert (type, r);
+ break;
+
+ case IFN_GOACC_LOOP_BOUND:
+ if (striding)
+ r = range;
+ else
+ {
+ noalias = fold_convert (diff_type, gimple_call_arg (call, 7));
+
+ tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
+ tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
+ tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
+ inner_size, outer_size);
+
+ volume = fold_convert (diff_type, volume);
+ volume = fold_build2 (MULT_EXPR, diff_type, volume, noalias);
+ volume
+ = fold_build2 (MAX_EXPR, diff_type, volume, fold_convert (diff_type, integer_one_node));
+ if (chunking)
+ chunk_size = fold_convert (diff_type, chunk_size);
+ else
+ {
+ tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+ /* chunk_size = (range + per - 1) / per. */
+ chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
+ chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
+ chunk_size
+ = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+ }
+
+ tree span = build2 (MULT_EXPR, diff_type, chunk_size,
+ fold_convert (diff_type, inner_size));
+
+ r = fold_build2 (MULT_EXPR, diff_type, span, step);
+
+ tree offset = gimple_call_arg (call, 6);
+ r = build2 (PLUS_EXPR, diff_type, r,
+ fold_convert (diff_type, offset));
+ r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR, diff_type, r,
+ range);
+ }
+ if (diff_type != type)
+ r = fold_convert (type, r);
+ break;
}
gimplify_assign (lhs, r, &seq);
new file mode 100644
@@ -0,0 +1,79 @@
+/* Test that a simple array copy does the right thing when the input and
+ output data overlap. The GPU kernel should automatically switch to
+ a sequential operation mode in order to give the expected results. */
+
+#include <stdlib.h>
+#include <openacc.h>
+
+void f(int *data, int n, int to, int from, int count)
+{
+ /* We cannot use copyin for two overlapping arrays because we get an error
+ that the memory is already present. We also cannot do the pointer
+ arithmetic inside the kernels region because it just ends up using
+ host pointers (bug?). Using enter data with a single array, and
+ acc_deviceptr solves the problem. */
+#pragma acc enter data copyin(data[0:n])
+
+ int *a = (int*)acc_deviceptr (data+to);
+ int *b = (int*)acc_deviceptr (data+from);
+
+#pragma acc kernels
+ for (int i = 0; i < count; i++)
+ a[i] = b[i];
+
+#pragma acc exit data copyout(data[0:n])
+}
+
+#define N 2000
+
+int data[N];
+
+int
+main ()
+{
+ for (int i=0; i < N; i++)
+ data[i] = i;
+
+ /* Baseline test; no aliasing. The high part of the data is copied to
+ the lower part. */
+ int to = 0;
+ int from = N/2;
+ int count = N/2;
+ f (data, N, to, from, count);
+ for (int i=0; i < N; i++)
+ if (data[i] != (i%count)+count)
+ exit (1);
+
+ /* Check various amounts of data overlap. */
+ int tests[] = {1, 10, N/4, N/2-10, N/2-1};
+ for (int t = 0; t < sizeof (tests)/sizeof(tests[0]); t++)
+ {
+ for (int i=0; i < N; i++)
+ data[i] = i;
+
+ /* Output overlaps the latter part of input; expect the initial no-aliased
+ part of the input to repeat throughout the aliased portion. */
+ to = tests[t];
+ from = 0;
+ count = N-tests[t];
+ f (data, N, to, from, count);
+ for (int i=0; i < N; i++)
+ if (data[i] != i%tests[t])
+ exit (2);
+
+ for (int i=0; i < N; i++)
+ data[i] = i;
+
+ /* Input overlaps the latter part of the output; expect the copy to work
+ in the obvious manner. */
+ to = 0;
+ from = tests[t];
+ count = N-tests[t];
+ f (data, N, to, from, count);
+ for (int i=0; i < count; i++)
+ if (data[i+to] != i+tests[t])
+ exit (3);
+ }
+
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,90 @@
+/* Test that a simple array copy does the right thing when the input and
+ output data overlap. The GPU kernel should automatically switch to
+ a sequential operation mode in order to give the expected results.
+
+ This test does not check the correctness of the output (there are other
+ tests for that), but checks that the code really does select the faster
+ path, when it can, by comparing the timing. */
+
+/* No optimization means no issue with aliasing.
+ { dg-skip-if "" { *-*-* } { "-O0" } { "" } }
+ { dg-skip-if "" { *-*-* } { "-foffload=disable" } { "" } } */
+
+#include <stdlib.h>
+#include <sys/time.h>
+#include <openacc.h>
+
+void f(int *data, int n, int to, int from, int count)
+{
+ int *a = (int*)acc_deviceptr (data+to);
+ int *b = (int*)acc_deviceptr (data+from);
+
+#pragma acc kernels
+ for (int i = 0; i < count; i++)
+ a[i] = b[i];
+}
+
+#define N 1000000
+int data[N];
+
+int
+main ()
+{
+ struct timeval start, stop, difference;
+ long basetime, aliastime;
+
+ for (int i=0; i < N; i++)
+ data[i] = i;
+
+ /* Ensure that the data copies are outside the timed zone. */
+#pragma acc enter data copyin(data[0:N])
+
+ /* Baseline test; no aliasing. The high part of the data is copied to
+ the lower part. */
+ int to = 0;
+ int from = N/2;
+ int count = N/2;
+ gettimeofday (&start, NULL);
+ f (data, N, to, from, count);
+ gettimeofday (&stop, NULL);
+ timersub (&stop, &start, &difference);
+ basetime = difference.tv_sec * 1000000 + difference.tv_usec;
+
+ /* Check various amounts of data overlap. */
+ int tests[] = {1, 10, N/4, N/2-10, N/2-1};
+ for (int i = 0; i < sizeof (tests)/sizeof(tests[0]); i++)
+ {
+ to = 0;
+ from = N/2 - tests[i];
+ gettimeofday (&start, NULL);
+ f (data, N, to, from, count);
+ gettimeofday (&stop, NULL);
+ timersub (&stop, &start, &difference);
+ aliastime = difference.tv_sec * 1000000 + difference.tv_usec;
+
+ /* If the aliased runtime is less than 200% of the non-aliased runtime
+ then the runtime alias check probably selected the wrong path.
+ (Actually we expect the difference to be far greater than that.) */
+ if (basetime*2 > aliastime)
+ exit (1);
+ }
+
+ /* Repeat the baseline check just to make sure it didn't also get slower
+ after the first run. */
+ to = 0;
+ from = N/2;
+ gettimeofday (&start, NULL);
+ f (data, N, to, from, count);
+ gettimeofday (&stop, NULL);
+ timersub (&stop, &start, &difference);
+ int controltime = difference.tv_sec * 1000000 + difference.tv_usec;
+
+ /* The two times should be roughly the same, but we just check it wouldn't
+ pass the aliastime test above. */
+ if (basetime*2 <= controltime)
+ exit (2);
+
+#pragma acc exit data copyout(data[0:N])
+
+ return 0;
+}