[25/40] openacc: Add runtime alias checking for OpenACC kernels

Message ID 20211215155447.19379-26-frederik@codesourcery.com
State New
Headers
Series OpenACC "kernels" Improvements |

Commit Message

Frederik Harwath Dec. 15, 2021, 3:54 p.m. UTC
  From: Andrew Stubbs <ams@codesourcery.com>

This commit adds the code generation for the runtime alias checks for
OpenACC loops that have been analyzed by Graphite.  The runtime alias
check condition gets generated in Graphite. It is evaluated by the
code generated for the IFN_GOACC_LOOP internal function calls.  If
aliasing is detected at runtime, the execution dimensions get adjusted
to execute the affected loops sequentially.

gcc/ChangeLog:

        * graphite-isl-ast-to-gimple.c: Include internal-fn.h.
        (graphite_oacc_analyze_scop): Implement runtime alias checks.
        * omp-expand.c (expand_oacc_for): Add an additional "noalias" parameter
        to GOACC_LOOP internal calls, and initialise it to integer_one_node.
        * omp-offload.c (oacc_xform_loop): Integrate the runtime alias check
        into the GOACC_LOOP expansion.

libgomp/ChangeLog:

        * testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-1.c: New test.
        * testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-2.c: New test.
---
 gcc/graphite-isl-ast-to-gimple.c              | 122 ++++++++
 gcc/omp-expand.c                              |  37 +--
 gcc/omp-offload.c                             | 271 ++++++++++--------
 .../runtime-alias-check-1.c                   |  79 +++++
 .../runtime-alias-check-2.c                   |  90 ++++++
 5 files changed, 457 insertions(+), 142 deletions(-)
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-1.c
 create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-2.c

--
2.33.0

-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
  

Patch

diff --git a/gcc/graphite-isl-ast-to-gimple.c b/gcc/graphite-isl-ast-to-gimple.c
index e820e2c32202..010adaabb000 100644
--- a/gcc/graphite-isl-ast-to-gimple.c
+++ b/gcc/graphite-isl-ast-to-gimple.c
@@ -58,6 +58,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "graphite.h"
 #include "graphite-oacc.h"
 #include "stdlib.h"
+#include "internal-fn.h"

 struct ast_build_info
 {
@@ -1697,6 +1698,127 @@  graphite_oacc_analyze_scop (scop_p scop)
       print_isl_schedule (dump_file, scop->original_schedule);
     }

+  if (flag_graphite_runtime_alias_checks
+      && scop->unhandled_alias_ddrs.length () > 0)
+    {
+      sese_info_p region = scop->scop_info;
+
+      /* Usually there will be a chunking loop with the actual work loop
+        inside it.  In some corner cases there may only be one loop.  */
+      loop_p top_loop = region->region.entry->dest->loop_father;
+      loop_p active_loop = top_loop->inner ? top_loop->inner : top_loop;
+      tree cond = generate_alias_cond (scop->unhandled_alias_ddrs, active_loop);
+
+      /* Walk back to GOACC_LOOP block.  */
+      basic_block goacc_loop_block = region->region.entry->src;
+
+      /* Find the GOACC_LOOP calls. If there aren't any then this is not an
+        OpenACC kernels loop and will need different handling.  */
+      gimple_stmt_iterator gsitop = gsi_start_bb (goacc_loop_block);
+      while (!gsi_end_p (gsitop)
+            && (!is_gimple_call (gsi_stmt (gsitop))
+                || !gimple_call_internal_p (gsi_stmt (gsitop))
+                || (gimple_call_internal_fn (gsi_stmt (gsitop))
+                    != IFN_GOACC_LOOP)))
+       gsi_next (&gsitop);
+
+      if (!gsi_end_p (gsitop))
+       {
+         /* Move the GOACC_LOOP CHUNK and STEP calls to after any hoisted
+            statements.  There ought not be any problematic dependencies because
+            the chunk size and step are only computed for very specific purposes.
+            They may not be at the very top of the block, but they should be
+            found together (the asserts test this assuption). */
+         gimple_stmt_iterator gsibottom = gsi_last_bb (goacc_loop_block);
+         gsi_move_after (&gsitop, &gsibottom);
+         gimple_stmt_iterator gsiinsert = gsibottom;
+         gcc_checking_assert (is_gimple_call (gsi_stmt (gsitop))
+                              && gimple_call_internal_p (gsi_stmt (gsitop))
+                              && (gimple_call_internal_fn (gsi_stmt (gsitop))
+                                  == IFN_GOACC_LOOP));
+         gsi_move_after (&gsitop, &gsibottom);
+
+         /* Insert "noalias_p = COND" before the GOACC_LOOP statements.
+            Note that these likely depend on some of the hoisted statements.  */
+         tree cond_val = force_gimple_operand_gsi (&gsiinsert, cond, true, NULL,
+                                                   true, GSI_NEW_STMT);
+
+         /* Insert the cond_val into each GOACC_LOOP call in the region.  */
+         for (int n = -1; n < (int)region->bbs.length (); n++)
+           {
+             /* Cover the region plus goacc_loop_block.  */
+             basic_block bb = n < 0 ? goacc_loop_block : region->bbs[n];
+
+             for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
+                  !gsi_end_p (gsi);
+                  gsi_next (&gsi))
+               {
+                 gimple *stmt = gsi_stmt (gsi);
+                 if (!is_gimple_call (stmt)
+                     || !gimple_call_internal_p (stmt))
+                   continue;
+
+                 gcall *goacc_call = as_a <gcall*> (stmt);
+                 if (gimple_call_internal_fn (goacc_call) != IFN_GOACC_LOOP)
+                   continue;
+
+                 enum ifn_goacc_loop_kind code = (enum ifn_goacc_loop_kind)
+                   TREE_INT_CST_LOW (gimple_call_arg (goacc_call, 0));
+                 int argno = 0;
+                 switch (code)
+                   {
+                   case IFN_GOACC_LOOP_CHUNKS:
+                   case IFN_GOACC_LOOP_STEP:
+                     argno = 6;
+                     break;
+
+                   case IFN_GOACC_LOOP_OFFSET:
+                   case IFN_GOACC_LOOP_BOUND:
+                     argno = 7;
+                     break;
+
+                   default:
+                     gcc_unreachable ();
+                   }
+
+                 gimple_call_set_arg (goacc_call, argno, cond_val);
+                 update_stmt (goacc_call);
+
+                 if (dump_enabled_p () && dump_flags & TDF_DETAILS)
+                   dump_printf (MSG_NOTE,
+                                "Runtime alias condition applied to: %G",
+                                goacc_call);
+               }
+           }
+       }
+      else
+       {
+         /* There wasn't any GOACC_LOOP calls where we expected to find them,
+            therefore this isn't an OpenACC parallel loop.  If it runs
+            sequentially then there's no need to worry about aliasing, so
+            nothing much to do here.  */
+         if (dump_enabled_p ())
+           dump_printf (MSG_NOTE, "Runtime alias check *not* inserted for"
+                        " bb %d (GOACC_LOOP not found)");
+
+         /* Unset can_be_parallel, in case something else might use it.  */
+         for (unsigned int i = 0; i < region->bbs.length (); i++)
+           if (region->bbs[i]->loop_father)
+             region->bbs[i]->loop_father->can_be_parallel = 0;
+       }
+
+      /* The loop-nest vec is shared by all DDRs. */
+      DDR_LOOP_NEST (scop->unhandled_alias_ddrs[0]).release ();
+
+      unsigned int i;
+      struct data_dependence_relation *ddr;
+
+      FOR_EACH_VEC_ELT (scop->unhandled_alias_ddrs, i, ddr)
+       if (ddr)
+         free_dependence_relation (ddr);
+      scop->unhandled_alias_ddrs.truncate (0);
+    }
+
   /* Analyze dependences in SCoP and mark loops as parallelizable accordingly. */
   isl_schedule_foreach_schedule_node_top_down (
       scop->original_schedule, visit_schedule_loop_node, scop->dependence);
diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c
index 365d167b6428..585ce798ee15 100644
--- a/gcc/omp-expand.c
+++ b/gcc/omp-expand.c
@@ -7719,10 +7719,11 @@  expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
       ass = gimple_build_assign (chunk_no, expr);
       gsi_insert_before (&gsi, ass, GSI_SAME_STMT);

-      call = gimple_build_call_internal (IFN_GOACC_LOOP, 6,
+      call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
                                         build_int_cst (integer_type_node,
                                                        IFN_GOACC_LOOP_CHUNKS),
-                                        dir, range, s, chunk_size, gwv);
+                                        dir, range, s, chunk_size, gwv,
+                                        integer_one_node);
       gimple_call_set_lhs (call, chunk_max);
       gimple_set_location (call, loc);
       gsi_insert_before (&gsi, call, GSI_SAME_STMT);
@@ -7730,10 +7731,11 @@  expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
   else
     chunk_size = chunk_no;

-  call = gimple_build_call_internal (IFN_GOACC_LOOP, 6,
+  call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
                                     build_int_cst (integer_type_node,
                                                    IFN_GOACC_LOOP_STEP),
-                                    dir, range, s, chunk_size, gwv);
+                                    dir, range, s, chunk_size, gwv,
+                                    integer_one_node);
   gimple_call_set_lhs (call, step);
   gimple_set_location (call, loc);
   gsi_insert_before (&gsi, call, GSI_SAME_STMT);
@@ -7767,20 +7769,20 @@  expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
   /* Loop offset & bound go into head_bb.  */
   gsi = gsi_start_bb (head_bb);

-  call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
+  call = gimple_build_call_internal (IFN_GOACC_LOOP, 8,
                                     build_int_cst (integer_type_node,
                                                    IFN_GOACC_LOOP_OFFSET),
-                                    dir, range, s,
-                                    chunk_size, gwv, chunk_no);
+                                    dir, range, s, chunk_size, gwv, chunk_no,
+                                    integer_one_node);
   gimple_call_set_lhs (call, offset_init);
   gimple_set_location (call, loc);
   gsi_insert_after (&gsi, call, GSI_CONTINUE_LINKING);

-  call = gimple_build_call_internal (IFN_GOACC_LOOP, 7,
+  call = gimple_build_call_internal (IFN_GOACC_LOOP, 8,
                                     build_int_cst (integer_type_node,
                                                    IFN_GOACC_LOOP_BOUND),
-                                    dir, range, s,
-                                    chunk_size, gwv, offset_init);
+                                    dir, range, s, chunk_size, gwv,
+                                    offset_init, integer_one_node);
   gimple_call_set_lhs (call, bound);
   gimple_set_location (call, loc);
   gsi_insert_after (&gsi, call, GSI_CONTINUE_LINKING);
@@ -7830,22 +7832,25 @@  expand_oacc_for (struct omp_region *region, struct omp_for_data *fd)
          tree chunk = build_int_cst (diff_type, 0); /* Never chunked.  */

          t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_OFFSET);
-         call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
-                                            element_s, chunk, e_gwv, chunk);
+         call = gimple_build_call_internal (IFN_GOACC_LOOP, 8, t, dir, e_range,
+                                            element_s, chunk, e_gwv, chunk,
+                                            integer_one_node);
          gimple_call_set_lhs (call, e_offset);
          gimple_set_location (call, loc);
          gsi_insert_before (&gsi, call, GSI_SAME_STMT);

          t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_BOUND);
-         call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
-                                            element_s, chunk, e_gwv, e_offset);
+         call = gimple_build_call_internal (IFN_GOACC_LOOP, 8, t, dir, e_range,
+                                            element_s, chunk, e_gwv, e_offset,
+                                            integer_one_node);
          gimple_call_set_lhs (call, e_bound);
          gimple_set_location (call, loc);
          gsi_insert_before (&gsi, call, GSI_SAME_STMT);

          t = build_int_cst (integer_type_node, IFN_GOACC_LOOP_STEP);
-         call = gimple_build_call_internal (IFN_GOACC_LOOP, 6, t, dir, e_range,
-                                            element_s, chunk, e_gwv);
+         call = gimple_build_call_internal (IFN_GOACC_LOOP, 7, t, dir, e_range,
+                                            element_s, chunk, e_gwv,
+                                            integer_one_node);
          gimple_call_set_lhs (call, e_step);
          gimple_set_location (call, loc);
          gsi_insert_before (&gsi, call, GSI_SAME_STMT);
diff --git a/gcc/omp-offload.c b/gcc/omp-offload.c
index 392ca56b1f4f..3458a1acbceb 100644
--- a/gcc/omp-offload.c
+++ b/gcc/omp-offload.c
@@ -555,6 +555,7 @@  oacc_xform_loop (gcall *call)
   bool chunking = false, striding = true;
   unsigned outer_mask = mask & (~mask + 1); // Outermost partitioning
   unsigned inner_mask = mask & ~outer_mask; // Inner partitioning (if any)
+  tree noalias = NULL_TREE;

   /* Skip lowering if return value of IFN_GOACC_LOOP call is not used.  */
   if (!lhs)
@@ -596,147 +597,165 @@  oacc_xform_loop (gcall *call)

   switch (code)
     {
-    default: gcc_unreachable ();
+    default:
+      gcc_unreachable ();

     case IFN_GOACC_LOOP_CHUNKS:
+      noalias = gimple_call_arg (call, 6);
       if (!chunking)
-       r = build_int_cst (type, 1);
+        r = build_int_cst (type, 1);
       else
-       {
-         /* chunk_max
-            = (range - dir) / (chunks * step * num_threads) + dir  */
-         tree per = oacc_thread_numbers (false, mask, &seq);
-         per = fold_convert (type, per);
-         chunk_size = fold_convert (type, chunk_size);
-         per = fold_build2 (MULT_EXPR, type, per, chunk_size);
-         per = fold_build2 (MULT_EXPR, type, per, step);
-         r = build2 (MINUS_EXPR, type, range, dir);
-         r = build2 (PLUS_EXPR, type, r, per);
-         r = build2 (TRUNC_DIV_EXPR, type, r, per);
-       }
+        {
+          /* chunk_max
+             = (range - dir) / (chunks * step * num_threads) + dir  */
+          tree per = oacc_thread_numbers (false, mask, &seq);
+          per = fold_convert (type, per);
+          noalias = fold_convert (type, noalias);
+          per = fold_build2 (MULT_EXPR, type, per, noalias);
+          per = fold_build2 (MAX_EXPR, type, per, fold_convert (type, integer_one_node));
+          chunk_size = fold_convert (type, chunk_size);
+          per = fold_build2 (MULT_EXPR, type, per, chunk_size);
+          per = fold_build2 (MULT_EXPR, type, per, step);
+          r = fold_build2 (MINUS_EXPR, type, range, dir);
+          r = fold_build2 (PLUS_EXPR, type, r, per);
+          r = build2 (TRUNC_DIV_EXPR, type, r, per);
+        }
       break;

     case IFN_GOACC_LOOP_STEP:
+      noalias = gimple_call_arg (call, 6);
       {
        /* If striding, step by the entire compute volume, otherwise
-          step by the inner volume.  */
+           step by the inner volume.  */
        unsigned volume = striding ? mask : inner_mask;

+       noalias = fold_convert (type, noalias);
        r = oacc_thread_numbers (false, volume, &seq);
+       r = fold_convert (type, r);
+       r = build2 (MULT_EXPR, type, r, noalias);
+       r = build2 (MAX_EXPR, type, r, fold_convert (type, fold_convert (type, integer_one_node)));
        r = build2 (MULT_EXPR, type, fold_convert (type, r), step);
+        break;
       }
-      break;
-
-    case IFN_GOACC_LOOP_OFFSET:
-      /* Enable vectorization on non-SIMT targets.  */
-      if (!targetm.simt.vf
-         && outer_mask == GOMP_DIM_MASK (GOMP_DIM_VECTOR)
-         /* If not -fno-tree-loop-vectorize, hint that we want to vectorize
-            the loop.  */
-         && (flag_tree_loop_vectorize
-             || !OPTION_SET_P (flag_tree_loop_vectorize)))
-       {
-         basic_block bb = gsi_bb (gsi);
-         class loop *parent = bb->loop_father;
-         class loop *body = parent->inner;
-
-         parent->force_vectorize = true;
-         parent->safelen = INT_MAX;
-
-         /* "Chunking loops" may have inner loops.  */
-         if (parent->inner)
-           {
-             body->force_vectorize = true;
-             body->safelen = INT_MAX;
-           }
-
-         cfun->has_force_vectorize_loops = true;
-       }
-      if (striding)
-       {
-         r = oacc_thread_numbers (true, mask, &seq);
-         r = fold_convert (diff_type, r);
-       }
-      else
-       {
-         tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
-         tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
-         tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
-                                    inner_size, outer_size);
-
-         volume = fold_convert (diff_type, volume);
-         if (chunking)
-           chunk_size = fold_convert (diff_type, chunk_size);
-         else
-           {
-             tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);

-             chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
-             chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
-             chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
-           }
-
-         tree span = build2 (MULT_EXPR, diff_type, chunk_size,
-                             fold_convert (diff_type, inner_size));
-         r = oacc_thread_numbers (true, outer_mask, &seq);
-         r = fold_convert (diff_type, r);
-         r = build2 (MULT_EXPR, diff_type, r, span);
-
-         tree inner = oacc_thread_numbers (true, inner_mask, &seq);
-         inner = fold_convert (diff_type, inner);
-         r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
-
-         if (chunking)
-           {
-             tree chunk = fold_convert (diff_type, gimple_call_arg (call, 6));
-             tree per
-               = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
-             per = build2 (MULT_EXPR, diff_type, per, chunk);
-
-             r = build2 (PLUS_EXPR, diff_type, r, per);
-           }
-       }
-      r = fold_build2 (MULT_EXPR, diff_type, r, step);
-      if (type != diff_type)
-       r = fold_convert (type, r);
-      break;
-
-    case IFN_GOACC_LOOP_BOUND:
-      if (striding)
-       r = range;
-      else
-       {
-         tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
-         tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
-         tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
-                                    inner_size, outer_size);
-
-         volume = fold_convert (diff_type, volume);
-         if (chunking)
-           chunk_size = fold_convert (diff_type, chunk_size);
-         else
-           {
-             tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
-
-             chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
-             chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
-             chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
-           }
-
-         tree span = build2 (MULT_EXPR, diff_type, chunk_size,
-                             fold_convert (diff_type, inner_size));
-
-         r = fold_build2 (MULT_EXPR, diff_type, span, step);
-
-         tree offset = gimple_call_arg (call, 6);
-         r = build2 (PLUS_EXPR, diff_type, r,
-                     fold_convert (diff_type, offset));
-         r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR,
-                     diff_type, r, range);
-       }
-      if (diff_type != type)
-       r = fold_convert (type, r);
-      break;
+      case IFN_GOACC_LOOP_OFFSET:
+       noalias = gimple_call_arg (call, 7);
+        if (striding)
+          {
+            r = oacc_thread_numbers (true, mask, &seq);
+            r = fold_convert (diff_type, r);
+            tree tmp1 = build2 (NE_EXPR, boolean_type_node, r,
+                                fold_convert (diff_type, integer_zero_node));
+            tree tmp2 = build2 (EQ_EXPR, boolean_type_node, noalias,
+                                boolean_false_node);
+            tree tmp3 = build2 (BIT_AND_EXPR, diff_type,
+                                fold_convert (diff_type, tmp1),
+                                fold_convert (diff_type, tmp2));
+            tree tmp4 = build2 (MULT_EXPR, diff_type, tmp3, range);
+            r = build2 (PLUS_EXPR, diff_type, r, tmp4);
+          }
+        else
+          {
+            tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
+            tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
+            tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
+                                       inner_size, outer_size);
+
+            volume = fold_convert (diff_type, volume);
+            if (chunking)
+              chunk_size = fold_convert (diff_type, chunk_size);
+            else
+              {
+                tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+                /* chunk_size = (range + per - 1) / per.  */
+                chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
+                chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
+                chunk_size = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+              }
+
+            /* Curtail the range in all but one thread when there may be
+               aliasing to prevent parallelization.  */
+            tree n = oacc_thread_numbers (true, mask, &seq);
+            n = fold_convert (diff_type, n);
+            tree tmp1 = build2 (NE_EXPR, boolean_type_node, n,
+                                fold_convert (diff_type, integer_zero_node));
+            tree tmp2 = build2 (EQ_EXPR, boolean_type_node, noalias,
+                                boolean_false_node);
+            tree tmp3 = build2 (BIT_AND_EXPR, diff_type,
+                                fold_convert (diff_type, tmp1),
+                                fold_convert (diff_type, tmp2));
+            range = build2 (MULT_EXPR, diff_type, tmp3, range);
+
+            tree span = build2 (MULT_EXPR, diff_type, chunk_size,
+                                fold_convert (diff_type, inner_size));
+            r = oacc_thread_numbers (true, outer_mask, &seq);
+            r = fold_convert (diff_type, r);
+            r = build2 (PLUS_EXPR, diff_type, r, range);
+            r = build2 (MULT_EXPR, diff_type, r, span);
+
+            tree inner = oacc_thread_numbers (true, inner_mask, &seq);
+
+            inner = fold_convert (diff_type, inner);
+            r = fold_build2 (PLUS_EXPR, diff_type, r, inner);
+
+            if (chunking)
+              {
+                tree chunk
+                    = fold_convert (diff_type, gimple_call_arg (call, 6));
+                tree per
+                    = fold_build2 (MULT_EXPR, diff_type, volume, chunk_size);
+                per = build2 (MULT_EXPR, diff_type, per, chunk);
+
+                r = build2 (PLUS_EXPR, diff_type, r, per);
+              }
+          }
+        r = fold_build2 (MULT_EXPR, diff_type, r, step);
+        if (type != diff_type)
+          r = fold_convert (type, r);
+        break;
+
+      case IFN_GOACC_LOOP_BOUND:
+        if (striding)
+          r = range;
+        else
+          {
+            noalias = fold_convert (diff_type, gimple_call_arg (call, 7));
+
+            tree inner_size = oacc_thread_numbers (false, inner_mask, &seq);
+            tree outer_size = oacc_thread_numbers (false, outer_mask, &seq);
+            tree volume = fold_build2 (MULT_EXPR, TREE_TYPE (inner_size),
+                                       inner_size, outer_size);
+
+            volume = fold_convert (diff_type, volume);
+            volume = fold_build2 (MULT_EXPR, diff_type, volume, noalias);
+            volume
+                = fold_build2 (MAX_EXPR, diff_type, volume, fold_convert (diff_type, integer_one_node));
+            if (chunking)
+              chunk_size = fold_convert (diff_type, chunk_size);
+            else
+              {
+                tree per = fold_build2 (MULT_EXPR, diff_type, volume, step);
+                /* chunk_size = (range + per - 1) / per.  */
+                chunk_size = build2 (MINUS_EXPR, diff_type, range, dir);
+                chunk_size = build2 (PLUS_EXPR, diff_type, chunk_size, per);
+                chunk_size
+                    = build2 (TRUNC_DIV_EXPR, diff_type, chunk_size, per);
+              }
+
+            tree span = build2 (MULT_EXPR, diff_type, chunk_size,
+                                fold_convert (diff_type, inner_size));
+
+            r = fold_build2 (MULT_EXPR, diff_type, span, step);
+
+            tree offset = gimple_call_arg (call, 6);
+            r = build2 (PLUS_EXPR, diff_type, r,
+                        fold_convert (diff_type, offset));
+            r = build2 (integer_onep (dir) ? MIN_EXPR : MAX_EXPR, diff_type, r,
+                        range);
+          }
+        if (diff_type != type)
+          r = fold_convert (type, r);
+        break;
     }

   gimplify_assign (lhs, r, &seq);
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-1.c
new file mode 100644
index 000000000000..2fb1c712beb3
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-1.c
@@ -0,0 +1,79 @@ 
+/* Test that a simple array copy does the right thing when the input and
+   output data overlap.  The GPU kernel should automatically switch to
+   a sequential operation mode in order to give the expected results.  */
+
+#include <stdlib.h>
+#include <openacc.h>
+
+void f(int *data, int n, int to, int from, int count)
+{
+  /* We cannot use copyin for two overlapping arrays because we get an error
+     that the memory is already present.  We also cannot do the pointer
+     arithmetic inside the kernels region because it just ends up using
+     host pointers (bug?).  Using enter data with a single array, and
+     acc_deviceptr solves the problem.  */
+#pragma acc enter data copyin(data[0:n])
+
+  int *a = (int*)acc_deviceptr (data+to);
+  int *b = (int*)acc_deviceptr (data+from);
+
+#pragma acc kernels
+  for (int i = 0; i < count; i++)
+    a[i] = b[i];
+
+#pragma acc exit data copyout(data[0:n])
+}
+
+#define N 2000
+
+int data[N];
+
+int
+main ()
+{
+  for (int i=0; i < N; i++)
+    data[i] = i;
+
+  /* Baseline test; no aliasing. The high part of the data is copied to
+     the lower part.  */
+  int to = 0;
+  int from = N/2;
+  int count = N/2;
+  f (data, N, to, from, count);
+  for (int i=0; i < N; i++)
+    if (data[i] != (i%count)+count)
+      exit (1);
+
+  /* Check various amounts of data overlap.  */
+  int tests[] = {1, 10, N/4, N/2-10, N/2-1};
+  for (int t = 0; t < sizeof (tests)/sizeof(tests[0]); t++)
+    {
+      for (int i=0; i < N; i++)
+       data[i] = i;
+
+      /* Output overlaps the latter part of input; expect the initial no-aliased
+        part of the input to repeat throughout the aliased portion.  */
+      to = tests[t];
+      from = 0;
+      count = N-tests[t];
+      f (data, N, to, from, count);
+      for (int i=0; i < N; i++)
+       if (data[i] != i%tests[t])
+       exit (2);
+
+      for (int i=0; i < N; i++)
+       data[i] = i;
+
+      /* Input overlaps the latter part of the output; expect the copy to work
+        in the obvious manner.  */
+      to = 0;
+      from = tests[t];
+      count = N-tests[t];
+      f (data, N, to, from, count);
+      for (int i=0; i < count; i++)
+       if (data[i+to] != i+tests[t])
+       exit (3);
+    }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-2.c
new file mode 100644
index 000000000000..96c03297d5b4
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/runtime-alias-check-2.c
@@ -0,0 +1,90 @@ 
+/* Test that a simple array copy does the right thing when the input and
+   output data overlap.  The GPU kernel should automatically switch to
+   a sequential operation mode in order to give the expected results.
+
+   This test does not check the correctness of the output (there are other
+   tests for that), but checks that the code really does select the faster
+   path, when it can, by comparing the timing.  */
+
+/* No optimization means no issue with aliasing.
+   { dg-skip-if "" { *-*-* } { "-O0" } { "" } }
+   { dg-skip-if "" { *-*-* } { "-foffload=disable" } { "" } } */
+
+#include <stdlib.h>
+#include <sys/time.h>
+#include <openacc.h>
+
+void f(int *data, int n, int to, int from, int count)
+{
+  int *a = (int*)acc_deviceptr (data+to);
+  int *b = (int*)acc_deviceptr (data+from);
+
+#pragma acc kernels
+  for (int i = 0; i < count; i++)
+    a[i] = b[i];
+}
+
+#define N 1000000
+int data[N];
+
+int
+main ()
+{
+  struct timeval start, stop, difference;
+  long basetime, aliastime;
+
+  for (int i=0; i < N; i++)
+    data[i] = i;
+
+  /* Ensure that the data copies are outside the timed zone.  */
+#pragma acc enter data copyin(data[0:N])
+
+  /* Baseline test; no aliasing. The high part of the data is copied to
+     the lower part.  */
+  int to = 0;
+  int from = N/2;
+  int count = N/2;
+  gettimeofday (&start, NULL);
+  f (data, N, to, from, count);
+  gettimeofday (&stop, NULL);
+  timersub (&stop, &start, &difference);
+  basetime = difference.tv_sec * 1000000 + difference.tv_usec;
+
+  /* Check various amounts of data overlap.  */
+  int tests[] = {1, 10, N/4, N/2-10, N/2-1};
+  for (int i = 0; i < sizeof (tests)/sizeof(tests[0]); i++)
+    {
+      to = 0;
+      from = N/2 - tests[i];
+      gettimeofday (&start, NULL);
+      f (data, N, to, from, count);
+      gettimeofday (&stop, NULL);
+      timersub (&stop, &start, &difference);
+      aliastime = difference.tv_sec * 1000000 + difference.tv_usec;
+
+      /* If the aliased runtime is less than 200% of the non-aliased runtime
+        then the runtime alias check probably selected the wrong path.
+        (Actually we expect the difference to be far greater than that.)  */
+      if (basetime*2 > aliastime)
+       exit (1);
+    }
+
+  /* Repeat the baseline check just to make sure it didn't also get slower
+     after the first run.  */
+  to = 0;
+  from = N/2;
+  gettimeofday (&start, NULL);
+  f (data, N, to, from, count);
+  gettimeofday (&stop, NULL);
+  timersub (&stop, &start, &difference);
+  int controltime = difference.tv_sec * 1000000 + difference.tv_usec;
+
+  /* The two times should be roughly the same, but we just check it wouldn't
+     pass the aliastime test above.  */
+  if (basetime*2 <= controltime)
+    exit (2);
+
+#pragma acc exit data copyout(data[0:N])
+
+  return 0;
+}