[OG11,committed,11/22] openacc: Add further kernels tests

Message ID 20211117160330.20029-11-frederik@codesourcery.com
State Committed
Headers
Series OpenACC "kernels" Improvements |

Commit Message

Frederik Harwath Nov. 17, 2021, 4:03 p.m. UTC
  Add some copies of tests to continue covering the old "parloops"-based
"kernels" implementation - until it gets removed from GCC - and
add further tests for the new Graphite-based implementation.

libgomp/ChangeLog:

        * testsuite/libgomp.oacc-fortran/parallel-loop-auto-reduction-2.f90:
        New test.

gcc/testsuite/ChangeLog:

        * c-c++-common/goacc/classify-kernels-unparallelized-graphite.c:
        New test.
        * c-c++-common/goacc/classify-kernels-unparallelized-parloops.c:
        New test.
        * c-c++-common/goacc/kernels-decompose-1-parloops.c: New test.
        * c-c++-common/goacc/kernels-reduction-parloops.c: New test.
        * c-c++-common/goacc/loop-auto-reductions.c: New test.
        * c-c++-common/goacc/note-parallelism-1-kernels-loop-auto-parloops.c:
        New test.
        * c-c++-common/goacc/note-parallelism-kernels-loops-1.c: New test.
        * c-c++-common/goacc/note-parallelism-kernels-loops-parloops.c:
        New test.
        * gfortran.dg/goacc/classify-kernels-unparallelized-parloops.f95:
        New test.
        * gfortran.dg/goacc/kernels-conversion.f95: New test.
        * gfortran.dg/goacc/kernels-decompose-1-parloops.f95: New test.
        * gfortran.dg/goacc/kernels-decompose-parloops-2.f95: New test.
        * gfortran.dg/goacc/kernels-loop-data-parloops-2.f95: New test.
        * gfortran.dg/goacc/kernels-loop-parloops-2.f95: New test.
        * gfortran.dg/goacc/kernels-loop-parloops.f95: New test.
        * gfortran.dg/goacc/kernels-reductions.f90: New test.
---
 ...classify-kernels-unparallelized-graphite.c |  41 +++++
 ...classify-kernels-unparallelized-parloops.c |  47 ++++++
 .../goacc/kernels-decompose-1-parloops.c      | 125 ++++++++++++++
 .../goacc/kernels-reduction-parloops.c        |  36 ++++
 .../c-c++-common/goacc/loop-auto-reductions.c |  22 +++
 ...parallelism-1-kernels-loop-auto-parloops.c | 128 +++++++++++++++
 .../goacc/note-parallelism-kernels-loops-1.c  |  61 +++++++
 .../note-parallelism-kernels-loops-parloops.c |  53 ++++++
 ...assify-kernels-unparallelized-parloops.f95 |  44 +++++
 .../gfortran.dg/goacc/kernels-conversion.f95  |  52 ++++++
 .../goacc/kernels-decompose-1-parloops.f95    | 121 ++++++++++++++
 .../goacc/kernels-decompose-parloops-2.f95    | 154 ++++++++++++++++++
 .../goacc/kernels-loop-data-parloops-2.f95    |  52 ++++++
 .../goacc/kernels-loop-parloops-2.f95         |  45 +++++
 .../goacc/kernels-loop-parloops.f95           |  39 +++++
 .../gfortran.dg/goacc/kernels-reductions.f90  |  37 +++++
 .../parallel-loop-auto-reduction-2.f90        |  98 +++++++++++
 17 files changed, 1155 insertions(+)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized-graphite.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized-parloops.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-decompose-1-parloops.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-reduction-parloops.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/loop-auto-reductions.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-loop-auto-parloops.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loops-1.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loops-parloops.c
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/classify-kernels-unparallelized-parloops.f95
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1-parloops.f95
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/kernels-decompose-parloops-2.f95
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-parloops-2.f95
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/kernels-loop-parloops-2.f95
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/kernels-loop-parloops.f95
 create mode 100644 gcc/testsuite/gfortran.dg/goacc/kernels-reductions.f90
 create mode 100644 libgomp/testsuite/libgomp.oacc-fortran/parallel-loop-auto-reduction-2.f90

--
2.33.0

-----------------
Siemens Electronic Design Automation GmbH; Anschrift: Arnulfstraße 201, 80634 München; Gesellschaft mit beschränkter Haftung; Geschäftsführer: Thomas Heurung, Frank Thürauf; Sitz der Gesellschaft: München; Registergericht München, HRB 106955
  

Patch

diff --git a/gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized-graphite.c b/gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized-graphite.c
new file mode 100644
index 000000000000..77f4524907a9
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized-graphite.c
@@ -0,0 +1,41 @@ 
+/* Check offloaded function's attributes and classification for unparallelized
+   OpenACC 'kernels' with Graphite kernles handling (default).  */
+
+/* { dg-additional-options "-O2" }
+   { dg-additional-options "-fno-openacc-kernels-annotate-loops" }
+   { dg-additional-options "-fopt-info-optimized-omp" }
+   { dg-additional-options "-fopt-info-note-omp" }
+   { dg-additional-options "-fdump-tree-ompexp" }
+   { dg-additional-options "-fdump-tree-graphite-details" }
+   { dg-additional-options "-fdump-tree-oaccloops1" }
+   { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose-details" } */
+
+#define N 1024
+
+extern unsigned int *__restrict a;
+extern unsigned int *__restrict b;
+extern unsigned int *__restrict c;
+
+extern unsigned int f (unsigned int);
+#pragma acc routine (f) seq
+
+void KERNELS ()
+{
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+  for (unsigned int i = 0; i < N; i++) /* { dg-message "note: beginning .Graphite. part in OpenACC .kernels. region" } */
+    /* An "extern"al mapping of loop iterations/array indices makes the loop
+       unparallelizable.  */
+    c[i] = a[f (i)] + b[f (i)]; /* { dg-optimized "assigned OpenACC seq loop parallelism" } */
+}
+
+/* Check the offloaded function's attributes.
+   { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc parallel_kernels_graphite, omp target entrypoint\\)\\)" 1 "ompexp" } } */
+
+/* Check that Graphite can handle neither the original nor the offloaded region
+   { dg-final { scan-tree-dump-times "number of SCoPs: 0" 2 "graphite" } }
+
+/* Check the offloaded function's classification and compute dimensions (will
+   always be 1 x 1 x 1 for non-offloading compilation).
+   { dg-final { scan-tree-dump-times "(?n)Function is parallel_kernels_graphite OpenACC kernels offload" 1 "oaccloops1" } }
+   { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops1" } }
+   { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc parallel_kernels_graphite, omp target entrypoint\\)\\)" 1 "oaccloops1" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized-parloops.c b/gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized-parloops.c
new file mode 100644
index 000000000000..252ab8eb87b7
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/classify-kernels-unparallelized-parloops.c
@@ -0,0 +1,47 @@ 
+/* Check offloaded function's attributes and classification for unparallelized
+   OpenACC 'kernels' with "parloops" handling.  */
+
+/* { dg-additional-options "-O2" }
+   { dg-additional-options "--param openacc-kernels=decompose-parloops" }
+   { dg-additional-options "-fno-openacc-kernels-annotate-loops" }
+   { dg-additional-options "-fopt-info-note-optimized-omp" }
+   { dg-additional-options "-fdump-tree-ompexp" }
+   { dg-additional-options "-fdump-tree-parloops1-all" }
+   { dg-additional-options "-fdump-tree-oaccloops1" }
+   { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose-details" } */
+
+/* { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
+   aspects of that functionality.  */
+
+#define N 1024
+
+extern unsigned int *__restrict a;
+extern unsigned int *__restrict b;
+extern unsigned int *__restrict c;
+
+extern unsigned int f (unsigned int);
+#pragma acc routine (f) seq
+
+void KERNELS ()
+{
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N]) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  for (unsigned int i = 0; i < N; i++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+    /* An "extern"al mapping of loop iterations/array indices makes the loop
+       unparallelizable.  */
+    c[i] = a[f (i)] + b[f (i)];
+}
+
+/* Check the offloaded function's attributes.
+   { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels, omp target entrypoint\\)\\)" 1 "ompexp" } } */
+
+/* Check that exactly one OpenACC kernels construct is analyzed, and that it
+   can't be parallelized.
+   { dg-final { scan-tree-dump-times "FAILED:" 1 "parloops1" } }
+   { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } }
+   { dg-final { scan-tree-dump-not "SUCCESS: may be parallelized" "parloops1" } } */
+
+/* Check the offloaded function's classification and compute dimensions (will
+   always be 1 x 1 x 1 for non-offloading compilation).
+   { dg-final { scan-tree-dump-times "(?n)Function is unparallelized OpenACC kernels offload" 1 "oaccloops1" } }
+   { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops1" } }
+   { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc kernels, omp target entrypoint\\)\\)" 1 "oaccloops1" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-decompose-1-parloops.c b/gcc/testsuite/c-c++-common/goacc/kernels-decompose-1-parloops.c
new file mode 100644
index 000000000000..76d528a6d8e1
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-decompose-1-parloops.c
@@ -0,0 +1,125 @@ 
+/* Test OpenACC .kernels. region decomposition with
+   "split-parloops" handling.  */
+/* { dg-additional-options "--param openacc-kernels=decompose-parloops" } */
+/* { dg-additional-options "-fno-openacc-kernels-annotate-loops" } */
+/* { dg-additional-options "-fopt-info-omp-all" } */
+/* { dg-additional-options "-Wopenacc-parallelism" } */
+/* { dg-additional-options "-O2" } for "parloops".  */
+
+/* See also "../../gfortran.dg/goacc/kernels-decompose-1.f95".  */
+
+#pragma acc routine gang
+extern int
+f_g (int);
+
+#pragma acc routine worker
+extern int
+f_w (int);
+
+#pragma acc routine vector
+extern int
+f_v (int);
+
+#pragma acc routine seq
+extern int
+f_s (int);
+
+int
+main ()
+{
+  int x, y, z;
+#define N 10
+  int a[N], b[N], c[N];
+
+#pragma acc kernels
+  {
+    x = 0; /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" } */
+    y = x < 10;
+    z = x++;
+    ;
+  }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+  for (int i = 0; i < N; i++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+    a[i] = 0;
+
+#pragma acc kernels loop /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (int i = 0; i < N; i++)
+    b[i] = a[N - i - 1];
+
+#pragma acc kernels
+  {
+#pragma acc loop /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+    /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+    for (int i = 0; i < N; i++)
+      b[i] = a[N - i - 1];
+
+#pragma acc loop /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+    /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+    for (int i = 0; i < N; i++)
+      c[i] = a[i] * b[i];
+
+    a[z] = 0; /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" } */
+
+#pragma acc loop /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+    /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+    for (int i = 0; i < N; i++)
+      c[i] += a[i];
+
+#pragma acc loop seq /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+    /* { dg-message "note: parallelized loop nest in OpenACC .kernels. region" "" { target *-*-* } .-1 } */
+    for (int i = 0 + 1; i < N; i++)
+      c[i] += c[i - 1];
+  }
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC worker vector loop parallelism" } */
+  {
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+    /* { dg-message "note: parallelized loop nest in OpenACC .kernels. region" "" { target *-*-* } .-1 } */
+    for (int i = 0; i < N; ++i)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC worker loop parallelism" } */
+      for (int j = 0; j < N; ++j)
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+        /* { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } .-1 } */
+       for (int k = 0; k < N; ++k)
+         a[(i + j + k) % N]
+           = b[j]
+           + f_v (c[k]); /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+
+    //TODO Should the following turn into "gang-single" instead of "parloops"?
+    //TODO The problem is that the first STMT is "if (y <= 4) goto <D.2547>; else goto <D.2548>;", thus "parloops".
+    if (y < 5)
+#pragma acc loop independent /* { dg-missed "unparallelized loop nest in OpenACC .kernels. region: it's executed conditionally" } */
+      for (int j = 0; j < N; ++j)
+       b[j] = f_w (c[j]);
+  }
+
+#pragma acc kernels /* { dg-warning "region contains gang partitioned code but is not gang partitioned" } */
+  {
+    /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" "" { target *-*-* } .+1 } */
+    y = f_g (a[5]); /* { dg-message "optimized: assigned OpenACC gang worker vector loop parallelism" } */
+
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang loop parallelism" } */
+    /* { dg-message "note: parallelized loop nest in OpenACC .kernels. region" "" { target *-*-* } .-1 } */
+    for (int j = 0; j < N; ++j)
+      b[j] = y + f_w (c[j]); /* { dg-message "optimized: assigned OpenACC worker vector loop parallelism" } */
+  }
+
+#pragma acc kernels
+  {
+    y = 3; /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" } */
+
+#pragma acc loop independent /* { dg-message "optimized: assigned OpenACC gang worker loop parallelism" } */
+    /* { dg-message "note: parallelized loop nest in OpenACC .kernels. region" "" { target *-*-* } .-1 } */
+    for (int j = 0; j < N; ++j)
+      b[j] = y + f_v (c[j]); /* { dg-message "optimized: assigned OpenACC vector loop parallelism" } */
+
+    z = 2; /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" } */
+  }
+
+#pragma acc kernels /* { dg-message "note: beginning .gang-single. part in OpenACC .kernels. region" } */
+  ;
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-reduction-parloops.c b/gcc/testsuite/c-c++-common/goacc/kernels-reduction-parloops.c
new file mode 100644
index 000000000000..1449f7a066d4
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-reduction-parloops.c
@@ -0,0 +1,36 @@ 
+/* { dg-additional-options "--param=openacc-kernels=parloops" } as this is
+   specifically testing "parloops" handling.  */
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fno-openacc-kernels-annotate-loops" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+#include <stdlib.h>
+
+#define n 10000
+
+unsigned int a[n];
+
+void  __attribute__((noinline,noclone))
+foo (void)
+{
+  int i;
+  unsigned int sum = 1;
+
+#pragma acc kernels copyin (a[0:n]) copy (sum)
+  {
+    for (i = 0; i < n; ++i)
+      sum += a[i];
+  }
+
+  if (sum != 5001)
+    abort ();
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint, noclone, noinline\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*._omp_fn.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/loop-auto-reductions.c b/gcc/testsuite/c-c++-common/goacc/loop-auto-reductions.c
new file mode 100644
index 000000000000..4d033ccff2d9
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/loop-auto-reductions.c
@@ -0,0 +1,22 @@ 
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-graphite-details" } */
+
+#include <stdlib.h>
+
+#define n 10000
+
+unsigned int a[n];
+
+void  __attribute__((noinline,noclone))
+foo (void)
+{
+  int i;
+  unsigned int sum = 1;
+
+#pragma acc parallel copyin (a[0:n])
+  {
+#pragma acc loop auto reduction(+:sum) /* { dg-message "optimized: assigned OpenACC gang vector loop parallelism"} */
+    for (i = 0; i < n; ++i)
+      sum += a[i];
+  }
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-loop-auto-parloops.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-loop-auto-parloops.c
new file mode 100644
index 000000000000..4889c398c06a
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism-1-kernels-loop-auto-parloops.c
@@ -0,0 +1,128 @@ 
+/* Test the output of "-fopt-info-optimized-omp" for an OpenACC 'kernels'
+   construct containing 'loop' constructs with explicit or implicit 'auto'
+   clause that are handled by "parloops".  */
+
+/* { dg-additional-options "--param openacc-kernels=decompose-parloops" } */
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+/* { dg-additional-options "-fopt-info-note-omp" } */
+
+//TODO update accordingly
+/* See also "../../gfortran.dg/goacc/note-parallelism.f90".  */
+
+int
+main ()
+{
+  int x, y, z;
+
+#pragma acc kernels
+ /* Strangely indented to keep this similar to other test cases.  */
+ {
+#pragma acc loop
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto gang /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto gang vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto gang worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto worker vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto gang worker vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto gang /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto worker /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+    for (y = 0; y < 10; y++)
+#pragma acc loop auto vector /* { dg-error ".auto. conflicts with other OpenACC loop specifiers" } */
+      for (z = 0; z < 10; z++)
+       ;
+
+#pragma acc loop auto
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+    ;
+
+#pragma acc loop auto
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+      ;
+
+#pragma acc loop auto
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+#pragma acc loop auto
+      for (z = 0; z < 10; z++)
+       ;
+
+#pragma acc loop
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+#pragma acc loop auto
+      for (z = 0; z < 10; z++)
+       ;
+
+#pragma acc loop auto
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop
+    for (y = 0; y < 10; y++)
+#pragma acc loop auto
+      for (z = 0; z < 10; z++)
+       ;
+
+#pragma acc loop auto
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+#pragma acc loop
+      for (z = 0; z < 10; z++)
+       ;
+
+#pragma acc loop
+  /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .parloops. for analysis" "" { target *-*-* } .-1 } */
+  for (x = 0; x < 10; x++)
+#pragma acc loop auto
+    for (y = 0; y < 10; y++)
+#pragma acc loop
+      for (z = 0; z < 10; z++)
+       ;
+ }
+
+  return 0;
+}
diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loops-1.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loops-1.c
new file mode 100644
index 000000000000..0cd2b9de1743
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loops-1.c
@@ -0,0 +1,61 @@ 
+/* Test the output of "-fopt-info-optimized-omp" for an OpenACC "kernels"
+   construct containing loops.  */
+
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+/* { dg-additional-options "-fopt-info-note-omp" } */
+/* { dg-additional-options "-O2" } */
+
+//TODO update accordingly
+/* See also "../../gfortran.dg/goacc/note-parallelism.f90".  */
+
+int
+main ()
+{
+  int x, y, z;
+
+#pragma acc kernels
+  for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+    /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */
+    ;
+
+#pragma acc kernels
+  for (x = 0; x < 10; x++)  /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+    /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */
+    ;
+
+#pragma acc kernels
+  for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+    /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */
+    for (y = 0; y < 10; y++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+      for (z = 0; z < 10; z++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+       ;
+
+#pragma acc kernels
+  for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+    /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */
+    ;
+
+#pragma acc kernels
+  for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+    /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */
+    for (y = 0; y < 10; y++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+      ;
+
+#pragma acc kernels
+  for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+    /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */
+    for (y = 0; y < 10; y++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+      for (z = 0; z < 10; z++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+       ;
+
+#pragma acc kernels
+  for (x = 0; x < 10; x++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */ \
+    /* { dg-message "note: forwarded loop nest in OpenACC .kernels. region to .Graphite. for analysis" "" { target *-*-* } .-1 } */
+    for (y = 0; y < 10; y++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+      for (z = 0; z < 10; z++) /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+       ;
+
+  return 0;
+}
+
+/* { dg-prune-output ".auto. loop cannot be parallel" } */
diff --git a/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loops-parloops.c b/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loops-parloops.c
new file mode 100644
index 000000000000..a3fea483a951
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/note-parallelism-kernels-loops-parloops.c
@@ -0,0 +1,53 @@ 
+/* Test the output of "-fopt-info-optimized-omp" for an OpenACC 'kernels'
+   construct containing loops.  */
+
+/* { dg-additional-options "--param openacc-kernels=decompose-parloops" } */
+/* { dg-additional-options "-fno-openacc-kernels-annotate-loops" } */
+/* { dg-additional-options "-fopt-info-optimized-omp" } */
+/* { dg-additional-options "-fopt-info-note-omp" } */
+/* { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose-details" } */
+// TODO update accordingly
+/* See also "../../gfortran.dg/goacc/note-parallelism.f90".  */
+
+int
+main ()
+{
+  int x, y, z;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+    ;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+    ;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+    for (y = 0; y < 10; y++)
+      for (z = 0; z < 10; z++)
+       ;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+    ;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+    for (y = 0; y < 10; y++)
+      ;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+    for (y = 0; y < 10; y++)
+      for (z = 0; z < 10; z++)
+       ;
+
+#pragma acc kernels /* { dg-message "optimized: assigned OpenACC seq loop parallelism" } */
+  for (x = 0; x < 10; x++) /* { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" } */
+    for (y = 0; y < 10; y++)
+      for (z = 0; z < 10; z++)
+       ;
+
+  return 0;
+}
diff --git a/gcc/testsuite/gfortran.dg/goacc/classify-kernels-unparallelized-parloops.f95 b/gcc/testsuite/gfortran.dg/goacc/classify-kernels-unparallelized-parloops.f95
new file mode 100644
index 000000000000..c9e24449db16
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/classify-kernels-unparallelized-parloops.f95
@@ -0,0 +1,44 @@ 
+! Check offloaded function's attributes and classification for unparallelized
+! OpenACC kernels that are handled by "parloops".
+
+! { dg-additional-options "--param openacc-kernels=decompose-parloops" }
+! { dg-additional-options "-O2" }
+! { dg-additional-options "-fno-openacc-kernels-annotate-loops" }
+! { dg-additional-options "-fopt-info-optimized-note-omp" }
+! { dg-additional-options "-fdump-tree-ompexp" }
+! { dg-additional-options "-fdump-tree-parloops1-all" }
+! { dg-additional-options "-fdump-tree-oaccloops1" }
+
+program main
+  implicit none
+  integer, parameter :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer :: i
+
+  ! A function call in a data-reference makes the loop unparallelizable
+  integer, external :: f
+
+  call setup(a, b)
+
+  !$acc kernels copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1)) ! { dg-message "optimized: assigned OpenACC seq loop parallelism" }
+  do i = 0, n - 1
+                  ! { dg-message "note: beginning .parloops. part in OpenACC .kernels. region" "" { target *-*-* } .-1 }
+     c(i) = a(f (i)) + b(f (i))
+  end do
+  !$acc end kernels
+end program main
+
+! Check the offloaded function's attributes.
+! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels, omp target entrypoint\\)\\)" 1 "ompexp" } }
+
+! Check that exactly one OpenACC kernels construct is analyzed, and that it
+! can't be parallelized.
+! { dg-final { scan-tree-dump-times "FAILED:" 1 "parloops1" } }
+! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } }
+! { dg-final { scan-tree-dump-not "SUCCESS: may be parallelized" "parloops1" } }
+
+! Check the offloaded function's classification and compute dimensions (will
+! always be 1 x 1 x 1 for non-offloading compilation).
+! { dg-final { scan-tree-dump-times "(?n)Function is unparallelized OpenACC kernels offload" 1 "oaccloops1" } }
+! { dg-final { scan-tree-dump-times "(?n)Compute dimensions \\\[1, 1, 1\\\]" 1 "oaccloops1" } }
+! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc function \\(1, 1, 1\\), oacc kernels, omp target entrypoint\\)\\)" 1 "oaccloops1" } }
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
new file mode 100644
index 000000000000..fe287c38c387
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-conversion.f95
@@ -0,0 +1,52 @@ 
+! { dg-additional-options "-fdump-tree-omp_oacc_kernels_decompose" }
+
+program main
+  implicit none
+  integer, parameter         :: N = 1024
+  integer, dimension (1:N)   :: a
+  integer                    :: i, sum
+
+  !$acc kernels copyin(a(1:N)) copy(sum)
+
+  ! converted to "oacc_kernels"
+  !$acc loop
+  do i = 1, N
+    sum = sum + a(i)
+  end do
+
+  ! converted to "oacc_parallel_kernels_gang_single"
+  sum = sum + 1
+  a(1) = a(1) + 1
+
+  ! converted to "oacc_parallel_kernels_parallelized"
+  !$acc loop independent
+  do i = 1, N
+    sum = sum + a(i)
+  end do
+
+  ! converted to "oacc_kernels"
+  if (sum .gt. 10) then
+    !$acc loop
+    do i = 1, N
+      sum = sum + a(i)
+    end do
+  end if
+
+  ! converted to "oacc_kernels"
+  !$acc loop auto
+  do i = 1, N
+    sum = sum + a(i)
+  end do
+
+  !$acc end kernels
+end program main
+
+! Check that the kernels region is split into a data region and enclosed
+! parallel regions.
+! { dg-final { scan-tree-dump-times "oacc_data_kernels" 1 "omp_oacc_kernels_decompose" } }
+! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_graphite " 5 "omp_oacc_kernels_decompose" } }
+
+! Each of the parallel regions is async, and there is a final call to
+! __builtin_GOACC_wait.
+! { dg-final { scan-tree-dump-times "oacc_parallel_kernels_graphite async\\(-1\\)" 5 "omp_oacc_kernels_decompose" } }
+! { dg-final { scan-tree-dump-times "__builtin_GOACC_wait" 1 "omp_oacc_kernels_decompose" } }
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1-parloops.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1-parloops.f95
new file mode 100644
index 000000000000..3ecf84da8367
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-1-parloops.f95
@@ -0,0 +1,121 @@ 
+! Test OpenACC 'kernels' construct decomposition with "decompose-parloops"
+! handling
+
+! { dg-additional-options "--param openacc-kernels=decompose-parloops" }
+! { dg-additional-options "-fopt-info-optimized-omp" }
+! { dg-additional-options "-Wopenacc-parallelism" }
+! { dg-additional-options "-O2" } for "parloops".
+
+! See also "../../c-c++-common/goacc/kernels-decompose-1.c".
+
+program main
+  implicit none
+
+  integer, external :: f_g
+  !$acc routine (f_g) gang
+  integer, external :: f_w
+  !$acc routine (f_w) worker
+  integer, external :: f_v
+  !$acc routine (f_v) vector
+  integer, external :: f_s
+  !$acc routine (f_s) seq
+
+  integer :: i, j, k
+  integer :: x, y, z
+  logical :: y_l
+  integer, parameter :: N = 10
+  integer :: a(N), b(N), c(N)
+
+  !$acc kernels
+  x = 0
+  y = 0
+  y_l = x < 10
+  z = x
+  x = x + 1
+  !$acc end kernels
+
+  !$acc kernels
+  do i = 1, N
+     ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } .-1 }
+     a(i) = 0
+  end do
+  !$acc end kernels
+
+  !$acc kernels loop ! { dg-optimized "assigned OpenACC gang loop parallelism" }
+  do i = 1, N
+     b(i) = a(N - i + 1)
+  end do
+
+  !$acc kernels
+  !$acc loop ! { dg-optimized "assigned OpenACC gang loop parallelism" }
+  do i = 1, N
+     b(i) = a(N - i + 1)
+  end do
+
+  !$acc loop ! { dg-optimized "assigned OpenACC gang loop parallelism" }
+  do i = 1, N
+     c(i) = a(i) * b(i)
+  end do
+
+  a(z) = 0
+
+  !$acc loop ! { dg-optimized "assigned OpenACC gang loop parallelism" }
+  do i = 1, N
+     c(i) = c(i) + a(i)
+  end do
+
+  !$acc loop seq ! { dg-optimized "assigned OpenACC seq loop parallelism" }
+  do i = 1 + 1, N
+     c(i) = c(i) + c(i - 1)
+  end do
+  !$acc end kernels
+
+  !$acc kernels ! { dg-optimized "assigned OpenACC worker vector loop parallelism" }
+  !$acc loop independent ! { dg-optimized "assigned OpenACC gang loop parallelism" }
+  do i = 1, N
+     !$acc loop independent ! { dg-optimized "assigned OpenACC worker loop parallelism" }
+     do j = 1, N
+        !$acc loop independent ! { dg-optimized "assigned OpenACC seq loop parallelism" }
+        ! { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } .-1 }
+        ! { dg-bogus "optimized: assigned OpenACC vector loop parallelism" "" { target *-*-* } .-2 }
+        do k = 1, N
+           a(1 + mod(i + j + k, N)) &
+                = b(j) &
+                + f_v (c(k)) ! { dg-optimized "assigned OpenACC vector loop parallelism" }
+        end do
+     end do
+  end do
+
+  !TODO Should the following turn into "gang-single" instead of "parloops"?
+  !TODO The problem is that the first STMT is "if (y <= 4) goto <D.2547>; else goto <D.2548>;", thus "parloops".
+  if (y < 5) then
+     !$acc loop independent
+     do j = 1, N
+        b(j) = f_w (c(j))
+     end do
+  end if
+  !$acc end kernels
+
+  !$acc kernels  ! { dg-warning "region contains gang partitioned code but is not gang partitioned" }
+  y = f_g (a(5)) ! { dg-optimized "assigned OpenACC gang worker vector loop parallelism" }
+
+  !$acc loop independent ! { dg-optimized "assigned OpenACC gang loop parallelism" }
+  do j = 1, N
+     b(j) = y + f_w (c(j)) ! { dg-optimized "assigned OpenACC worker vector loop parallelism" }
+  end do
+  !$acc end kernels
+
+  !$acc kernels
+  y = 3
+
+  !$acc loop independent ! { dg-optimized "assigned OpenACC gang worker loop parallelism" }
+  do j = 1, N
+     b(j) = y + f_v (c(j)) ! { dg-optimized "assigned OpenACC vector loop parallelism" }
+  end do
+
+  z = 2
+  !$acc end kernels
+
+  !$acc kernels
+  !$acc end kernels
+end program main
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-parloops-2.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-parloops-2.f95
new file mode 100644
index 000000000000..fc126ea5e037
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-decompose-parloops-2.f95
@@ -0,0 +1,154 @@ 
+! Test OpenACC 'kernels' construct decomposition.
+
+! { dg-additional-options "-fopenacc-kernels-annotate-loops" }
+! { dg-additional-options "-fopt-info-omp-all" }
+! { dg-additional-options "--param=openacc-kernels=decompose-parloops" }
+! { dg-additional-options "-O2" } for 'parloops'.
+
+! { dg-additional-options "-Wopenacc-parallelism" } for testing/documenting
+! aspects of that functionality.
+
+! See also '../../c-c++-common/goacc/kernels-decompose-2.c'.
+
+! It's only with Tcl 8.5 (released in 2007) that "the variable 'varName'
+! passed to 'incr' may be unset, and in that case, it will be set to [...]",
+! so to maintain compatibility with earlier Tcl releases, we manually
+! initialize counter variables:
+! { dg-line l_dummy[variable c_loop_i 0 c_loop_j 0 c_loop_k 0 c_part 0] }
+! { dg-message "dummy" "" { target iN-VAl-Id } l_dummy } to avoid
+! "WARNING: dg-line var l_dummy defined, but not used".
+
+program main
+  implicit none
+
+  integer, external :: f_g
+  !$acc routine (f_g) gang
+  integer, external :: f_w
+  !$acc routine (f_w) worker
+  integer, external :: f_v
+  !$acc routine (f_v) vector
+  integer, external :: f_s
+  !$acc routine (f_s) seq
+
+  integer :: i, j, k
+  integer :: x, y, z
+  logical :: y_l
+  integer, parameter :: N = 10
+  integer :: a(N), b(N), c(N)
+
+  !$acc kernels
+  x = 0 ! { dg-message "note: beginning 'gang-single' part in OpenACC 'kernels' region" }
+  y = 0
+  y_l = x < 10
+  z = x
+  x = x + 1
+  ;
+  !$acc end kernels
+
+  !$acc kernels
+  do i = 1, N  ! { dg-line l_loop_i[incr c_loop_i] }
+     ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
+     ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+     a(i) = 0
+  end do
+  !$acc end kernels
+
+  !$acc kernels loop ! { dg-line l_loop_i[incr c_loop_i] }
+  ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
+  ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+  do i = 1, N
+     b(i) = a(N - i + 1)
+  end do
+
+  !$acc kernels
+  !$acc loop ! { dg-line l_loop_i[incr c_loop_i] }
+  ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
+  ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+  do i = 1, N
+     b(i) = a(N - i + 1)
+  end do
+
+  !$acc loop ! { dg-line l_loop_i[incr c_loop_i] }
+  ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
+  ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+  do i = 1, N
+     c(i) = a(i) * b(i)
+  end do
+
+  a(z) = 0
+
+  !$acc loop ! { dg-line l_loop_i[incr c_loop_i] }
+  ! { dg-message "note: forwarded loop nest in OpenACC 'kernels' region to 'parloops' for analysis" "" { target *-*-* } l_loop_i$c_loop_i }
+  ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+  do i = 1, N
+     c(i) = c(i) + a(i)
+  end do
+
+  !$acc loop seq ! { dg-line l_loop_i[incr c_loop_i] }
+  ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i }
+  ! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+  do i = 1 + 1, N
+     c(i) = c(i) + c(i - 1)
+  end do
+  !$acc end kernels
+
+  !$acc kernels ! { dg-optimized "assigned OpenACC worker vector loop parallelism" }
+  !$acc loop independent ! { dg-line l_loop_i[incr c_loop_i] }
+  ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_i$c_loop_i }
+  ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_i$c_loop_i }
+  do i = 1, N
+     !$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] }
+     ! { dg-optimized "assigned OpenACC worker loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j }
+     do j = 1, N
+        !$acc loop independent ! { dg-line l_loop_k[incr c_loop_k] }
+        ! { dg-warning "insufficient partitioning available to parallelize loop" "" { target *-*-* } l_loop_k$c_loop_k }
+        ! { dg-optimized "assigned OpenACC seq loop parallelism" "" { target *-*-* } l_loop_k$c_loop_k }
+        do k = 1, N
+           a(1 + mod(i + j + k, N)) &
+                = b(j) &
+                + f_v (c(k)) ! { dg-optimized "assigned OpenACC vector loop parallelism" }
+        end do
+     end do
+  end do
+
+  !TODO Should the following turn into "gang-single" instead of "parloops"?
+  !TODO The problem is that the first STMT is 'if (y <= 4) goto <D.2547>; else goto <D.2548>;', thus "parloops".
+  if (y < 5) then ! { dg-message "note: beginning 'parloops' part in OpenACC 'kernels' region" }
+     !$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] }
+     ! { dg-missed "unparallelized loop nest in OpenACC 'kernels' region: it's executed conditionally" "" { target *-*-* } l_loop_j$c_loop_j }
+     do j = 1, N
+        b(j) = f_w (c(j))
+     end do
+  end if
+  !$acc end kernels
+
+  !$acc kernels
+  ! { dg-bogus "\[Ww\]arning: region contains gang partitioned code but is not gang partitioned" "TODO 'kernels'" { xfail *-*-* } .-1 }
+  y = f_g (a(5)) ! { dg-line l_part[incr c_part] }
+  !TODO If such a construct is placed in its own part (like it is, here), can't this actually use gang paralelism, instead of "gang-single"?
+  ! { dg-optimized "assigned OpenACC gang worker vector loop parallelism" "" { target *-*-* } l_part$c_part }
+
+  !$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] }
+  ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_j$c_loop_j }
+  ! { dg-optimized "assigned OpenACC gang loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j }
+  do j = 1, N
+     b(j) = y + f_w (c(j)) ! { dg-optimized "assigned OpenACC worker vector loop parallelism" }
+  end do
+  !$acc end kernels
+
+  !$acc kernels
+  y = 3
+
+  !$acc loop independent ! { dg-line l_loop_j[incr c_loop_j] }
+  ! { dg-message "note: parallelized loop nest in OpenACC 'kernels' region" "" { target *-*-* } l_loop_j$c_loop_j }
+  ! { dg-optimized "assigned OpenACC gang worker loop parallelism" "" { target *-*-* } l_loop_j$c_loop_j }
+  do j = 1, N
+     b(j) = y + f_v (c(j)) ! { dg-optimized "assigned OpenACC vector loop parallelism" }
+  end do
+
+  z = 2
+  !$acc end kernels
+
+  !$acc kernels
+  !$acc end kernels
+end program main
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-parloops-2.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-parloops-2.f95
new file mode 100644
index 000000000000..c92ad4ccf6f2
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-data-parloops-2.f95
@@ -0,0 +1,52 @@ 
+! { dg-additional-options "--param=openacc-kernels=decompose-parloops" } as this is
+! specifically testing "parloops" handling.
+! { dg-additional-options "-O2" }
+! { dg-additional-options "-fopenacc-kernels-annotate-loops" }
+! { dg-additional-options "-fdump-tree-parloops1-all" }
+! { dg-additional-options "-fdump-tree-optimized" }
+
+program main
+  implicit none
+  integer, parameter         :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer                    :: i, ii
+
+  !$acc data copyout (a(0:n-1))
+  !$acc kernels present (a(0:n-1))
+  do i = 0, n - 1
+     a(i) = i * 2
+  end do
+  !$acc end kernels
+  !$acc end data
+
+  !$acc data copyout (b(0:n-1))
+  !$acc kernels present (b(0:n-1))
+  do i = 0, n -1
+     b(i) = i * 4
+  end do
+  !$acc end kernels
+  !$acc end data
+
+  !$acc data copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1))
+  !$acc kernels present (a(0:n-1), b(0:n-1), c(0:n-1))
+  do ii = 0, n - 1
+     c(ii) = a(ii) + b(ii)
+  end do
+  !$acc end kernels
+  !$acc end data
+
+  do i = 0, n - 1
+     if (c(i) .ne. a(i) + b(i)) STOP 1
+  end do
+
+end program main
+
+! Check that only three loops are analyzed, and that all can be parallelized.
+! { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops1" } }
+! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 3 "parloops1" } }
+! { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } }
+
+! Check that the loop has been split off into a function.
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.0 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.1 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.2 " 1 "optimized" } }
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-parloops-2.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-parloops-2.f95
new file mode 100644
index 000000000000..634445ad4a1b
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-parloops-2.f95
@@ -0,0 +1,45 @@ 
+! { dg-additional-options "--param openacc-kernels=decompose-parloops" } as this is
+! specifically testing "parloops" handling.
+! { dg-additional-options "-O2" }
+! { dg-additional-options "-fdump-tree-parloops1-all" }
+! { dg-additional-options "-fdump-tree-optimized" }
+
+program main
+  implicit none
+  integer, parameter         :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer                    :: i, ii
+
+  !$acc kernels copyout (a(0:n-1))
+  do i = 0, n - 1
+     a(i) = i * 2
+  end do
+  !$acc end kernels
+
+  !$acc kernels copyout (b(0:n-1))
+  do i = 0, n -1
+     b(i) = i * 4
+  end do
+  !$acc end kernels
+
+  !$acc kernels copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1))
+  do ii = 0, n - 1
+     c(ii) = a(ii) + b(ii)
+  end do
+  !$acc end kernels
+
+  do i = 0, n - 1
+     if (c(i) .ne. a(i) + b(i)) STOP 1
+  end do
+
+end program main
+
+! Check that only three loops are analyzed, and that all can be parallelized.
+! { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops1" } }
+! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 3 "parloops1" } }
+! { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } }
+
+! Check that the loop has been split off into a function.
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.0 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.1 " 1 "optimized" } }
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.2 " 1 "optimized" } }
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-parloops.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-parloops.f95
new file mode 100644
index 000000000000..c6fa14f5920f
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-parloops.f95
@@ -0,0 +1,39 @@ 
+! { dg-additional-options "--param openacc-kernels=decompose-parloops" } as this is
+! specifically testing "parloops" handling.
+! { dg-additional-options "-O2" }
+! { dg-additional-options "-fdump-tree-parloops1-all" }
+! { dg-additional-options "-fdump-tree-optimized" }
+
+program main
+  implicit none
+  integer, parameter         :: n = 1024
+  integer, dimension (0:n-1) :: a, b, c
+  integer                    :: i, ii
+
+  do i = 0, n - 1
+     a(i) = i * 2
+  end do
+
+  do i = 0, n -1
+     b(i) = i * 4
+  end do
+
+  !$acc kernels copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1))
+  do ii = 0, n - 1
+     c(ii) = a(ii) + b(ii)
+  end do
+  !$acc end kernels
+
+  do i = 0, n - 1
+     if (c(i) .ne. a(i) + b(i)) STOP 1
+  end do
+
+end program main
+
+! Check that only one loop is analyzed, and that it can be parallelized.
+! { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } }
+! { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } }
+! { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } }
+
+! Check that the loop has been split off into a function.
+! { dg-final { scan-tree-dump-times "(?n);; Function MAIN__._omp_fn.0 " 1 "optimized" } }
diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-reductions.f90 b/gcc/testsuite/gfortran.dg/goacc/kernels-reductions.f90
new file mode 100644
index 000000000000..2036395bf594
--- /dev/null
+++ b/gcc/testsuite/gfortran.dg/goacc/kernels-reductions.f90
@@ -0,0 +1,37 @@ 
+! { dg-additional-options "--param openacc-kernels=decompose" }
+
+! A regression test checking that the reduction clause lowering does
+! not fail if a subroutine argument is used as a reduction variable in
+! a kernels region.
+
+! This was fine ...
+subroutine reduction_var_not_argument(res)
+  real res
+  real tmp
+  integer i
+
+  !$acc kernels
+  !$acc loop reduction(+:tmp)
+  do i=0,n-1
+     tmp = tmp + 1
+  end do
+  !$acc end kernels
+
+  res = tmp
+end subroutine reduction_var_not_argument
+
+! ... but this led to problems because ARG
+! was a pointer type that did not get dereferenced.
+subroutine reduction_var_as_argument(arg)
+  real arg
+  integer i
+
+  !$acc kernels
+  !$acc loop reduction(+:arg)
+  do i=0,n-1
+     arg = arg + 1
+  end do
+  !$acc end kernels
+end subroutine reduction_var_as_argument
+
+
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/parallel-loop-auto-reduction-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/parallel-loop-auto-reduction-2.f90
new file mode 100644
index 000000000000..0e9da426d998
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-fortran/parallel-loop-auto-reduction-2.f90
@@ -0,0 +1,98 @@ 
+! Check that the Graphite-based "auto" loop and "kernels" handling
+! is able to assign the parallelism dimensions correctly for a simple
+! loop-nest with reductions. All loops should be parallelized.
+
+! { dg-additional-options "-O2 -g" }
+! { dg-additional-options "-foffload=-fdump-tree-oaccloops1-details" }
+! { dg-additional-options "-foffload=-fopt-info-optimized" }
+! { dg-additional-options "-fdump-tree-oaccloops1-details" }
+! { dg-additional-options "-fopt-info-optimized" }
+
+module test
+  implicit none
+
+  integer, parameter :: n = 10000
+  integer :: a(n,n)
+  integer :: sums(n,n)
+
+contains
+  function sum_loop_auto() result(sum)
+    integer :: i, j
+    integer :: sum, max_val
+
+    sum = 0
+    max_val = 0
+
+    !$acc parallel copyin (a) reduction(+:sum)
+    !$acc loop auto reduction(+:sum) reduction(max:max_val) ! { dg-optimized "assigned OpenACC gang worker loop parallelism" }
+    ! { dg-optimized ".auto. loop can be parallel" "" { target *-*-* } .-1 }
+    do i = 1,size (a, 1)
+       !$acc loop auto reduction(max:max_val) ! { dg-optimized "assigned OpenACC vector loop parallelism" }
+       ! { dg-optimized ".auto. loop can be parallel" "" { target *-*-* } .-1 }
+       do j = 1,size(a, 2)
+          max_val = a(i,j)
+       end do
+       sum = sum + max_val
+    end do
+    !$acc end parallel
+  end function sum_loop_auto
+
+  function sum_kernels() result(sum)
+    integer :: i, j
+    integer :: sum, max_val
+
+    sum = 0
+    max_val = 0
+
+    !$acc kernels
+    ! { dg-optimized {'map\(force_tofrom:max_val [^)]+\)' optimized to 'map\(to:max_val [^)]+\)'} "" { target *-*-* } .-1 }
+    !$acc loop reduction(+:sum) reduction(max:max_val) ! { dg-optimized "assigned OpenACC gang worker loop parallelism" }
+    ! { dg-optimized ".auto. loop can be parallel" "" { target *-*-* } .-1 }
+    ! { dg-optimized "forwarded loop nest in OpenACC .kernels. construct to .Graphite." "" { target *-*-* } .-2 }
+    do i = 1,size (a, 1)
+       !$acc loop reduction(max:max_val) ! { dg-optimized "assigned OpenACC vector loop parallelism" }
+       ! { dg-optimized ".auto. loop can be parallel" "" { target *-*-* } .-1 }
+       do j = 1,size(a, 2)
+          max_val = a(i,j)
+       end do
+       sum = sum + max_val
+    end do
+    !$acc end kernels
+  end function sum_kernels
+end module test
+
+program main
+  use test
+
+  implicit none
+
+  integer :: result, i, j
+
+  ! We sum the maxima of n rows, each containing numbers
+  ! 1..n
+  integer, parameter :: expected_sum = n * n
+
+  do i = 1, size (a, 1) ! { dg-optimized "loop nest optimized" }
+     do j = 1, size (a, 2)
+        a(i, j) = j
+     end do
+  end do
+
+
+  result = sum_loop_auto()
+  if (result /= expected_sum) then
+     write (*, *) "Wrong result:", result
+     call abort()
+  endif
+
+  result = sum_kernels()
+  if (result /= expected_sum) then
+     write (*, *) "Wrong result:", result
+     call abort()
+  endif
+end program main
+
+! This ensures that the dg-optimized assertions above hold for both
+! compilers because the output goes to stderr and the dump file.
+! { dg-final { scan-offload-tree-dump-times "optimized: assigned OpenACC .*? parallelism" 4 "oaccloops1" } }
+! { dg-final { scan-tree-dump-times "optimized: assigned OpenACC .*? parallelism" 4 "oaccloops1" } }