diff mbox series

X86: Add an option -muse-unaligned-vector-move

Message ID	20211020053026.67998-1-dianhong.xu@intel.com
State	New
Headers	DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org A37453858C3A To: hjl.tools@gmail.com Subject: [PATCH] X86: Add an option -muse-unaligned-vector-move Date: Wed, 20 Oct 2021 13:30:26 +0800 Message-Id: <20211020053026.67998-1-dianhong.xu@intel.com> Precedence: list From: "dianhong.xu--- via Gcc-patches" <gcc-patches@gcc.gnu.org> Reply-To: dianhong.xu@intel.com Cc: dianhong7@gmail.com, hongtao.liu@intel.com, gcc-patches@gcc.gnu.org, dianhong.xu@intel.com Errors-To: gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org Sender: "Gcc-patches" <gcc-patches-bounces+patchwork=sourceware.org@gcc.gnu.org>
Series	X86: Add an option -muse-unaligned-vector-move \| X86: Add an option -muse-unaligned-vector-move

Commit Message

Li, Pan2 via Gcc-patches Oct. 20, 2021, 5:30 a.m. UTC

  From: dianhong xu <dianhong.xu@intel.com>

Add -muse-unaligned-vector-move option to emit unaligned vector move
instaructions.

gcc/ChangeLog:

	* config/i386/i386-options.c (ix86_target_string): Add
	-muse-unaligned-vector-move.
	* config/i386/i386.c (ix86_get_ssemov): Emit unaligned vector if use
	the new option.
	* config/i386/i386.opt (muse-unaligned-vector-move): New.
	* config/i386/sse.md: Emit unaligned vector if use this new option
	* doc/invoke.texi: Document -muse-unaligned-vector-move

gcc/testsuite/ChangeLog:

	* gcc.target/i386/avx2-vector-unaligned-load-store-1.c: New test.
	* gcc.target/i386/avx2-vector-unaligned-load-store-2.c: New test.
	* gcc.target/i386/avx2-vector-unaligned-load-store-3.c: New test.
	* gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c: New test.
---
 gcc/config/i386/i386-options.c                |   3 +-
 gcc/config/i386/i386.c                        |  41 +++----
 gcc/config/i386/i386.opt                      |   4 +
 gcc/config/i386/sse.md                        |  30 +++--
 gcc/doc/invoke.texi                           |   7 ++
 .../i386/avx2-vector-unaligned-load-store-1.c | 102 +++++++++++++++++
 .../i386/avx2-vector-unaligned-load-store-2.c | 107 ++++++++++++++++++
 .../i386/avx2-vector-unaligned-load-store-3.c |  11 ++
 .../avx512vl-vector-unaligned-load-store-1.c  |  13 +++
 9 files changed, 287 insertions(+), 31 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c

Comments

Richard Biener Oct. 20, 2021, 7:02 a.m. UTC | #1

On Wed, Oct 20, 2021 at 7:31 AM dianhong.xu--- via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> From: dianhong xu <dianhong.xu@intel.com>
>
> Add -muse-unaligned-vector-move option to emit unaligned vector move
> instaructions.

Why would you ever want to have such option?!  Should the documentation
at least read "emit unaligned vector moves even for aligned storage or when
using aligned move intrinsics"?

Richard.

> gcc/ChangeLog:
>
>         * config/i386/i386-options.c (ix86_target_string): Add
>         -muse-unaligned-vector-move.
>         * config/i386/i386.c (ix86_get_ssemov): Emit unaligned vector if use
>         the new option.
>         * config/i386/i386.opt (muse-unaligned-vector-move): New.
>         * config/i386/sse.md: Emit unaligned vector if use this new option
>         * doc/invoke.texi: Document -muse-unaligned-vector-move
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/avx2-vector-unaligned-load-store-1.c: New test.
>         * gcc.target/i386/avx2-vector-unaligned-load-store-2.c: New test.
>         * gcc.target/i386/avx2-vector-unaligned-load-store-3.c: New test.
>         * gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c: New test.
> ---
>  gcc/config/i386/i386-options.c                |   3 +-
>  gcc/config/i386/i386.c                        |  41 +++----
>  gcc/config/i386/i386.opt                      |   4 +
>  gcc/config/i386/sse.md                        |  30 +++--
>  gcc/doc/invoke.texi                           |   7 ++
>  .../i386/avx2-vector-unaligned-load-store-1.c | 102 +++++++++++++++++
>  .../i386/avx2-vector-unaligned-load-store-2.c | 107 ++++++++++++++++++
>  .../i386/avx2-vector-unaligned-load-store-3.c |  11 ++
>  .../avx512vl-vector-unaligned-load-store-1.c  |  13 +++
>  9 files changed, 287 insertions(+), 31 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
>
> diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
> index c9523b26f49..eacbd0f5451 100644
> --- a/gcc/config/i386/i386-options.c
> +++ b/gcc/config/i386/i386-options.c
> @@ -397,7 +397,8 @@ ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
>      { "-mstv",                         MASK_STV },
>      { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
>      { "-mavx256-split-unaligned-store",        MASK_AVX256_SPLIT_UNALIGNED_STORE },
> -    { "-mcall-ms2sysv-xlogues",                MASK_CALL_MS2SYSV_XLOGUES }
> +    { "-mcall-ms2sysv-xlogues",                MASK_CALL_MS2SYSV_XLOGUES },
> +    { "-muse-unaligned-vector-move",   MASK_USE_UNALIGNED_VECTOR_MOVE }
>    };
>
>    /* Additional flag options.  */
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index f111411e599..7581e854021 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -5323,8 +5323,9 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>                  enum attr_mode insn_mode, machine_mode mode)
>  {
>    char buf[128];
> -  bool misaligned_p = (misaligned_operand (operands[0], mode)
> -                      || misaligned_operand (operands[1], mode));
> +  bool need_unaligned_p = (TARGET_USE_UNALIGNED_VECTOR_MOVE
> +                          || misaligned_operand (operands[0], mode)
> +                          || misaligned_operand (operands[1], mode));
>    bool evex_reg_p = (size == 64
>                      || EXT_REX_SSE_REG_P (operands[0])
>                      || EXT_REX_SSE_REG_P (operands[1]));
> @@ -5380,17 +5381,17 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>         {
>         case opcode_int:
>           if (scalar_mode == E_HFmode)
> -           opcode = (misaligned_p
> +           opcode = (need_unaligned_p
>                       ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
>                       : "vmovdqa64");
>           else
> -           opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> +           opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
>           break;
>         case opcode_float:
> -         opcode = misaligned_p ? "vmovups" : "vmovaps";
> +         opcode = need_unaligned_p ? "vmovups" : "vmovaps";
>           break;
>         case opcode_double:
> -         opcode = misaligned_p ? "vmovupd" : "vmovapd";
> +         opcode = need_unaligned_p ? "vmovupd" : "vmovapd";
>           break;
>         }
>      }
> @@ -5399,21 +5400,21 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>        switch (scalar_mode)
>         {
>         case E_HFmode:
> -         opcode = (misaligned_p
> +         opcode = (need_unaligned_p
>                     ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
>                     : "vmovdqa64");
>           break;
>         case E_SFmode:
> -         opcode = misaligned_p ? "%vmovups" : "%vmovaps";
> +         opcode = need_unaligned_p ? "%vmovups" : "%vmovaps";
>           break;
>         case E_DFmode:
> -         opcode = misaligned_p ? "%vmovupd" : "%vmovapd";
> +         opcode = need_unaligned_p ? "%vmovupd" : "%vmovapd";
>           break;
>         case E_TFmode:
>           if (evex_reg_p)
> -           opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> +           opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
>           else
> -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
>           break;
>         default:
>           gcc_unreachable ();
> @@ -5425,13 +5426,13 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>         {
>         case E_QImode:
>           if (evex_reg_p)
> -           opcode = (misaligned_p
> +           opcode = (need_unaligned_p
>                       ? (TARGET_AVX512BW
>                          ? "vmovdqu8"
>                          : "vmovdqu64")
>                       : "vmovdqa64");
>           else
> -           opcode = (misaligned_p
> +           opcode = (need_unaligned_p
>                       ? (TARGET_AVX512BW
>                          ? "vmovdqu8"
>                          : "%vmovdqu")
> @@ -5439,13 +5440,13 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>           break;
>         case E_HImode:
>           if (evex_reg_p)
> -           opcode = (misaligned_p
> +           opcode = (need_unaligned_p
>                       ? (TARGET_AVX512BW
>                          ? "vmovdqu16"
>                          : "vmovdqu64")
>                       : "vmovdqa64");
>           else
> -           opcode = (misaligned_p
> +           opcode = (need_unaligned_p
>                       ? (TARGET_AVX512BW
>                          ? "vmovdqu16"
>                          : "%vmovdqu")
> @@ -5453,20 +5454,20 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>           break;
>         case E_SImode:
>           if (evex_reg_p)
> -           opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> +           opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
>           else
> -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
>           break;
>         case E_DImode:
>         case E_TImode:
>         case E_OImode:
>           if (evex_reg_p)
> -           opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> +           opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
>           else
> -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
>           break;
>         case E_XImode:
> -         opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> +         opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
>           break;
>         default:
>           gcc_unreachable ();
> diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
> index ad366974b5b..2162d10925a 100644
> --- a/gcc/config/i386/i386.opt
> +++ b/gcc/config/i386/i386.opt
> @@ -1170,3 +1170,7 @@ Support MWAIT and MONITOR built-in functions and code generation.
>  mavx512fp16
>  Target Mask(ISA2_AVX512FP16) Var(ix86_isa_flags2) Save
>  Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and AVX512FP16 built-in functions and code generation.
> +
> +muse-unaligned-vector-move
> +Target Mask(USE_UNALIGNED_VECTOR_MOVE) Save
> +Emit unaligned vector move instructions.
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index fbf056bf9e6..dc99597f195 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -17059,24 +17059,28 @@
>    switch (<MODE>mode)
>      {
>      case E_V8DFmode:
> -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> +         || misaligned_operand (operands[2], <ssequartermode>mode))
>         return "vmovupd\t{%2, %x0|%x0, %2}";
>        else
>         return "vmovapd\t{%2, %x0|%x0, %2}";
>      case E_V16SFmode:
> -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> +         || misaligned_operand (operands[2], <ssequartermode>mode))
>         return "vmovups\t{%2, %x0|%x0, %2}";
>        else
>         return "vmovaps\t{%2, %x0|%x0, %2}";
>      case E_V8DImode:
> -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> +         || misaligned_operand (operands[2], <ssequartermode>mode))
>         return which_alternative == 2 ? "vmovdqu64\t{%2, %x0|%x0, %2}"
>                                       : "vmovdqu\t{%2, %x0|%x0, %2}";
>        else
>         return which_alternative == 2 ? "vmovdqa64\t{%2, %x0|%x0, %2}"
>                                       : "vmovdqa\t{%2, %x0|%x0, %2}";
>      case E_V16SImode:
> -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> +         || misaligned_operand (operands[2], <ssequartermode>mode))
>         return which_alternative == 2 ? "vmovdqu32\t{%2, %x0|%x0, %2}"
>                                       : "vmovdqu\t{%2, %x0|%x0, %2}";
>        else
> @@ -25238,27 +25242,32 @@
>        switch (get_attr_mode (insn))
>         {
>         case MODE_V16SF:
> -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>             return "vmovups\t{%1, %t0|%t0, %1}";
>           else
>             return "vmovaps\t{%1, %t0|%t0, %1}";
>         case MODE_V8DF:
> -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>             return "vmovupd\t{%1, %t0|%t0, %1}";
>           else
>             return "vmovapd\t{%1, %t0|%t0, %1}";
>         case MODE_V8SF:
> -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>             return "vmovups\t{%1, %x0|%x0, %1}";
>           else
>             return "vmovaps\t{%1, %x0|%x0, %1}";
>         case MODE_V4DF:
> -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>             return "vmovupd\t{%1, %x0|%x0, %1}";
>           else
>             return "vmovapd\t{%1, %x0|%x0, %1}";
>         case MODE_XI:
> -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>             {
>               if (which_alternative == 2)
>                 return "vmovdqu\t{%1, %t0|%t0, %1}";
> @@ -25277,7 +25286,8 @@
>                 return "vmovdqa32\t{%1, %t0|%t0, %1}";
>             }
>         case MODE_OI:
> -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>             {
>               if (which_alternative == 2)
>                 return "vmovdqu\t{%1, %x0|%x0, %1}";
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 0cc8a8edd05..13777d62437 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -1418,6 +1418,7 @@ See RS/6000 and PowerPC Options.
>  -mstack-protector-guard-offset=@var{offset} @gol
>  -mstack-protector-guard-symbol=@var{symbol} @gol
>  -mgeneral-regs-only  -mcall-ms2sysv-xlogues @gol
> +-muse-unaligned-vector-move @gol
>  -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol
>  -mindirect-branch-register -mneeded}
>
> @@ -31808,6 +31809,12 @@ resulting in fairly lengthy prologues and epilogues.  Using
>  use stubs in the static portion of libgcc to perform these saves and restores,
>  thus reducing function size at the cost of a few extra instructions.
>
> +@item -muse-unaligned-vector-move
> +@opindex muse-unaligned-vector-move
> +@opindex mno-use-unaligned-vector-move
> +Use @option{-muse-unaligned-vector-move} to emits unaligned vector move
> +instructions like vmovdqu, vmovups, vmovupd.
> +
>  @item -mtls-dialect=@var{type}
>  @opindex mtls-dialect
>  Generate code to access thread-local storage using the @samp{gnu} or
> diff --git a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> new file mode 100644
> index 00000000000..d21eee562ac
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> @@ -0,0 +1,102 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
> +
> +#define N 1024
> +
> +char **cp;
> +char **ep;
> +char **fp;
> +
> +void
> +test_char ()
> +{
> +  int i;
> +  char **ap = __builtin_assume_aligned (ep, 32);
> +  char **zp;
> +  for (i = 128; i > 0; i--)
> +  {
> +    *ap++ = *cp++;
> +    *zp++ = *fp++;
> +  }
> +}
> +
> +float f1[N], f2[N], f3[N];
> +
> +void
> +test_float (void)
> +{
> +  for (int i = 0; i < N; i++)
> +  {
> +    f3[i] = f1[i] * f2[i];
> +  }
> +}
> +
> +double d1[N], d2[N], d3[N];
> +
> +void
> +test_double_load (void)
> +{
> +  for (int i = 0; i < N; i++)
> +  {
> +    d3[i] = d1[i] * d2[i];
> +
> +  }
> +}
> +
> +unsigned char uc1[N], uc2[N], uc3[N];
> +void
> +test_unchar ()
> +{
> +   for (int i=0;i<N;i++) {
> +     uc3[i] = uc1[i] * uc2[i];
> +   }
> +}
> +
> +short st1[N], st2[N], st3[N];
> +void
> +test_short ()
> +{
> +   for (int i=0;i<N;i++) {
> +     st3[i] = st1[i] * st2[i];
> +   }
> +}
> +
> +int n1[N], n2[N], n3[N];
> +void
> +test_int ()
> +{
> +   for (int i=0;i<N;i++) {
> +     n3[i] = n1[i] * n2[i];
> +   }
> +}
> +
> +long l1[N], l2[N], l3[N];
> +
> +void
> +test_long ()
> +{
> +  for (int i=0; i<N; i++)
> +  {
> +    l3[i] = l1[i] *l2[i];
> +  }
> +}
> +
> +long long ll1[N], ll2[N], ll3[N];
> +
> +void
> +test_long_long()
> +{
> +  for (int i=0;i<N;i++)
> +  {
> +    ll3[i] = ll1[i]*ll2[i];
> +  }
> +}
> +
> +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovapd" } } */
> +/* { dg-final { scan-assembler-times "vmovdqu" 19 { target lp64 } } } */
> +/* { dg-final { scan-assembler-times "vmovdqu" 46 { target x32 } } } */
> +/* { dg-final { scan-assembler-times "vmovdqu" 47 { target ia32 } } } */
> +/* { dg-final { scan-assembler-times "vmovups" 2 } } */
> +/* { dg-final { scan-assembler-times "vmovupd" 2 } } */
> diff --git a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> new file mode 100644
> index 00000000000..65c81105ebd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> @@ -0,0 +1,107 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
> +
> +#include <immintrin.h>
> +__m128 value128;
> +char src128[16];
> +
> +__m256 value256;
> +float src256[8];
> +
> +void add128(__m128* pointer) {
> +    value128 = _mm_add_ps(value128, *pointer);
> +}
> +
> +void add256(__m256* pointer) {
> +    value256 = _mm256_add_ps(value256, *pointer);
> +}
> +
> +__m128d value128d;
> +__m128d aux128d;
> +float src128f[4];
> +float res128f[4];
> +double src128d[2];
> +double res128d[2];
> +
> +void add128d(__m128d* pointer, __m128d aux, __m128d* res128d) {
> +    value128d = _mm_add_pd(value128d, *pointer);
> +    __m128d s1 = _mm_add_pd(aux, *pointer);
> +    *res128d = _mm_add_pd(s1, value128d);
> +}
> +
> +__m256d value256d;
> +__m256d aux256d;
> +float src256f[8];
> +float res256f[8];
> +double src256d[4];
> +double res256d[4];
> +
> +void add256d(__m256d* pointer, __m256d aux, __m256d* res) {
> +    value256d = _mm256_add_pd(value256d, *pointer);
> +    __m256d s1 = _mm256_add_pd(aux, *pointer);
> +    *res = _mm256_add_pd(s1, value256d);
> +}
> +
> +__m256i value256i;
> +__m256i aux256i;
> +char src256c[32];
> +char res256c[32];
> +short src256s[16];
> +short res256s[16];
> +int src256i[8];
> +int res256i[8];
> +long long src256l[4];
> +long long res256l[4];
> +
> +void add256i(__m256i* pointer, __m256i aux, __m256i* res) {
> +    value256i = _mm256_add_epi32(value256i, *pointer);
> +    __m256i s1 = _mm256_add_epi32(aux, *pointer);
> +    *res = _mm256_add_epi32(s1, value256i);
> +}
> +
> +void foo1() {
> +    add128((__m128*)src128);
> +}
> +
> +void foo2() {
> +    add256((__m256*)src256);
> +}
> +
> +void foo3() {
> +    add128d((__m128d*)src128d, aux128d, (__m128d*)res128d);
> +}
> +
> +void foo4() {
> +    add128d((__m128d*)src128f, aux128d, (__m128d*)res128f);
> +}
> +
> +void foo5() {
> +    add256d((__m256d*)src256f, aux256d, (__m256d*)res256f);
> +}
> +
> +void foo6() {
> +    add256d((__m256d*)src256d, aux256d, (__m256d*)res256d);
> +}
> +
> +void foo7() {
> +    add256i((__m256i*)src256c, aux256i, (__m256i*)res256c);
> +}
> +
> +void foo8() {
> +    add256i((__m256i*)src256s, aux256i, (__m256i*)res256s);
> +}
> +
> +void foo9() {
> +    add256i((__m256i*)src256i, aux256i, (__m256i*)res256i);
> +}
> +
> +void foo11() {
> +    add256i((__m256i*)src256l, aux256i, (__m256i*)res256l);
> +}
> +
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovapd" } } */
> +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> +/* { dg-final { scan-assembler "vmovups" } } */
> +/* { dg-final { scan-assembler "vmovupd" } } */
> +/* { dg-final { scan-assembler "vmovdqu" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> new file mode 100644
> index 00000000000..59924304bae
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx2 -mno-use-unaligned-vector-move" } */
> +
> +#include "avx2-vector-unaligned-load-store-2.c"
> +
> +/* { dg-final { scan-assembler-not "vmovups" { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-not "vmovupd" { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-not "vmovdqu" } } */
> +/* { dg-final { scan-assembler "vmovaps" } } */
> +/* { dg-final { scan-assembler "vmovapd" } } */
> +/* { dg-final { scan-assembler "vmovdqa" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> new file mode 100644
> index 00000000000..3759fd9f2f4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -mavx512vl -muse-unaligned-vector-move" } */
> +
> +#include "avx2-vector-unaligned-load-store-1.c"
> +
> +/* { dg-final { scan-assembler-not "vmovdqa32" } } */
> +/* { dg-final { scan-assembler-not "vmovdqa64" } } */
> +/* { dg-final { scan-assembler-not "vmovaps" } } */
> +/* { dg-final { scan-assembler-not "vmovapd" } } */
> +/* { dg-final { scan-assembler "vmovdqu32" } } */
> +/* { dg-final { scan-assembler "vmovdqu64" } } */
> +/* { dg-final { scan-assembler "vmovups" } } */
> +/* { dg-final { scan-assembler "vmovupd" } } */
> --
> 2.18.1
>

Richard Biener Oct. 20, 2021, 7:04 a.m. UTC | #2

On Wed, Oct 20, 2021 at 9:02 AM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Wed, Oct 20, 2021 at 7:31 AM dianhong.xu--- via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > From: dianhong xu <dianhong.xu@intel.com>
> >
> > Add -muse-unaligned-vector-move option to emit unaligned vector move
> > instaructions.
>
> Why would you ever want to have such option?!  Should the documentation
> at least read "emit unaligned vector moves even for aligned storage or when
> using aligned move intrinsics"?

And does it even work?  I fail to see adjustments to memory operands of
SSE/AVX instructions that have to be aligned and now would need to be
pushed to separate unaligned moves with an extra register?

Richard.

>
> Richard.
>
> > gcc/ChangeLog:
> >
> >         * config/i386/i386-options.c (ix86_target_string): Add
> >         -muse-unaligned-vector-move.
> >         * config/i386/i386.c (ix86_get_ssemov): Emit unaligned vector if use
> >         the new option.
> >         * config/i386/i386.opt (muse-unaligned-vector-move): New.
> >         * config/i386/sse.md: Emit unaligned vector if use this new option
> >         * doc/invoke.texi: Document -muse-unaligned-vector-move
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/avx2-vector-unaligned-load-store-1.c: New test.
> >         * gcc.target/i386/avx2-vector-unaligned-load-store-2.c: New test.
> >         * gcc.target/i386/avx2-vector-unaligned-load-store-3.c: New test.
> >         * gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c: New test.
> > ---
> >  gcc/config/i386/i386-options.c                |   3 +-
> >  gcc/config/i386/i386.c                        |  41 +++----
> >  gcc/config/i386/i386.opt                      |   4 +
> >  gcc/config/i386/sse.md                        |  30 +++--
> >  gcc/doc/invoke.texi                           |   7 ++
> >  .../i386/avx2-vector-unaligned-load-store-1.c | 102 +++++++++++++++++
> >  .../i386/avx2-vector-unaligned-load-store-2.c | 107 ++++++++++++++++++
> >  .../i386/avx2-vector-unaligned-load-store-3.c |  11 ++
> >  .../avx512vl-vector-unaligned-load-store-1.c  |  13 +++
> >  9 files changed, 287 insertions(+), 31 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> >
> > diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
> > index c9523b26f49..eacbd0f5451 100644
> > --- a/gcc/config/i386/i386-options.c
> > +++ b/gcc/config/i386/i386-options.c
> > @@ -397,7 +397,8 @@ ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
> >      { "-mstv",                         MASK_STV },
> >      { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
> >      { "-mavx256-split-unaligned-store",        MASK_AVX256_SPLIT_UNALIGNED_STORE },
> > -    { "-mcall-ms2sysv-xlogues",                MASK_CALL_MS2SYSV_XLOGUES }
> > +    { "-mcall-ms2sysv-xlogues",                MASK_CALL_MS2SYSV_XLOGUES },
> > +    { "-muse-unaligned-vector-move",   MASK_USE_UNALIGNED_VECTOR_MOVE }
> >    };
> >
> >    /* Additional flag options.  */
> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > index f111411e599..7581e854021 100644
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -5323,8 +5323,9 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> >                  enum attr_mode insn_mode, machine_mode mode)
> >  {
> >    char buf[128];
> > -  bool misaligned_p = (misaligned_operand (operands[0], mode)
> > -                      || misaligned_operand (operands[1], mode));
> > +  bool need_unaligned_p = (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +                          || misaligned_operand (operands[0], mode)
> > +                          || misaligned_operand (operands[1], mode));
> >    bool evex_reg_p = (size == 64
> >                      || EXT_REX_SSE_REG_P (operands[0])
> >                      || EXT_REX_SSE_REG_P (operands[1]));
> > @@ -5380,17 +5381,17 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> >         {
> >         case opcode_int:
> >           if (scalar_mode == E_HFmode)
> > -           opcode = (misaligned_p
> > +           opcode = (need_unaligned_p
> >                       ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
> >                       : "vmovdqa64");
> >           else
> > -           opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> > +           opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
> >           break;
> >         case opcode_float:
> > -         opcode = misaligned_p ? "vmovups" : "vmovaps";
> > +         opcode = need_unaligned_p ? "vmovups" : "vmovaps";
> >           break;
> >         case opcode_double:
> > -         opcode = misaligned_p ? "vmovupd" : "vmovapd";
> > +         opcode = need_unaligned_p ? "vmovupd" : "vmovapd";
> >           break;
> >         }
> >      }
> > @@ -5399,21 +5400,21 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> >        switch (scalar_mode)
> >         {
> >         case E_HFmode:
> > -         opcode = (misaligned_p
> > +         opcode = (need_unaligned_p
> >                     ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
> >                     : "vmovdqa64");
> >           break;
> >         case E_SFmode:
> > -         opcode = misaligned_p ? "%vmovups" : "%vmovaps";
> > +         opcode = need_unaligned_p ? "%vmovups" : "%vmovaps";
> >           break;
> >         case E_DFmode:
> > -         opcode = misaligned_p ? "%vmovupd" : "%vmovapd";
> > +         opcode = need_unaligned_p ? "%vmovupd" : "%vmovapd";
> >           break;
> >         case E_TFmode:
> >           if (evex_reg_p)
> > -           opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > +           opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
> >           else
> > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
> >           break;
> >         default:
> >           gcc_unreachable ();
> > @@ -5425,13 +5426,13 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> >         {
> >         case E_QImode:
> >           if (evex_reg_p)
> > -           opcode = (misaligned_p
> > +           opcode = (need_unaligned_p
> >                       ? (TARGET_AVX512BW
> >                          ? "vmovdqu8"
> >                          : "vmovdqu64")
> >                       : "vmovdqa64");
> >           else
> > -           opcode = (misaligned_p
> > +           opcode = (need_unaligned_p
> >                       ? (TARGET_AVX512BW
> >                          ? "vmovdqu8"
> >                          : "%vmovdqu")
> > @@ -5439,13 +5440,13 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> >           break;
> >         case E_HImode:
> >           if (evex_reg_p)
> > -           opcode = (misaligned_p
> > +           opcode = (need_unaligned_p
> >                       ? (TARGET_AVX512BW
> >                          ? "vmovdqu16"
> >                          : "vmovdqu64")
> >                       : "vmovdqa64");
> >           else
> > -           opcode = (misaligned_p
> > +           opcode = (need_unaligned_p
> >                       ? (TARGET_AVX512BW
> >                          ? "vmovdqu16"
> >                          : "%vmovdqu")
> > @@ -5453,20 +5454,20 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> >           break;
> >         case E_SImode:
> >           if (evex_reg_p)
> > -           opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> > +           opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
> >           else
> > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
> >           break;
> >         case E_DImode:
> >         case E_TImode:
> >         case E_OImode:
> >           if (evex_reg_p)
> > -           opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > +           opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
> >           else
> > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
> >           break;
> >         case E_XImode:
> > -         opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > +         opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
> >           break;
> >         default:
> >           gcc_unreachable ();
> > diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
> > index ad366974b5b..2162d10925a 100644
> > --- a/gcc/config/i386/i386.opt
> > +++ b/gcc/config/i386/i386.opt
> > @@ -1170,3 +1170,7 @@ Support MWAIT and MONITOR built-in functions and code generation.
> >  mavx512fp16
> >  Target Mask(ISA2_AVX512FP16) Var(ix86_isa_flags2) Save
> >  Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and AVX512FP16 built-in functions and code generation.
> > +
> > +muse-unaligned-vector-move
> > +Target Mask(USE_UNALIGNED_VECTOR_MOVE) Save
> > +Emit unaligned vector move instructions.
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index fbf056bf9e6..dc99597f195 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -17059,24 +17059,28 @@
> >    switch (<MODE>mode)
> >      {
> >      case E_V8DFmode:
> > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +         || misaligned_operand (operands[2], <ssequartermode>mode))
> >         return "vmovupd\t{%2, %x0|%x0, %2}";
> >        else
> >         return "vmovapd\t{%2, %x0|%x0, %2}";
> >      case E_V16SFmode:
> > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +         || misaligned_operand (operands[2], <ssequartermode>mode))
> >         return "vmovups\t{%2, %x0|%x0, %2}";
> >        else
> >         return "vmovaps\t{%2, %x0|%x0, %2}";
> >      case E_V8DImode:
> > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +         || misaligned_operand (operands[2], <ssequartermode>mode))
> >         return which_alternative == 2 ? "vmovdqu64\t{%2, %x0|%x0, %2}"
> >                                       : "vmovdqu\t{%2, %x0|%x0, %2}";
> >        else
> >         return which_alternative == 2 ? "vmovdqa64\t{%2, %x0|%x0, %2}"
> >                                       : "vmovdqa\t{%2, %x0|%x0, %2}";
> >      case E_V16SImode:
> > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +         || misaligned_operand (operands[2], <ssequartermode>mode))
> >         return which_alternative == 2 ? "vmovdqu32\t{%2, %x0|%x0, %2}"
> >                                       : "vmovdqu\t{%2, %x0|%x0, %2}";
> >        else
> > @@ -25238,27 +25242,32 @@
> >        switch (get_attr_mode (insn))
> >         {
> >         case MODE_V16SF:
> > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> >             return "vmovups\t{%1, %t0|%t0, %1}";
> >           else
> >             return "vmovaps\t{%1, %t0|%t0, %1}";
> >         case MODE_V8DF:
> > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> >             return "vmovupd\t{%1, %t0|%t0, %1}";
> >           else
> >             return "vmovapd\t{%1, %t0|%t0, %1}";
> >         case MODE_V8SF:
> > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> >             return "vmovups\t{%1, %x0|%x0, %1}";
> >           else
> >             return "vmovaps\t{%1, %x0|%x0, %1}";
> >         case MODE_V4DF:
> > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> >             return "vmovupd\t{%1, %x0|%x0, %1}";
> >           else
> >             return "vmovapd\t{%1, %x0|%x0, %1}";
> >         case MODE_XI:
> > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> >             {
> >               if (which_alternative == 2)
> >                 return "vmovdqu\t{%1, %t0|%t0, %1}";
> > @@ -25277,7 +25286,8 @@
> >                 return "vmovdqa32\t{%1, %t0|%t0, %1}";
> >             }
> >         case MODE_OI:
> > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> >             {
> >               if (which_alternative == 2)
> >                 return "vmovdqu\t{%1, %x0|%x0, %1}";
> > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> > index 0cc8a8edd05..13777d62437 100644
> > --- a/gcc/doc/invoke.texi
> > +++ b/gcc/doc/invoke.texi
> > @@ -1418,6 +1418,7 @@ See RS/6000 and PowerPC Options.
> >  -mstack-protector-guard-offset=@var{offset} @gol
> >  -mstack-protector-guard-symbol=@var{symbol} @gol
> >  -mgeneral-regs-only  -mcall-ms2sysv-xlogues @gol
> > +-muse-unaligned-vector-move @gol
> >  -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol
> >  -mindirect-branch-register -mneeded}
> >
> > @@ -31808,6 +31809,12 @@ resulting in fairly lengthy prologues and epilogues.  Using
> >  use stubs in the static portion of libgcc to perform these saves and restores,
> >  thus reducing function size at the cost of a few extra instructions.
> >
> > +@item -muse-unaligned-vector-move
> > +@opindex muse-unaligned-vector-move
> > +@opindex mno-use-unaligned-vector-move
> > +Use @option{-muse-unaligned-vector-move} to emits unaligned vector move
> > +instructions like vmovdqu, vmovups, vmovupd.
> > +
> >  @item -mtls-dialect=@var{type}
> >  @opindex mtls-dialect
> >  Generate code to access thread-local storage using the @samp{gnu} or
> > diff --git a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> > new file mode 100644
> > index 00000000000..d21eee562ac
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> > @@ -0,0 +1,102 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
> > +
> > +#define N 1024
> > +
> > +char **cp;
> > +char **ep;
> > +char **fp;
> > +
> > +void
> > +test_char ()
> > +{
> > +  int i;
> > +  char **ap = __builtin_assume_aligned (ep, 32);
> > +  char **zp;
> > +  for (i = 128; i > 0; i--)
> > +  {
> > +    *ap++ = *cp++;
> > +    *zp++ = *fp++;
> > +  }
> > +}
> > +
> > +float f1[N], f2[N], f3[N];
> > +
> > +void
> > +test_float (void)
> > +{
> > +  for (int i = 0; i < N; i++)
> > +  {
> > +    f3[i] = f1[i] * f2[i];
> > +  }
> > +}
> > +
> > +double d1[N], d2[N], d3[N];
> > +
> > +void
> > +test_double_load (void)
> > +{
> > +  for (int i = 0; i < N; i++)
> > +  {
> > +    d3[i] = d1[i] * d2[i];
> > +
> > +  }
> > +}
> > +
> > +unsigned char uc1[N], uc2[N], uc3[N];
> > +void
> > +test_unchar ()
> > +{
> > +   for (int i=0;i<N;i++) {
> > +     uc3[i] = uc1[i] * uc2[i];
> > +   }
> > +}
> > +
> > +short st1[N], st2[N], st3[N];
> > +void
> > +test_short ()
> > +{
> > +   for (int i=0;i<N;i++) {
> > +     st3[i] = st1[i] * st2[i];
> > +   }
> > +}
> > +
> > +int n1[N], n2[N], n3[N];
> > +void
> > +test_int ()
> > +{
> > +   for (int i=0;i<N;i++) {
> > +     n3[i] = n1[i] * n2[i];
> > +   }
> > +}
> > +
> > +long l1[N], l2[N], l3[N];
> > +
> > +void
> > +test_long ()
> > +{
> > +  for (int i=0; i<N; i++)
> > +  {
> > +    l3[i] = l1[i] *l2[i];
> > +  }
> > +}
> > +
> > +long long ll1[N], ll2[N], ll3[N];
> > +
> > +void
> > +test_long_long()
> > +{
> > +  for (int i=0;i<N;i++)
> > +  {
> > +    ll3[i] = ll1[i]*ll2[i];
> > +  }
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> > +/* { dg-final { scan-assembler-not "vmovaps" } } */
> > +/* { dg-final { scan-assembler-not "vmovapd" } } */
> > +/* { dg-final { scan-assembler-times "vmovdqu" 19 { target lp64 } } } */
> > +/* { dg-final { scan-assembler-times "vmovdqu" 46 { target x32 } } } */
> > +/* { dg-final { scan-assembler-times "vmovdqu" 47 { target ia32 } } } */
> > +/* { dg-final { scan-assembler-times "vmovups" 2 } } */
> > +/* { dg-final { scan-assembler-times "vmovupd" 2 } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> > new file mode 100644
> > index 00000000000..65c81105ebd
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> > @@ -0,0 +1,107 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
> > +
> > +#include <immintrin.h>
> > +__m128 value128;
> > +char src128[16];
> > +
> > +__m256 value256;
> > +float src256[8];
> > +
> > +void add128(__m128* pointer) {
> > +    value128 = _mm_add_ps(value128, *pointer);
> > +}
> > +
> > +void add256(__m256* pointer) {
> > +    value256 = _mm256_add_ps(value256, *pointer);
> > +}
> > +
> > +__m128d value128d;
> > +__m128d aux128d;
> > +float src128f[4];
> > +float res128f[4];
> > +double src128d[2];
> > +double res128d[2];
> > +
> > +void add128d(__m128d* pointer, __m128d aux, __m128d* res128d) {
> > +    value128d = _mm_add_pd(value128d, *pointer);
> > +    __m128d s1 = _mm_add_pd(aux, *pointer);
> > +    *res128d = _mm_add_pd(s1, value128d);
> > +}
> > +
> > +__m256d value256d;
> > +__m256d aux256d;
> > +float src256f[8];
> > +float res256f[8];
> > +double src256d[4];
> > +double res256d[4];
> > +
> > +void add256d(__m256d* pointer, __m256d aux, __m256d* res) {
> > +    value256d = _mm256_add_pd(value256d, *pointer);
> > +    __m256d s1 = _mm256_add_pd(aux, *pointer);
> > +    *res = _mm256_add_pd(s1, value256d);
> > +}
> > +
> > +__m256i value256i;
> > +__m256i aux256i;
> > +char src256c[32];
> > +char res256c[32];
> > +short src256s[16];
> > +short res256s[16];
> > +int src256i[8];
> > +int res256i[8];
> > +long long src256l[4];
> > +long long res256l[4];
> > +
> > +void add256i(__m256i* pointer, __m256i aux, __m256i* res) {
> > +    value256i = _mm256_add_epi32(value256i, *pointer);
> > +    __m256i s1 = _mm256_add_epi32(aux, *pointer);
> > +    *res = _mm256_add_epi32(s1, value256i);
> > +}
> > +
> > +void foo1() {
> > +    add128((__m128*)src128);
> > +}
> > +
> > +void foo2() {
> > +    add256((__m256*)src256);
> > +}
> > +
> > +void foo3() {
> > +    add128d((__m128d*)src128d, aux128d, (__m128d*)res128d);
> > +}
> > +
> > +void foo4() {
> > +    add128d((__m128d*)src128f, aux128d, (__m128d*)res128f);
> > +}
> > +
> > +void foo5() {
> > +    add256d((__m256d*)src256f, aux256d, (__m256d*)res256f);
> > +}
> > +
> > +void foo6() {
> > +    add256d((__m256d*)src256d, aux256d, (__m256d*)res256d);
> > +}
> > +
> > +void foo7() {
> > +    add256i((__m256i*)src256c, aux256i, (__m256i*)res256c);
> > +}
> > +
> > +void foo8() {
> > +    add256i((__m256i*)src256s, aux256i, (__m256i*)res256s);
> > +}
> > +
> > +void foo9() {
> > +    add256i((__m256i*)src256i, aux256i, (__m256i*)res256i);
> > +}
> > +
> > +void foo11() {
> > +    add256i((__m256i*)src256l, aux256i, (__m256i*)res256l);
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "vmovaps" } } */
> > +/* { dg-final { scan-assembler-not "vmovapd" } } */
> > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> > +/* { dg-final { scan-assembler "vmovups" } } */
> > +/* { dg-final { scan-assembler "vmovupd" } } */
> > +/* { dg-final { scan-assembler "vmovdqu" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> > new file mode 100644
> > index 00000000000..59924304bae
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -mavx2 -mno-use-unaligned-vector-move" } */
> > +
> > +#include "avx2-vector-unaligned-load-store-2.c"
> > +
> > +/* { dg-final { scan-assembler-not "vmovups" { target { ! ia32 } } } } */
> > +/* { dg-final { scan-assembler-not "vmovupd" { target { ! ia32 } } } } */
> > +/* { dg-final { scan-assembler-not "vmovdqu" } } */
> > +/* { dg-final { scan-assembler "vmovaps" } } */
> > +/* { dg-final { scan-assembler "vmovapd" } } */
> > +/* { dg-final { scan-assembler "vmovdqa" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> > new file mode 100644
> > index 00000000000..3759fd9f2f4
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> > @@ -0,0 +1,13 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -mavx512vl -muse-unaligned-vector-move" } */
> > +
> > +#include "avx2-vector-unaligned-load-store-1.c"
> > +
> > +/* { dg-final { scan-assembler-not "vmovdqa32" } } */
> > +/* { dg-final { scan-assembler-not "vmovdqa64" } } */
> > +/* { dg-final { scan-assembler-not "vmovaps" } } */
> > +/* { dg-final { scan-assembler-not "vmovapd" } } */
> > +/* { dg-final { scan-assembler "vmovdqu32" } } */
> > +/* { dg-final { scan-assembler "vmovdqu64" } } */
> > +/* { dg-final { scan-assembler "vmovups" } } */
> > +/* { dg-final { scan-assembler "vmovupd" } } */
> > --
> > 2.18.1
> >

Xu Dianhong Oct. 20, 2021, 7:48 a.m. UTC | #3

Thanks for the comments.

> And does it even work?
It works, I checked it in the test case, and when using this option, it can
emit an unaligned vector move.
>I fail to see adjustments to memory operands of
SSE/AVX instructions that have to be aligned
I changed all vector move in "get_ssemov" without checking the move with
memory operands or not.
>and now would need to be
pushed to separate unaligned moves with an extra register?
I think it did not use an extra register. I'm not sure if I got your
question, and this patch just change the final operator of SSE MOVE from
aligned operator to unaligned operator, and I did not change the operands.

On Wed, Oct 20, 2021 at 3:04 PM Richard Biener <richard.guenther@gmail.com>
wrote:

> On Wed, Oct 20, 2021 at 9:02 AM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On Wed, Oct 20, 2021 at 7:31 AM dianhong.xu--- via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > From: dianhong xu <dianhong.xu@intel.com>
> > >
> > > Add -muse-unaligned-vector-move option to emit unaligned vector move
> > > instaructions.
> >
> > Why would you ever want to have such option?!  Should the documentation
> > at least read "emit unaligned vector moves even for aligned storage or
> when
> > using aligned move intrinsics"?
>
> And does it even work?  I fail to see adjustments to memory operands of
> SSE/AVX instructions that have to be aligned and now would need to be
> pushed to separate unaligned moves with an extra register?
>
> Richard.
>
> >
> > Richard.
> >
> > > gcc/ChangeLog:
> > >
> > >         * config/i386/i386-options.c (ix86_target_string): Add
> > >         -muse-unaligned-vector-move.
> > >         * config/i386/i386.c (ix86_get_ssemov): Emit unaligned vector
> if use
> > >         the new option.
> > >         * config/i386/i386.opt (muse-unaligned-vector-move): New.
> > >         * config/i386/sse.md: Emit unaligned vector if use this new
> option
> > >         * doc/invoke.texi: Document -muse-unaligned-vector-move
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         * gcc.target/i386/avx2-vector-unaligned-load-store-1.c: New
> test.
> > >         * gcc.target/i386/avx2-vector-unaligned-load-store-2.c: New
> test.
> > >         * gcc.target/i386/avx2-vector-unaligned-load-store-3.c: New
> test.
> > >         * gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c:
> New test.
> > > ---
> > >  gcc/config/i386/i386-options.c                |   3 +-
> > >  gcc/config/i386/i386.c                        |  41 +++----
> > >  gcc/config/i386/i386.opt                      |   4 +
> > >  gcc/config/i386/sse.md                        |  30 +++--
> > >  gcc/doc/invoke.texi                           |   7 ++
> > >  .../i386/avx2-vector-unaligned-load-store-1.c | 102 +++++++++++++++++
> > >  .../i386/avx2-vector-unaligned-load-store-2.c | 107 ++++++++++++++++++
> > >  .../i386/avx2-vector-unaligned-load-store-3.c |  11 ++
> > >  .../avx512vl-vector-unaligned-load-store-1.c  |  13 +++
> > >  9 files changed, 287 insertions(+), 31 deletions(-)
> > >  create mode 100644
> gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> > >  create mode 100644
> gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> > >  create mode 100644
> gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> > >  create mode 100644
> gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> > >
> > > diff --git a/gcc/config/i386/i386-options.c
> b/gcc/config/i386/i386-options.c
> > > index c9523b26f49..eacbd0f5451 100644
> > > --- a/gcc/config/i386/i386-options.c
> > > +++ b/gcc/config/i386/i386-options.c
> > > @@ -397,7 +397,8 @@ ix86_target_string (HOST_WIDE_INT isa,
> HOST_WIDE_INT isa2,
> > >      { "-mstv",                         MASK_STV },
> > >      { "-mavx256-split-unaligned-load",
> MASK_AVX256_SPLIT_UNALIGNED_LOAD },
> > >      { "-mavx256-split-unaligned-store",
> MASK_AVX256_SPLIT_UNALIGNED_STORE },
> > > -    { "-mcall-ms2sysv-xlogues",
> MASK_CALL_MS2SYSV_XLOGUES }
> > > +    { "-mcall-ms2sysv-xlogues",
> MASK_CALL_MS2SYSV_XLOGUES },
> > > +    { "-muse-unaligned-vector-move",   MASK_USE_UNALIGNED_VECTOR_MOVE
> }
> > >    };
> > >
> > >    /* Additional flag options.  */
> > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > > index f111411e599..7581e854021 100644
> > > --- a/gcc/config/i386/i386.c
> > > +++ b/gcc/config/i386/i386.c
> > > @@ -5323,8 +5323,9 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> > >                  enum attr_mode insn_mode, machine_mode mode)
> > >  {
> > >    char buf[128];
> > > -  bool misaligned_p = (misaligned_operand (operands[0], mode)
> > > -                      || misaligned_operand (operands[1], mode));
> > > +  bool need_unaligned_p = (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > > +                          || misaligned_operand (operands[0], mode)
> > > +                          || misaligned_operand (operands[1], mode));
> > >    bool evex_reg_p = (size == 64
> > >                      || EXT_REX_SSE_REG_P (operands[0])
> > >                      || EXT_REX_SSE_REG_P (operands[1]));
> > > @@ -5380,17 +5381,17 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> > >         {
> > >         case opcode_int:
> > >           if (scalar_mode == E_HFmode)
> > > -           opcode = (misaligned_p
> > > +           opcode = (need_unaligned_p
> > >                       ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
> > >                       : "vmovdqa64");
> > >           else
> > > -           opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> > > +           opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
> > >           break;
> > >         case opcode_float:
> > > -         opcode = misaligned_p ? "vmovups" : "vmovaps";
> > > +         opcode = need_unaligned_p ? "vmovups" : "vmovaps";
> > >           break;
> > >         case opcode_double:
> > > -         opcode = misaligned_p ? "vmovupd" : "vmovapd";
> > > +         opcode = need_unaligned_p ? "vmovupd" : "vmovapd";
> > >           break;
> > >         }
> > >      }
> > > @@ -5399,21 +5400,21 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> > >        switch (scalar_mode)
> > >         {
> > >         case E_HFmode:
> > > -         opcode = (misaligned_p
> > > +         opcode = (need_unaligned_p
> > >                     ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
> > >                     : "vmovdqa64");
> > >           break;
> > >         case E_SFmode:
> > > -         opcode = misaligned_p ? "%vmovups" : "%vmovaps";
> > > +         opcode = need_unaligned_p ? "%vmovups" : "%vmovaps";
> > >           break;
> > >         case E_DFmode:
> > > -         opcode = misaligned_p ? "%vmovupd" : "%vmovapd";
> > > +         opcode = need_unaligned_p ? "%vmovupd" : "%vmovapd";
> > >           break;
> > >         case E_TFmode:
> > >           if (evex_reg_p)
> > > -           opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > > +           opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
> > >           else
> > > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
> > >           break;
> > >         default:
> > >           gcc_unreachable ();
> > > @@ -5425,13 +5426,13 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> > >         {
> > >         case E_QImode:
> > >           if (evex_reg_p)
> > > -           opcode = (misaligned_p
> > > +           opcode = (need_unaligned_p
> > >                       ? (TARGET_AVX512BW
> > >                          ? "vmovdqu8"
> > >                          : "vmovdqu64")
> > >                       : "vmovdqa64");
> > >           else
> > > -           opcode = (misaligned_p
> > > +           opcode = (need_unaligned_p
> > >                       ? (TARGET_AVX512BW
> > >                          ? "vmovdqu8"
> > >                          : "%vmovdqu")
> > > @@ -5439,13 +5440,13 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> > >           break;
> > >         case E_HImode:
> > >           if (evex_reg_p)
> > > -           opcode = (misaligned_p
> > > +           opcode = (need_unaligned_p
> > >                       ? (TARGET_AVX512BW
> > >                          ? "vmovdqu16"
> > >                          : "vmovdqu64")
> > >                       : "vmovdqa64");
> > >           else
> > > -           opcode = (misaligned_p
> > > +           opcode = (need_unaligned_p
> > >                       ? (TARGET_AVX512BW
> > >                          ? "vmovdqu16"
> > >                          : "%vmovdqu")
> > > @@ -5453,20 +5454,20 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> > >           break;
> > >         case E_SImode:
> > >           if (evex_reg_p)
> > > -           opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> > > +           opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
> > >           else
> > > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
> > >           break;
> > >         case E_DImode:
> > >         case E_TImode:
> > >         case E_OImode:
> > >           if (evex_reg_p)
> > > -           opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > > +           opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
> > >           else
> > > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
> > >           break;
> > >         case E_XImode:
> > > -         opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > > +         opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
> > >           break;
> > >         default:
> > >           gcc_unreachable ();
> > > diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
> > > index ad366974b5b..2162d10925a 100644
> > > --- a/gcc/config/i386/i386.opt
> > > +++ b/gcc/config/i386/i386.opt
> > > @@ -1170,3 +1170,7 @@ Support MWAIT and MONITOR built-in functions and
> code generation.
> > >  mavx512fp16
> > >  Target Mask(ISA2_AVX512FP16) Var(ix86_isa_flags2) Save
> > >  Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2,
> AVX512F and AVX512FP16 built-in functions and code generation.
> > > +
> > > +muse-unaligned-vector-move
> > > +Target Mask(USE_UNALIGNED_VECTOR_MOVE) Save
> > > +Emit unaligned vector move instructions.
> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > > index fbf056bf9e6..dc99597f195 100644
> > > --- a/gcc/config/i386/sse.md
> > > +++ b/gcc/config/i386/sse.md
> > > @@ -17059,24 +17059,28 @@
> > >    switch (<MODE>mode)
> > >      {
> > >      case E_V8DFmode:
> > > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> > > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > > +         || misaligned_operand (operands[2], <ssequartermode>mode))
> > >         return "vmovupd\t{%2, %x0|%x0, %2}";
> > >        else
> > >         return "vmovapd\t{%2, %x0|%x0, %2}";
> > >      case E_V16SFmode:
> > > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> > > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > > +         || misaligned_operand (operands[2], <ssequartermode>mode))
> > >         return "vmovups\t{%2, %x0|%x0, %2}";
> > >        else
> > >         return "vmovaps\t{%2, %x0|%x0, %2}";
> > >      case E_V8DImode:
> > > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> > > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > > +         || misaligned_operand (operands[2], <ssequartermode>mode))
> > >         return which_alternative == 2 ? "vmovdqu64\t{%2, %x0|%x0, %2}"
> > >                                       : "vmovdqu\t{%2, %x0|%x0, %2}";
> > >        else
> > >         return which_alternative == 2 ? "vmovdqa64\t{%2, %x0|%x0, %2}"
> > >                                       : "vmovdqa\t{%2, %x0|%x0, %2}";
> > >      case E_V16SImode:
> > > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> > > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > > +         || misaligned_operand (operands[2], <ssequartermode>mode))
> > >         return which_alternative == 2 ? "vmovdqu32\t{%2, %x0|%x0, %2}"
> > >                                       : "vmovdqu\t{%2, %x0|%x0, %2}";
> > >        else
> > > @@ -25238,27 +25242,32 @@
> > >        switch (get_attr_mode (insn))
> > >         {
> > >         case MODE_V16SF:
> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > > +             || misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> > >             return "vmovups\t{%1, %t0|%t0, %1}";
> > >           else
> > >             return "vmovaps\t{%1, %t0|%t0, %1}";
> > >         case MODE_V8DF:
> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > > +             || misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> > >             return "vmovupd\t{%1, %t0|%t0, %1}";
> > >           else
> > >             return "vmovapd\t{%1, %t0|%t0, %1}";
> > >         case MODE_V8SF:
> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > > +             || misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> > >             return "vmovups\t{%1, %x0|%x0, %1}";
> > >           else
> > >             return "vmovaps\t{%1, %x0|%x0, %1}";
> > >         case MODE_V4DF:
> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > > +             || misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> > >             return "vmovupd\t{%1, %x0|%x0, %1}";
> > >           else
> > >             return "vmovapd\t{%1, %x0|%x0, %1}";
> > >         case MODE_XI:
> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > > +             || misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> > >             {
> > >               if (which_alternative == 2)
> > >                 return "vmovdqu\t{%1, %t0|%t0, %1}";
> > > @@ -25277,7 +25286,8 @@
> > >                 return "vmovdqa32\t{%1, %t0|%t0, %1}";
> > >             }
> > >         case MODE_OI:
> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > > +             || misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> > >             {
> > >               if (which_alternative == 2)
> > >                 return "vmovdqu\t{%1, %x0|%x0, %1}";
> > > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> > > index 0cc8a8edd05..13777d62437 100644
> > > --- a/gcc/doc/invoke.texi
> > > +++ b/gcc/doc/invoke.texi
> > > @@ -1418,6 +1418,7 @@ See RS/6000 and PowerPC Options.
> > >  -mstack-protector-guard-offset=@var{offset} @gol
> > >  -mstack-protector-guard-symbol=@var{symbol} @gol
> > >  -mgeneral-regs-only  -mcall-ms2sysv-xlogues @gol
> > > +-muse-unaligned-vector-move @gol
> > >  -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol
> > >  -mindirect-branch-register -mneeded}
> > >
> > > @@ -31808,6 +31809,12 @@ resulting in fairly lengthy prologues and
> epilogues.  Using
> > >  use stubs in the static portion of libgcc to perform these saves and
> restores,
> > >  thus reducing function size at the cost of a few extra instructions.
> > >
> > > +@item -muse-unaligned-vector-move
> > > +@opindex muse-unaligned-vector-move
> > > +@opindex mno-use-unaligned-vector-move
> > > +Use @option{-muse-unaligned-vector-move} to emits unaligned vector
> move
> > > +instructions like vmovdqu, vmovups, vmovupd.
> > > +
> > >  @item -mtls-dialect=@var{type}
> > >  @opindex mtls-dialect
> > >  Generate code to access thread-local storage using the @samp{gnu} or
> > > diff --git
> a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> > > new file mode 100644
> > > index 00000000000..d21eee562ac
> > > --- /dev/null
> > > +++
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> > > @@ -0,0 +1,102 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
> > > +
> > > +#define N 1024
> > > +
> > > +char **cp;
> > > +char **ep;
> > > +char **fp;
> > > +
> > > +void
> > > +test_char ()
> > > +{
> > > +  int i;
> > > +  char **ap = __builtin_assume_aligned (ep, 32);
> > > +  char **zp;
> > > +  for (i = 128; i > 0; i--)
> > > +  {
> > > +    *ap++ = *cp++;
> > > +    *zp++ = *fp++;
> > > +  }
> > > +}
> > > +
> > > +float f1[N], f2[N], f3[N];
> > > +
> > > +void
> > > +test_float (void)
> > > +{
> > > +  for (int i = 0; i < N; i++)
> > > +  {
> > > +    f3[i] = f1[i] * f2[i];
> > > +  }
> > > +}
> > > +
> > > +double d1[N], d2[N], d3[N];
> > > +
> > > +void
> > > +test_double_load (void)
> > > +{
> > > +  for (int i = 0; i < N; i++)
> > > +  {
> > > +    d3[i] = d1[i] * d2[i];
> > > +
> > > +  }
> > > +}
> > > +
> > > +unsigned char uc1[N], uc2[N], uc3[N];
> > > +void
> > > +test_unchar ()
> > > +{
> > > +   for (int i=0;i<N;i++) {
> > > +     uc3[i] = uc1[i] * uc2[i];
> > > +   }
> > > +}
> > > +
> > > +short st1[N], st2[N], st3[N];
> > > +void
> > > +test_short ()
> > > +{
> > > +   for (int i=0;i<N;i++) {
> > > +     st3[i] = st1[i] * st2[i];
> > > +   }
> > > +}
> > > +
> > > +int n1[N], n2[N], n3[N];
> > > +void
> > > +test_int ()
> > > +{
> > > +   for (int i=0;i<N;i++) {
> > > +     n3[i] = n1[i] * n2[i];
> > > +   }
> > > +}
> > > +
> > > +long l1[N], l2[N], l3[N];
> > > +
> > > +void
> > > +test_long ()
> > > +{
> > > +  for (int i=0; i<N; i++)
> > > +  {
> > > +    l3[i] = l1[i] *l2[i];
> > > +  }
> > > +}
> > > +
> > > +long long ll1[N], ll2[N], ll3[N];
> > > +
> > > +void
> > > +test_long_long()
> > > +{
> > > +  for (int i=0;i<N;i++)
> > > +  {
> > > +    ll3[i] = ll1[i]*ll2[i];
> > > +  }
> > > +}
> > > +
> > > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> > > +/* { dg-final { scan-assembler-not "vmovaps" } } */
> > > +/* { dg-final { scan-assembler-not "vmovapd" } } */
> > > +/* { dg-final { scan-assembler-times "vmovdqu" 19 { target lp64 } } }
> */
> > > +/* { dg-final { scan-assembler-times "vmovdqu" 46 { target x32 } } }
> */
> > > +/* { dg-final { scan-assembler-times "vmovdqu" 47 { target ia32 } } }
> */
> > > +/* { dg-final { scan-assembler-times "vmovups" 2 } } */
> > > +/* { dg-final { scan-assembler-times "vmovupd" 2 } } */
> > > diff --git
> a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> > > new file mode 100644
> > > index 00000000000..65c81105ebd
> > > --- /dev/null
> > > +++
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> > > @@ -0,0 +1,107 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
> > > +
> > > +#include <immintrin.h>
> > > +__m128 value128;
> > > +char src128[16];
> > > +
> > > +__m256 value256;
> > > +float src256[8];
> > > +
> > > +void add128(__m128* pointer) {
> > > +    value128 = _mm_add_ps(value128, *pointer);
> > > +}
> > > +
> > > +void add256(__m256* pointer) {
> > > +    value256 = _mm256_add_ps(value256, *pointer);
> > > +}
> > > +
> > > +__m128d value128d;
> > > +__m128d aux128d;
> > > +float src128f[4];
> > > +float res128f[4];
> > > +double src128d[2];
> > > +double res128d[2];
> > > +
> > > +void add128d(__m128d* pointer, __m128d aux, __m128d* res128d) {
> > > +    value128d = _mm_add_pd(value128d, *pointer);
> > > +    __m128d s1 = _mm_add_pd(aux, *pointer);
> > > +    *res128d = _mm_add_pd(s1, value128d);
> > > +}
> > > +
> > > +__m256d value256d;
> > > +__m256d aux256d;
> > > +float src256f[8];
> > > +float res256f[8];
> > > +double src256d[4];
> > > +double res256d[4];
> > > +
> > > +void add256d(__m256d* pointer, __m256d aux, __m256d* res) {
> > > +    value256d = _mm256_add_pd(value256d, *pointer);
> > > +    __m256d s1 = _mm256_add_pd(aux, *pointer);
> > > +    *res = _mm256_add_pd(s1, value256d);
> > > +}
> > > +
> > > +__m256i value256i;
> > > +__m256i aux256i;
> > > +char src256c[32];
> > > +char res256c[32];
> > > +short src256s[16];
> > > +short res256s[16];
> > > +int src256i[8];
> > > +int res256i[8];
> > > +long long src256l[4];
> > > +long long res256l[4];
> > > +
> > > +void add256i(__m256i* pointer, __m256i aux, __m256i* res) {
> > > +    value256i = _mm256_add_epi32(value256i, *pointer);
> > > +    __m256i s1 = _mm256_add_epi32(aux, *pointer);
> > > +    *res = _mm256_add_epi32(s1, value256i);
> > > +}
> > > +
> > > +void foo1() {
> > > +    add128((__m128*)src128);
> > > +}
> > > +
> > > +void foo2() {
> > > +    add256((__m256*)src256);
> > > +}
> > > +
> > > +void foo3() {
> > > +    add128d((__m128d*)src128d, aux128d, (__m128d*)res128d);
> > > +}
> > > +
> > > +void foo4() {
> > > +    add128d((__m128d*)src128f, aux128d, (__m128d*)res128f);
> > > +}
> > > +
> > > +void foo5() {
> > > +    add256d((__m256d*)src256f, aux256d, (__m256d*)res256f);
> > > +}
> > > +
> > > +void foo6() {
> > > +    add256d((__m256d*)src256d, aux256d, (__m256d*)res256d);
> > > +}
> > > +
> > > +void foo7() {
> > > +    add256i((__m256i*)src256c, aux256i, (__m256i*)res256c);
> > > +}
> > > +
> > > +void foo8() {
> > > +    add256i((__m256i*)src256s, aux256i, (__m256i*)res256s);
> > > +}
> > > +
> > > +void foo9() {
> > > +    add256i((__m256i*)src256i, aux256i, (__m256i*)res256i);
> > > +}
> > > +
> > > +void foo11() {
> > > +    add256i((__m256i*)src256l, aux256i, (__m256i*)res256l);
> > > +}
> > > +
> > > +/* { dg-final { scan-assembler-not "vmovaps" } } */
> > > +/* { dg-final { scan-assembler-not "vmovapd" } } */
> > > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> > > +/* { dg-final { scan-assembler "vmovups" } } */
> > > +/* { dg-final { scan-assembler "vmovupd" } } */
> > > +/* { dg-final { scan-assembler "vmovdqu" } } */
> > > diff --git
> a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> > > new file mode 100644
> > > index 00000000000..59924304bae
> > > --- /dev/null
> > > +++
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> > > @@ -0,0 +1,11 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O3 -mavx2 -mno-use-unaligned-vector-move" } */
> > > +
> > > +#include "avx2-vector-unaligned-load-store-2.c"
> > > +
> > > +/* { dg-final { scan-assembler-not "vmovups" { target { ! ia32 } } }
> } */
> > > +/* { dg-final { scan-assembler-not "vmovupd" { target { ! ia32 } } }
> } */
> > > +/* { dg-final { scan-assembler-not "vmovdqu" } } */
> > > +/* { dg-final { scan-assembler "vmovaps" } } */
> > > +/* { dg-final { scan-assembler "vmovapd" } } */
> > > +/* { dg-final { scan-assembler "vmovdqa" } } */
> > > diff --git
> a/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> > > new file mode 100644
> > > index 00000000000..3759fd9f2f4
> > > --- /dev/null
> > > +++
> b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> > > @@ -0,0 +1,13 @@
> > > +/* { dg-do compile } */
> > > +/* { dg-options "-O3 -mavx512vl -muse-unaligned-vector-move" } */
> > > +
> > > +#include "avx2-vector-unaligned-load-store-1.c"
> > > +
> > > +/* { dg-final { scan-assembler-not "vmovdqa32" } } */
> > > +/* { dg-final { scan-assembler-not "vmovdqa64" } } */
> > > +/* { dg-final { scan-assembler-not "vmovaps" } } */
> > > +/* { dg-final { scan-assembler-not "vmovapd" } } */
> > > +/* { dg-final { scan-assembler "vmovdqu32" } } */
> > > +/* { dg-final { scan-assembler "vmovdqu64" } } */
> > > +/* { dg-final { scan-assembler "vmovups" } } */
> > > +/* { dg-final { scan-assembler "vmovupd" } } */
> > > --
> > > 2.18.1
> > >
>

Xu Dianhong Oct. 20, 2021, 7:53 a.m. UTC | #4

Thanks for the comments.

>Why would you ever want to have such option?!
I need to ask @H. J. Lu for help to answer this question. He knows more
about the background. I may not explain it clearly.
>Should the documentation
at least read "emit unaligned vector moves even for aligned storage or when
using aligned move intrinsics"?
Thanks for the mention. I'll add it to the documents later.

On Wed, Oct 20, 2021 at 3:02 PM Richard Biener <richard.guenther@gmail.com>
wrote:

> On Wed, Oct 20, 2021 at 7:31 AM dianhong.xu--- via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > From: dianhong xu <dianhong.xu@intel.com>
> >
> > Add -muse-unaligned-vector-move option to emit unaligned vector move
> > instaructions.
>
> Why would you ever want to have such option?!  Should the documentation
> at least read "emit unaligned vector moves even for aligned storage or when
> using aligned move intrinsics"?
>
> Richard.
>
> > gcc/ChangeLog:
> >
> >         * config/i386/i386-options.c (ix86_target_string): Add
> >         -muse-unaligned-vector-move.
> >         * config/i386/i386.c (ix86_get_ssemov): Emit unaligned vector if
> use
> >         the new option.
> >         * config/i386/i386.opt (muse-unaligned-vector-move): New.
> >         * config/i386/sse.md: Emit unaligned vector if use this new
> option
> >         * doc/invoke.texi: Document -muse-unaligned-vector-move
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.target/i386/avx2-vector-unaligned-load-store-1.c: New test.
> >         * gcc.target/i386/avx2-vector-unaligned-load-store-2.c: New test.
> >         * gcc.target/i386/avx2-vector-unaligned-load-store-3.c: New test.
> >         * gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c: New
> test.
> > ---
> >  gcc/config/i386/i386-options.c                |   3 +-
> >  gcc/config/i386/i386.c                        |  41 +++----
> >  gcc/config/i386/i386.opt                      |   4 +
> >  gcc/config/i386/sse.md                        |  30 +++--
> >  gcc/doc/invoke.texi                           |   7 ++
> >  .../i386/avx2-vector-unaligned-load-store-1.c | 102 +++++++++++++++++
> >  .../i386/avx2-vector-unaligned-load-store-2.c | 107 ++++++++++++++++++
> >  .../i386/avx2-vector-unaligned-load-store-3.c |  11 ++
> >  .../avx512vl-vector-unaligned-load-store-1.c  |  13 +++
> >  9 files changed, 287 insertions(+), 31 deletions(-)
> >  create mode 100644
> gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> >  create mode 100644
> gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> >  create mode 100644
> gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> >  create mode 100644
> gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> >
> > diff --git a/gcc/config/i386/i386-options.c
> b/gcc/config/i386/i386-options.c
> > index c9523b26f49..eacbd0f5451 100644
> > --- a/gcc/config/i386/i386-options.c
> > +++ b/gcc/config/i386/i386-options.c
> > @@ -397,7 +397,8 @@ ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT
> isa2,
> >      { "-mstv",                         MASK_STV },
> >      { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD
> },
> >      { "-mavx256-split-unaligned-store",
> MASK_AVX256_SPLIT_UNALIGNED_STORE },
> > -    { "-mcall-ms2sysv-xlogues",
> MASK_CALL_MS2SYSV_XLOGUES }
> > +    { "-mcall-ms2sysv-xlogues",
> MASK_CALL_MS2SYSV_XLOGUES },
> > +    { "-muse-unaligned-vector-move",   MASK_USE_UNALIGNED_VECTOR_MOVE }
> >    };
> >
> >    /* Additional flag options.  */
> > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> > index f111411e599..7581e854021 100644
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -5323,8 +5323,9 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> >                  enum attr_mode insn_mode, machine_mode mode)
> >  {
> >    char buf[128];
> > -  bool misaligned_p = (misaligned_operand (operands[0], mode)
> > -                      || misaligned_operand (operands[1], mode));
> > +  bool need_unaligned_p = (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +                          || misaligned_operand (operands[0], mode)
> > +                          || misaligned_operand (operands[1], mode));
> >    bool evex_reg_p = (size == 64
> >                      || EXT_REX_SSE_REG_P (operands[0])
> >                      || EXT_REX_SSE_REG_P (operands[1]));
> > @@ -5380,17 +5381,17 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> >         {
> >         case opcode_int:
> >           if (scalar_mode == E_HFmode)
> > -           opcode = (misaligned_p
> > +           opcode = (need_unaligned_p
> >                       ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
> >                       : "vmovdqa64");
> >           else
> > -           opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> > +           opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
> >           break;
> >         case opcode_float:
> > -         opcode = misaligned_p ? "vmovups" : "vmovaps";
> > +         opcode = need_unaligned_p ? "vmovups" : "vmovaps";
> >           break;
> >         case opcode_double:
> > -         opcode = misaligned_p ? "vmovupd" : "vmovapd";
> > +         opcode = need_unaligned_p ? "vmovupd" : "vmovapd";
> >           break;
> >         }
> >      }
> > @@ -5399,21 +5400,21 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> >        switch (scalar_mode)
> >         {
> >         case E_HFmode:
> > -         opcode = (misaligned_p
> > +         opcode = (need_unaligned_p
> >                     ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
> >                     : "vmovdqa64");
> >           break;
> >         case E_SFmode:
> > -         opcode = misaligned_p ? "%vmovups" : "%vmovaps";
> > +         opcode = need_unaligned_p ? "%vmovups" : "%vmovaps";
> >           break;
> >         case E_DFmode:
> > -         opcode = misaligned_p ? "%vmovupd" : "%vmovapd";
> > +         opcode = need_unaligned_p ? "%vmovupd" : "%vmovapd";
> >           break;
> >         case E_TFmode:
> >           if (evex_reg_p)
> > -           opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > +           opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
> >           else
> > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
> >           break;
> >         default:
> >           gcc_unreachable ();
> > @@ -5425,13 +5426,13 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> >         {
> >         case E_QImode:
> >           if (evex_reg_p)
> > -           opcode = (misaligned_p
> > +           opcode = (need_unaligned_p
> >                       ? (TARGET_AVX512BW
> >                          ? "vmovdqu8"
> >                          : "vmovdqu64")
> >                       : "vmovdqa64");
> >           else
> > -           opcode = (misaligned_p
> > +           opcode = (need_unaligned_p
> >                       ? (TARGET_AVX512BW
> >                          ? "vmovdqu8"
> >                          : "%vmovdqu")
> > @@ -5439,13 +5440,13 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> >           break;
> >         case E_HImode:
> >           if (evex_reg_p)
> > -           opcode = (misaligned_p
> > +           opcode = (need_unaligned_p
> >                       ? (TARGET_AVX512BW
> >                          ? "vmovdqu16"
> >                          : "vmovdqu64")
> >                       : "vmovdqa64");
> >           else
> > -           opcode = (misaligned_p
> > +           opcode = (need_unaligned_p
> >                       ? (TARGET_AVX512BW
> >                          ? "vmovdqu16"
> >                          : "%vmovdqu")
> > @@ -5453,20 +5454,20 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> >           break;
> >         case E_SImode:
> >           if (evex_reg_p)
> > -           opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> > +           opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
> >           else
> > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
> >           break;
> >         case E_DImode:
> >         case E_TImode:
> >         case E_OImode:
> >           if (evex_reg_p)
> > -           opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > +           opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
> >           else
> > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
> >           break;
> >         case E_XImode:
> > -         opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> > +         opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
> >           break;
> >         default:
> >           gcc_unreachable ();
> > diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
> > index ad366974b5b..2162d10925a 100644
> > --- a/gcc/config/i386/i386.opt
> > +++ b/gcc/config/i386/i386.opt
> > @@ -1170,3 +1170,7 @@ Support MWAIT and MONITOR built-in functions and
> code generation.
> >  mavx512fp16
> >  Target Mask(ISA2_AVX512FP16) Var(ix86_isa_flags2) Save
> >  Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F
> and AVX512FP16 built-in functions and code generation.
> > +
> > +muse-unaligned-vector-move
> > +Target Mask(USE_UNALIGNED_VECTOR_MOVE) Save
> > +Emit unaligned vector move instructions.
> > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> > index fbf056bf9e6..dc99597f195 100644
> > --- a/gcc/config/i386/sse.md
> > +++ b/gcc/config/i386/sse.md
> > @@ -17059,24 +17059,28 @@
> >    switch (<MODE>mode)
> >      {
> >      case E_V8DFmode:
> > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +         || misaligned_operand (operands[2], <ssequartermode>mode))
> >         return "vmovupd\t{%2, %x0|%x0, %2}";
> >        else
> >         return "vmovapd\t{%2, %x0|%x0, %2}";
> >      case E_V16SFmode:
> > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +         || misaligned_operand (operands[2], <ssequartermode>mode))
> >         return "vmovups\t{%2, %x0|%x0, %2}";
> >        else
> >         return "vmovaps\t{%2, %x0|%x0, %2}";
> >      case E_V8DImode:
> > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +         || misaligned_operand (operands[2], <ssequartermode>mode))
> >         return which_alternative == 2 ? "vmovdqu64\t{%2, %x0|%x0, %2}"
> >                                       : "vmovdqu\t{%2, %x0|%x0, %2}";
> >        else
> >         return which_alternative == 2 ? "vmovdqa64\t{%2, %x0|%x0, %2}"
> >                                       : "vmovdqa\t{%2, %x0|%x0, %2}";
> >      case E_V16SImode:
> > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +         || misaligned_operand (operands[2], <ssequartermode>mode))
> >         return which_alternative == 2 ? "vmovdqu32\t{%2, %x0|%x0, %2}"
> >                                       : "vmovdqu\t{%2, %x0|%x0, %2}";
> >        else
> > @@ -25238,27 +25242,32 @@
> >        switch (get_attr_mode (insn))
> >         {
> >         case MODE_V16SF:
> > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> >             return "vmovups\t{%1, %t0|%t0, %1}";
> >           else
> >             return "vmovaps\t{%1, %t0|%t0, %1}";
> >         case MODE_V8DF:
> > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> >             return "vmovupd\t{%1, %t0|%t0, %1}";
> >           else
> >             return "vmovapd\t{%1, %t0|%t0, %1}";
> >         case MODE_V8SF:
> > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> >             return "vmovups\t{%1, %x0|%x0, %1}";
> >           else
> >             return "vmovaps\t{%1, %x0|%x0, %1}";
> >         case MODE_V4DF:
> > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> >             return "vmovupd\t{%1, %x0|%x0, %1}";
> >           else
> >             return "vmovapd\t{%1, %x0|%x0, %1}";
> >         case MODE_XI:
> > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> >             {
> >               if (which_alternative == 2)
> >                 return "vmovdqu\t{%1, %t0|%t0, %1}";
> > @@ -25277,7 +25286,8 @@
> >                 return "vmovdqa32\t{%1, %t0|%t0, %1}";
> >             }
> >         case MODE_OI:
> > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
> > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
> >             {
> >               if (which_alternative == 2)
> >                 return "vmovdqu\t{%1, %x0|%x0, %1}";
> > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> > index 0cc8a8edd05..13777d62437 100644
> > --- a/gcc/doc/invoke.texi
> > +++ b/gcc/doc/invoke.texi
> > @@ -1418,6 +1418,7 @@ See RS/6000 and PowerPC Options.
> >  -mstack-protector-guard-offset=@var{offset} @gol
> >  -mstack-protector-guard-symbol=@var{symbol} @gol
> >  -mgeneral-regs-only  -mcall-ms2sysv-xlogues @gol
> > +-muse-unaligned-vector-move @gol
> >  -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol
> >  -mindirect-branch-register -mneeded}
> >
> > @@ -31808,6 +31809,12 @@ resulting in fairly lengthy prologues and
> epilogues.  Using
> >  use stubs in the static portion of libgcc to perform these saves and
> restores,
> >  thus reducing function size at the cost of a few extra instructions.
> >
> > +@item -muse-unaligned-vector-move
> > +@opindex muse-unaligned-vector-move
> > +@opindex mno-use-unaligned-vector-move
> > +Use @option{-muse-unaligned-vector-move} to emits unaligned vector move
> > +instructions like vmovdqu, vmovups, vmovupd.
> > +
> >  @item -mtls-dialect=@var{type}
> >  @opindex mtls-dialect
> >  Generate code to access thread-local storage using the @samp{gnu} or
> > diff --git
> a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> > new file mode 100644
> > index 00000000000..d21eee562ac
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> > @@ -0,0 +1,102 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
> > +
> > +#define N 1024
> > +
> > +char **cp;
> > +char **ep;
> > +char **fp;
> > +
> > +void
> > +test_char ()
> > +{
> > +  int i;
> > +  char **ap = __builtin_assume_aligned (ep, 32);
> > +  char **zp;
> > +  for (i = 128; i > 0; i--)
> > +  {
> > +    *ap++ = *cp++;
> > +    *zp++ = *fp++;
> > +  }
> > +}
> > +
> > +float f1[N], f2[N], f3[N];
> > +
> > +void
> > +test_float (void)
> > +{
> > +  for (int i = 0; i < N; i++)
> > +  {
> > +    f3[i] = f1[i] * f2[i];
> > +  }
> > +}
> > +
> > +double d1[N], d2[N], d3[N];
> > +
> > +void
> > +test_double_load (void)
> > +{
> > +  for (int i = 0; i < N; i++)
> > +  {
> > +    d3[i] = d1[i] * d2[i];
> > +
> > +  }
> > +}
> > +
> > +unsigned char uc1[N], uc2[N], uc3[N];
> > +void
> > +test_unchar ()
> > +{
> > +   for (int i=0;i<N;i++) {
> > +     uc3[i] = uc1[i] * uc2[i];
> > +   }
> > +}
> > +
> > +short st1[N], st2[N], st3[N];
> > +void
> > +test_short ()
> > +{
> > +   for (int i=0;i<N;i++) {
> > +     st3[i] = st1[i] * st2[i];
> > +   }
> > +}
> > +
> > +int n1[N], n2[N], n3[N];
> > +void
> > +test_int ()
> > +{
> > +   for (int i=0;i<N;i++) {
> > +     n3[i] = n1[i] * n2[i];
> > +   }
> > +}
> > +
> > +long l1[N], l2[N], l3[N];
> > +
> > +void
> > +test_long ()
> > +{
> > +  for (int i=0; i<N; i++)
> > +  {
> > +    l3[i] = l1[i] *l2[i];
> > +  }
> > +}
> > +
> > +long long ll1[N], ll2[N], ll3[N];
> > +
> > +void
> > +test_long_long()
> > +{
> > +  for (int i=0;i<N;i++)
> > +  {
> > +    ll3[i] = ll1[i]*ll2[i];
> > +  }
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> > +/* { dg-final { scan-assembler-not "vmovaps" } } */
> > +/* { dg-final { scan-assembler-not "vmovapd" } } */
> > +/* { dg-final { scan-assembler-times "vmovdqu" 19 { target lp64 } } } */
> > +/* { dg-final { scan-assembler-times "vmovdqu" 46 { target x32 } } } */
> > +/* { dg-final { scan-assembler-times "vmovdqu" 47 { target ia32 } } } */
> > +/* { dg-final { scan-assembler-times "vmovups" 2 } } */
> > +/* { dg-final { scan-assembler-times "vmovupd" 2 } } */
> > diff --git
> a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> > new file mode 100644
> > index 00000000000..65c81105ebd
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> > @@ -0,0 +1,107 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
> > +
> > +#include <immintrin.h>
> > +__m128 value128;
> > +char src128[16];
> > +
> > +__m256 value256;
> > +float src256[8];
> > +
> > +void add128(__m128* pointer) {
> > +    value128 = _mm_add_ps(value128, *pointer);
> > +}
> > +
> > +void add256(__m256* pointer) {
> > +    value256 = _mm256_add_ps(value256, *pointer);
> > +}
> > +
> > +__m128d value128d;
> > +__m128d aux128d;
> > +float src128f[4];
> > +float res128f[4];
> > +double src128d[2];
> > +double res128d[2];
> > +
> > +void add128d(__m128d* pointer, __m128d aux, __m128d* res128d) {
> > +    value128d = _mm_add_pd(value128d, *pointer);
> > +    __m128d s1 = _mm_add_pd(aux, *pointer);
> > +    *res128d = _mm_add_pd(s1, value128d);
> > +}
> > +
> > +__m256d value256d;
> > +__m256d aux256d;
> > +float src256f[8];
> > +float res256f[8];
> > +double src256d[4];
> > +double res256d[4];
> > +
> > +void add256d(__m256d* pointer, __m256d aux, __m256d* res) {
> > +    value256d = _mm256_add_pd(value256d, *pointer);
> > +    __m256d s1 = _mm256_add_pd(aux, *pointer);
> > +    *res = _mm256_add_pd(s1, value256d);
> > +}
> > +
> > +__m256i value256i;
> > +__m256i aux256i;
> > +char src256c[32];
> > +char res256c[32];
> > +short src256s[16];
> > +short res256s[16];
> > +int src256i[8];
> > +int res256i[8];
> > +long long src256l[4];
> > +long long res256l[4];
> > +
> > +void add256i(__m256i* pointer, __m256i aux, __m256i* res) {
> > +    value256i = _mm256_add_epi32(value256i, *pointer);
> > +    __m256i s1 = _mm256_add_epi32(aux, *pointer);
> > +    *res = _mm256_add_epi32(s1, value256i);
> > +}
> > +
> > +void foo1() {
> > +    add128((__m128*)src128);
> > +}
> > +
> > +void foo2() {
> > +    add256((__m256*)src256);
> > +}
> > +
> > +void foo3() {
> > +    add128d((__m128d*)src128d, aux128d, (__m128d*)res128d);
> > +}
> > +
> > +void foo4() {
> > +    add128d((__m128d*)src128f, aux128d, (__m128d*)res128f);
> > +}
> > +
> > +void foo5() {
> > +    add256d((__m256d*)src256f, aux256d, (__m256d*)res256f);
> > +}
> > +
> > +void foo6() {
> > +    add256d((__m256d*)src256d, aux256d, (__m256d*)res256d);
> > +}
> > +
> > +void foo7() {
> > +    add256i((__m256i*)src256c, aux256i, (__m256i*)res256c);
> > +}
> > +
> > +void foo8() {
> > +    add256i((__m256i*)src256s, aux256i, (__m256i*)res256s);
> > +}
> > +
> > +void foo9() {
> > +    add256i((__m256i*)src256i, aux256i, (__m256i*)res256i);
> > +}
> > +
> > +void foo11() {
> > +    add256i((__m256i*)src256l, aux256i, (__m256i*)res256l);
> > +}
> > +
> > +/* { dg-final { scan-assembler-not "vmovaps" } } */
> > +/* { dg-final { scan-assembler-not "vmovapd" } } */
> > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> > +/* { dg-final { scan-assembler "vmovups" } } */
> > +/* { dg-final { scan-assembler "vmovupd" } } */
> > +/* { dg-final { scan-assembler "vmovdqu" } } */
> > diff --git
> a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> > new file mode 100644
> > index 00000000000..59924304bae
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> > @@ -0,0 +1,11 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -mavx2 -mno-use-unaligned-vector-move" } */
> > +
> > +#include "avx2-vector-unaligned-load-store-2.c"
> > +
> > +/* { dg-final { scan-assembler-not "vmovups" { target { ! ia32 } } } }
> */
> > +/* { dg-final { scan-assembler-not "vmovupd" { target { ! ia32 } } } }
> */
> > +/* { dg-final { scan-assembler-not "vmovdqu" } } */
> > +/* { dg-final { scan-assembler "vmovaps" } } */
> > +/* { dg-final { scan-assembler "vmovapd" } } */
> > +/* { dg-final { scan-assembler "vmovdqa" } } */
> > diff --git
> a/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> > new file mode 100644
> > index 00000000000..3759fd9f2f4
> > --- /dev/null
> > +++
> b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> > @@ -0,0 +1,13 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O3 -mavx512vl -muse-unaligned-vector-move" } */
> > +
> > +#include "avx2-vector-unaligned-load-store-1.c"
> > +
> > +/* { dg-final { scan-assembler-not "vmovdqa32" } } */
> > +/* { dg-final { scan-assembler-not "vmovdqa64" } } */
> > +/* { dg-final { scan-assembler-not "vmovaps" } } */
> > +/* { dg-final { scan-assembler-not "vmovapd" } } */
> > +/* { dg-final { scan-assembler "vmovdqu32" } } */
> > +/* { dg-final { scan-assembler "vmovdqu64" } } */
> > +/* { dg-final { scan-assembler "vmovups" } } */
> > +/* { dg-final { scan-assembler "vmovupd" } } */
> > --
> > 2.18.1
> >
>

Richard Biener Oct. 20, 2021, 8:06 a.m. UTC | #5

On Wed, Oct 20, 2021 at 9:48 AM Xu Dianhong <dianhong7@gmail.com> wrote:
>
> Thanks for the comments.
>
> > And does it even work?
> It works, I checked it in the test case, and when using this option, it can emit an unaligned vector move.
> >I fail to see adjustments to memory operands of
> SSE/AVX instructions that have to be aligned
> I changed all vector move in "get_ssemov" without checking the move with memory operands or not.
> >and now would need to be
> pushed to separate unaligned moves with an extra register?
> I think it did not use an extra register. I'm not sure if I got your question, and this patch just change the final operator of SSE MOVE from aligned operator to unaligned operator, and I did not change the operands.

For example

typedef double v2df __attribute__((vector_size(16)));

v2df a, b;

void foo ()
{
  a += b;
}

will compile to

foo:
.LFB0:
        .cfi_startproc
        movapd  a(%rip), %xmm0
        addpd   b(%rip), %xmm0
        movaps  %xmm0, a(%rip)
        ret

what should -muse-unaligned-vector-move do here?  The addpd b(%rip), %xmm0
instruction implies an aligned move from b(%rip).

It looks your patch could be better implemented in the assembler, just using
the unaligned encodings for aligned moves?

Richard.

> On Wed, Oct 20, 2021 at 3:04 PM Richard Biener <richard.guenther@gmail.com> wrote:
>>
>> On Wed, Oct 20, 2021 at 9:02 AM Richard Biener
>> <richard.guenther@gmail.com> wrote:
>> >
>> > On Wed, Oct 20, 2021 at 7:31 AM dianhong.xu--- via Gcc-patches
>> > <gcc-patches@gcc.gnu.org> wrote:
>> > >
>> > > From: dianhong xu <dianhong.xu@intel.com>
>> > >
>> > > Add -muse-unaligned-vector-move option to emit unaligned vector move
>> > > instaructions.
>> >
>> > Why would you ever want to have such option?!  Should the documentation
>> > at least read "emit unaligned vector moves even for aligned storage or when
>> > using aligned move intrinsics"?
>>
>> And does it even work?  I fail to see adjustments to memory operands of
>> SSE/AVX instructions that have to be aligned and now would need to be
>> pushed to separate unaligned moves with an extra register?
>>
>> Richard.
>>
>> >
>> > Richard.
>> >
>> > > gcc/ChangeLog:
>> > >
>> > >         * config/i386/i386-options.c (ix86_target_string): Add
>> > >         -muse-unaligned-vector-move.
>> > >         * config/i386/i386.c (ix86_get_ssemov): Emit unaligned vector if use
>> > >         the new option.
>> > >         * config/i386/i386.opt (muse-unaligned-vector-move): New.
>> > >         * config/i386/sse.md: Emit unaligned vector if use this new option
>> > >         * doc/invoke.texi: Document -muse-unaligned-vector-move
>> > >
>> > > gcc/testsuite/ChangeLog:
>> > >
>> > >         * gcc.target/i386/avx2-vector-unaligned-load-store-1.c: New test.
>> > >         * gcc.target/i386/avx2-vector-unaligned-load-store-2.c: New test.
>> > >         * gcc.target/i386/avx2-vector-unaligned-load-store-3.c: New test.
>> > >         * gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c: New test.
>> > > ---
>> > >  gcc/config/i386/i386-options.c                |   3 +-
>> > >  gcc/config/i386/i386.c                        |  41 +++----
>> > >  gcc/config/i386/i386.opt                      |   4 +
>> > >  gcc/config/i386/sse.md                        |  30 +++--
>> > >  gcc/doc/invoke.texi                           |   7 ++
>> > >  .../i386/avx2-vector-unaligned-load-store-1.c | 102 +++++++++++++++++
>> > >  .../i386/avx2-vector-unaligned-load-store-2.c | 107 ++++++++++++++++++
>> > >  .../i386/avx2-vector-unaligned-load-store-3.c |  11 ++
>> > >  .../avx512vl-vector-unaligned-load-store-1.c  |  13 +++
>> > >  9 files changed, 287 insertions(+), 31 deletions(-)
>> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
>> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
>> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
>> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
>> > >
>> > > diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
>> > > index c9523b26f49..eacbd0f5451 100644
>> > > --- a/gcc/config/i386/i386-options.c
>> > > +++ b/gcc/config/i386/i386-options.c
>> > > @@ -397,7 +397,8 @@ ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
>> > >      { "-mstv",                         MASK_STV },
>> > >      { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
>> > >      { "-mavx256-split-unaligned-store",        MASK_AVX256_SPLIT_UNALIGNED_STORE },
>> > > -    { "-mcall-ms2sysv-xlogues",                MASK_CALL_MS2SYSV_XLOGUES }
>> > > +    { "-mcall-ms2sysv-xlogues",                MASK_CALL_MS2SYSV_XLOGUES },
>> > > +    { "-muse-unaligned-vector-move",   MASK_USE_UNALIGNED_VECTOR_MOVE }
>> > >    };
>> > >
>> > >    /* Additional flag options.  */
>> > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>> > > index f111411e599..7581e854021 100644
>> > > --- a/gcc/config/i386/i386.c
>> > > +++ b/gcc/config/i386/i386.c
>> > > @@ -5323,8 +5323,9 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>> > >                  enum attr_mode insn_mode, machine_mode mode)
>> > >  {
>> > >    char buf[128];
>> > > -  bool misaligned_p = (misaligned_operand (operands[0], mode)
>> > > -                      || misaligned_operand (operands[1], mode));
>> > > +  bool need_unaligned_p = (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> > > +                          || misaligned_operand (operands[0], mode)
>> > > +                          || misaligned_operand (operands[1], mode));
>> > >    bool evex_reg_p = (size == 64
>> > >                      || EXT_REX_SSE_REG_P (operands[0])
>> > >                      || EXT_REX_SSE_REG_P (operands[1]));
>> > > @@ -5380,17 +5381,17 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>> > >         {
>> > >         case opcode_int:
>> > >           if (scalar_mode == E_HFmode)
>> > > -           opcode = (misaligned_p
>> > > +           opcode = (need_unaligned_p
>> > >                       ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
>> > >                       : "vmovdqa64");
>> > >           else
>> > > -           opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
>> > > +           opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
>> > >           break;
>> > >         case opcode_float:
>> > > -         opcode = misaligned_p ? "vmovups" : "vmovaps";
>> > > +         opcode = need_unaligned_p ? "vmovups" : "vmovaps";
>> > >           break;
>> > >         case opcode_double:
>> > > -         opcode = misaligned_p ? "vmovupd" : "vmovapd";
>> > > +         opcode = need_unaligned_p ? "vmovupd" : "vmovapd";
>> > >           break;
>> > >         }
>> > >      }
>> > > @@ -5399,21 +5400,21 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>> > >        switch (scalar_mode)
>> > >         {
>> > >         case E_HFmode:
>> > > -         opcode = (misaligned_p
>> > > +         opcode = (need_unaligned_p
>> > >                     ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
>> > >                     : "vmovdqa64");
>> > >           break;
>> > >         case E_SFmode:
>> > > -         opcode = misaligned_p ? "%vmovups" : "%vmovaps";
>> > > +         opcode = need_unaligned_p ? "%vmovups" : "%vmovaps";
>> > >           break;
>> > >         case E_DFmode:
>> > > -         opcode = misaligned_p ? "%vmovupd" : "%vmovapd";
>> > > +         opcode = need_unaligned_p ? "%vmovupd" : "%vmovapd";
>> > >           break;
>> > >         case E_TFmode:
>> > >           if (evex_reg_p)
>> > > -           opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
>> > > +           opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
>> > >           else
>> > > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
>> > > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
>> > >           break;
>> > >         default:
>> > >           gcc_unreachable ();
>> > > @@ -5425,13 +5426,13 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>> > >         {
>> > >         case E_QImode:
>> > >           if (evex_reg_p)
>> > > -           opcode = (misaligned_p
>> > > +           opcode = (need_unaligned_p
>> > >                       ? (TARGET_AVX512BW
>> > >                          ? "vmovdqu8"
>> > >                          : "vmovdqu64")
>> > >                       : "vmovdqa64");
>> > >           else
>> > > -           opcode = (misaligned_p
>> > > +           opcode = (need_unaligned_p
>> > >                       ? (TARGET_AVX512BW
>> > >                          ? "vmovdqu8"
>> > >                          : "%vmovdqu")
>> > > @@ -5439,13 +5440,13 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>> > >           break;
>> > >         case E_HImode:
>> > >           if (evex_reg_p)
>> > > -           opcode = (misaligned_p
>> > > +           opcode = (need_unaligned_p
>> > >                       ? (TARGET_AVX512BW
>> > >                          ? "vmovdqu16"
>> > >                          : "vmovdqu64")
>> > >                       : "vmovdqa64");
>> > >           else
>> > > -           opcode = (misaligned_p
>> > > +           opcode = (need_unaligned_p
>> > >                       ? (TARGET_AVX512BW
>> > >                          ? "vmovdqu16"
>> > >                          : "%vmovdqu")
>> > > @@ -5453,20 +5454,20 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>> > >           break;
>> > >         case E_SImode:
>> > >           if (evex_reg_p)
>> > > -           opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
>> > > +           opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
>> > >           else
>> > > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
>> > > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
>> > >           break;
>> > >         case E_DImode:
>> > >         case E_TImode:
>> > >         case E_OImode:
>> > >           if (evex_reg_p)
>> > > -           opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
>> > > +           opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
>> > >           else
>> > > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
>> > > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
>> > >           break;
>> > >         case E_XImode:
>> > > -         opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
>> > > +         opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
>> > >           break;
>> > >         default:
>> > >           gcc_unreachable ();
>> > > diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
>> > > index ad366974b5b..2162d10925a 100644
>> > > --- a/gcc/config/i386/i386.opt
>> > > +++ b/gcc/config/i386/i386.opt
>> > > @@ -1170,3 +1170,7 @@ Support MWAIT and MONITOR built-in functions and code generation.
>> > >  mavx512fp16
>> > >  Target Mask(ISA2_AVX512FP16) Var(ix86_isa_flags2) Save
>> > >  Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and AVX512FP16 built-in functions and code generation.
>> > > +
>> > > +muse-unaligned-vector-move
>> > > +Target Mask(USE_UNALIGNED_VECTOR_MOVE) Save
>> > > +Emit unaligned vector move instructions.
>> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
>> > > index fbf056bf9e6..dc99597f195 100644
>> > > --- a/gcc/config/i386/sse.md
>> > > +++ b/gcc/config/i386/sse.md
>> > > @@ -17059,24 +17059,28 @@
>> > >    switch (<MODE>mode)
>> > >      {
>> > >      case E_V8DFmode:
>> > > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
>> > > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> > > +         || misaligned_operand (operands[2], <ssequartermode>mode))
>> > >         return "vmovupd\t{%2, %x0|%x0, %2}";
>> > >        else
>> > >         return "vmovapd\t{%2, %x0|%x0, %2}";
>> > >      case E_V16SFmode:
>> > > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
>> > > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> > > +         || misaligned_operand (operands[2], <ssequartermode>mode))
>> > >         return "vmovups\t{%2, %x0|%x0, %2}";
>> > >        else
>> > >         return "vmovaps\t{%2, %x0|%x0, %2}";
>> > >      case E_V8DImode:
>> > > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
>> > > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> > > +         || misaligned_operand (operands[2], <ssequartermode>mode))
>> > >         return which_alternative == 2 ? "vmovdqu64\t{%2, %x0|%x0, %2}"
>> > >                                       : "vmovdqu\t{%2, %x0|%x0, %2}";
>> > >        else
>> > >         return which_alternative == 2 ? "vmovdqa64\t{%2, %x0|%x0, %2}"
>> > >                                       : "vmovdqa\t{%2, %x0|%x0, %2}";
>> > >      case E_V16SImode:
>> > > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
>> > > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> > > +         || misaligned_operand (operands[2], <ssequartermode>mode))
>> > >         return which_alternative == 2 ? "vmovdqu32\t{%2, %x0|%x0, %2}"
>> > >                                       : "vmovdqu\t{%2, %x0|%x0, %2}";
>> > >        else
>> > > @@ -25238,27 +25242,32 @@
>> > >        switch (get_attr_mode (insn))
>> > >         {
>> > >         case MODE_V16SF:
>> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> > > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> > >             return "vmovups\t{%1, %t0|%t0, %1}";
>> > >           else
>> > >             return "vmovaps\t{%1, %t0|%t0, %1}";
>> > >         case MODE_V8DF:
>> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> > > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> > >             return "vmovupd\t{%1, %t0|%t0, %1}";
>> > >           else
>> > >             return "vmovapd\t{%1, %t0|%t0, %1}";
>> > >         case MODE_V8SF:
>> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> > > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> > >             return "vmovups\t{%1, %x0|%x0, %1}";
>> > >           else
>> > >             return "vmovaps\t{%1, %x0|%x0, %1}";
>> > >         case MODE_V4DF:
>> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> > > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> > >             return "vmovupd\t{%1, %x0|%x0, %1}";
>> > >           else
>> > >             return "vmovapd\t{%1, %x0|%x0, %1}";
>> > >         case MODE_XI:
>> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> > > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> > >             {
>> > >               if (which_alternative == 2)
>> > >                 return "vmovdqu\t{%1, %t0|%t0, %1}";
>> > > @@ -25277,7 +25286,8 @@
>> > >                 return "vmovdqa32\t{%1, %t0|%t0, %1}";
>> > >             }
>> > >         case MODE_OI:
>> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> > > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> > >             {
>> > >               if (which_alternative == 2)
>> > >                 return "vmovdqu\t{%1, %x0|%x0, %1}";
>> > > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>> > > index 0cc8a8edd05..13777d62437 100644
>> > > --- a/gcc/doc/invoke.texi
>> > > +++ b/gcc/doc/invoke.texi
>> > > @@ -1418,6 +1418,7 @@ See RS/6000 and PowerPC Options.
>> > >  -mstack-protector-guard-offset=@var{offset} @gol
>> > >  -mstack-protector-guard-symbol=@var{symbol} @gol
>> > >  -mgeneral-regs-only  -mcall-ms2sysv-xlogues @gol
>> > > +-muse-unaligned-vector-move @gol
>> > >  -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol
>> > >  -mindirect-branch-register -mneeded}
>> > >
>> > > @@ -31808,6 +31809,12 @@ resulting in fairly lengthy prologues and epilogues.  Using
>> > >  use stubs in the static portion of libgcc to perform these saves and restores,
>> > >  thus reducing function size at the cost of a few extra instructions.
>> > >
>> > > +@item -muse-unaligned-vector-move
>> > > +@opindex muse-unaligned-vector-move
>> > > +@opindex mno-use-unaligned-vector-move
>> > > +Use @option{-muse-unaligned-vector-move} to emits unaligned vector move
>> > > +instructions like vmovdqu, vmovups, vmovupd.
>> > > +
>> > >  @item -mtls-dialect=@var{type}
>> > >  @opindex mtls-dialect
>> > >  Generate code to access thread-local storage using the @samp{gnu} or
>> > > diff --git a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
>> > > new file mode 100644
>> > > index 00000000000..d21eee562ac
>> > > --- /dev/null
>> > > +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
>> > > @@ -0,0 +1,102 @@
>> > > +/* { dg-do compile } */
>> > > +/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
>> > > +
>> > > +#define N 1024
>> > > +
>> > > +char **cp;
>> > > +char **ep;
>> > > +char **fp;
>> > > +
>> > > +void
>> > > +test_char ()
>> > > +{
>> > > +  int i;
>> > > +  char **ap = __builtin_assume_aligned (ep, 32);
>> > > +  char **zp;
>> > > +  for (i = 128; i > 0; i--)
>> > > +  {
>> > > +    *ap++ = *cp++;
>> > > +    *zp++ = *fp++;
>> > > +  }
>> > > +}
>> > > +
>> > > +float f1[N], f2[N], f3[N];
>> > > +
>> > > +void
>> > > +test_float (void)
>> > > +{
>> > > +  for (int i = 0; i < N; i++)
>> > > +  {
>> > > +    f3[i] = f1[i] * f2[i];
>> > > +  }
>> > > +}
>> > > +
>> > > +double d1[N], d2[N], d3[N];
>> > > +
>> > > +void
>> > > +test_double_load (void)
>> > > +{
>> > > +  for (int i = 0; i < N; i++)
>> > > +  {
>> > > +    d3[i] = d1[i] * d2[i];
>> > > +
>> > > +  }
>> > > +}
>> > > +
>> > > +unsigned char uc1[N], uc2[N], uc3[N];
>> > > +void
>> > > +test_unchar ()
>> > > +{
>> > > +   for (int i=0;i<N;i++) {
>> > > +     uc3[i] = uc1[i] * uc2[i];
>> > > +   }
>> > > +}
>> > > +
>> > > +short st1[N], st2[N], st3[N];
>> > > +void
>> > > +test_short ()
>> > > +{
>> > > +   for (int i=0;i<N;i++) {
>> > > +     st3[i] = st1[i] * st2[i];
>> > > +   }
>> > > +}
>> > > +
>> > > +int n1[N], n2[N], n3[N];
>> > > +void
>> > > +test_int ()
>> > > +{
>> > > +   for (int i=0;i<N;i++) {
>> > > +     n3[i] = n1[i] * n2[i];
>> > > +   }
>> > > +}
>> > > +
>> > > +long l1[N], l2[N], l3[N];
>> > > +
>> > > +void
>> > > +test_long ()
>> > > +{
>> > > +  for (int i=0; i<N; i++)
>> > > +  {
>> > > +    l3[i] = l1[i] *l2[i];
>> > > +  }
>> > > +}
>> > > +
>> > > +long long ll1[N], ll2[N], ll3[N];
>> > > +
>> > > +void
>> > > +test_long_long()
>> > > +{
>> > > +  for (int i=0;i<N;i++)
>> > > +  {
>> > > +    ll3[i] = ll1[i]*ll2[i];
>> > > +  }
>> > > +}
>> > > +
>> > > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
>> > > +/* { dg-final { scan-assembler-not "vmovaps" } } */
>> > > +/* { dg-final { scan-assembler-not "vmovapd" } } */
>> > > +/* { dg-final { scan-assembler-times "vmovdqu" 19 { target lp64 } } } */
>> > > +/* { dg-final { scan-assembler-times "vmovdqu" 46 { target x32 } } } */
>> > > +/* { dg-final { scan-assembler-times "vmovdqu" 47 { target ia32 } } } */
>> > > +/* { dg-final { scan-assembler-times "vmovups" 2 } } */
>> > > +/* { dg-final { scan-assembler-times "vmovupd" 2 } } */
>> > > diff --git a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
>> > > new file mode 100644
>> > > index 00000000000..65c81105ebd
>> > > --- /dev/null
>> > > +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
>> > > @@ -0,0 +1,107 @@
>> > > +/* { dg-do compile } */
>> > > +/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
>> > > +
>> > > +#include <immintrin.h>
>> > > +__m128 value128;
>> > > +char src128[16];
>> > > +
>> > > +__m256 value256;
>> > > +float src256[8];
>> > > +
>> > > +void add128(__m128* pointer) {
>> > > +    value128 = _mm_add_ps(value128, *pointer);
>> > > +}
>> > > +
>> > > +void add256(__m256* pointer) {
>> > > +    value256 = _mm256_add_ps(value256, *pointer);
>> > > +}
>> > > +
>> > > +__m128d value128d;
>> > > +__m128d aux128d;
>> > > +float src128f[4];
>> > > +float res128f[4];
>> > > +double src128d[2];
>> > > +double res128d[2];
>> > > +
>> > > +void add128d(__m128d* pointer, __m128d aux, __m128d* res128d) {
>> > > +    value128d = _mm_add_pd(value128d, *pointer);
>> > > +    __m128d s1 = _mm_add_pd(aux, *pointer);
>> > > +    *res128d = _mm_add_pd(s1, value128d);
>> > > +}
>> > > +
>> > > +__m256d value256d;
>> > > +__m256d aux256d;
>> > > +float src256f[8];
>> > > +float res256f[8];
>> > > +double src256d[4];
>> > > +double res256d[4];
>> > > +
>> > > +void add256d(__m256d* pointer, __m256d aux, __m256d* res) {
>> > > +    value256d = _mm256_add_pd(value256d, *pointer);
>> > > +    __m256d s1 = _mm256_add_pd(aux, *pointer);
>> > > +    *res = _mm256_add_pd(s1, value256d);
>> > > +}
>> > > +
>> > > +__m256i value256i;
>> > > +__m256i aux256i;
>> > > +char src256c[32];
>> > > +char res256c[32];
>> > > +short src256s[16];
>> > > +short res256s[16];
>> > > +int src256i[8];
>> > > +int res256i[8];
>> > > +long long src256l[4];
>> > > +long long res256l[4];
>> > > +
>> > > +void add256i(__m256i* pointer, __m256i aux, __m256i* res) {
>> > > +    value256i = _mm256_add_epi32(value256i, *pointer);
>> > > +    __m256i s1 = _mm256_add_epi32(aux, *pointer);
>> > > +    *res = _mm256_add_epi32(s1, value256i);
>> > > +}
>> > > +
>> > > +void foo1() {
>> > > +    add128((__m128*)src128);
>> > > +}
>> > > +
>> > > +void foo2() {
>> > > +    add256((__m256*)src256);
>> > > +}
>> > > +
>> > > +void foo3() {
>> > > +    add128d((__m128d*)src128d, aux128d, (__m128d*)res128d);
>> > > +}
>> > > +
>> > > +void foo4() {
>> > > +    add128d((__m128d*)src128f, aux128d, (__m128d*)res128f);
>> > > +}
>> > > +
>> > > +void foo5() {
>> > > +    add256d((__m256d*)src256f, aux256d, (__m256d*)res256f);
>> > > +}
>> > > +
>> > > +void foo6() {
>> > > +    add256d((__m256d*)src256d, aux256d, (__m256d*)res256d);
>> > > +}
>> > > +
>> > > +void foo7() {
>> > > +    add256i((__m256i*)src256c, aux256i, (__m256i*)res256c);
>> > > +}
>> > > +
>> > > +void foo8() {
>> > > +    add256i((__m256i*)src256s, aux256i, (__m256i*)res256s);
>> > > +}
>> > > +
>> > > +void foo9() {
>> > > +    add256i((__m256i*)src256i, aux256i, (__m256i*)res256i);
>> > > +}
>> > > +
>> > > +void foo11() {
>> > > +    add256i((__m256i*)src256l, aux256i, (__m256i*)res256l);
>> > > +}
>> > > +
>> > > +/* { dg-final { scan-assembler-not "vmovaps" } } */
>> > > +/* { dg-final { scan-assembler-not "vmovapd" } } */
>> > > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
>> > > +/* { dg-final { scan-assembler "vmovups" } } */
>> > > +/* { dg-final { scan-assembler "vmovupd" } } */
>> > > +/* { dg-final { scan-assembler "vmovdqu" } } */
>> > > diff --git a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
>> > > new file mode 100644
>> > > index 00000000000..59924304bae
>> > > --- /dev/null
>> > > +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
>> > > @@ -0,0 +1,11 @@
>> > > +/* { dg-do compile } */
>> > > +/* { dg-options "-O3 -mavx2 -mno-use-unaligned-vector-move" } */
>> > > +
>> > > +#include "avx2-vector-unaligned-load-store-2.c"
>> > > +
>> > > +/* { dg-final { scan-assembler-not "vmovups" { target { ! ia32 } } } } */
>> > > +/* { dg-final { scan-assembler-not "vmovupd" { target { ! ia32 } } } } */
>> > > +/* { dg-final { scan-assembler-not "vmovdqu" } } */
>> > > +/* { dg-final { scan-assembler "vmovaps" } } */
>> > > +/* { dg-final { scan-assembler "vmovapd" } } */
>> > > +/* { dg-final { scan-assembler "vmovdqa" } } */
>> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
>> > > new file mode 100644
>> > > index 00000000000..3759fd9f2f4
>> > > --- /dev/null
>> > > +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
>> > > @@ -0,0 +1,13 @@
>> > > +/* { dg-do compile } */
>> > > +/* { dg-options "-O3 -mavx512vl -muse-unaligned-vector-move" } */
>> > > +
>> > > +#include "avx2-vector-unaligned-load-store-1.c"
>> > > +
>> > > +/* { dg-final { scan-assembler-not "vmovdqa32" } } */
>> > > +/* { dg-final { scan-assembler-not "vmovdqa64" } } */
>> > > +/* { dg-final { scan-assembler-not "vmovaps" } } */
>> > > +/* { dg-final { scan-assembler-not "vmovapd" } } */
>> > > +/* { dg-final { scan-assembler "vmovdqu32" } } */
>> > > +/* { dg-final { scan-assembler "vmovdqu64" } } */
>> > > +/* { dg-final { scan-assembler "vmovups" } } */
>> > > +/* { dg-final { scan-assembler "vmovupd" } } */
>> > > --
>> > > 2.18.1
>> > >

Xu Dianhong Oct. 20, 2021, 10:40 a.m. UTC | #6

Many thanks for your explanation. I got the meaning of operands.
The "addpd b(%rip), %xmm0" instruction needs "b(%rip)" aligned otherwise
it will rise a "Real-Address Mode Exceptions".
I haven't considered this situation  "b(%rip)" has an address dependence of
"a(%rip)" before. I think this situation could be resolved on the assembler
side except for this dummy code like "movapd 0x200b37(%rip),%xmm1, ...
addpd  0x200b37(%rip),%xmm0 ".

On Wed, Oct 20, 2021 at 4:06 PM Richard Biener <richard.guenther@gmail.com>
wrote:

> On Wed, Oct 20, 2021 at 9:48 AM Xu Dianhong <dianhong7@gmail.com> wrote:
> >
> > Thanks for the comments.
> >
> > > And does it even work?
> > It works, I checked it in the test case, and when using this option, it
> can emit an unaligned vector move.
> > >I fail to see adjustments to memory operands of
> > SSE/AVX instructions that have to be aligned
> > I changed all vector move in "get_ssemov" without checking the move with
> memory operands or not.
> > >and now would need to be
> > pushed to separate unaligned moves with an extra register?
> > I think it did not use an extra register. I'm not sure if I got your
> question, and this patch just change the final operator of SSE MOVE from
> aligned operator to unaligned operator, and I did not change the operands.
>
> For example
>
> typedef double v2df __attribute__((vector_size(16)));
>
> v2df a, b;
>
> void foo ()
> {
>   a += b;
> }
>
> will compile to
>
> foo:
> .LFB0:
>         .cfi_startproc
>         movapd  a(%rip), %xmm0
>         addpd   b(%rip), %xmm0
>         movaps  %xmm0, a(%rip)
>         ret
>
> what should -muse-unaligned-vector-move do here?  The addpd b(%rip), %xmm0
> instruction implies an aligned move from b(%rip).
>
> It looks your patch could be better implemented in the assembler, just
> using
> the unaligned encodings for aligned moves?
>
> Richard.
>
> > On Wed, Oct 20, 2021 at 3:04 PM Richard Biener <
> richard.guenther@gmail.com> wrote:
> >>
> >> On Wed, Oct 20, 2021 at 9:02 AM Richard Biener
> >> <richard.guenther@gmail.com> wrote:
> >> >
> >> > On Wed, Oct 20, 2021 at 7:31 AM dianhong.xu--- via Gcc-patches
> >> > <gcc-patches@gcc.gnu.org> wrote:
> >> > >
> >> > > From: dianhong xu <dianhong.xu@intel.com>
> >> > >
> >> > > Add -muse-unaligned-vector-move option to emit unaligned vector move
> >> > > instaructions.
> >> >
> >> > Why would you ever want to have such option?!  Should the
> documentation
> >> > at least read "emit unaligned vector moves even for aligned storage
> or when
> >> > using aligned move intrinsics"?
> >>
> >> And does it even work?  I fail to see adjustments to memory operands of
> >> SSE/AVX instructions that have to be aligned and now would need to be
> >> pushed to separate unaligned moves with an extra register?
> >>
> >> Richard.
> >>
> >> >
> >> > Richard.
> >> >
> >> > > gcc/ChangeLog:
> >> > >
> >> > >         * config/i386/i386-options.c (ix86_target_string): Add
> >> > >         -muse-unaligned-vector-move.
> >> > >         * config/i386/i386.c (ix86_get_ssemov): Emit unaligned
> vector if use
> >> > >         the new option.
> >> > >         * config/i386/i386.opt (muse-unaligned-vector-move): New.
> >> > >         * config/i386/sse.md: Emit unaligned vector if use this new
> option
> >> > >         * doc/invoke.texi: Document -muse-unaligned-vector-move
> >> > >
> >> > > gcc/testsuite/ChangeLog:
> >> > >
> >> > >         * gcc.target/i386/avx2-vector-unaligned-load-store-1.c: New
> test.
> >> > >         * gcc.target/i386/avx2-vector-unaligned-load-store-2.c: New
> test.
> >> > >         * gcc.target/i386/avx2-vector-unaligned-load-store-3.c: New
> test.
> >> > >         * gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c:
> New test.
> >> > > ---
> >> > >  gcc/config/i386/i386-options.c                |   3 +-
> >> > >  gcc/config/i386/i386.c                        |  41 +++----
> >> > >  gcc/config/i386/i386.opt                      |   4 +
> >> > >  gcc/config/i386/sse.md                        |  30 +++--
> >> > >  gcc/doc/invoke.texi                           |   7 ++
> >> > >  .../i386/avx2-vector-unaligned-load-store-1.c | 102
> +++++++++++++++++
> >> > >  .../i386/avx2-vector-unaligned-load-store-2.c | 107
> ++++++++++++++++++
> >> > >  .../i386/avx2-vector-unaligned-load-store-3.c |  11 ++
> >> > >  .../avx512vl-vector-unaligned-load-store-1.c  |  13 +++
> >> > >  9 files changed, 287 insertions(+), 31 deletions(-)
> >> > >  create mode 100644
> gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> >> > >  create mode 100644
> gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> >> > >  create mode 100644
> gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> >> > >  create mode 100644
> gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> >> > >
> >> > > diff --git a/gcc/config/i386/i386-options.c
> b/gcc/config/i386/i386-options.c
> >> > > index c9523b26f49..eacbd0f5451 100644
> >> > > --- a/gcc/config/i386/i386-options.c
> >> > > +++ b/gcc/config/i386/i386-options.c
> >> > > @@ -397,7 +397,8 @@ ix86_target_string (HOST_WIDE_INT isa,
> HOST_WIDE_INT isa2,
> >> > >      { "-mstv",                         MASK_STV },
> >> > >      { "-mavx256-split-unaligned-load",
> MASK_AVX256_SPLIT_UNALIGNED_LOAD },
> >> > >      { "-mavx256-split-unaligned-store",
> MASK_AVX256_SPLIT_UNALIGNED_STORE },
> >> > > -    { "-mcall-ms2sysv-xlogues",
> MASK_CALL_MS2SYSV_XLOGUES }
> >> > > +    { "-mcall-ms2sysv-xlogues",
> MASK_CALL_MS2SYSV_XLOGUES },
> >> > > +    { "-muse-unaligned-vector-move",
>  MASK_USE_UNALIGNED_VECTOR_MOVE }
> >> > >    };
> >> > >
> >> > >    /* Additional flag options.  */
> >> > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> >> > > index f111411e599..7581e854021 100644
> >> > > --- a/gcc/config/i386/i386.c
> >> > > +++ b/gcc/config/i386/i386.c
> >> > > @@ -5323,8 +5323,9 @@ ix86_get_ssemov (rtx *operands, unsigned size,
> >> > >                  enum attr_mode insn_mode, machine_mode mode)
> >> > >  {
> >> > >    char buf[128];
> >> > > -  bool misaligned_p = (misaligned_operand (operands[0], mode)
> >> > > -                      || misaligned_operand (operands[1], mode));
> >> > > +  bool need_unaligned_p = (TARGET_USE_UNALIGNED_VECTOR_MOVE
> >> > > +                          || misaligned_operand (operands[0], mode)
> >> > > +                          || misaligned_operand (operands[1],
> mode));
> >> > >    bool evex_reg_p = (size == 64
> >> > >                      || EXT_REX_SSE_REG_P (operands[0])
> >> > >                      || EXT_REX_SSE_REG_P (operands[1]));
> >> > > @@ -5380,17 +5381,17 @@ ix86_get_ssemov (rtx *operands, unsigned
> size,
> >> > >         {
> >> > >         case opcode_int:
> >> > >           if (scalar_mode == E_HFmode)
> >> > > -           opcode = (misaligned_p
> >> > > +           opcode = (need_unaligned_p
> >> > >                       ? (TARGET_AVX512BW ? "vmovdqu16" :
> "vmovdqu64")
> >> > >                       : "vmovdqa64");
> >> > >           else
> >> > > -           opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> >> > > +           opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
> >> > >           break;
> >> > >         case opcode_float:
> >> > > -         opcode = misaligned_p ? "vmovups" : "vmovaps";
> >> > > +         opcode = need_unaligned_p ? "vmovups" : "vmovaps";
> >> > >           break;
> >> > >         case opcode_double:
> >> > > -         opcode = misaligned_p ? "vmovupd" : "vmovapd";
> >> > > +         opcode = need_unaligned_p ? "vmovupd" : "vmovapd";
> >> > >           break;
> >> > >         }
> >> > >      }
> >> > > @@ -5399,21 +5400,21 @@ ix86_get_ssemov (rtx *operands, unsigned
> size,
> >> > >        switch (scalar_mode)
> >> > >         {
> >> > >         case E_HFmode:
> >> > > -         opcode = (misaligned_p
> >> > > +         opcode = (need_unaligned_p
> >> > >                     ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
> >> > >                     : "vmovdqa64");
> >> > >           break;
> >> > >         case E_SFmode:
> >> > > -         opcode = misaligned_p ? "%vmovups" : "%vmovaps";
> >> > > +         opcode = need_unaligned_p ? "%vmovups" : "%vmovaps";
> >> > >           break;
> >> > >         case E_DFmode:
> >> > > -         opcode = misaligned_p ? "%vmovupd" : "%vmovapd";
> >> > > +         opcode = need_unaligned_p ? "%vmovupd" : "%vmovapd";
> >> > >           break;
> >> > >         case E_TFmode:
> >> > >           if (evex_reg_p)
> >> > > -           opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> >> > > +           opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
> >> > >           else
> >> > > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> >> > > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
> >> > >           break;
> >> > >         default:
> >> > >           gcc_unreachable ();
> >> > > @@ -5425,13 +5426,13 @@ ix86_get_ssemov (rtx *operands, unsigned
> size,
> >> > >         {
> >> > >         case E_QImode:
> >> > >           if (evex_reg_p)
> >> > > -           opcode = (misaligned_p
> >> > > +           opcode = (need_unaligned_p
> >> > >                       ? (TARGET_AVX512BW
> >> > >                          ? "vmovdqu8"
> >> > >                          : "vmovdqu64")
> >> > >                       : "vmovdqa64");
> >> > >           else
> >> > > -           opcode = (misaligned_p
> >> > > +           opcode = (need_unaligned_p
> >> > >                       ? (TARGET_AVX512BW
> >> > >                          ? "vmovdqu8"
> >> > >                          : "%vmovdqu")
> >> > > @@ -5439,13 +5440,13 @@ ix86_get_ssemov (rtx *operands, unsigned
> size,
> >> > >           break;
> >> > >         case E_HImode:
> >> > >           if (evex_reg_p)
> >> > > -           opcode = (misaligned_p
> >> > > +           opcode = (need_unaligned_p
> >> > >                       ? (TARGET_AVX512BW
> >> > >                          ? "vmovdqu16"
> >> > >                          : "vmovdqu64")
> >> > >                       : "vmovdqa64");
> >> > >           else
> >> > > -           opcode = (misaligned_p
> >> > > +           opcode = (need_unaligned_p
> >> > >                       ? (TARGET_AVX512BW
> >> > >                          ? "vmovdqu16"
> >> > >                          : "%vmovdqu")
> >> > > @@ -5453,20 +5454,20 @@ ix86_get_ssemov (rtx *operands, unsigned
> size,
> >> > >           break;
> >> > >         case E_SImode:
> >> > >           if (evex_reg_p)
> >> > > -           opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
> >> > > +           opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
> >> > >           else
> >> > > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> >> > > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
> >> > >           break;
> >> > >         case E_DImode:
> >> > >         case E_TImode:
> >> > >         case E_OImode:
> >> > >           if (evex_reg_p)
> >> > > -           opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> >> > > +           opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
> >> > >           else
> >> > > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
> >> > > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
> >> > >           break;
> >> > >         case E_XImode:
> >> > > -         opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
> >> > > +         opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
> >> > >           break;
> >> > >         default:
> >> > >           gcc_unreachable ();
> >> > > diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
> >> > > index ad366974b5b..2162d10925a 100644
> >> > > --- a/gcc/config/i386/i386.opt
> >> > > +++ b/gcc/config/i386/i386.opt
> >> > > @@ -1170,3 +1170,7 @@ Support MWAIT and MONITOR built-in functions
> and code generation.
> >> > >  mavx512fp16
> >> > >  Target Mask(ISA2_AVX512FP16) Var(ix86_isa_flags2) Save
> >> > >  Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2,
> AVX512F and AVX512FP16 built-in functions and code generation.
> >> > > +
> >> > > +muse-unaligned-vector-move
> >> > > +Target Mask(USE_UNALIGNED_VECTOR_MOVE) Save
> >> > > +Emit unaligned vector move instructions.
> >> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> >> > > index fbf056bf9e6..dc99597f195 100644
> >> > > --- a/gcc/config/i386/sse.md
> >> > > +++ b/gcc/config/i386/sse.md
> >> > > @@ -17059,24 +17059,28 @@
> >> > >    switch (<MODE>mode)
> >> > >      {
> >> > >      case E_V8DFmode:
> >> > > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> >> > > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> >> > > +         || misaligned_operand (operands[2], <ssequartermode>mode))
> >> > >         return "vmovupd\t{%2, %x0|%x0, %2}";
> >> > >        else
> >> > >         return "vmovapd\t{%2, %x0|%x0, %2}";
> >> > >      case E_V16SFmode:
> >> > > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> >> > > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> >> > > +         || misaligned_operand (operands[2], <ssequartermode>mode))
> >> > >         return "vmovups\t{%2, %x0|%x0, %2}";
> >> > >        else
> >> > >         return "vmovaps\t{%2, %x0|%x0, %2}";
> >> > >      case E_V8DImode:
> >> > > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> >> > > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> >> > > +         || misaligned_operand (operands[2], <ssequartermode>mode))
> >> > >         return which_alternative == 2 ? "vmovdqu64\t{%2, %x0|%x0,
> %2}"
> >> > >                                       : "vmovdqu\t{%2, %x0|%x0,
> %2}";
> >> > >        else
> >> > >         return which_alternative == 2 ? "vmovdqa64\t{%2, %x0|%x0,
> %2}"
> >> > >                                       : "vmovdqa\t{%2, %x0|%x0,
> %2}";
> >> > >      case E_V16SImode:
> >> > > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
> >> > > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> >> > > +         || misaligned_operand (operands[2], <ssequartermode>mode))
> >> > >         return which_alternative == 2 ? "vmovdqu32\t{%2, %x0|%x0,
> %2}"
> >> > >                                       : "vmovdqu\t{%2, %x0|%x0,
> %2}";
> >> > >        else
> >> > > @@ -25238,27 +25242,32 @@
> >> > >        switch (get_attr_mode (insn))
> >> > >         {
> >> > >         case MODE_V16SF:
> >> > > -         if (misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> >> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> >> > > +             || misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> >> > >             return "vmovups\t{%1, %t0|%t0, %1}";
> >> > >           else
> >> > >             return "vmovaps\t{%1, %t0|%t0, %1}";
> >> > >         case MODE_V8DF:
> >> > > -         if (misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> >> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> >> > > +             || misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> >> > >             return "vmovupd\t{%1, %t0|%t0, %1}";
> >> > >           else
> >> > >             return "vmovapd\t{%1, %t0|%t0, %1}";
> >> > >         case MODE_V8SF:
> >> > > -         if (misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> >> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> >> > > +             || misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> >> > >             return "vmovups\t{%1, %x0|%x0, %1}";
> >> > >           else
> >> > >             return "vmovaps\t{%1, %x0|%x0, %1}";
> >> > >         case MODE_V4DF:
> >> > > -         if (misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> >> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> >> > > +             || misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> >> > >             return "vmovupd\t{%1, %x0|%x0, %1}";
> >> > >           else
> >> > >             return "vmovapd\t{%1, %x0|%x0, %1}";
> >> > >         case MODE_XI:
> >> > > -         if (misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> >> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> >> > > +             || misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> >> > >             {
> >> > >               if (which_alternative == 2)
> >> > >                 return "vmovdqu\t{%1, %t0|%t0, %1}";
> >> > > @@ -25277,7 +25286,8 @@
> >> > >                 return "vmovdqa32\t{%1, %t0|%t0, %1}";
> >> > >             }
> >> > >         case MODE_OI:
> >> > > -         if (misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> >> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
> >> > > +             || misaligned_operand (operands[1],
> <ssehalfvecmode>mode))
> >> > >             {
> >> > >               if (which_alternative == 2)
> >> > >                 return "vmovdqu\t{%1, %x0|%x0, %1}";
> >> > > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> >> > > index 0cc8a8edd05..13777d62437 100644
> >> > > --- a/gcc/doc/invoke.texi
> >> > > +++ b/gcc/doc/invoke.texi
> >> > > @@ -1418,6 +1418,7 @@ See RS/6000 and PowerPC Options.
> >> > >  -mstack-protector-guard-offset=@var{offset} @gol
> >> > >  -mstack-protector-guard-symbol=@var{symbol} @gol
> >> > >  -mgeneral-regs-only  -mcall-ms2sysv-xlogues @gol
> >> > > +-muse-unaligned-vector-move @gol
> >> > >  -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol
> >> > >  -mindirect-branch-register -mneeded}
> >> > >
> >> > > @@ -31808,6 +31809,12 @@ resulting in fairly lengthy prologues and
> epilogues.  Using
> >> > >  use stubs in the static portion of libgcc to perform these saves
> and restores,
> >> > >  thus reducing function size at the cost of a few extra
> instructions.
> >> > >
> >> > > +@item -muse-unaligned-vector-move
> >> > > +@opindex muse-unaligned-vector-move
> >> > > +@opindex mno-use-unaligned-vector-move
> >> > > +Use @option{-muse-unaligned-vector-move} to emits unaligned vector
> move
> >> > > +instructions like vmovdqu, vmovups, vmovupd.
> >> > > +
> >> > >  @item -mtls-dialect=@var{type}
> >> > >  @opindex mtls-dialect
> >> > >  Generate code to access thread-local storage using the @samp{gnu}
> or
> >> > > diff --git
> a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> >> > > new file mode 100644
> >> > > index 00000000000..d21eee562ac
> >> > > --- /dev/null
> >> > > +++
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
> >> > > @@ -0,0 +1,102 @@
> >> > > +/* { dg-do compile } */
> >> > > +/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
> >> > > +
> >> > > +#define N 1024
> >> > > +
> >> > > +char **cp;
> >> > > +char **ep;
> >> > > +char **fp;
> >> > > +
> >> > > +void
> >> > > +test_char ()
> >> > > +{
> >> > > +  int i;
> >> > > +  char **ap = __builtin_assume_aligned (ep, 32);
> >> > > +  char **zp;
> >> > > +  for (i = 128; i > 0; i--)
> >> > > +  {
> >> > > +    *ap++ = *cp++;
> >> > > +    *zp++ = *fp++;
> >> > > +  }
> >> > > +}
> >> > > +
> >> > > +float f1[N], f2[N], f3[N];
> >> > > +
> >> > > +void
> >> > > +test_float (void)
> >> > > +{
> >> > > +  for (int i = 0; i < N; i++)
> >> > > +  {
> >> > > +    f3[i] = f1[i] * f2[i];
> >> > > +  }
> >> > > +}
> >> > > +
> >> > > +double d1[N], d2[N], d3[N];
> >> > > +
> >> > > +void
> >> > > +test_double_load (void)
> >> > > +{
> >> > > +  for (int i = 0; i < N; i++)
> >> > > +  {
> >> > > +    d3[i] = d1[i] * d2[i];
> >> > > +
> >> > > +  }
> >> > > +}
> >> > > +
> >> > > +unsigned char uc1[N], uc2[N], uc3[N];
> >> > > +void
> >> > > +test_unchar ()
> >> > > +{
> >> > > +   for (int i=0;i<N;i++) {
> >> > > +     uc3[i] = uc1[i] * uc2[i];
> >> > > +   }
> >> > > +}
> >> > > +
> >> > > +short st1[N], st2[N], st3[N];
> >> > > +void
> >> > > +test_short ()
> >> > > +{
> >> > > +   for (int i=0;i<N;i++) {
> >> > > +     st3[i] = st1[i] * st2[i];
> >> > > +   }
> >> > > +}
> >> > > +
> >> > > +int n1[N], n2[N], n3[N];
> >> > > +void
> >> > > +test_int ()
> >> > > +{
> >> > > +   for (int i=0;i<N;i++) {
> >> > > +     n3[i] = n1[i] * n2[i];
> >> > > +   }
> >> > > +}
> >> > > +
> >> > > +long l1[N], l2[N], l3[N];
> >> > > +
> >> > > +void
> >> > > +test_long ()
> >> > > +{
> >> > > +  for (int i=0; i<N; i++)
> >> > > +  {
> >> > > +    l3[i] = l1[i] *l2[i];
> >> > > +  }
> >> > > +}
> >> > > +
> >> > > +long long ll1[N], ll2[N], ll3[N];
> >> > > +
> >> > > +void
> >> > > +test_long_long()
> >> > > +{
> >> > > +  for (int i=0;i<N;i++)
> >> > > +  {
> >> > > +    ll3[i] = ll1[i]*ll2[i];
> >> > > +  }
> >> > > +}
> >> > > +
> >> > > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> >> > > +/* { dg-final { scan-assembler-not "vmovaps" } } */
> >> > > +/* { dg-final { scan-assembler-not "vmovapd" } } */
> >> > > +/* { dg-final { scan-assembler-times "vmovdqu" 19 { target lp64 }
> } } */
> >> > > +/* { dg-final { scan-assembler-times "vmovdqu" 46 { target x32 } }
> } */
> >> > > +/* { dg-final { scan-assembler-times "vmovdqu" 47 { target ia32 }
> } } */
> >> > > +/* { dg-final { scan-assembler-times "vmovups" 2 } } */
> >> > > +/* { dg-final { scan-assembler-times "vmovupd" 2 } } */
> >> > > diff --git
> a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> >> > > new file mode 100644
> >> > > index 00000000000..65c81105ebd
> >> > > --- /dev/null
> >> > > +++
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
> >> > > @@ -0,0 +1,107 @@
> >> > > +/* { dg-do compile } */
> >> > > +/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
> >> > > +
> >> > > +#include <immintrin.h>
> >> > > +__m128 value128;
> >> > > +char src128[16];
> >> > > +
> >> > > +__m256 value256;
> >> > > +float src256[8];
> >> > > +
> >> > > +void add128(__m128* pointer) {
> >> > > +    value128 = _mm_add_ps(value128, *pointer);
> >> > > +}
> >> > > +
> >> > > +void add256(__m256* pointer) {
> >> > > +    value256 = _mm256_add_ps(value256, *pointer);
> >> > > +}
> >> > > +
> >> > > +__m128d value128d;
> >> > > +__m128d aux128d;
> >> > > +float src128f[4];
> >> > > +float res128f[4];
> >> > > +double src128d[2];
> >> > > +double res128d[2];
> >> > > +
> >> > > +void add128d(__m128d* pointer, __m128d aux, __m128d* res128d) {
> >> > > +    value128d = _mm_add_pd(value128d, *pointer);
> >> > > +    __m128d s1 = _mm_add_pd(aux, *pointer);
> >> > > +    *res128d = _mm_add_pd(s1, value128d);
> >> > > +}
> >> > > +
> >> > > +__m256d value256d;
> >> > > +__m256d aux256d;
> >> > > +float src256f[8];
> >> > > +float res256f[8];
> >> > > +double src256d[4];
> >> > > +double res256d[4];
> >> > > +
> >> > > +void add256d(__m256d* pointer, __m256d aux, __m256d* res) {
> >> > > +    value256d = _mm256_add_pd(value256d, *pointer);
> >> > > +    __m256d s1 = _mm256_add_pd(aux, *pointer);
> >> > > +    *res = _mm256_add_pd(s1, value256d);
> >> > > +}
> >> > > +
> >> > > +__m256i value256i;
> >> > > +__m256i aux256i;
> >> > > +char src256c[32];
> >> > > +char res256c[32];
> >> > > +short src256s[16];
> >> > > +short res256s[16];
> >> > > +int src256i[8];
> >> > > +int res256i[8];
> >> > > +long long src256l[4];
> >> > > +long long res256l[4];
> >> > > +
> >> > > +void add256i(__m256i* pointer, __m256i aux, __m256i* res) {
> >> > > +    value256i = _mm256_add_epi32(value256i, *pointer);
> >> > > +    __m256i s1 = _mm256_add_epi32(aux, *pointer);
> >> > > +    *res = _mm256_add_epi32(s1, value256i);
> >> > > +}
> >> > > +
> >> > > +void foo1() {
> >> > > +    add128((__m128*)src128);
> >> > > +}
> >> > > +
> >> > > +void foo2() {
> >> > > +    add256((__m256*)src256);
> >> > > +}
> >> > > +
> >> > > +void foo3() {
> >> > > +    add128d((__m128d*)src128d, aux128d, (__m128d*)res128d);
> >> > > +}
> >> > > +
> >> > > +void foo4() {
> >> > > +    add128d((__m128d*)src128f, aux128d, (__m128d*)res128f);
> >> > > +}
> >> > > +
> >> > > +void foo5() {
> >> > > +    add256d((__m256d*)src256f, aux256d, (__m256d*)res256f);
> >> > > +}
> >> > > +
> >> > > +void foo6() {
> >> > > +    add256d((__m256d*)src256d, aux256d, (__m256d*)res256d);
> >> > > +}
> >> > > +
> >> > > +void foo7() {
> >> > > +    add256i((__m256i*)src256c, aux256i, (__m256i*)res256c);
> >> > > +}
> >> > > +
> >> > > +void foo8() {
> >> > > +    add256i((__m256i*)src256s, aux256i, (__m256i*)res256s);
> >> > > +}
> >> > > +
> >> > > +void foo9() {
> >> > > +    add256i((__m256i*)src256i, aux256i, (__m256i*)res256i);
> >> > > +}
> >> > > +
> >> > > +void foo11() {
> >> > > +    add256i((__m256i*)src256l, aux256i, (__m256i*)res256l);
> >> > > +}
> >> > > +
> >> > > +/* { dg-final { scan-assembler-not "vmovaps" } } */
> >> > > +/* { dg-final { scan-assembler-not "vmovapd" } } */
> >> > > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
> >> > > +/* { dg-final { scan-assembler "vmovups" } } */
> >> > > +/* { dg-final { scan-assembler "vmovupd" } } */
> >> > > +/* { dg-final { scan-assembler "vmovdqu" } } */
> >> > > diff --git
> a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> >> > > new file mode 100644
> >> > > index 00000000000..59924304bae
> >> > > --- /dev/null
> >> > > +++
> b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
> >> > > @@ -0,0 +1,11 @@
> >> > > +/* { dg-do compile } */
> >> > > +/* { dg-options "-O3 -mavx2 -mno-use-unaligned-vector-move" } */
> >> > > +
> >> > > +#include "avx2-vector-unaligned-load-store-2.c"
> >> > > +
> >> > > +/* { dg-final { scan-assembler-not "vmovups" { target { ! ia32 } }
> } } */
> >> > > +/* { dg-final { scan-assembler-not "vmovupd" { target { ! ia32 } }
> } } */
> >> > > +/* { dg-final { scan-assembler-not "vmovdqu" } } */
> >> > > +/* { dg-final { scan-assembler "vmovaps" } } */
> >> > > +/* { dg-final { scan-assembler "vmovapd" } } */
> >> > > +/* { dg-final { scan-assembler "vmovdqa" } } */
> >> > > diff --git
> a/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> >> > > new file mode 100644
> >> > > index 00000000000..3759fd9f2f4
> >> > > --- /dev/null
> >> > > +++
> b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
> >> > > @@ -0,0 +1,13 @@
> >> > > +/* { dg-do compile } */
> >> > > +/* { dg-options "-O3 -mavx512vl -muse-unaligned-vector-move" } */
> >> > > +
> >> > > +#include "avx2-vector-unaligned-load-store-1.c"
> >> > > +
> >> > > +/* { dg-final { scan-assembler-not "vmovdqa32" } } */
> >> > > +/* { dg-final { scan-assembler-not "vmovdqa64" } } */
> >> > > +/* { dg-final { scan-assembler-not "vmovaps" } } */
> >> > > +/* { dg-final { scan-assembler-not "vmovapd" } } */
> >> > > +/* { dg-final { scan-assembler "vmovdqu32" } } */
> >> > > +/* { dg-final { scan-assembler "vmovdqu64" } } */
> >> > > +/* { dg-final { scan-assembler "vmovups" } } */
> >> > > +/* { dg-final { scan-assembler "vmovupd" } } */
> >> > > --
> >> > > 2.18.1
> >> > >
>

Richard Biener Oct. 20, 2021, 11:18 a.m. UTC | #7

On Wed, Oct 20, 2021 at 12:40 PM Xu Dianhong <dianhong7@gmail.com> wrote:
>
> Many thanks for your explanation. I got the meaning of operands.
> The "addpd b(%rip), %xmm0" instruction needs "b(%rip)" aligned otherwise it will rise a "Real-Address Mode Exceptions".
> I haven't considered this situation  "b(%rip)" has an address dependence of "a(%rip)" before. I think this situation could be resolved on the assembler side except for this dummy code like "movapd 0x200b37(%rip),%xmm1, ... addpd  0x200b37(%rip),%xmm0 ".

Of course the compiler will only emit instructions which have the
constraint of aligned memory
when the memory is known to be aligned.  That's why I wonder why you
would need such
option.  "Real-Address Mode Exceptions" may point to the issue, but I
wonder what's different
in real mode vs. protected mode - even with segmentation the alignment
of objects should
prevail unless you play linker"tricks" that make global objects have
different alignment - but
then it's better to adjust the respective hooks to not falsely claim
such alignment.  Consider
for example

   if ((uintptr_t)&a & 0x7)
     foo();
  else
     bar();

GCC will optimize the branch statically to always call foo if 'a'
appears to be aligned,
even if you later try to "override" this with an option.  Alignment is
not only about
moves, it's also about knowledge about low bits in addresses and about
alias analysis where alignment constrains how two objects can overlap.

So - do not lie to the compiler!  A late "workaround" avoiding aligned
SSE moves isn't a proper fix.

Richard.

> On Wed, Oct 20, 2021 at 4:06 PM Richard Biener <richard.guenther@gmail.com> wrote:
>>
>> On Wed, Oct 20, 2021 at 9:48 AM Xu Dianhong <dianhong7@gmail.com> wrote:
>> >
>> > Thanks for the comments.
>> >
>> > > And does it even work?
>> > It works, I checked it in the test case, and when using this option, it can emit an unaligned vector move.
>> > >I fail to see adjustments to memory operands of
>> > SSE/AVX instructions that have to be aligned
>> > I changed all vector move in "get_ssemov" without checking the move with memory operands or not.
>> > >and now would need to be
>> > pushed to separate unaligned moves with an extra register?
>> > I think it did not use an extra register. I'm not sure if I got your question, and this patch just change the final operator of SSE MOVE from aligned operator to unaligned operator, and I did not change the operands.
>>
>> For example
>>
>> typedef double v2df __attribute__((vector_size(16)));
>>
>> v2df a, b;
>>
>> void foo ()
>> {
>>   a += b;
>> }
>>
>> will compile to
>>
>> foo:
>> .LFB0:
>>         .cfi_startproc
>>         movapd  a(%rip), %xmm0
>>         addpd   b(%rip), %xmm0
>>         movaps  %xmm0, a(%rip)
>>         ret
>>
>> what should -muse-unaligned-vector-move do here?  The addpd b(%rip), %xmm0
>> instruction implies an aligned move from b(%rip).
>>
>> It looks your patch could be better implemented in the assembler, just using
>> the unaligned encodings for aligned moves?
>>
>> Richard.
>>
>> > On Wed, Oct 20, 2021 at 3:04 PM Richard Biener <richard.guenther@gmail.com> wrote:
>> >>
>> >> On Wed, Oct 20, 2021 at 9:02 AM Richard Biener
>> >> <richard.guenther@gmail.com> wrote:
>> >> >
>> >> > On Wed, Oct 20, 2021 at 7:31 AM dianhong.xu--- via Gcc-patches
>> >> > <gcc-patches@gcc.gnu.org> wrote:
>> >> > >
>> >> > > From: dianhong xu <dianhong.xu@intel.com>
>> >> > >
>> >> > > Add -muse-unaligned-vector-move option to emit unaligned vector move
>> >> > > instaructions.
>> >> >
>> >> > Why would you ever want to have such option?!  Should the documentation
>> >> > at least read "emit unaligned vector moves even for aligned storage or when
>> >> > using aligned move intrinsics"?
>> >>
>> >> And does it even work?  I fail to see adjustments to memory operands of
>> >> SSE/AVX instructions that have to be aligned and now would need to be
>> >> pushed to separate unaligned moves with an extra register?
>> >>
>> >> Richard.
>> >>
>> >> >
>> >> > Richard.
>> >> >
>> >> > > gcc/ChangeLog:
>> >> > >
>> >> > >         * config/i386/i386-options.c (ix86_target_string): Add
>> >> > >         -muse-unaligned-vector-move.
>> >> > >         * config/i386/i386.c (ix86_get_ssemov): Emit unaligned vector if use
>> >> > >         the new option.
>> >> > >         * config/i386/i386.opt (muse-unaligned-vector-move): New.
>> >> > >         * config/i386/sse.md: Emit unaligned vector if use this new option
>> >> > >         * doc/invoke.texi: Document -muse-unaligned-vector-move
>> >> > >
>> >> > > gcc/testsuite/ChangeLog:
>> >> > >
>> >> > >         * gcc.target/i386/avx2-vector-unaligned-load-store-1.c: New test.
>> >> > >         * gcc.target/i386/avx2-vector-unaligned-load-store-2.c: New test.
>> >> > >         * gcc.target/i386/avx2-vector-unaligned-load-store-3.c: New test.
>> >> > >         * gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c: New test.
>> >> > > ---
>> >> > >  gcc/config/i386/i386-options.c                |   3 +-
>> >> > >  gcc/config/i386/i386.c                        |  41 +++----
>> >> > >  gcc/config/i386/i386.opt                      |   4 +
>> >> > >  gcc/config/i386/sse.md                        |  30 +++--
>> >> > >  gcc/doc/invoke.texi                           |   7 ++
>> >> > >  .../i386/avx2-vector-unaligned-load-store-1.c | 102 +++++++++++++++++
>> >> > >  .../i386/avx2-vector-unaligned-load-store-2.c | 107 ++++++++++++++++++
>> >> > >  .../i386/avx2-vector-unaligned-load-store-3.c |  11 ++
>> >> > >  .../avx512vl-vector-unaligned-load-store-1.c  |  13 +++
>> >> > >  9 files changed, 287 insertions(+), 31 deletions(-)
>> >> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
>> >> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
>> >> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
>> >> > >  create mode 100644 gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
>> >> > >
>> >> > > diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
>> >> > > index c9523b26f49..eacbd0f5451 100644
>> >> > > --- a/gcc/config/i386/i386-options.c
>> >> > > +++ b/gcc/config/i386/i386-options.c
>> >> > > @@ -397,7 +397,8 @@ ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
>> >> > >      { "-mstv",                         MASK_STV },
>> >> > >      { "-mavx256-split-unaligned-load", MASK_AVX256_SPLIT_UNALIGNED_LOAD },
>> >> > >      { "-mavx256-split-unaligned-store",        MASK_AVX256_SPLIT_UNALIGNED_STORE },
>> >> > > -    { "-mcall-ms2sysv-xlogues",                MASK_CALL_MS2SYSV_XLOGUES }
>> >> > > +    { "-mcall-ms2sysv-xlogues",                MASK_CALL_MS2SYSV_XLOGUES },
>> >> > > +    { "-muse-unaligned-vector-move",   MASK_USE_UNALIGNED_VECTOR_MOVE }
>> >> > >    };
>> >> > >
>> >> > >    /* Additional flag options.  */
>> >> > > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>> >> > > index f111411e599..7581e854021 100644
>> >> > > --- a/gcc/config/i386/i386.c
>> >> > > +++ b/gcc/config/i386/i386.c
>> >> > > @@ -5323,8 +5323,9 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>> >> > >                  enum attr_mode insn_mode, machine_mode mode)
>> >> > >  {
>> >> > >    char buf[128];
>> >> > > -  bool misaligned_p = (misaligned_operand (operands[0], mode)
>> >> > > -                      || misaligned_operand (operands[1], mode));
>> >> > > +  bool need_unaligned_p = (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> >> > > +                          || misaligned_operand (operands[0], mode)
>> >> > > +                          || misaligned_operand (operands[1], mode));
>> >> > >    bool evex_reg_p = (size == 64
>> >> > >                      || EXT_REX_SSE_REG_P (operands[0])
>> >> > >                      || EXT_REX_SSE_REG_P (operands[1]));
>> >> > > @@ -5380,17 +5381,17 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>> >> > >         {
>> >> > >         case opcode_int:
>> >> > >           if (scalar_mode == E_HFmode)
>> >> > > -           opcode = (misaligned_p
>> >> > > +           opcode = (need_unaligned_p
>> >> > >                       ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
>> >> > >                       : "vmovdqa64");
>> >> > >           else
>> >> > > -           opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
>> >> > > +           opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
>> >> > >           break;
>> >> > >         case opcode_float:
>> >> > > -         opcode = misaligned_p ? "vmovups" : "vmovaps";
>> >> > > +         opcode = need_unaligned_p ? "vmovups" : "vmovaps";
>> >> > >           break;
>> >> > >         case opcode_double:
>> >> > > -         opcode = misaligned_p ? "vmovupd" : "vmovapd";
>> >> > > +         opcode = need_unaligned_p ? "vmovupd" : "vmovapd";
>> >> > >           break;
>> >> > >         }
>> >> > >      }
>> >> > > @@ -5399,21 +5400,21 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>> >> > >        switch (scalar_mode)
>> >> > >         {
>> >> > >         case E_HFmode:
>> >> > > -         opcode = (misaligned_p
>> >> > > +         opcode = (need_unaligned_p
>> >> > >                     ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
>> >> > >                     : "vmovdqa64");
>> >> > >           break;
>> >> > >         case E_SFmode:
>> >> > > -         opcode = misaligned_p ? "%vmovups" : "%vmovaps";
>> >> > > +         opcode = need_unaligned_p ? "%vmovups" : "%vmovaps";
>> >> > >           break;
>> >> > >         case E_DFmode:
>> >> > > -         opcode = misaligned_p ? "%vmovupd" : "%vmovapd";
>> >> > > +         opcode = need_unaligned_p ? "%vmovupd" : "%vmovapd";
>> >> > >           break;
>> >> > >         case E_TFmode:
>> >> > >           if (evex_reg_p)
>> >> > > -           opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
>> >> > > +           opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
>> >> > >           else
>> >> > > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
>> >> > > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
>> >> > >           break;
>> >> > >         default:
>> >> > >           gcc_unreachable ();
>> >> > > @@ -5425,13 +5426,13 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>> >> > >         {
>> >> > >         case E_QImode:
>> >> > >           if (evex_reg_p)
>> >> > > -           opcode = (misaligned_p
>> >> > > +           opcode = (need_unaligned_p
>> >> > >                       ? (TARGET_AVX512BW
>> >> > >                          ? "vmovdqu8"
>> >> > >                          : "vmovdqu64")
>> >> > >                       : "vmovdqa64");
>> >> > >           else
>> >> > > -           opcode = (misaligned_p
>> >> > > +           opcode = (need_unaligned_p
>> >> > >                       ? (TARGET_AVX512BW
>> >> > >                          ? "vmovdqu8"
>> >> > >                          : "%vmovdqu")
>> >> > > @@ -5439,13 +5440,13 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>> >> > >           break;
>> >> > >         case E_HImode:
>> >> > >           if (evex_reg_p)
>> >> > > -           opcode = (misaligned_p
>> >> > > +           opcode = (need_unaligned_p
>> >> > >                       ? (TARGET_AVX512BW
>> >> > >                          ? "vmovdqu16"
>> >> > >                          : "vmovdqu64")
>> >> > >                       : "vmovdqa64");
>> >> > >           else
>> >> > > -           opcode = (misaligned_p
>> >> > > +           opcode = (need_unaligned_p
>> >> > >                       ? (TARGET_AVX512BW
>> >> > >                          ? "vmovdqu16"
>> >> > >                          : "%vmovdqu")
>> >> > > @@ -5453,20 +5454,20 @@ ix86_get_ssemov (rtx *operands, unsigned size,
>> >> > >           break;
>> >> > >         case E_SImode:
>> >> > >           if (evex_reg_p)
>> >> > > -           opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
>> >> > > +           opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
>> >> > >           else
>> >> > > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
>> >> > > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
>> >> > >           break;
>> >> > >         case E_DImode:
>> >> > >         case E_TImode:
>> >> > >         case E_OImode:
>> >> > >           if (evex_reg_p)
>> >> > > -           opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
>> >> > > +           opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
>> >> > >           else
>> >> > > -           opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
>> >> > > +           opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
>> >> > >           break;
>> >> > >         case E_XImode:
>> >> > > -         opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
>> >> > > +         opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
>> >> > >           break;
>> >> > >         default:
>> >> > >           gcc_unreachable ();
>> >> > > diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
>> >> > > index ad366974b5b..2162d10925a 100644
>> >> > > --- a/gcc/config/i386/i386.opt
>> >> > > +++ b/gcc/config/i386/i386.opt
>> >> > > @@ -1170,3 +1170,7 @@ Support MWAIT and MONITOR built-in functions and code generation.
>> >> > >  mavx512fp16
>> >> > >  Target Mask(ISA2_AVX512FP16) Var(ix86_isa_flags2) Save
>> >> > >  Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and AVX512FP16 built-in functions and code generation.
>> >> > > +
>> >> > > +muse-unaligned-vector-move
>> >> > > +Target Mask(USE_UNALIGNED_VECTOR_MOVE) Save
>> >> > > +Emit unaligned vector move instructions.
>> >> > > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
>> >> > > index fbf056bf9e6..dc99597f195 100644
>> >> > > --- a/gcc/config/i386/sse.md
>> >> > > +++ b/gcc/config/i386/sse.md
>> >> > > @@ -17059,24 +17059,28 @@
>> >> > >    switch (<MODE>mode)
>> >> > >      {
>> >> > >      case E_V8DFmode:
>> >> > > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
>> >> > > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> >> > > +         || misaligned_operand (operands[2], <ssequartermode>mode))
>> >> > >         return "vmovupd\t{%2, %x0|%x0, %2}";
>> >> > >        else
>> >> > >         return "vmovapd\t{%2, %x0|%x0, %2}";
>> >> > >      case E_V16SFmode:
>> >> > > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
>> >> > > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> >> > > +         || misaligned_operand (operands[2], <ssequartermode>mode))
>> >> > >         return "vmovups\t{%2, %x0|%x0, %2}";
>> >> > >        else
>> >> > >         return "vmovaps\t{%2, %x0|%x0, %2}";
>> >> > >      case E_V8DImode:
>> >> > > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
>> >> > > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> >> > > +         || misaligned_operand (operands[2], <ssequartermode>mode))
>> >> > >         return which_alternative == 2 ? "vmovdqu64\t{%2, %x0|%x0, %2}"
>> >> > >                                       : "vmovdqu\t{%2, %x0|%x0, %2}";
>> >> > >        else
>> >> > >         return which_alternative == 2 ? "vmovdqa64\t{%2, %x0|%x0, %2}"
>> >> > >                                       : "vmovdqa\t{%2, %x0|%x0, %2}";
>> >> > >      case E_V16SImode:
>> >> > > -      if (misaligned_operand (operands[2], <ssequartermode>mode))
>> >> > > +      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> >> > > +         || misaligned_operand (operands[2], <ssequartermode>mode))
>> >> > >         return which_alternative == 2 ? "vmovdqu32\t{%2, %x0|%x0, %2}"
>> >> > >                                       : "vmovdqu\t{%2, %x0|%x0, %2}";
>> >> > >        else
>> >> > > @@ -25238,27 +25242,32 @@
>> >> > >        switch (get_attr_mode (insn))
>> >> > >         {
>> >> > >         case MODE_V16SF:
>> >> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> >> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> >> > > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> >> > >             return "vmovups\t{%1, %t0|%t0, %1}";
>> >> > >           else
>> >> > >             return "vmovaps\t{%1, %t0|%t0, %1}";
>> >> > >         case MODE_V8DF:
>> >> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> >> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> >> > > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> >> > >             return "vmovupd\t{%1, %t0|%t0, %1}";
>> >> > >           else
>> >> > >             return "vmovapd\t{%1, %t0|%t0, %1}";
>> >> > >         case MODE_V8SF:
>> >> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> >> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> >> > > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> >> > >             return "vmovups\t{%1, %x0|%x0, %1}";
>> >> > >           else
>> >> > >             return "vmovaps\t{%1, %x0|%x0, %1}";
>> >> > >         case MODE_V4DF:
>> >> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> >> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> >> > > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> >> > >             return "vmovupd\t{%1, %x0|%x0, %1}";
>> >> > >           else
>> >> > >             return "vmovapd\t{%1, %x0|%x0, %1}";
>> >> > >         case MODE_XI:
>> >> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> >> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> >> > > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> >> > >             {
>> >> > >               if (which_alternative == 2)
>> >> > >                 return "vmovdqu\t{%1, %t0|%t0, %1}";
>> >> > > @@ -25277,7 +25286,8 @@
>> >> > >                 return "vmovdqa32\t{%1, %t0|%t0, %1}";
>> >> > >             }
>> >> > >         case MODE_OI:
>> >> > > -         if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> >> > > +         if (TARGET_USE_UNALIGNED_VECTOR_MOVE
>> >> > > +             || misaligned_operand (operands[1], <ssehalfvecmode>mode))
>> >> > >             {
>> >> > >               if (which_alternative == 2)
>> >> > >                 return "vmovdqu\t{%1, %x0|%x0, %1}";
>> >> > > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
>> >> > > index 0cc8a8edd05..13777d62437 100644
>> >> > > --- a/gcc/doc/invoke.texi
>> >> > > +++ b/gcc/doc/invoke.texi
>> >> > > @@ -1418,6 +1418,7 @@ See RS/6000 and PowerPC Options.
>> >> > >  -mstack-protector-guard-offset=@var{offset} @gol
>> >> > >  -mstack-protector-guard-symbol=@var{symbol} @gol
>> >> > >  -mgeneral-regs-only  -mcall-ms2sysv-xlogues @gol
>> >> > > +-muse-unaligned-vector-move @gol
>> >> > >  -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol
>> >> > >  -mindirect-branch-register -mneeded}
>> >> > >
>> >> > > @@ -31808,6 +31809,12 @@ resulting in fairly lengthy prologues and epilogues.  Using
>> >> > >  use stubs in the static portion of libgcc to perform these saves and restores,
>> >> > >  thus reducing function size at the cost of a few extra instructions.
>> >> > >
>> >> > > +@item -muse-unaligned-vector-move
>> >> > > +@opindex muse-unaligned-vector-move
>> >> > > +@opindex mno-use-unaligned-vector-move
>> >> > > +Use @option{-muse-unaligned-vector-move} to emits unaligned vector move
>> >> > > +instructions like vmovdqu, vmovups, vmovupd.
>> >> > > +
>> >> > >  @item -mtls-dialect=@var{type}
>> >> > >  @opindex mtls-dialect
>> >> > >  Generate code to access thread-local storage using the @samp{gnu} or
>> >> > > diff --git a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
>> >> > > new file mode 100644
>> >> > > index 00000000000..d21eee562ac
>> >> > > --- /dev/null
>> >> > > +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
>> >> > > @@ -0,0 +1,102 @@
>> >> > > +/* { dg-do compile } */
>> >> > > +/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
>> >> > > +
>> >> > > +#define N 1024
>> >> > > +
>> >> > > +char **cp;
>> >> > > +char **ep;
>> >> > > +char **fp;
>> >> > > +
>> >> > > +void
>> >> > > +test_char ()
>> >> > > +{
>> >> > > +  int i;
>> >> > > +  char **ap = __builtin_assume_aligned (ep, 32);
>> >> > > +  char **zp;
>> >> > > +  for (i = 128; i > 0; i--)
>> >> > > +  {
>> >> > > +    *ap++ = *cp++;
>> >> > > +    *zp++ = *fp++;
>> >> > > +  }
>> >> > > +}
>> >> > > +
>> >> > > +float f1[N], f2[N], f3[N];
>> >> > > +
>> >> > > +void
>> >> > > +test_float (void)
>> >> > > +{
>> >> > > +  for (int i = 0; i < N; i++)
>> >> > > +  {
>> >> > > +    f3[i] = f1[i] * f2[i];
>> >> > > +  }
>> >> > > +}
>> >> > > +
>> >> > > +double d1[N], d2[N], d3[N];
>> >> > > +
>> >> > > +void
>> >> > > +test_double_load (void)
>> >> > > +{
>> >> > > +  for (int i = 0; i < N; i++)
>> >> > > +  {
>> >> > > +    d3[i] = d1[i] * d2[i];
>> >> > > +
>> >> > > +  }
>> >> > > +}
>> >> > > +
>> >> > > +unsigned char uc1[N], uc2[N], uc3[N];
>> >> > > +void
>> >> > > +test_unchar ()
>> >> > > +{
>> >> > > +   for (int i=0;i<N;i++) {
>> >> > > +     uc3[i] = uc1[i] * uc2[i];
>> >> > > +   }
>> >> > > +}
>> >> > > +
>> >> > > +short st1[N], st2[N], st3[N];
>> >> > > +void
>> >> > > +test_short ()
>> >> > > +{
>> >> > > +   for (int i=0;i<N;i++) {
>> >> > > +     st3[i] = st1[i] * st2[i];
>> >> > > +   }
>> >> > > +}
>> >> > > +
>> >> > > +int n1[N], n2[N], n3[N];
>> >> > > +void
>> >> > > +test_int ()
>> >> > > +{
>> >> > > +   for (int i=0;i<N;i++) {
>> >> > > +     n3[i] = n1[i] * n2[i];
>> >> > > +   }
>> >> > > +}
>> >> > > +
>> >> > > +long l1[N], l2[N], l3[N];
>> >> > > +
>> >> > > +void
>> >> > > +test_long ()
>> >> > > +{
>> >> > > +  for (int i=0; i<N; i++)
>> >> > > +  {
>> >> > > +    l3[i] = l1[i] *l2[i];
>> >> > > +  }
>> >> > > +}
>> >> > > +
>> >> > > +long long ll1[N], ll2[N], ll3[N];
>> >> > > +
>> >> > > +void
>> >> > > +test_long_long()
>> >> > > +{
>> >> > > +  for (int i=0;i<N;i++)
>> >> > > +  {
>> >> > > +    ll3[i] = ll1[i]*ll2[i];
>> >> > > +  }
>> >> > > +}
>> >> > > +
>> >> > > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
>> >> > > +/* { dg-final { scan-assembler-not "vmovaps" } } */
>> >> > > +/* { dg-final { scan-assembler-not "vmovapd" } } */
>> >> > > +/* { dg-final { scan-assembler-times "vmovdqu" 19 { target lp64 } } } */
>> >> > > +/* { dg-final { scan-assembler-times "vmovdqu" 46 { target x32 } } } */
>> >> > > +/* { dg-final { scan-assembler-times "vmovdqu" 47 { target ia32 } } } */
>> >> > > +/* { dg-final { scan-assembler-times "vmovups" 2 } } */
>> >> > > +/* { dg-final { scan-assembler-times "vmovupd" 2 } } */
>> >> > > diff --git a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
>> >> > > new file mode 100644
>> >> > > index 00000000000..65c81105ebd
>> >> > > --- /dev/null
>> >> > > +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
>> >> > > @@ -0,0 +1,107 @@
>> >> > > +/* { dg-do compile } */
>> >> > > +/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
>> >> > > +
>> >> > > +#include <immintrin.h>
>> >> > > +__m128 value128;
>> >> > > +char src128[16];
>> >> > > +
>> >> > > +__m256 value256;
>> >> > > +float src256[8];
>> >> > > +
>> >> > > +void add128(__m128* pointer) {
>> >> > > +    value128 = _mm_add_ps(value128, *pointer);
>> >> > > +}
>> >> > > +
>> >> > > +void add256(__m256* pointer) {
>> >> > > +    value256 = _mm256_add_ps(value256, *pointer);
>> >> > > +}
>> >> > > +
>> >> > > +__m128d value128d;
>> >> > > +__m128d aux128d;
>> >> > > +float src128f[4];
>> >> > > +float res128f[4];
>> >> > > +double src128d[2];
>> >> > > +double res128d[2];
>> >> > > +
>> >> > > +void add128d(__m128d* pointer, __m128d aux, __m128d* res128d) {
>> >> > > +    value128d = _mm_add_pd(value128d, *pointer);
>> >> > > +    __m128d s1 = _mm_add_pd(aux, *pointer);
>> >> > > +    *res128d = _mm_add_pd(s1, value128d);
>> >> > > +}
>> >> > > +
>> >> > > +__m256d value256d;
>> >> > > +__m256d aux256d;
>> >> > > +float src256f[8];
>> >> > > +float res256f[8];
>> >> > > +double src256d[4];
>> >> > > +double res256d[4];
>> >> > > +
>> >> > > +void add256d(__m256d* pointer, __m256d aux, __m256d* res) {
>> >> > > +    value256d = _mm256_add_pd(value256d, *pointer);
>> >> > > +    __m256d s1 = _mm256_add_pd(aux, *pointer);
>> >> > > +    *res = _mm256_add_pd(s1, value256d);
>> >> > > +}
>> >> > > +
>> >> > > +__m256i value256i;
>> >> > > +__m256i aux256i;
>> >> > > +char src256c[32];
>> >> > > +char res256c[32];
>> >> > > +short src256s[16];
>> >> > > +short res256s[16];
>> >> > > +int src256i[8];
>> >> > > +int res256i[8];
>> >> > > +long long src256l[4];
>> >> > > +long long res256l[4];
>> >> > > +
>> >> > > +void add256i(__m256i* pointer, __m256i aux, __m256i* res) {
>> >> > > +    value256i = _mm256_add_epi32(value256i, *pointer);
>> >> > > +    __m256i s1 = _mm256_add_epi32(aux, *pointer);
>> >> > > +    *res = _mm256_add_epi32(s1, value256i);
>> >> > > +}
>> >> > > +
>> >> > > +void foo1() {
>> >> > > +    add128((__m128*)src128);
>> >> > > +}
>> >> > > +
>> >> > > +void foo2() {
>> >> > > +    add256((__m256*)src256);
>> >> > > +}
>> >> > > +
>> >> > > +void foo3() {
>> >> > > +    add128d((__m128d*)src128d, aux128d, (__m128d*)res128d);
>> >> > > +}
>> >> > > +
>> >> > > +void foo4() {
>> >> > > +    add128d((__m128d*)src128f, aux128d, (__m128d*)res128f);
>> >> > > +}
>> >> > > +
>> >> > > +void foo5() {
>> >> > > +    add256d((__m256d*)src256f, aux256d, (__m256d*)res256f);
>> >> > > +}
>> >> > > +
>> >> > > +void foo6() {
>> >> > > +    add256d((__m256d*)src256d, aux256d, (__m256d*)res256d);
>> >> > > +}
>> >> > > +
>> >> > > +void foo7() {
>> >> > > +    add256i((__m256i*)src256c, aux256i, (__m256i*)res256c);
>> >> > > +}
>> >> > > +
>> >> > > +void foo8() {
>> >> > > +    add256i((__m256i*)src256s, aux256i, (__m256i*)res256s);
>> >> > > +}
>> >> > > +
>> >> > > +void foo9() {
>> >> > > +    add256i((__m256i*)src256i, aux256i, (__m256i*)res256i);
>> >> > > +}
>> >> > > +
>> >> > > +void foo11() {
>> >> > > +    add256i((__m256i*)src256l, aux256i, (__m256i*)res256l);
>> >> > > +}
>> >> > > +
>> >> > > +/* { dg-final { scan-assembler-not "vmovaps" } } */
>> >> > > +/* { dg-final { scan-assembler-not "vmovapd" } } */
>> >> > > +/* { dg-final { scan-assembler-not "vmovdqa" } } */
>> >> > > +/* { dg-final { scan-assembler "vmovups" } } */
>> >> > > +/* { dg-final { scan-assembler "vmovupd" } } */
>> >> > > +/* { dg-final { scan-assembler "vmovdqu" } } */
>> >> > > diff --git a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
>> >> > > new file mode 100644
>> >> > > index 00000000000..59924304bae
>> >> > > --- /dev/null
>> >> > > +++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
>> >> > > @@ -0,0 +1,11 @@
>> >> > > +/* { dg-do compile } */
>> >> > > +/* { dg-options "-O3 -mavx2 -mno-use-unaligned-vector-move" } */
>> >> > > +
>> >> > > +#include "avx2-vector-unaligned-load-store-2.c"
>> >> > > +
>> >> > > +/* { dg-final { scan-assembler-not "vmovups" { target { ! ia32 } } } } */
>> >> > > +/* { dg-final { scan-assembler-not "vmovupd" { target { ! ia32 } } } } */
>> >> > > +/* { dg-final { scan-assembler-not "vmovdqu" } } */
>> >> > > +/* { dg-final { scan-assembler "vmovaps" } } */
>> >> > > +/* { dg-final { scan-assembler "vmovapd" } } */
>> >> > > +/* { dg-final { scan-assembler "vmovdqa" } } */
>> >> > > diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
>> >> > > new file mode 100644
>> >> > > index 00000000000..3759fd9f2f4
>> >> > > --- /dev/null
>> >> > > +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
>> >> > > @@ -0,0 +1,13 @@
>> >> > > +/* { dg-do compile } */
>> >> > > +/* { dg-options "-O3 -mavx512vl -muse-unaligned-vector-move" } */
>> >> > > +
>> >> > > +#include "avx2-vector-unaligned-load-store-1.c"
>> >> > > +
>> >> > > +/* { dg-final { scan-assembler-not "vmovdqa32" } } */
>> >> > > +/* { dg-final { scan-assembler-not "vmovdqa64" } } */
>> >> > > +/* { dg-final { scan-assembler-not "vmovaps" } } */
>> >> > > +/* { dg-final { scan-assembler-not "vmovapd" } } */
>> >> > > +/* { dg-final { scan-assembler "vmovdqu32" } } */
>> >> > > +/* { dg-final { scan-assembler "vmovdqu64" } } */
>> >> > > +/* { dg-final { scan-assembler "vmovups" } } */
>> >> > > +/* { dg-final { scan-assembler "vmovupd" } } */
>> >> > > --
>> >> > > 2.18.1
>> >> > >

H.J. Lu Oct. 20, 2021, 1:19 p.m. UTC | #8

On Wed, Oct 20, 2021 at 4:18 AM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Wed, Oct 20, 2021 at 12:40 PM Xu Dianhong <dianhong7@gmail.com> wrote:
> >
> > Many thanks for your explanation. I got the meaning of operands.
> > The "addpd b(%rip), %xmm0" instruction needs "b(%rip)" aligned otherwise it will rise a "Real-Address Mode Exceptions".
> > I haven't considered this situation  "b(%rip)" has an address dependence of "a(%rip)" before. I think this situation could be resolved on the assembler side except for this dummy code like "movapd 0x200b37(%rip),%xmm1, ... addpd  0x200b37(%rip),%xmm0 ".
>
> Of course the compiler will only emit instructions which have the
> constraint of aligned memory
> when the memory is known to be aligned.  That's why I wonder why you
> would need such
> option.  "Real-Address Mode Exceptions" may point to the issue, but I
> wonder what's different
> in real mode vs. protected mode - even with segmentation the alignment
> of objects should
> prevail unless you play linker"tricks" that make global objects have
> different alignment - but
> then it's better to adjust the respective hooks to not falsely claim
> such alignment.  Consider
> for example
>
>    if ((uintptr_t)&a & 0x7)
>      foo();
>   else
>      bar();
>
> GCC will optimize the branch statically to always call foo if 'a'
> appears to be aligned,
> even if you later try to "override" this with an option.  Alignment is
> not only about
> moves, it's also about knowledge about low bits in addresses and about
> alias analysis where alignment constrains how two objects can overlap.
>
> So - do not lie to the compiler!  A late "workaround" avoiding aligned
> SSE moves isn't a proper fix.
>

The motivations are

1.  AVX non-load/store ops work on unaligned memory.   Unaligned
load/store on aligned memory is as fast as aligned load/store on Intel
AVX machines.   The new switch makes load/store consistent with
other AVX ops.
2. We don't properly align the stack for AVX on Windows.  This can
be used as a workaround for -mavx on Windows.

We can change TARGET_USE_UNALIGNED_VECTOR_MOVE
to require AVX.

Richard Biener Oct. 20, 2021, 4:58 p.m. UTC | #9

On October 20, 2021 3:19:28 PM GMT+02:00, "H.J. Lu" <hjl.tools@gmail.com> wrote:
>On Wed, Oct 20, 2021 at 4:18 AM Richard Biener
><richard.guenther@gmail.com> wrote:
>>
>> On Wed, Oct 20, 2021 at 12:40 PM Xu Dianhong <dianhong7@gmail.com> wrote:
>> >
>> > Many thanks for your explanation. I got the meaning of operands.
>> > The "addpd b(%rip), %xmm0" instruction needs "b(%rip)" aligned otherwise it will rise a "Real-Address Mode Exceptions".
>> > I haven't considered this situation  "b(%rip)" has an address dependence of "a(%rip)" before. I think this situation could be resolved on the assembler side except for this dummy code like "movapd 0x200b37(%rip),%xmm1, ... addpd  0x200b37(%rip),%xmm0 ".
>>
>> Of course the compiler will only emit instructions which have the
>> constraint of aligned memory
>> when the memory is known to be aligned.  That's why I wonder why you
>> would need such
>> option.  "Real-Address Mode Exceptions" may point to the issue, but I
>> wonder what's different
>> in real mode vs. protected mode - even with segmentation the alignment
>> of objects should
>> prevail unless you play linker"tricks" that make global objects have
>> different alignment - but
>> then it's better to adjust the respective hooks to not falsely claim
>> such alignment.  Consider
>> for example
>>
>>    if ((uintptr_t)&a & 0x7)
>>      foo();
>>   else
>>      bar();
>>
>> GCC will optimize the branch statically to always call foo if 'a'
>> appears to be aligned,
>> even if you later try to "override" this with an option.  Alignment is
>> not only about
>> moves, it's also about knowledge about low bits in addresses and about
>> alias analysis where alignment constrains how two objects can overlap.
>>
>> So - do not lie to the compiler!  A late "workaround" avoiding aligned
>> SSE moves isn't a proper fix.
>>
>
>The motivations are
>
>1.  AVX non-load/store ops work on unaligned memory.   Unaligned
>load/store on aligned memory is as fast as aligned load/store on Intel
>AVX machines.   The new switch makes load/store consistent with
>other AVX ops.
>2. We don't properly align the stack for AVX on Windows.  This can
>be used as a workaround for -mavx on Windows.

But this, with lying that the stack is aligned, causes all of the above mentioned issues and thus needs to be fixed by either properly aligning the stack or not lying to the compiler that we do.

>
>We can change TARGET_USE_UNALIGNED_VECTOR_MOVE
>to require AVX.

But such workaround does not make any sense since it does not fix the fundamental underlying problem. 

Richard. 

>

H.J. Lu Oct. 20, 2021, 6:34 p.m. UTC | #10

On Wed, Oct 20, 2021 at 9:58 AM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On October 20, 2021 3:19:28 PM GMT+02:00, "H.J. Lu" <hjl.tools@gmail.com> wrote:
> >On Wed, Oct 20, 2021 at 4:18 AM Richard Biener
> ><richard.guenther@gmail.com> wrote:
> >>
> >> On Wed, Oct 20, 2021 at 12:40 PM Xu Dianhong <dianhong7@gmail.com> wrote:
> >> >
> >> > Many thanks for your explanation. I got the meaning of operands.
> >> > The "addpd b(%rip), %xmm0" instruction needs "b(%rip)" aligned otherwise it will rise a "Real-Address Mode Exceptions".
> >> > I haven't considered this situation  "b(%rip)" has an address dependence of "a(%rip)" before. I think this situation could be resolved on the assembler side except for this dummy code like "movapd 0x200b37(%rip),%xmm1, ... addpd  0x200b37(%rip),%xmm0 ".
> >>
> >> Of course the compiler will only emit instructions which have the
> >> constraint of aligned memory
> >> when the memory is known to be aligned.  That's why I wonder why you
> >> would need such
> >> option.  "Real-Address Mode Exceptions" may point to the issue, but I
> >> wonder what's different
> >> in real mode vs. protected mode - even with segmentation the alignment
> >> of objects should
> >> prevail unless you play linker"tricks" that make global objects have
> >> different alignment - but
> >> then it's better to adjust the respective hooks to not falsely claim
> >> such alignment.  Consider
> >> for example
> >>
> >>    if ((uintptr_t)&a & 0x7)
> >>      foo();
> >>   else
> >>      bar();
> >>
> >> GCC will optimize the branch statically to always call foo if 'a'
> >> appears to be aligned,
> >> even if you later try to "override" this with an option.  Alignment is
> >> not only about
> >> moves, it's also about knowledge about low bits in addresses and about
> >> alias analysis where alignment constrains how two objects can overlap.
> >>
> >> So - do not lie to the compiler!  A late "workaround" avoiding aligned
> >> SSE moves isn't a proper fix.
> >>
> >
> >The motivations are
> >
> >1.  AVX non-load/store ops work on unaligned memory.   Unaligned
> >load/store on aligned memory is as fast as aligned load/store on Intel
> >AVX machines.   The new switch makes load/store consistent with
> >other AVX ops.
> >2. We don't properly align the stack for AVX on Windows.  This can
> >be used as a workaround for -mavx on Windows.
>
> But this, with lying that the stack is aligned, causes all of the above mentioned issues and thus needs to be fixed by either properly aligning the stack or not lying to the compiler that we do.
>
> >
> >We can change TARGET_USE_UNALIGNED_VECTOR_MOVE
> >to require AVX.
>
> But such workaround does not make any sense since it does not fix the fundamental underlying problem.
>

There is a long standing desire to remove alignment checking (#AC(0)).
For integer operations, alignment checking is disabled in hardware.
For AVX ops, alignment checking is disabled in hardware for non-load/store
instructions.  But we can't disable alignment checking in hardware for
aligned load/store instructions.  -muse-unaligned-vector-move implements
disabling alignment checking for all AVX ops.

Richard Biener Oct. 21, 2021, 7:15 a.m. UTC | #11

On Wed, Oct 20, 2021 at 8:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Oct 20, 2021 at 9:58 AM Richard Biener
> <richard.guenther@gmail.com> wrote:
> >
> > On October 20, 2021 3:19:28 PM GMT+02:00, "H.J. Lu" <hjl.tools@gmail.com> wrote:
> > >On Wed, Oct 20, 2021 at 4:18 AM Richard Biener
> > ><richard.guenther@gmail.com> wrote:
> > >>
> > >> On Wed, Oct 20, 2021 at 12:40 PM Xu Dianhong <dianhong7@gmail.com> wrote:
> > >> >
> > >> > Many thanks for your explanation. I got the meaning of operands.
> > >> > The "addpd b(%rip), %xmm0" instruction needs "b(%rip)" aligned otherwise it will rise a "Real-Address Mode Exceptions".
> > >> > I haven't considered this situation  "b(%rip)" has an address dependence of "a(%rip)" before. I think this situation could be resolved on the assembler side except for this dummy code like "movapd 0x200b37(%rip),%xmm1, ... addpd  0x200b37(%rip),%xmm0 ".
> > >>
> > >> Of course the compiler will only emit instructions which have the
> > >> constraint of aligned memory
> > >> when the memory is known to be aligned.  That's why I wonder why you
> > >> would need such
> > >> option.  "Real-Address Mode Exceptions" may point to the issue, but I
> > >> wonder what's different
> > >> in real mode vs. protected mode - even with segmentation the alignment
> > >> of objects should
> > >> prevail unless you play linker"tricks" that make global objects have
> > >> different alignment - but
> > >> then it's better to adjust the respective hooks to not falsely claim
> > >> such alignment.  Consider
> > >> for example
> > >>
> > >>    if ((uintptr_t)&a & 0x7)
> > >>      foo();
> > >>   else
> > >>      bar();
> > >>
> > >> GCC will optimize the branch statically to always call foo if 'a'
> > >> appears to be aligned,
> > >> even if you later try to "override" this with an option.  Alignment is
> > >> not only about
> > >> moves, it's also about knowledge about low bits in addresses and about
> > >> alias analysis where alignment constrains how two objects can overlap.
> > >>
> > >> So - do not lie to the compiler!  A late "workaround" avoiding aligned
> > >> SSE moves isn't a proper fix.
> > >>
> > >
> > >The motivations are
> > >
> > >1.  AVX non-load/store ops work on unaligned memory.   Unaligned
> > >load/store on aligned memory is as fast as aligned load/store on Intel
> > >AVX machines.   The new switch makes load/store consistent with
> > >other AVX ops.
> > >2. We don't properly align the stack for AVX on Windows.  This can
> > >be used as a workaround for -mavx on Windows.
> >
> > But this, with lying that the stack is aligned, causes all of the above mentioned issues and thus needs to be fixed by either properly aligning the stack or not lying to the compiler that we do.
> >
> > >
> > >We can change TARGET_USE_UNALIGNED_VECTOR_MOVE
> > >to require AVX.
> >
> > But such workaround does not make any sense since it does not fix the fundamental underlying problem.
> >
>
> There is a long standing desire to remove alignment checking (#AC(0)).
> For integer operations, alignment checking is disabled in hardware.
> For AVX ops, alignment checking is disabled in hardware for non-load/store
> instructions.  But we can't disable alignment checking in hardware for
> aligned load/store instructions.  -muse-unaligned-vector-move implements
> disabling alignment checking for all AVX ops.

No, it does not - it just emits unaligned moves.  The compiler still assumes
aligned memory.  So whatever reason you have for disabling alignment
checking for memory that is known to be aligned, I don't see it.

If you want to "fix" broken user code then this doesn't do it.

If you want to avoid the penalty for runtime stack alignment then you simply
have to change the ABI(?) to not require vector types to have big alignment.

Richard.

>
> --
> H.J.

H.J. Lu Oct. 21, 2021, 12:10 p.m. UTC | #12

On Thu, Oct 21, 2021 at 12:15 AM Richard Biener
<richard.guenther@gmail.com> wrote:
>
> On Wed, Oct 20, 2021 at 8:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Oct 20, 2021 at 9:58 AM Richard Biener
> > <richard.guenther@gmail.com> wrote:
> > >
> > > On October 20, 2021 3:19:28 PM GMT+02:00, "H.J. Lu" <hjl.tools@gmail.com> wrote:
> > > >On Wed, Oct 20, 2021 at 4:18 AM Richard Biener
> > > ><richard.guenther@gmail.com> wrote:
> > > >>
> > > >> On Wed, Oct 20, 2021 at 12:40 PM Xu Dianhong <dianhong7@gmail.com> wrote:
> > > >> >
> > > >> > Many thanks for your explanation. I got the meaning of operands.
> > > >> > The "addpd b(%rip), %xmm0" instruction needs "b(%rip)" aligned otherwise it will rise a "Real-Address Mode Exceptions".
> > > >> > I haven't considered this situation  "b(%rip)" has an address dependence of "a(%rip)" before. I think this situation could be resolved on the assembler side except for this dummy code like "movapd 0x200b37(%rip),%xmm1, ... addpd  0x200b37(%rip),%xmm0 ".
> > > >>
> > > >> Of course the compiler will only emit instructions which have the
> > > >> constraint of aligned memory
> > > >> when the memory is known to be aligned.  That's why I wonder why you
> > > >> would need such
> > > >> option.  "Real-Address Mode Exceptions" may point to the issue, but I
> > > >> wonder what's different
> > > >> in real mode vs. protected mode - even with segmentation the alignment
> > > >> of objects should
> > > >> prevail unless you play linker"tricks" that make global objects have
> > > >> different alignment - but
> > > >> then it's better to adjust the respective hooks to not falsely claim
> > > >> such alignment.  Consider
> > > >> for example
> > > >>
> > > >>    if ((uintptr_t)&a & 0x7)
> > > >>      foo();
> > > >>   else
> > > >>      bar();
> > > >>
> > > >> GCC will optimize the branch statically to always call foo if 'a'
> > > >> appears to be aligned,
> > > >> even if you later try to "override" this with an option.  Alignment is
> > > >> not only about
> > > >> moves, it's also about knowledge about low bits in addresses and about
> > > >> alias analysis where alignment constrains how two objects can overlap.
> > > >>
> > > >> So - do not lie to the compiler!  A late "workaround" avoiding aligned
> > > >> SSE moves isn't a proper fix.
> > > >>
> > > >
> > > >The motivations are
> > > >
> > > >1.  AVX non-load/store ops work on unaligned memory.   Unaligned
> > > >load/store on aligned memory is as fast as aligned load/store on Intel
> > > >AVX machines.   The new switch makes load/store consistent with
> > > >other AVX ops.
> > > >2. We don't properly align the stack for AVX on Windows.  This can
> > > >be used as a workaround for -mavx on Windows.
> > >
> > > But this, with lying that the stack is aligned, causes all of the above mentioned issues and thus needs to be fixed by either properly aligning the stack or not lying to the compiler that we do.
> > >
> > > >
> > > >We can change TARGET_USE_UNALIGNED_VECTOR_MOVE
> > > >to require AVX.
> > >
> > > But such workaround does not make any sense since it does not fix the fundamental underlying problem.
> > >
> >
> > There is a long standing desire to remove alignment checking (#AC(0)).
> > For integer operations, alignment checking is disabled in hardware.
> > For AVX ops, alignment checking is disabled in hardware for non-load/store
> > instructions.  But we can't disable alignment checking in hardware for
> > aligned load/store instructions.  -muse-unaligned-vector-move implements
> > disabling alignment checking for all AVX ops.
>
> No, it does not - it just emits unaligned moves.  The compiler still assumes
> aligned memory.  So whatever reason you have for disabling alignment
> checking for memory that is known to be aligned, I don't see it.
>
> If you want to "fix" broken user code then this doesn't do it.
>
> If you want to avoid the penalty for runtime stack alignment then you simply
> have to change the ABI(?) to not require vector types to have big alignment.
>
>

Let's drop it.  We will find another way.

Thanks.

diff mbox series

Patch

diff --git a/gcc/config/i386/i386-options.c b/gcc/config/i386/i386-options.c
index c9523b26f49..eacbd0f5451 100644
--- a/gcc/config/i386/i386-options.c
+++ b/gcc/config/i386/i386-options.c
@@ -397,7 +397,8 @@  ix86_target_string (HOST_WIDE_INT isa, HOST_WIDE_INT isa2,
     { "-mstv",				MASK_STV },
     { "-mavx256-split-unaligned-load",	MASK_AVX256_SPLIT_UNALIGNED_LOAD },
     { "-mavx256-split-unaligned-store",	MASK_AVX256_SPLIT_UNALIGNED_STORE },
-    { "-mcall-ms2sysv-xlogues",		MASK_CALL_MS2SYSV_XLOGUES }
+    { "-mcall-ms2sysv-xlogues",		MASK_CALL_MS2SYSV_XLOGUES },
+    { "-muse-unaligned-vector-move",	MASK_USE_UNALIGNED_VECTOR_MOVE }
   };
 
   /* Additional flag options.  */
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index f111411e599..7581e854021 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -5323,8 +5323,9 @@  ix86_get_ssemov (rtx *operands, unsigned size,
 		 enum attr_mode insn_mode, machine_mode mode)
 {
   char buf[128];
-  bool misaligned_p = (misaligned_operand (operands[0], mode)
-		       || misaligned_operand (operands[1], mode));
+  bool need_unaligned_p = (TARGET_USE_UNALIGNED_VECTOR_MOVE
+			   || misaligned_operand (operands[0], mode)
+			   || misaligned_operand (operands[1], mode));
   bool evex_reg_p = (size == 64
 		     || EXT_REX_SSE_REG_P (operands[0])
 		     || EXT_REX_SSE_REG_P (operands[1]));
@@ -5380,17 +5381,17 @@  ix86_get_ssemov (rtx *operands, unsigned size,
 	{
 	case opcode_int:
 	  if (scalar_mode == E_HFmode)
-	    opcode = (misaligned_p
+	    opcode = (need_unaligned_p
 		      ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
 		      : "vmovdqa64");
 	  else
-	    opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
+	    opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
 	  break;
 	case opcode_float:
-	  opcode = misaligned_p ? "vmovups" : "vmovaps";
+	  opcode = need_unaligned_p ? "vmovups" : "vmovaps";
 	  break;
 	case opcode_double:
-	  opcode = misaligned_p ? "vmovupd" : "vmovapd";
+	  opcode = need_unaligned_p ? "vmovupd" : "vmovapd";
 	  break;
 	}
     }
@@ -5399,21 +5400,21 @@  ix86_get_ssemov (rtx *operands, unsigned size,
       switch (scalar_mode)
 	{
 	case E_HFmode:
-	  opcode = (misaligned_p
+	  opcode = (need_unaligned_p
 		    ? (TARGET_AVX512BW ? "vmovdqu16" : "vmovdqu64")
 		    : "vmovdqa64");
 	  break;
 	case E_SFmode:
-	  opcode = misaligned_p ? "%vmovups" : "%vmovaps";
+	  opcode = need_unaligned_p ? "%vmovups" : "%vmovaps";
 	  break;
 	case E_DFmode:
-	  opcode = misaligned_p ? "%vmovupd" : "%vmovapd";
+	  opcode = need_unaligned_p ? "%vmovupd" : "%vmovapd";
 	  break;
 	case E_TFmode:
 	  if (evex_reg_p)
-	    opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+	    opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
 	  else
-	    opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
+	    opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
 	  break;
 	default:
 	  gcc_unreachable ();
@@ -5425,13 +5426,13 @@  ix86_get_ssemov (rtx *operands, unsigned size,
 	{
 	case E_QImode:
 	  if (evex_reg_p)
-	    opcode = (misaligned_p
+	    opcode = (need_unaligned_p
 		      ? (TARGET_AVX512BW
 			 ? "vmovdqu8"
 			 : "vmovdqu64")
 		      : "vmovdqa64");
 	  else
-	    opcode = (misaligned_p
+	    opcode = (need_unaligned_p
 		      ? (TARGET_AVX512BW
 			 ? "vmovdqu8"
 			 : "%vmovdqu")
@@ -5439,13 +5440,13 @@  ix86_get_ssemov (rtx *operands, unsigned size,
 	  break;
 	case E_HImode:
 	  if (evex_reg_p)
-	    opcode = (misaligned_p
+	    opcode = (need_unaligned_p
 		      ? (TARGET_AVX512BW
 			 ? "vmovdqu16"
 			 : "vmovdqu64")
 		      : "vmovdqa64");
 	  else
-	    opcode = (misaligned_p
+	    opcode = (need_unaligned_p
 		      ? (TARGET_AVX512BW
 			 ? "vmovdqu16"
 			 : "%vmovdqu")
@@ -5453,20 +5454,20 @@  ix86_get_ssemov (rtx *operands, unsigned size,
 	  break;
 	case E_SImode:
 	  if (evex_reg_p)
-	    opcode = misaligned_p ? "vmovdqu32" : "vmovdqa32";
+	    opcode = need_unaligned_p ? "vmovdqu32" : "vmovdqa32";
 	  else
-	    opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
+	    opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
 	  break;
 	case E_DImode:
 	case E_TImode:
 	case E_OImode:
 	  if (evex_reg_p)
-	    opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+	    opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
 	  else
-	    opcode = misaligned_p ? "%vmovdqu" : "%vmovdqa";
+	    opcode = need_unaligned_p ? "%vmovdqu" : "%vmovdqa";
 	  break;
 	case E_XImode:
-	  opcode = misaligned_p ? "vmovdqu64" : "vmovdqa64";
+	  opcode = need_unaligned_p ? "vmovdqu64" : "vmovdqa64";
 	  break;
 	default:
 	  gcc_unreachable ();
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index ad366974b5b..2162d10925a 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -1170,3 +1170,7 @@  Support MWAIT and MONITOR built-in functions and code generation.
 mavx512fp16
 Target Mask(ISA2_AVX512FP16) Var(ix86_isa_flags2) Save
 Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, AVX512F and AVX512FP16 built-in functions and code generation.
+
+muse-unaligned-vector-move
+Target Mask(USE_UNALIGNED_VECTOR_MOVE) Save
+Emit unaligned vector move instructions.
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index fbf056bf9e6..dc99597f195 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -17059,24 +17059,28 @@ 
   switch (<MODE>mode)
     {
     case E_V8DFmode:
-      if (misaligned_operand (operands[2], <ssequartermode>mode))
+      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
+	  || misaligned_operand (operands[2], <ssequartermode>mode))
 	return "vmovupd\t{%2, %x0|%x0, %2}";
       else
 	return "vmovapd\t{%2, %x0|%x0, %2}";
     case E_V16SFmode:
-      if (misaligned_operand (operands[2], <ssequartermode>mode))
+      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
+	  || misaligned_operand (operands[2], <ssequartermode>mode))
 	return "vmovups\t{%2, %x0|%x0, %2}";
       else
 	return "vmovaps\t{%2, %x0|%x0, %2}";
     case E_V8DImode:
-      if (misaligned_operand (operands[2], <ssequartermode>mode))
+      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
+	  || misaligned_operand (operands[2], <ssequartermode>mode))
 	return which_alternative == 2 ? "vmovdqu64\t{%2, %x0|%x0, %2}"
 				      : "vmovdqu\t{%2, %x0|%x0, %2}";
       else
 	return which_alternative == 2 ? "vmovdqa64\t{%2, %x0|%x0, %2}"
 				      : "vmovdqa\t{%2, %x0|%x0, %2}";
     case E_V16SImode:
-      if (misaligned_operand (operands[2], <ssequartermode>mode))
+      if (TARGET_USE_UNALIGNED_VECTOR_MOVE
+	  || misaligned_operand (operands[2], <ssequartermode>mode))
 	return which_alternative == 2 ? "vmovdqu32\t{%2, %x0|%x0, %2}"
 				      : "vmovdqu\t{%2, %x0|%x0, %2}";
       else
@@ -25238,27 +25242,32 @@ 
       switch (get_attr_mode (insn))
 	{
 	case MODE_V16SF:
-	  if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
+	  if (TARGET_USE_UNALIGNED_VECTOR_MOVE
+	      || misaligned_operand (operands[1], <ssehalfvecmode>mode))
 	    return "vmovups\t{%1, %t0|%t0, %1}";
 	  else
 	    return "vmovaps\t{%1, %t0|%t0, %1}";
 	case MODE_V8DF:
-	  if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
+	  if (TARGET_USE_UNALIGNED_VECTOR_MOVE
+	      || misaligned_operand (operands[1], <ssehalfvecmode>mode))
 	    return "vmovupd\t{%1, %t0|%t0, %1}";
 	  else
 	    return "vmovapd\t{%1, %t0|%t0, %1}";
 	case MODE_V8SF:
-	  if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
+	  if (TARGET_USE_UNALIGNED_VECTOR_MOVE
+	      || misaligned_operand (operands[1], <ssehalfvecmode>mode))
 	    return "vmovups\t{%1, %x0|%x0, %1}";
 	  else
 	    return "vmovaps\t{%1, %x0|%x0, %1}";
 	case MODE_V4DF:
-	  if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
+	  if (TARGET_USE_UNALIGNED_VECTOR_MOVE
+	      || misaligned_operand (operands[1], <ssehalfvecmode>mode))
 	    return "vmovupd\t{%1, %x0|%x0, %1}";
 	  else
 	    return "vmovapd\t{%1, %x0|%x0, %1}";
 	case MODE_XI:
-	  if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
+	  if (TARGET_USE_UNALIGNED_VECTOR_MOVE
+	      || misaligned_operand (operands[1], <ssehalfvecmode>mode))
 	    {
 	      if (which_alternative == 2)
 		return "vmovdqu\t{%1, %t0|%t0, %1}";
@@ -25277,7 +25286,8 @@ 
 		return "vmovdqa32\t{%1, %t0|%t0, %1}";
 	    }
 	case MODE_OI:
-	  if (misaligned_operand (operands[1], <ssehalfvecmode>mode))
+	  if (TARGET_USE_UNALIGNED_VECTOR_MOVE
+	      || misaligned_operand (operands[1], <ssehalfvecmode>mode))
 	    {
 	      if (which_alternative == 2)
 		return "vmovdqu\t{%1, %x0|%x0, %1}";
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 0cc8a8edd05..13777d62437 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -1418,6 +1418,7 @@  See RS/6000 and PowerPC Options.
 -mstack-protector-guard-offset=@var{offset} @gol
 -mstack-protector-guard-symbol=@var{symbol} @gol
 -mgeneral-regs-only  -mcall-ms2sysv-xlogues @gol
+-muse-unaligned-vector-move @gol
 -mindirect-branch=@var{choice}  -mfunction-return=@var{choice} @gol
 -mindirect-branch-register -mneeded}
 
@@ -31808,6 +31809,12 @@  resulting in fairly lengthy prologues and epilogues.  Using
 use stubs in the static portion of libgcc to perform these saves and restores,
 thus reducing function size at the cost of a few extra instructions.
 
+@item -muse-unaligned-vector-move
+@opindex muse-unaligned-vector-move
+@opindex mno-use-unaligned-vector-move
+Use @option{-muse-unaligned-vector-move} to emits unaligned vector move
+instructions like vmovdqu, vmovups, vmovupd.
+
 @item -mtls-dialect=@var{type}
 @opindex mtls-dialect
 Generate code to access thread-local storage using the @samp{gnu} or
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
new file mode 100644
index 00000000000..d21eee562ac
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-1.c
@@ -0,0 +1,102 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
+
+#define N 1024
+
+char **cp;
+char **ep;
+char **fp;
+
+void
+test_char ()
+{
+  int i;
+  char **ap = __builtin_assume_aligned (ep, 32);
+  char **zp;
+  for (i = 128; i > 0; i--)
+  {
+    *ap++ = *cp++;
+    *zp++ = *fp++;
+  }
+}
+
+float f1[N], f2[N], f3[N];
+
+void
+test_float (void)
+{
+  for (int i = 0; i < N; i++)
+  {
+    f3[i] = f1[i] * f2[i];
+  }
+}
+
+double d1[N], d2[N], d3[N];
+
+void
+test_double_load (void)
+{
+  for (int i = 0; i < N; i++)
+  {
+    d3[i] = d1[i] * d2[i];
+
+  }
+}
+
+unsigned char uc1[N], uc2[N], uc3[N];
+void
+test_unchar ()
+{
+   for (int i=0;i<N;i++) {
+     uc3[i] = uc1[i] * uc2[i];
+   }
+}
+
+short st1[N], st2[N], st3[N];
+void
+test_short ()
+{
+   for (int i=0;i<N;i++) {
+     st3[i] = st1[i] * st2[i];
+   }
+}
+
+int n1[N], n2[N], n3[N];
+void
+test_int ()
+{
+   for (int i=0;i<N;i++) {
+     n3[i] = n1[i] * n2[i];
+   }
+}
+
+long l1[N], l2[N], l3[N];
+
+void
+test_long ()
+{
+  for (int i=0; i<N; i++)
+  {
+    l3[i] = l1[i] *l2[i];
+  }
+}
+
+long long ll1[N], ll2[N], ll3[N];
+
+void
+test_long_long()
+{
+  for (int i=0;i<N;i++) 
+  {
+    ll3[i] = ll1[i]*ll2[i];
+  }
+}
+
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovapd" } } */
+/* { dg-final { scan-assembler-times "vmovdqu" 19 { target lp64 } } } */
+/* { dg-final { scan-assembler-times "vmovdqu" 46 { target x32 } } } */
+/* { dg-final { scan-assembler-times "vmovdqu" 47 { target ia32 } } } */
+/* { dg-final { scan-assembler-times "vmovups" 2 } } */
+/* { dg-final { scan-assembler-times "vmovupd" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
new file mode 100644
index 00000000000..65c81105ebd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-2.c
@@ -0,0 +1,107 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -muse-unaligned-vector-move" } */
+
+#include <immintrin.h>
+__m128 value128;
+char src128[16];
+
+__m256 value256;
+float src256[8];
+
+void add128(__m128* pointer) {
+    value128 = _mm_add_ps(value128, *pointer);
+}
+
+void add256(__m256* pointer) {
+    value256 = _mm256_add_ps(value256, *pointer);
+}
+
+__m128d value128d;
+__m128d aux128d;
+float src128f[4];
+float res128f[4];
+double src128d[2];
+double res128d[2];
+
+void add128d(__m128d* pointer, __m128d aux, __m128d* res128d) {
+    value128d = _mm_add_pd(value128d, *pointer);
+    __m128d s1 = _mm_add_pd(aux, *pointer);
+    *res128d = _mm_add_pd(s1, value128d);
+}
+
+__m256d value256d;
+__m256d aux256d;
+float src256f[8];
+float res256f[8];
+double src256d[4];
+double res256d[4];
+
+void add256d(__m256d* pointer, __m256d aux, __m256d* res) {
+    value256d = _mm256_add_pd(value256d, *pointer);
+    __m256d s1 = _mm256_add_pd(aux, *pointer);
+    *res = _mm256_add_pd(s1, value256d);
+}
+
+__m256i value256i;
+__m256i aux256i;
+char src256c[32];
+char res256c[32];
+short src256s[16];
+short res256s[16];
+int src256i[8];
+int res256i[8];
+long long src256l[4];
+long long res256l[4];
+
+void add256i(__m256i* pointer, __m256i aux, __m256i* res) {
+    value256i = _mm256_add_epi32(value256i, *pointer);
+    __m256i s1 = _mm256_add_epi32(aux, *pointer);
+    *res = _mm256_add_epi32(s1, value256i);
+}
+
+void foo1() {
+    add128((__m128*)src128);
+}
+
+void foo2() {
+    add256((__m256*)src256);
+}
+
+void foo3() {
+    add128d((__m128d*)src128d, aux128d, (__m128d*)res128d);
+}
+
+void foo4() {
+    add128d((__m128d*)src128f, aux128d, (__m128d*)res128f);
+}
+
+void foo5() {
+    add256d((__m256d*)src256f, aux256d, (__m256d*)res256f);
+}
+
+void foo6() {
+    add256d((__m256d*)src256d, aux256d, (__m256d*)res256d);
+}
+
+void foo7() {
+    add256i((__m256i*)src256c, aux256i, (__m256i*)res256c);
+}
+
+void foo8() {
+    add256i((__m256i*)src256s, aux256i, (__m256i*)res256s);
+}
+
+void foo9() {
+    add256i((__m256i*)src256i, aux256i, (__m256i*)res256i);
+}
+
+void foo11() {
+    add256i((__m256i*)src256l, aux256i, (__m256i*)res256l);
+}
+
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovapd" } } */
+/* { dg-final { scan-assembler-not "vmovdqa" } } */
+/* { dg-final { scan-assembler "vmovups" } } */
+/* { dg-final { scan-assembler "vmovupd" } } */
+/* { dg-final { scan-assembler "vmovdqu" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
new file mode 100644
index 00000000000..59924304bae
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-vector-unaligned-load-store-3.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx2 -mno-use-unaligned-vector-move" } */
+
+#include "avx2-vector-unaligned-load-store-2.c"
+
+/* { dg-final { scan-assembler-not "vmovups" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not "vmovupd" { target { ! ia32 } } } } */
+/* { dg-final { scan-assembler-not "vmovdqu" } } */
+/* { dg-final { scan-assembler "vmovaps" } } */
+/* { dg-final { scan-assembler "vmovapd" } } */
+/* { dg-final { scan-assembler "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
new file mode 100644
index 00000000000..3759fd9f2f4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vector-unaligned-load-store-1.c
@@ -0,0 +1,13 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3 -mavx512vl -muse-unaligned-vector-move" } */
+
+#include "avx2-vector-unaligned-load-store-1.c"
+
+/* { dg-final { scan-assembler-not "vmovdqa32" } } */
+/* { dg-final { scan-assembler-not "vmovdqa64" } } */
+/* { dg-final { scan-assembler-not "vmovaps" } } */
+/* { dg-final { scan-assembler-not "vmovapd" } } */
+/* { dg-final { scan-assembler "vmovdqu32" } } */
+/* { dg-final { scan-assembler "vmovdqu64" } } */
+/* { dg-final { scan-assembler "vmovups" } } */
+/* { dg-final { scan-assembler "vmovupd" } } */