[v2,3/3] x86/string: Add version of memmove with page unrolled large impl

Message ID 20250114210341.599037-3-goldstein.w.n@gmail.com (mailing list archive)
State New
Delegated to: Florian Weimer
Headers
Series [v2,1/3] x86/string: Factor out large memmove implemention to seperate file |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Test passed
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Test passed

Commit Message

Noah Goldstein Jan. 14, 2025, 9:03 p.m. UTC
  The page unrolled version has been shown to be the best performing on
Intel SnB through ICX hardware.
---
 sysdeps/x86/cpu-features.c                    |  17 +++
 sysdeps/x86/cpu-tunables.c                    |   6 +
 ...cpu-features-preferred_feature_index_1.def |   1 +
 sysdeps/x86/tst-hwcap-tunables.c              |   4 +-
 sysdeps/x86_64/multiarch/Makefile             |   3 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 120 ++++++++++++++++++
 sysdeps/x86_64/multiarch/ifunc-memmove.h      |  75 +++++++----
 ...ove-avx-unaligned-erms-page-unrolled-rtm.S |   5 +
 ...memmove-avx-unaligned-erms-page-unrolled.S |   5 +
 .../memmove-avx-unaligned-erms-rtm.S          |   2 +
 ...emmove-evex-unaligned-erms-page-unrolled.S |   5 +
 11 files changed, 218 insertions(+), 25 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-page-unrolled-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-page-unrolled.S
 create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms-page-unrolled.S
  

Patch

diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 27abaca8b7..c0ecbbb812 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -877,6 +877,12 @@  init_cpu_features (struct cpu_features *cpu_features)
 	    case INTEL_BIGCORE_HASWELL:
 	    case INTEL_BIGCORE_BROADWELL:
 	      cpu_features->cachesize_non_temporal_divisor = 8;
+	      /* Benchmarks indicate page unrolled large implementation
+             performs better than standard copy loop on HSW (and
+             presumably SnB).  */
+	      cpu_features
+		  ->preferred[index_arch_Prefer_Page_Unrolled_Large_Copy]
+		  |= bit_arch_Prefer_Page_Unrolled_Large_Copy;
 	      goto default_tuning;
 
 	      /* Newer Bigcore microarch (larger non-temporal store
@@ -890,6 +896,11 @@  init_cpu_features (struct cpu_features *cpu_features)
 		     non-temporal on all Skylake servers. */
 	      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
 		  |= bit_arch_Avoid_Non_Temporal_Memset;
+	      /* Benchmarks indicate page unrolled large implementation
+             performs better than standard copy loop on Skylake/SKX.  */
+	      cpu_features
+		  ->preferred[index_arch_Prefer_Page_Unrolled_Large_Copy]
+		  |= bit_arch_Prefer_Page_Unrolled_Large_Copy;
 	      /* fallthrough */
 	    case INTEL_BIGCORE_COMETLAKE:
 	    case INTEL_BIGCORE_SKYLAKE:
@@ -897,6 +908,12 @@  init_cpu_features (struct cpu_features *cpu_features)
 	    case INTEL_BIGCORE_ICELAKE:
 	    case INTEL_BIGCORE_TIGERLAKE:
 	    case INTEL_BIGCORE_ROCKETLAKE:
+	      /* Benchmarks indicate page unrolled large implementation
+             performs better than standard copy loop on SKX/ICX.  */
+	      cpu_features
+		  ->preferred[index_arch_Prefer_Page_Unrolled_Large_Copy]
+		  |= bit_arch_Prefer_Page_Unrolled_Large_Copy;
+          /* fallthrough */
 	    case INTEL_BIGCORE_RAPTORLAKE:
 	    case INTEL_BIGCORE_METEORLAKE:
 	    case INTEL_BIGCORE_LUNARLAKE:
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 3423176802..d85b618311 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -257,6 +257,12 @@  TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
 		(n, cpu_features, Prefer_PMINUB_for_stringop, SSE2, 26);
 	    }
 	  break;
+	case 31:
+	  {
+	    CHECK_GLIBC_IFUNC_PREFERRED_BOTH (
+		n, cpu_features, Prefer_Page_Unrolled_Large_Copy, 31);
+	  }
+	  break;
 	}
     }
 }
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index 0f14aaf071..5943fc1423 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -35,3 +35,4 @@  BIT (Prefer_FSRM)
 BIT (Avoid_Short_Distance_REP_MOVSB)
 BIT (Avoid_Non_Temporal_Memset)
 BIT (Avoid_STOSB)
+BIT (Prefer_Page_Unrolled_Large_Copy)
\ No newline at end of file
diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c
index 3e06048dcc..985153fb38 100644
--- a/sysdeps/x86/tst-hwcap-tunables.c
+++ b/sysdeps/x86/tst-hwcap-tunables.c
@@ -61,7 +61,7 @@  static const struct test_t
     "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
     "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
     "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,"
-    "-Avoid_STOSB",
+    "-Avoid_STOSB,-Prefer_Page_Unrolled_Large_Copy",
     test_1,
     array_length (test_1)
   },
@@ -70,7 +70,7 @@  static const struct test_t
     ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
     "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
     "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,"
-    "-Avoid_STOSB,-,",
+    "-Avoid_STOSB,-Prefer_Page_Unrolled_Large_Copy,-,",
     test_1,
     array_length (test_1)
   }
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 696cb66991..381eaef455 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,11 +16,14 @@  sysdep_routines += \
   memcmpeq-evex \
   memcmpeq-sse2 \
   memmove-avx-unaligned-erms \
+  memmove-avx-unaligned-erms-page-unrolled \
+  memmove-avx-unaligned-erms-page-unrolled-rtm \
   memmove-avx-unaligned-erms-rtm \
   memmove-avx512-no-vzeroupper \
   memmove-avx512-unaligned-erms \
   memmove-erms \
   memmove-evex-unaligned-erms \
+  memmove-evex-unaligned-erms-page-unrolled \
   memmove-sse2-unaligned-erms \
   memmove-ssse3 \
   memrchr-avx2 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a8349775df..424031f0e6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -133,23 +133,43 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
 				     CPU_FEATURE_USABLE (AVX512VL),
 				     __memmove_chk_evex_unaligned)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
+				     CPU_FEATURE_USABLE (AVX512VL),
+				     __memmove_chk_evex_unaligned_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
 				     CPU_FEATURE_USABLE (AVX512VL),
 				     __memmove_chk_evex_unaligned_erms)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
+				     CPU_FEATURE_USABLE (AVX512VL),
+				     __memmove_chk_evex_unaligned_erms_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
 				     CPU_FEATURE_USABLE (AVX),
 				     __memmove_chk_avx_unaligned)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
+				     CPU_FEATURE_USABLE (AVX),
+				     __memmove_chk_avx_unaligned_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
 				     CPU_FEATURE_USABLE (AVX),
 				     __memmove_chk_avx_unaligned_erms)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
+				     CPU_FEATURE_USABLE (AVX),
+				     __memmove_chk_avx_unaligned_erms_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memmove_chk_avx_unaligned_rtm)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
+				     (CPU_FEATURE_USABLE (AVX)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __memmove_chk_avx_unaligned_page_unrolled_rtm)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memmove_chk_avx_unaligned_erms_rtm)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
+				     (CPU_FEATURE_USABLE (AVX)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __memmove_chk_avx_unaligned_erms_page_unrolled_rtm)
 	      /* By V3 we assume fast aligned copy.  */
 	      X86_IFUNC_IMPL_ADD_V2 (array, i, __memmove_chk,
 				     CPU_FEATURE_USABLE (SSSE3),
@@ -180,23 +200,43 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
 				     CPU_FEATURE_USABLE (AVX512VL),
 				     __memmove_evex_unaligned)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
+				     CPU_FEATURE_USABLE (AVX512VL),
+				     __memmove_evex_unaligned_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
 				     CPU_FEATURE_USABLE (AVX512VL),
 				     __memmove_evex_unaligned_erms)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
+				     CPU_FEATURE_USABLE (AVX512VL),
+				     __memmove_evex_unaligned_erms_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
 				     CPU_FEATURE_USABLE (AVX),
 				     __memmove_avx_unaligned)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
+				     CPU_FEATURE_USABLE (AVX),
+				     __memmove_avx_unaligned_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
 				     CPU_FEATURE_USABLE (AVX),
 				     __memmove_avx_unaligned_erms)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
+				     CPU_FEATURE_USABLE (AVX),
+				     __memmove_avx_unaligned_erms_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memmove_avx_unaligned_rtm)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
+				     (CPU_FEATURE_USABLE (AVX)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __memmove_avx_unaligned_page_unrolled_rtm)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memmove_avx_unaligned_erms_rtm)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
+				     (CPU_FEATURE_USABLE (AVX)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __memmove_avx_unaligned_erms_page_unrolled_rtm)
 	      /* By V3 we assume fast aligned copy.  */
 	      X86_IFUNC_IMPL_ADD_V2 (array, i, memmove,
 				     CPU_FEATURE_USABLE (SSSE3),
@@ -1140,23 +1180,43 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
 				     CPU_FEATURE_USABLE (AVX512VL),
 				     __memcpy_chk_evex_unaligned)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
+				     CPU_FEATURE_USABLE (AVX512VL),
+				     __memcpy_chk_evex_unaligned_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
 				     CPU_FEATURE_USABLE (AVX512VL),
 				     __memcpy_chk_evex_unaligned_erms)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
+				     CPU_FEATURE_USABLE (AVX512VL),
+				     __memcpy_chk_evex_unaligned_erms_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
 				     CPU_FEATURE_USABLE (AVX),
 				     __memcpy_chk_avx_unaligned)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
+				     CPU_FEATURE_USABLE (AVX),
+				     __memcpy_chk_avx_unaligned_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
 				     CPU_FEATURE_USABLE (AVX),
 				     __memcpy_chk_avx_unaligned_erms)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
+				     CPU_FEATURE_USABLE (AVX),
+				     __memcpy_chk_avx_unaligned_erms_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memcpy_chk_avx_unaligned_rtm)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
+				     (CPU_FEATURE_USABLE (AVX)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __memcpy_chk_avx_unaligned_page_unrolled_rtm)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memcpy_chk_avx_unaligned_erms_rtm)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
+				     (CPU_FEATURE_USABLE (AVX)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __memcpy_chk_avx_unaligned_erms_page_unrolled_rtm)
 	      /* By V3 we assume fast aligned copy.  */
 	      X86_IFUNC_IMPL_ADD_V2 (array, i, __memcpy_chk,
 				     CPU_FEATURE_USABLE (SSSE3),
@@ -1187,23 +1247,43 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
 				     CPU_FEATURE_USABLE (AVX512VL),
 				     __memcpy_evex_unaligned)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
+				     CPU_FEATURE_USABLE (AVX512VL),
+				     __memcpy_evex_unaligned_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
 				     CPU_FEATURE_USABLE (AVX512VL),
 				     __memcpy_evex_unaligned_erms)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
+				     CPU_FEATURE_USABLE (AVX512VL),
+				     __memcpy_evex_unaligned_erms_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
 				     CPU_FEATURE_USABLE (AVX),
 				     __memcpy_avx_unaligned)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
+				     CPU_FEATURE_USABLE (AVX),
+				     __memcpy_avx_unaligned_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
 				     CPU_FEATURE_USABLE (AVX),
 				     __memcpy_avx_unaligned_erms)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
+				     CPU_FEATURE_USABLE (AVX),
+				     __memcpy_avx_unaligned_erms_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memcpy_avx_unaligned_rtm)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
+				     (CPU_FEATURE_USABLE (AVX)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __memcpy_avx_unaligned_page_unrolled_rtm)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __memcpy_avx_unaligned_erms_rtm)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
+				     (CPU_FEATURE_USABLE (AVX)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __memcpy_avx_unaligned_erms_page_unrolled_rtm)
 	      /* By V3 we assume fast aligned copy.  */
 	      X86_IFUNC_IMPL_ADD_V2 (array, i, memcpy,
 				     CPU_FEATURE_USABLE (SSSE3),
@@ -1234,23 +1314,43 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
 				     CPU_FEATURE_USABLE (AVX512VL),
 				     __mempcpy_chk_evex_unaligned)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
+				     CPU_FEATURE_USABLE (AVX512VL),
+				     __mempcpy_chk_evex_unaligned_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
 				     CPU_FEATURE_USABLE (AVX512VL),
 				     __mempcpy_chk_evex_unaligned_erms)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
+				     CPU_FEATURE_USABLE (AVX512VL),
+				     __mempcpy_chk_evex_unaligned_erms_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
 				     CPU_FEATURE_USABLE (AVX),
 				     __mempcpy_chk_avx_unaligned)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
+				     CPU_FEATURE_USABLE (AVX),
+				     __mempcpy_chk_avx_unaligned_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
 				     CPU_FEATURE_USABLE (AVX),
 				     __mempcpy_chk_avx_unaligned_erms)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
+				     CPU_FEATURE_USABLE (AVX),
+				     __mempcpy_chk_avx_unaligned_erms_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __mempcpy_chk_avx_unaligned_rtm)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
+				     (CPU_FEATURE_USABLE (AVX)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __mempcpy_chk_avx_unaligned_page_unrolled_rtm)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __mempcpy_chk_avx_unaligned_erms_rtm)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
+				     (CPU_FEATURE_USABLE (AVX)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __mempcpy_chk_avx_unaligned_erms_page_unrolled_rtm)
 	      /* By V3 we assume fast aligned copy.  */
 	      X86_IFUNC_IMPL_ADD_V2 (array, i, __mempcpy_chk,
 				     CPU_FEATURE_USABLE (SSSE3),
@@ -1281,23 +1381,43 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
 				     CPU_FEATURE_USABLE (AVX512VL),
 				     __mempcpy_evex_unaligned)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
+				     CPU_FEATURE_USABLE (AVX512VL),
+				     __mempcpy_evex_unaligned_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
 				     CPU_FEATURE_USABLE (AVX512VL),
 				     __mempcpy_evex_unaligned_erms)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
+				     CPU_FEATURE_USABLE (AVX512VL),
+				     __mempcpy_evex_unaligned_erms_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
 				     CPU_FEATURE_USABLE (AVX),
 				     __mempcpy_avx_unaligned)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
+				     CPU_FEATURE_USABLE (AVX),
+				     __mempcpy_avx_unaligned_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
 				     CPU_FEATURE_USABLE (AVX),
 				     __mempcpy_avx_unaligned_erms)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
+				     CPU_FEATURE_USABLE (AVX),
+				     __mempcpy_avx_unaligned_erms_page_unrolled)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __mempcpy_avx_unaligned_rtm)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
+				     (CPU_FEATURE_USABLE (AVX)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __mempcpy_avx_unaligned_page_unrolled_rtm)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
 				     (CPU_FEATURE_USABLE (AVX)
 				      && CPU_FEATURE_USABLE (RTM)),
 				     __mempcpy_avx_unaligned_erms_rtm)
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
+				     (CPU_FEATURE_USABLE (AVX)
+				      && CPU_FEATURE_USABLE (RTM)),
+				     __mempcpy_avx_unaligned_erms_page_unrolled_rtm)
 	      /* By V3 we assume fast aligned copy.  */
 	      X86_IFUNC_IMPL_ADD_V2 (array, i, mempcpy,
 				     CPU_FEATURE_USABLE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index de0ac73a2a..6d5df8a9eb 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -28,18 +28,27 @@  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_no_vzeroupper)
   attribute_hidden;
 
-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
-  attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
-  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME)
+    OPTIMIZE (evex_unaligned_page_unrolled) attribute_hidden;
+extern __typeof (REDIRECT_NAME)
+    OPTIMIZE (evex_unaligned_erms) attribute_hidden;
+extern __typeof (REDIRECT_NAME)
+    OPTIMIZE (evex_unaligned_erms_page_unrolled) attribute_hidden;
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
-  attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm)
-  attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm)
-  attribute_hidden;
+extern __typeof (REDIRECT_NAME)
+    OPTIMIZE (avx_unaligned_page_unrolled) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) attribute_hidden;
+extern __typeof (REDIRECT_NAME)
+    OPTIMIZE (avx_unaligned_erms_page_unrolled) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME)
+    OPTIMIZE (avx_unaligned_page_unrolled_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME)
+    OPTIMIZE (avx_unaligned_erms_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME)
+    OPTIMIZE (avx_unaligned_erms_page_unrolled_rtm) attribute_hidden;
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 
@@ -71,40 +80,60 @@  IFUNC_SELECTOR (void)
       return OPTIMIZE (avx512_no_vzeroupper);
     }
 
-  if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
-				   AVX_Fast_Unaligned_Load, ))
+  if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load, ))
     {
       if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
 	{
 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	    return OPTIMIZE (evex_unaligned_erms);
-
+	    {
+	      if (CPU_FEATURES_ARCH_P (cpu_features,
+				       Prefer_Page_Unrolled_Large_Copy))
+		return OPTIMIZE (evex_unaligned_erms_page_unrolled);
+	      return OPTIMIZE (evex_unaligned_erms);
+	    }
+
+	  if (CPU_FEATURES_ARCH_P (cpu_features,
+				   Prefer_Page_Unrolled_Large_Copy))
+	    return OPTIMIZE (evex_unaligned_page_unrolled);
 	  return OPTIMIZE (evex_unaligned);
 	}
 
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 	{
 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	    return OPTIMIZE (avx_unaligned_erms_rtm);
-
+	    {
+	      if (CPU_FEATURES_ARCH_P (cpu_features,
+				       Prefer_Page_Unrolled_Large_Copy))
+		return OPTIMIZE (avx_unaligned_erms_page_unrolled_rtm);
+	      return OPTIMIZE (avx_unaligned_erms_rtm);
+	    }
+	  if (CPU_FEATURES_ARCH_P (cpu_features,
+				   Prefer_Page_Unrolled_Large_Copy))
+	    return OPTIMIZE (avx_unaligned_page_unrolled_rtm);
 	  return OPTIMIZE (avx_unaligned_rtm);
 	}
 
-      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
-				       Prefer_No_VZEROUPPER, !))
+      if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER, !))
 	{
 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	    return OPTIMIZE (avx_unaligned_erms);
-
+	    {
+	      if (CPU_FEATURES_ARCH_P (cpu_features,
+				       Prefer_Page_Unrolled_Large_Copy))
+		return OPTIMIZE (avx_unaligned_erms_page_unrolled);
+	      return OPTIMIZE (avx_unaligned_erms);
+	    }
+	  if (CPU_FEATURES_ARCH_P (cpu_features,
+				   Prefer_Page_Unrolled_Large_Copy))
+	    return OPTIMIZE (avx_unaligned_page_unrolled);
 	  return OPTIMIZE (avx_unaligned);
 	}
     }
 
   if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
       /* Leave this as runtime check.  The SSSE3 is optimized almost
-         exclusively for avoiding unaligned memory access during the
-         copy and by and large is not better than the sse2
-         implementation as a general purpose memmove.  */
+	 exclusively for avoiding unaligned memory access during the
+	 copy and by and large is not better than the sse2
+	 implementation as a general purpose memmove.  */
       && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
     {
       return OPTIMIZE (ssse3);
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-page-unrolled-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-page-unrolled-rtm.S
new file mode 100644
index 0000000000..683d903243
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-page-unrolled-rtm.S
@@ -0,0 +1,5 @@ 
+#ifndef MEMMOVE_SYMBOL
+# define MEMMOVE_SYMBOL(p,s)	p##_avx_##s##_page_unrolled_rtm
+#endif
+#define MEMMOVE_VEC_LARGE_IMPL	"memmove-vec-large-page-unrolled.S"
+#include "memmove-avx-unaligned-erms-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-page-unrolled.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-page-unrolled.S
new file mode 100644
index 0000000000..57b518e16f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-page-unrolled.S
@@ -0,0 +1,5 @@ 
+#ifndef MEMMOVE_SYMBOL
+# define MEMMOVE_SYMBOL(p,s)	p##_avx_##s##_page_unrolled
+#endif
+#define MEMMOVE_VEC_LARGE_IMPL	"memmove-vec-large-page-unrolled.S"
+#include "memmove-avx-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
index 20746e6713..36e864e935 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
@@ -2,7 +2,9 @@ 
 
 # include "x86-avx-rtm-vecs.h"
 
+#ifndef MEMMOVE_SYMBOL
 # define MEMMOVE_SYMBOL(p,s)	p##_avx_##s##_rtm
+#endif
 
 # include "memmove-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms-page-unrolled.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms-page-unrolled.S
new file mode 100644
index 0000000000..371b454819
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms-page-unrolled.S
@@ -0,0 +1,5 @@ 
+#ifndef MEMMOVE_SYMBOL
+# define MEMMOVE_SYMBOL(p,s)	p##_evex_##s##_page_unrolled
+#endif
+#define MEMMOVE_VEC_LARGE_IMPL	"memmove-vec-large-page-unrolled.S"
+#include "memmove-evex-unaligned-erms.S"