@@ -877,6 +877,12 @@ init_cpu_features (struct cpu_features *cpu_features)
case INTEL_BIGCORE_HASWELL:
case INTEL_BIGCORE_BROADWELL:
cpu_features->cachesize_non_temporal_divisor = 8;
+ /* Benchmarks indicate page unrolled large implementation
+ performs better than standard copy loop on HSW (and
+ presumably SnB). */
+ cpu_features
+ ->preferred[index_arch_Prefer_Page_Unrolled_Large_Copy]
+ |= bit_arch_Prefer_Page_Unrolled_Large_Copy;
goto default_tuning;
/* Newer Bigcore microarch (larger non-temporal store
@@ -890,6 +896,11 @@ init_cpu_features (struct cpu_features *cpu_features)
non-temporal on all Skylake servers. */
cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
|= bit_arch_Avoid_Non_Temporal_Memset;
+ /* Benchmarks indicate page unrolled large implementation
+ performs better than standard copy loop on Skylake/SKX. */
+ cpu_features
+ ->preferred[index_arch_Prefer_Page_Unrolled_Large_Copy]
+ |= bit_arch_Prefer_Page_Unrolled_Large_Copy;
/* fallthrough */
case INTEL_BIGCORE_COMETLAKE:
case INTEL_BIGCORE_SKYLAKE:
@@ -897,6 +908,12 @@ init_cpu_features (struct cpu_features *cpu_features)
case INTEL_BIGCORE_ICELAKE:
case INTEL_BIGCORE_TIGERLAKE:
case INTEL_BIGCORE_ROCKETLAKE:
+ /* Benchmarks indicate page unrolled large implementation
+ performs better than standard copy loop on SKX/ICX. */
+ cpu_features
+ ->preferred[index_arch_Prefer_Page_Unrolled_Large_Copy]
+ |= bit_arch_Prefer_Page_Unrolled_Large_Copy;
+ /* fallthrough */
case INTEL_BIGCORE_RAPTORLAKE:
case INTEL_BIGCORE_METEORLAKE:
case INTEL_BIGCORE_LUNARLAKE:
@@ -257,6 +257,12 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
(n, cpu_features, Prefer_PMINUB_for_stringop, SSE2, 26);
}
break;
+ case 31:
+ {
+ CHECK_GLIBC_IFUNC_PREFERRED_BOTH (
+ n, cpu_features, Prefer_Page_Unrolled_Large_Copy, 31);
+ }
+ break;
}
}
}
@@ -35,3 +35,4 @@ BIT (Prefer_FSRM)
BIT (Avoid_Short_Distance_REP_MOVSB)
BIT (Avoid_Non_Temporal_Memset)
BIT (Avoid_STOSB)
+BIT (Prefer_Page_Unrolled_Large_Copy)
\ No newline at end of file
@@ -61,7 +61,7 @@ static const struct test_t
"-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
"-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS,"
"-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,"
- "-Avoid_STOSB",
+ "-Avoid_STOSB,-Prefer_Page_Unrolled_Large_Copy",
test_1,
array_length (test_1)
},
@@ -70,7 +70,7 @@ static const struct test_t
",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL,"
"-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-,"
"-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,"
- "-Avoid_STOSB,-,",
+ "-Avoid_STOSB,-Prefer_Page_Unrolled_Large_Copy,-,",
test_1,
array_length (test_1)
}
@@ -16,11 +16,14 @@ sysdep_routines += \
memcmpeq-evex \
memcmpeq-sse2 \
memmove-avx-unaligned-erms \
+ memmove-avx-unaligned-erms-page-unrolled \
+ memmove-avx-unaligned-erms-page-unrolled-rtm \
memmove-avx-unaligned-erms-rtm \
memmove-avx512-no-vzeroupper \
memmove-avx512-unaligned-erms \
memmove-erms \
memmove-evex-unaligned-erms \
+ memmove-evex-unaligned-erms-page-unrolled \
memmove-sse2-unaligned-erms \
memmove-ssse3 \
memrchr-avx2 \
@@ -133,23 +133,43 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_evex_unaligned)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_chk_evex_unaligned_page_unrolled)
X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_evex_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __memmove_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_chk_evex_unaligned_erms_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX),
__memmove_chk_avx_unaligned)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
+ CPU_FEATURE_USABLE (AVX),
+ __memmove_chk_avx_unaligned_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX),
__memmove_chk_avx_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
+ CPU_FEATURE_USABLE (AVX),
+ __memmove_chk_avx_unaligned_erms_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_chk_avx_unaligned_rtm)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
+ (CPU_FEATURE_USABLE (AVX)
+ && CPU_FEATURE_USABLE (RTM)),
+ __memmove_chk_avx_unaligned_page_unrolled_rtm)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_chk_avx_unaligned_erms_rtm)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memmove_chk,
+ (CPU_FEATURE_USABLE (AVX)
+ && CPU_FEATURE_USABLE (RTM)),
+ __memmove_chk_avx_unaligned_erms_page_unrolled_rtm)
/* By V3 we assume fast aligned copy. */
X86_IFUNC_IMPL_ADD_V2 (array, i, __memmove_chk,
CPU_FEATURE_USABLE (SSSE3),
@@ -180,23 +200,43 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_evex_unaligned)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_evex_unaligned_page_unrolled)
X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_evex_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, memmove,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_evex_unaligned_erms_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
CPU_FEATURE_USABLE (AVX),
__memmove_avx_unaligned)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
+ CPU_FEATURE_USABLE (AVX),
+ __memmove_avx_unaligned_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
CPU_FEATURE_USABLE (AVX),
__memmove_avx_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
+ CPU_FEATURE_USABLE (AVX),
+ __memmove_avx_unaligned_erms_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_avx_unaligned_rtm)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
+ (CPU_FEATURE_USABLE (AVX)
+ && CPU_FEATURE_USABLE (RTM)),
+ __memmove_avx_unaligned_page_unrolled_rtm)
X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memmove_avx_unaligned_erms_rtm)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memmove,
+ (CPU_FEATURE_USABLE (AVX)
+ && CPU_FEATURE_USABLE (RTM)),
+ __memmove_avx_unaligned_erms_page_unrolled_rtm)
/* By V3 we assume fast aligned copy. */
X86_IFUNC_IMPL_ADD_V2 (array, i, memmove,
CPU_FEATURE_USABLE (SSSE3),
@@ -1140,23 +1180,43 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_evex_unaligned)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_chk_evex_unaligned_page_unrolled)
X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_evex_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __memcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_chk_evex_unaligned_erms_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX),
__memcpy_chk_avx_unaligned)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
+ CPU_FEATURE_USABLE (AVX),
+ __memcpy_chk_avx_unaligned_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX),
__memcpy_chk_avx_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
+ CPU_FEATURE_USABLE (AVX),
+ __memcpy_chk_avx_unaligned_erms_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_chk_avx_unaligned_rtm)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
+ (CPU_FEATURE_USABLE (AVX)
+ && CPU_FEATURE_USABLE (RTM)),
+ __memcpy_chk_avx_unaligned_page_unrolled_rtm)
X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_chk_avx_unaligned_erms_rtm)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __memcpy_chk,
+ (CPU_FEATURE_USABLE (AVX)
+ && CPU_FEATURE_USABLE (RTM)),
+ __memcpy_chk_avx_unaligned_erms_page_unrolled_rtm)
/* By V3 we assume fast aligned copy. */
X86_IFUNC_IMPL_ADD_V2 (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
@@ -1187,23 +1247,43 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_evex_unaligned)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_evex_unaligned_page_unrolled)
X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_evex_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, memcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_evex_unaligned_erms_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
CPU_FEATURE_USABLE (AVX),
__memcpy_avx_unaligned)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
+ CPU_FEATURE_USABLE (AVX),
+ __memcpy_avx_unaligned_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
CPU_FEATURE_USABLE (AVX),
__memcpy_avx_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
+ CPU_FEATURE_USABLE (AVX),
+ __memcpy_avx_unaligned_erms_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_avx_unaligned_rtm)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
+ (CPU_FEATURE_USABLE (AVX)
+ && CPU_FEATURE_USABLE (RTM)),
+ __memcpy_avx_unaligned_page_unrolled_rtm)
X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__memcpy_avx_unaligned_erms_rtm)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, memcpy,
+ (CPU_FEATURE_USABLE (AVX)
+ && CPU_FEATURE_USABLE (RTM)),
+ __memcpy_avx_unaligned_erms_page_unrolled_rtm)
/* By V3 we assume fast aligned copy. */
X86_IFUNC_IMPL_ADD_V2 (array, i, memcpy,
CPU_FEATURE_USABLE (SSSE3),
@@ -1234,23 +1314,43 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_evex_unaligned)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_chk_evex_unaligned_page_unrolled)
X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_evex_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, __mempcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_chk_evex_unaligned_erms_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX),
__mempcpy_chk_avx_unaligned)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
+ CPU_FEATURE_USABLE (AVX),
+ __mempcpy_chk_avx_unaligned_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX),
__mempcpy_chk_avx_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
+ CPU_FEATURE_USABLE (AVX),
+ __mempcpy_chk_avx_unaligned_erms_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_chk_avx_unaligned_rtm)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
+ (CPU_FEATURE_USABLE (AVX)
+ && CPU_FEATURE_USABLE (RTM)),
+ __mempcpy_chk_avx_unaligned_page_unrolled_rtm)
X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_chk_avx_unaligned_erms_rtm)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, __mempcpy_chk,
+ (CPU_FEATURE_USABLE (AVX)
+ && CPU_FEATURE_USABLE (RTM)),
+ __mempcpy_chk_avx_unaligned_erms_page_unrolled_rtm)
/* By V3 we assume fast aligned copy. */
X86_IFUNC_IMPL_ADD_V2 (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
@@ -1281,23 +1381,43 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_evex_unaligned)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_evex_unaligned_page_unrolled)
X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_evex_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V4 (array, i, mempcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_evex_unaligned_erms_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX),
__mempcpy_avx_unaligned)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
+ CPU_FEATURE_USABLE (AVX),
+ __mempcpy_avx_unaligned_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX),
__mempcpy_avx_unaligned_erms)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
+ CPU_FEATURE_USABLE (AVX),
+ __mempcpy_avx_unaligned_erms_page_unrolled)
X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_avx_unaligned_rtm)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
+ (CPU_FEATURE_USABLE (AVX)
+ && CPU_FEATURE_USABLE (RTM)),
+ __mempcpy_avx_unaligned_page_unrolled_rtm)
X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
(CPU_FEATURE_USABLE (AVX)
&& CPU_FEATURE_USABLE (RTM)),
__mempcpy_avx_unaligned_erms_rtm)
+ X86_IFUNC_IMPL_ADD_V3 (array, i, mempcpy,
+ (CPU_FEATURE_USABLE (AVX)
+ && CPU_FEATURE_USABLE (RTM)),
+ __mempcpy_avx_unaligned_erms_page_unrolled_rtm)
/* By V3 we assume fast aligned copy. */
X86_IFUNC_IMPL_ADD_V2 (array, i, mempcpy,
CPU_FEATURE_USABLE (SSSE3),
@@ -28,18 +28,27 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_no_vzeroupper)
attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
- attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
- attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME)
+ OPTIMIZE (evex_unaligned_page_unrolled) attribute_hidden;
+extern __typeof (REDIRECT_NAME)
+ OPTIMIZE (evex_unaligned_erms) attribute_hidden;
+extern __typeof (REDIRECT_NAME)
+ OPTIMIZE (evex_unaligned_erms_page_unrolled) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
- attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm)
- attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm)
- attribute_hidden;
+extern __typeof (REDIRECT_NAME)
+ OPTIMIZE (avx_unaligned_page_unrolled) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) attribute_hidden;
+extern __typeof (REDIRECT_NAME)
+ OPTIMIZE (avx_unaligned_erms_page_unrolled) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME)
+ OPTIMIZE (avx_unaligned_page_unrolled_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME)
+ OPTIMIZE (avx_unaligned_erms_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME)
+ OPTIMIZE (avx_unaligned_erms_page_unrolled_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
@@ -71,40 +80,60 @@ IFUNC_SELECTOR (void)
return OPTIMIZE (avx512_no_vzeroupper);
}
- if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
- AVX_Fast_Unaligned_Load, ))
+ if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load, ))
{
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
{
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (evex_unaligned_erms);
-
+ {
+ if (CPU_FEATURES_ARCH_P (cpu_features,
+ Prefer_Page_Unrolled_Large_Copy))
+ return OPTIMIZE (evex_unaligned_erms_page_unrolled);
+ return OPTIMIZE (evex_unaligned_erms);
+ }
+
+ if (CPU_FEATURES_ARCH_P (cpu_features,
+ Prefer_Page_Unrolled_Large_Copy))
+ return OPTIMIZE (evex_unaligned_page_unrolled);
return OPTIMIZE (evex_unaligned);
}
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
{
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (avx_unaligned_erms_rtm);
-
+ {
+ if (CPU_FEATURES_ARCH_P (cpu_features,
+ Prefer_Page_Unrolled_Large_Copy))
+ return OPTIMIZE (avx_unaligned_erms_page_unrolled_rtm);
+ return OPTIMIZE (avx_unaligned_erms_rtm);
+ }
+ if (CPU_FEATURES_ARCH_P (cpu_features,
+ Prefer_Page_Unrolled_Large_Copy))
+ return OPTIMIZE (avx_unaligned_page_unrolled_rtm);
return OPTIMIZE (avx_unaligned_rtm);
}
- if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features,
- Prefer_No_VZEROUPPER, !))
+ if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER, !))
{
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (avx_unaligned_erms);
-
+ {
+ if (CPU_FEATURES_ARCH_P (cpu_features,
+ Prefer_Page_Unrolled_Large_Copy))
+ return OPTIMIZE (avx_unaligned_erms_page_unrolled);
+ return OPTIMIZE (avx_unaligned_erms);
+ }
+ if (CPU_FEATURES_ARCH_P (cpu_features,
+ Prefer_Page_Unrolled_Large_Copy))
+ return OPTIMIZE (avx_unaligned_page_unrolled);
return OPTIMIZE (avx_unaligned);
}
}
if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
/* Leave this as runtime check. The SSSE3 is optimized almost
- exclusively for avoiding unaligned memory access during the
- copy and by and large is not better than the sse2
- implementation as a general purpose memmove. */
+ exclusively for avoiding unaligned memory access during the
+ copy and by and large is not better than the sse2
+ implementation as a general purpose memmove. */
&& !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
{
return OPTIMIZE (ssse3);
new file mode 100644
@@ -0,0 +1,5 @@
+#ifndef MEMMOVE_SYMBOL
+# define MEMMOVE_SYMBOL(p,s) p##_avx_##s##_page_unrolled_rtm
+#endif
+#define MEMMOVE_VEC_LARGE_IMPL "memmove-vec-large-page-unrolled.S"
+#include "memmove-avx-unaligned-erms-rtm.S"
new file mode 100644
@@ -0,0 +1,5 @@
+#ifndef MEMMOVE_SYMBOL
+# define MEMMOVE_SYMBOL(p,s) p##_avx_##s##_page_unrolled
+#endif
+#define MEMMOVE_VEC_LARGE_IMPL "memmove-vec-large-page-unrolled.S"
+#include "memmove-avx-unaligned-erms.S"
@@ -2,7 +2,9 @@
# include "x86-avx-rtm-vecs.h"
+#ifndef MEMMOVE_SYMBOL
# define MEMMOVE_SYMBOL(p,s) p##_avx_##s##_rtm
+#endif
# include "memmove-vec-unaligned-erms.S"
#endif
new file mode 100644
@@ -0,0 +1,5 @@
+#ifndef MEMMOVE_SYMBOL
+# define MEMMOVE_SYMBOL(p,s) p##_evex_##s##_page_unrolled
+#endif
+#define MEMMOVE_VEC_LARGE_IMPL "memmove-vec-large-page-unrolled.S"
+#include "memmove-evex-unaligned-erms.S"