[x86_64] Update memcpy, mempcpy and memmove selection order for Excavator CPU BZ #19583
Commit Message
>It was done based on assumption that AVX enabled machine has fast AVX unaligned load. If it isn't true for AMD CPUs, we can enable it for all Intel AVX CPUs and you can set it for AMD CPUs properly.
Memcpy still needs to be fixed otherwise SSE2_Unaligned version is selected. Is it OK to fix in following way else please suggest.
--Amit
Comments
On Tue, Mar 22, 2016 at 4:08 AM, Pawar, Amit <Amit.Pawar@amd.com> wrote:
>>It was done based on assumption that AVX enabled machine has fast AVX unaligned load. If it isn't true for AMD CPUs, we can enable it for all Intel AVX CPUs and you can set it for AMD CPUs properly.
>
> Memcpy still needs to be fixed otherwise SSE2_Unaligned version is selected. Is it OK to fix in following way else please suggest.
So AMD processor doesn't want Fast_Unaligned_Load. Why is it
set?
>>>It was done based on assumption that AVX enabled machine has fast AVX unaligned load. If it isn't true for AMD CPUs, we can enable it for all Intel AVX CPUs and you can set it for AMD CPUs properly.
>>
>> Memcpy still needs to be fixed otherwise SSE2_Unaligned version is selected. Is it OK to fix in following way else please suggest.
>
>So AMD processor doesn't want Fast_Unaligned_Load. Why is it set?
For this function it is not better but good for other routines like strcat, strncat, stpcpy, stpncpy, strcpy and strncpy.
Thanks,
Amit Pawar
On Tue, Mar 22, 2016 at 7:57 AM, Pawar, Amit <Amit.Pawar@amd.com> wrote:
>>>>It was done based on assumption that AVX enabled machine has fast AVX unaligned load. If it isn't true for AMD CPUs, we can enable it for all Intel AVX CPUs and you can set it for AMD CPUs properly.
>>>
>>> Memcpy still needs to be fixed otherwise SSE2_Unaligned version is selected. Is it OK to fix in following way else please suggest.
>>
>>So AMD processor doesn't want Fast_Unaligned_Load. Why is it set?
>
> For this function it is not better but good for other routines like strcat, strncat, stpcpy, stpncpy, strcpy and strncpy.
Then we should add Fast_Unaligned_Copy and only use it in
memcpy.
> Then we should add Fast_Unaligned_Copy and only use it in memcpy.
PFA patch and ChangeLog files containing fix for memcpy IFUNC function. Is it OK else please suggest for any required changes.
Thanks,
Amit Pawar
@@ -159,9 +159,17 @@ init_cpu_features (struct cpu_features *cpu_features)
if (family == 0x15)
{
/* "Excavator" */
+#if index_arch_Fast_Unaligned_Load != index_arch_Prefer_Fast_Copy_Backward
+# error index_arch_Fast_Unaligned_Load != index_arch_Prefer_Fast_Copy_Backward
+#endif
+#if index_arch_Fast_Unaligned_Load != index_arch_Fast_Copy_Backward
+# error index_arch_Fast_Unaligned_Load != index_arch_Fast_Copy_Backward
+#endif
if (model >= 0x60 && model <= 0x7f)
cpu_features->feature[index_arch_Fast_Unaligned_Load]
- |= bit_arch_Fast_Unaligned_Load;
+ |= (bit_arch_Fast_Unaligned_Load
+ | bit_arch_Fast_Copy_Backward
+ | bit_arch_Prefer_Fast_Copy_Backward);
}
}
else
@@ -35,6 +35,7 @@
#define bit_arch_I686 (1 << 15)
#define bit_arch_Prefer_MAP_32BIT_EXEC (1 << 16)
#define bit_arch_Prefer_No_VZEROUPPER (1 << 17)
+#define bit_arch_Prefer_Fast_Copy_Backward (1 << 18)
/* CPUID Feature flags. */
@@ -101,6 +102,7 @@
# define index_arch_I686 FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Prefer_Fast_Copy_Backward FEATURE_INDEX_1*FEATURE_SIZE
# if defined (_LIBC) && !IS_IN (nonlib)
@@ -259,6 +261,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define index_arch_I686 FEATURE_INDEX_1
# define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1
# define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1
+# define index_arch_Prefer_Fast_Copy_Backward FEATURE_INDEX_1
#endif /* !__ASSEMBLER__ */
@@ -40,18 +40,20 @@ ENTRY(__new_memcpy)
#endif
1: lea __memcpy_avx_unaligned(%rip), %RAX_LP
HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jnz 3f
+ HAS_ARCH_FEATURE (Preferred_Fast_Copy_Backward)
jnz 2f
lea __memcpy_sse2_unaligned(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Unaligned_Load)
- jnz 2f
- lea __memcpy_sse2(%rip), %RAX_LP
+ jnz 3f
+2: lea __memcpy_sse2(%rip), %RAX_LP
HAS_CPU_FEATURE (SSSE3)
- jz 2f
+ jz 3f
lea __memcpy_ssse3_back(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jnz 2f
+ jnz 3f
lea __memcpy_ssse3(%rip), %RAX_LP
-2: ret
+3: ret
END(__new_memcpy)
# undef ENTRY