PowerPC: Align power7 memcpy using VSX to quadword
Commit Message
This patch changes power7 memcpy to use VSX instructions only when
memory is aligned to quardword (16b). It is to avoid unaligned kernel
traps on non-cacheable memory (for instance, memory-mapped I/O).
Checked on ppc64be and ppc32be.
--
2014-06-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/power7/memcpy.S: Align VSX copies to 16B
to avoid alignment traps in non-cacheable memory.
* sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
---
Comments
Ping.
On 13-06-2014 12:27, Adhemerval Zanella wrote:
> This patch changes power7 memcpy to use VSX instructions only when
> memory is aligned to quardword (16b). It is to avoid unaligned kernel
> traps on non-cacheable memory (for instance, memory-mapped I/O).
>
> Checked on ppc64be and ppc32be.
>
> --
>
> 2014-06-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
>
> * sysdeps/powerpc/powerpc64/power7/memcpy.S: Align VSX copies to 16B
> to avoid alignment traps in non-cacheable memory.
> * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
>
> ---
>
> diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S
> index 52c2a6b..e540fea 100644
> --- a/sysdeps/powerpc/powerpc32/power7/memcpy.S
> +++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S
> @@ -38,8 +38,8 @@ EALIGN (memcpy, 5, 0)
> ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
> code. */
>
> - andi. 11,3,7 /* Check alignment of DST. */
> - clrlwi 10,4,29 /* Check alignment of SRC. */
> + andi. 11,3,15 /* Check alignment of DST. */
> + clrlwi 10,4,28 /* Check alignment of SRC. */
> cmplw cr6,10,11 /* SRC and DST alignments match? */
> mr 12,4
> mr 31,5
> diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
> index bbfd381..18467f6 100644
> --- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
> +++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
> @@ -36,16 +36,11 @@ EALIGN (memcpy, 5, 0)
> ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
> code. */
>
> -#ifdef __LITTLE_ENDIAN__
> -/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
> - or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
> - loop is only used for quadword aligned copies. */
> +/* Align copies using VSX instructions to quadword. It is to avoid alignment
> + traps when memcpy is used on non-cacheable memory (for instance, memory
> + mapped I/O). */
> andi. 10,3,15
> clrldi 11,4,60
> -#else
> - andi. 10,3,7 /* Check alignment of DST. */
> - clrldi 11,4,61 /* Check alignment of SRC. */
> -#endif
> cmpld cr6,10,11 /* SRC and DST alignments match? */
>
> mr dst,3
> @@ -53,11 +48,7 @@ EALIGN (memcpy, 5, 0)
> beq L(aligned_copy)
>
> mtocrf 0x01,0
> -#ifdef __LITTLE_ENDIAN__
> clrldi 0,0,60
> -#else
> - clrldi 0,0,61
> -#endif
>
> /* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */
> 1:
> @@ -79,14 +70,12 @@ EALIGN (memcpy, 5, 0)
> stw 6,0(dst)
> addi dst,dst,4
> 8:
> -#ifdef __LITTLE_ENDIAN__
> bf 28,16f
> ld 6,0(src)
> addi src,src,8
> std 6,0(dst)
> addi dst,dst,8
> 16:
> -#endif
> subf cnt,0,cnt
>
> /* Main aligned copy loop. Copies 128 bytes at a time. */
> @@ -298,9 +287,7 @@ L(copy_LE_8):
> .align 4
> L(copy_GE_32_unaligned):
> clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */
> -#ifndef __LITTLE_ENDIAN__
> andi. 10,3,15 /* Check alignment of DST (against quadwords). */
> -#endif
> srdi 9,cnt,4 /* Number of full quadwords remaining. */
>
> beq L(copy_GE_32_unaligned_cont)
>
@@ -38,8 +38,8 @@ EALIGN (memcpy, 5, 0)
ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
code. */
- andi. 11,3,7 /* Check alignment of DST. */
- clrlwi 10,4,29 /* Check alignment of SRC. */
+ andi. 11,3,15 /* Check alignment of DST. */
+ clrlwi 10,4,28 /* Check alignment of SRC. */
cmplw cr6,10,11 /* SRC and DST alignments match? */
mr 12,4
mr 31,5
@@ -36,16 +36,11 @@ EALIGN (memcpy, 5, 0)
ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
code. */
-#ifdef __LITTLE_ENDIAN__
-/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
- or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
- loop is only used for quadword aligned copies. */
+/* Align copies using VSX instructions to quadword. It is to avoid alignment
+ traps when memcpy is used on non-cacheable memory (for instance, memory
+ mapped I/O). */
andi. 10,3,15
clrldi 11,4,60
-#else
- andi. 10,3,7 /* Check alignment of DST. */
- clrldi 11,4,61 /* Check alignment of SRC. */
-#endif
cmpld cr6,10,11 /* SRC and DST alignments match? */
mr dst,3
@@ -53,11 +48,7 @@ EALIGN (memcpy, 5, 0)
beq L(aligned_copy)
mtocrf 0x01,0
-#ifdef __LITTLE_ENDIAN__
clrldi 0,0,60
-#else
- clrldi 0,0,61
-#endif
/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */
1:
@@ -79,14 +70,12 @@ EALIGN (memcpy, 5, 0)
stw 6,0(dst)
addi dst,dst,4
8:
-#ifdef __LITTLE_ENDIAN__
bf 28,16f
ld 6,0(src)
addi src,src,8
std 6,0(dst)
addi dst,dst,8
16:
-#endif
subf cnt,0,cnt
/* Main aligned copy loop. Copies 128 bytes at a time. */
@@ -298,9 +287,7 @@ L(copy_LE_8):
.align 4
L(copy_GE_32_unaligned):
clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */
-#ifndef __LITTLE_ENDIAN__
andi. 10,3,15 /* Check alignment of DST (against quadwords). */
-#endif
srdi 9,cnt,4 /* Number of full quadwords remaining. */
beq L(copy_GE_32_unaligned_cont)