PowerPC: Align power7 memcpy using VSX to quadword

Message ID 539B1868.3000307@linux.vnet.ibm.com
State Committed
Headers

Commit Message

Adhemerval Zanella Netto June 13, 2014, 3:27 p.m. UTC
  This patch changes power7 memcpy to use VSX instructions only when
memory is aligned to quardword (16b).  It is to avoid unaligned kernel
traps on non-cacheable memory (for instance, memory-mapped I/O).

Checked on ppc64be and ppc32be.

--

2014-06-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>

	* sysdeps/powerpc/powerpc64/power7/memcpy.S: Align VSX copies to 16B
	to avoid alignment traps in non-cacheable memory.
	* sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.

---
  

Comments

Adhemerval Zanella Netto June 25, 2014, 1:09 a.m. UTC | #1
Ping.

On 13-06-2014 12:27, Adhemerval Zanella wrote:
> This patch changes power7 memcpy to use VSX instructions only when
> memory is aligned to quardword (16b).  It is to avoid unaligned kernel
> traps on non-cacheable memory (for instance, memory-mapped I/O).
>
> Checked on ppc64be and ppc32be.
>
> --
>
> 2014-06-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
>
> 	* sysdeps/powerpc/powerpc64/power7/memcpy.S: Align VSX copies to 16B
> 	to avoid alignment traps in non-cacheable memory.
> 	* sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
>
> ---
>
> diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S
> index 52c2a6b..e540fea 100644
> --- a/sysdeps/powerpc/powerpc32/power7/memcpy.S
> +++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S
> @@ -38,8 +38,8 @@ EALIGN (memcpy, 5, 0)
>  	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
>  				    code.  */
>  
> -	andi.   11,3,7	      /* Check alignment of DST.  */
> -	clrlwi  10,4,29	      /* Check alignment of SRC.  */
> +	andi.   11,3,15	      /* Check alignment of DST.  */
> +	clrlwi  10,4,28	      /* Check alignment of SRC.  */
>  	cmplw   cr6,10,11     /* SRC and DST alignments match?  */
>  	mr	12,4
>  	mr	31,5
> diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
> index bbfd381..18467f6 100644
> --- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
> +++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
> @@ -36,16 +36,11 @@ EALIGN (memcpy, 5, 0)
>  	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
>  				    code.  */
>  
> -#ifdef __LITTLE_ENDIAN__
> -/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
> -   or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
> -   loop is only used for quadword aligned copies.  */
> +/* Align copies using VSX instructions to quadword. It is to avoid alignment
> +   traps when memcpy is used on non-cacheable memory (for instance, memory
> +   mapped I/O).  */
>  	andi.	10,3,15
>  	clrldi	11,4,60
> -#else
> -	andi.	10,3,7		/* Check alignment of DST.  */
> -	clrldi	11,4,61		/* Check alignment of SRC.  */
> -#endif
>  	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
>  
>  	mr	dst,3
> @@ -53,11 +48,7 @@ EALIGN (memcpy, 5, 0)
>  	beq	L(aligned_copy)
>  
>  	mtocrf	0x01,0
> -#ifdef __LITTLE_ENDIAN__
>  	clrldi	0,0,60
> -#else
> -	clrldi	0,0,61
> -#endif
>  
>  /* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
>  1:
> @@ -79,14 +70,12 @@ EALIGN (memcpy, 5, 0)
>  	stw	6,0(dst)
>  	addi	dst,dst,4
>  8:
> -#ifdef __LITTLE_ENDIAN__
>  	bf	28,16f
>  	ld	6,0(src)
>  	addi	src,src,8
>  	std	6,0(dst)
>  	addi	dst,dst,8
>  16:
> -#endif
>  	subf	cnt,0,cnt
>  
>  /* Main aligned copy loop. Copies 128 bytes at a time. */
> @@ -298,9 +287,7 @@ L(copy_LE_8):
>  	.align	4
>  L(copy_GE_32_unaligned):
>  	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
> -#ifndef __LITTLE_ENDIAN__
>  	andi.	10,3,15	      /* Check alignment of DST (against quadwords).  */
> -#endif
>  	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
>  
>  	beq	L(copy_GE_32_unaligned_cont)
>
  

Patch

diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S
index 52c2a6b..e540fea 100644
--- a/sysdeps/powerpc/powerpc32/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S
@@ -38,8 +38,8 @@  EALIGN (memcpy, 5, 0)
 	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
 				    code.  */
 
-	andi.   11,3,7	      /* Check alignment of DST.  */
-	clrlwi  10,4,29	      /* Check alignment of SRC.  */
+	andi.   11,3,15	      /* Check alignment of DST.  */
+	clrlwi  10,4,28	      /* Check alignment of SRC.  */
 	cmplw   cr6,10,11     /* SRC and DST alignments match?  */
 	mr	12,4
 	mr	31,5
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
index bbfd381..18467f6 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -36,16 +36,11 @@  EALIGN (memcpy, 5, 0)
 	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
 				    code.  */
 
-#ifdef __LITTLE_ENDIAN__
-/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
-   or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
-   loop is only used for quadword aligned copies.  */
+/* Align copies using VSX instructions to quadword. It is to avoid alignment
+   traps when memcpy is used on non-cacheable memory (for instance, memory
+   mapped I/O).  */
 	andi.	10,3,15
 	clrldi	11,4,60
-#else
-	andi.	10,3,7		/* Check alignment of DST.  */
-	clrldi	11,4,61		/* Check alignment of SRC.  */
-#endif
 	cmpld	cr6,10,11	/* SRC and DST alignments match?  */
 
 	mr	dst,3
@@ -53,11 +48,7 @@  EALIGN (memcpy, 5, 0)
 	beq	L(aligned_copy)
 
 	mtocrf	0x01,0
-#ifdef __LITTLE_ENDIAN__
 	clrldi	0,0,60
-#else
-	clrldi	0,0,61
-#endif
 
 /* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
 1:
@@ -79,14 +70,12 @@  EALIGN (memcpy, 5, 0)
 	stw	6,0(dst)
 	addi	dst,dst,4
 8:
-#ifdef __LITTLE_ENDIAN__
 	bf	28,16f
 	ld	6,0(src)
 	addi	src,src,8
 	std	6,0(dst)
 	addi	dst,dst,8
 16:
-#endif
 	subf	cnt,0,cnt
 
 /* Main aligned copy loop. Copies 128 bytes at a time. */
@@ -298,9 +287,7 @@  L(copy_LE_8):
 	.align	4
 L(copy_GE_32_unaligned):
 	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
-#ifndef __LITTLE_ENDIAN__
 	andi.	10,3,15	      /* Check alignment of DST (against quadwords).  */
-#endif
 	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */
 
 	beq	L(copy_GE_32_unaligned_cont)