[PATCHv3] powerpc: Replace lxvd2x/stxvd2x with lvx/stvx in P7's memcpy/memmove

Message ID 20171020192239.24706-1-tuliom@linux.vnet.ibm.com
State Committed
Headers

Commit Message

Tulio Magno Quites Machado Filho Oct. 20, 2017, 7:22 p.m. UTC
  From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>

POWER9 DD2.1 and earlier has an issue where some cache inhibited
vector load traps to the kernel, causing a performance degradation.  To
handle this in memcpy and memmove, lvx/stvx is used for aligned
addresses instead of lxvd2x/stxvd2x.

Reference: https://patchwork.ozlabs.org/patch/814059/

2017-10-20  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>

	* sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
	lxvd2x/stxvd2x with lvx/stvx.
	* sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.

Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
---
 sysdeps/powerpc/powerpc64/power7/memcpy.S  |  64 +++++++--------
 sysdeps/powerpc/powerpc64/power7/memmove.S | 128 ++++++++++++++---------------
 2 files changed, 96 insertions(+), 96 deletions(-)
  

Comments

Adhemerval Zanella Netto Oct. 25, 2017, 11:22 a.m. UTC | #1
On 20/10/2017 17:22, Tulio Magno Quites Machado Filho wrote:
> From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
>
> POWER9 DD2.1 and earlier has an issue where some cache inhibited
> vector load traps to the kernel, causing a performance degradation.  To
> handle this in memcpy and memmove, lvx/stvx is used for aligned
> addresses instead of lxvd2x/stxvd2x.
>
> Reference: https://patchwork.ozlabs.org/patch/814059/
>
> 2017-10-20  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
>
> 	* sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
> 	lxvd2x/stxvd2x with lvx/stvx.
> 	* sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
>
> Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>

|Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>|



> ---
>  sysdeps/powerpc/powerpc64/power7/memcpy.S  |  64 +++++++--------
>  sysdeps/powerpc/powerpc64/power7/memmove.S | 128 ++++++++++++++---------------
>  2 files changed, 96 insertions(+), 96 deletions(-)
>
> diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
> index 1ccbc2e..a7cdf8b 100644
> --- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
> +++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
> @@ -91,63 +91,63 @@ L(aligned_copy):
>  	srdi	12,cnt,7
>  	cmpdi	12,0
>  	beq	L(aligned_tail)
> -	lxvd2x	6,0,src
> -	lxvd2x	7,src,6
> +	lvx	6,0,src
> +	lvx	7,src,6
>  	mtctr	12
>  	b	L(aligned_128loop)
>  
>  	.align  4
>  L(aligned_128head):
>  	/* for the 2nd + iteration of this loop. */
> -	lxvd2x	6,0,src
> -	lxvd2x	7,src,6
> +	lvx	6,0,src
> +	lvx	7,src,6
>  L(aligned_128loop):
> -	lxvd2x	8,src,7
> -	lxvd2x	9,src,8
> -	stxvd2x	6,0,dst
> +	lvx	8,src,7
> +	lvx	9,src,8
> +	stvx	6,0,dst
>  	addi	src,src,64
> -	stxvd2x	7,dst,6
> -	stxvd2x	8,dst,7
> -	stxvd2x	9,dst,8
> -	lxvd2x	6,0,src
> -	lxvd2x	7,src,6
> +	stvx	7,dst,6
> +	stvx	8,dst,7
> +	stvx	9,dst,8
> +	lvx	6,0,src
> +	lvx	7,src,6
>  	addi	dst,dst,64
> -	lxvd2x	8,src,7
> -	lxvd2x	9,src,8
> +	lvx	8,src,7
> +	lvx	9,src,8
>  	addi	src,src,64
> -	stxvd2x	6,0,dst
> -	stxvd2x	7,dst,6
> -	stxvd2x	8,dst,7
> -	stxvd2x	9,dst,8
> +	stvx	6,0,dst
> +	stvx	7,dst,6
> +	stvx	8,dst,7
> +	stvx	9,dst,8
>  	addi	dst,dst,64
>  	bdnz	L(aligned_128head)
>  
>  L(aligned_tail):
>  	mtocrf	0x01,cnt
>  	bf	25,32f
> -	lxvd2x	6,0,src
> -	lxvd2x	7,src,6
> -	lxvd2x	8,src,7
> -	lxvd2x	9,src,8
> +	lvx	6,0,src
> +	lvx	7,src,6
> +	lvx	8,src,7
> +	lvx	9,src,8
>  	addi	src,src,64
> -	stxvd2x	6,0,dst
> -	stxvd2x	7,dst,6
> -	stxvd2x	8,dst,7
> -	stxvd2x	9,dst,8
> +	stvx	6,0,dst
> +	stvx	7,dst,6
> +	stvx	8,dst,7
> +	stvx	9,dst,8
>  	addi	dst,dst,64
>  32:
>  	bf	26,16f
> -	lxvd2x	6,0,src
> -	lxvd2x	7,src,6
> +	lvx	6,0,src
> +	lvx	7,src,6
>  	addi	src,src,32
> -	stxvd2x	6,0,dst
> -	stxvd2x	7,dst,6
> +	stvx	6,0,dst
> +	stvx	7,dst,6
>  	addi	dst,dst,32
>  16:
>  	bf	27,8f
> -	lxvd2x	6,0,src
> +	lvx	6,0,src
>  	addi	src,src,16
> -	stxvd2x	6,0,dst
> +	stvx	6,0,dst
>  	addi	dst,dst,16
>  8:
>  	bf	28,4f
> diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S
> index 93baa69..667c6e2 100644
> --- a/sysdeps/powerpc/powerpc64/power7/memmove.S
> +++ b/sysdeps/powerpc/powerpc64/power7/memmove.S
> @@ -92,63 +92,63 @@ L(aligned_copy):
>  	srdi	12,r5,7
>  	cmpdi	12,0
>  	beq	L(aligned_tail)
> -	lxvd2x	6,0,r4
> -	lxvd2x	7,r4,6
> +	lvx	6,0,r4
> +	lvx	7,r4,6
>  	mtctr	12
>  	b	L(aligned_128loop)
>  
>  	.align  4
>  L(aligned_128head):
>  	/* for the 2nd + iteration of this loop. */
> -	lxvd2x	6,0,r4
> -	lxvd2x	7,r4,6
> +	lvx	6,0,r4
> +	lvx	7,r4,6
>  L(aligned_128loop):
> -	lxvd2x	8,r4,7
> -	lxvd2x	9,r4,8
> -	stxvd2x	6,0,r11
> +	lvx	8,r4,7
> +	lvx	9,r4,8
> +	stvx	6,0,r11
>  	addi	r4,r4,64
> -	stxvd2x	7,r11,6
> -	stxvd2x	8,r11,7
> -	stxvd2x	9,r11,8
> -	lxvd2x	6,0,r4
> -	lxvd2x	7,r4,6
> +	stvx	7,r11,6
> +	stvx	8,r11,7
> +	stvx	9,r11,8
> +	lvx	6,0,r4
> +	lvx	7,r4,6
>  	addi	r11,r11,64
> -	lxvd2x	8,r4,7
> -	lxvd2x	9,r4,8
> +	lvx	8,r4,7
> +	lvx	9,r4,8
>  	addi	r4,r4,64
> -	stxvd2x	6,0,r11
> -	stxvd2x	7,r11,6
> -	stxvd2x	8,r11,7
> -	stxvd2x	9,r11,8
> +	stvx	6,0,r11
> +	stvx	7,r11,6
> +	stvx	8,r11,7
> +	stvx	9,r11,8
>  	addi	r11,r11,64
>  	bdnz	L(aligned_128head)
>  
>  L(aligned_tail):
>  	mtocrf	0x01,r5
>  	bf	25,32f
> -	lxvd2x	6,0,r4
> -	lxvd2x	7,r4,6
> -	lxvd2x	8,r4,7
> -	lxvd2x	9,r4,8
> +	lvx	6,0,r4
> +	lvx	7,r4,6
> +	lvx	8,r4,7
> +	lvx	9,r4,8
>  	addi	r4,r4,64
> -	stxvd2x	6,0,r11
> -	stxvd2x	7,r11,6
> -	stxvd2x	8,r11,7
> -	stxvd2x	9,r11,8
> +	stvx	6,0,r11
> +	stvx	7,r11,6
> +	stvx	8,r11,7
> +	stvx	9,r11,8
>  	addi	r11,r11,64
>  32:
>  	bf	26,16f
> -	lxvd2x	6,0,r4
> -	lxvd2x	7,r4,6
> +	lvx	6,0,r4
> +	lvx	7,r4,6
>  	addi	r4,r4,32
> -	stxvd2x	6,0,r11
> -	stxvd2x	7,r11,6
> +	stvx	6,0,r11
> +	stvx	7,r11,6
>  	addi	r11,r11,32
>  16:
>  	bf	27,8f
> -	lxvd2x	6,0,r4
> +	lvx	6,0,r4
>  	addi	r4,r4,16
> -	stxvd2x	6,0,r11
> +	stvx	6,0,r11
>  	addi	r11,r11,16
>  8:
>  	bf	28,4f
> @@ -488,63 +488,63 @@ L(aligned_copy_bwd):
>  	srdi	r12,r5,7
>  	cmpdi	r12,0
>  	beq	L(aligned_tail_bwd)
> -	lxvd2x	v6,r4,r6
> -	lxvd2x	v7,r4,r7
> +	lvx	v6,r4,r6
> +	lvx	v7,r4,r7
>  	mtctr	12
>  	b	L(aligned_128loop_bwd)
>  
>  	.align  4
>  L(aligned_128head_bwd):
>  	/* for the 2nd + iteration of this loop. */
> -	lxvd2x	v6,r4,r6
> -	lxvd2x	v7,r4,r7
> +	lvx	v6,r4,r6
> +	lvx	v7,r4,r7
>  L(aligned_128loop_bwd):
> -	lxvd2x	v8,r4,r8
> -	lxvd2x	v9,r4,r9
> -	stxvd2x	v6,r11,r6
> +	lvx	v8,r4,r8
> +	lvx	v9,r4,r9
> +	stvx	v6,r11,r6
>  	subi	r4,r4,64
> -	stxvd2x	v7,r11,r7
> -	stxvd2x	v8,r11,r8
> -	stxvd2x	v9,r11,r9
> -	lxvd2x	v6,r4,r6
> -	lxvd2x	v7,r4,7
> +	stvx	v7,r11,r7
> +	stvx	v8,r11,r8
> +	stvx	v9,r11,r9
> +	lvx	v6,r4,r6
> +	lvx	v7,r4,7
>  	subi	r11,r11,64
> -	lxvd2x	v8,r4,r8
> -	lxvd2x	v9,r4,r9
> +	lvx	v8,r4,r8
> +	lvx	v9,r4,r9
>  	subi	r4,r4,64
> -	stxvd2x	v6,r11,r6
> -	stxvd2x	v7,r11,r7
> -	stxvd2x	v8,r11,r8
> -	stxvd2x	v9,r11,r9
> +	stvx	v6,r11,r6
> +	stvx	v7,r11,r7
> +	stvx	v8,r11,r8
> +	stvx	v9,r11,r9
>  	subi	r11,r11,64
>  	bdnz	L(aligned_128head_bwd)
>  
>  L(aligned_tail_bwd):
>  	mtocrf	0x01,r5
>  	bf	25,32f
> -	lxvd2x	v6,r4,r6
> -	lxvd2x	v7,r4,r7
> -	lxvd2x	v8,r4,r8
> -	lxvd2x	v9,r4,r9
> +	lvx	v6,r4,r6
> +	lvx	v7,r4,r7
> +	lvx	v8,r4,r8
> +	lvx	v9,r4,r9
>  	subi	r4,r4,64
> -	stxvd2x	v6,r11,r6
> -	stxvd2x	v7,r11,r7
> -	stxvd2x	v8,r11,r8
> -	stxvd2x	v9,r11,r9
> +	stvx	v6,r11,r6
> +	stvx	v7,r11,r7
> +	stvx	v8,r11,r8
> +	stvx	v9,r11,r9
>  	subi	r11,r11,64
>  32:
>  	bf	26,16f
> -	lxvd2x	v6,r4,r6
> -	lxvd2x	v7,r4,r7
> +	lvx	v6,r4,r6
> +	lvx	v7,r4,r7
>  	subi	r4,r4,32
> -	stxvd2x	v6,r11,r6
> -	stxvd2x	v7,r11,r7
> +	stvx	v6,r11,r6
> +	stvx	v7,r11,r7
>  	subi	r11,r11,32
>  16:
>  	bf	27,8f
> -	lxvd2x	v6,r4,r6
> +	lvx	v6,r4,r6
>  	subi	r4,r4,16
> -	stxvd2x	v6,r11,r6
> +	stvx	v6,r11,r6
>  	subi	r11,r11,16
>  8:
>  	bf	28,4f
  
Tulio Magno Quites Machado Filho Oct. 25, 2017, 3:17 p.m. UTC | #2
Adhemerval Zanella <adhemerval.zanella@linaro.org> writes:

> On 20/10/2017 17:22, Tulio Magno Quites Machado Filho wrote:
>> From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
>>
>> POWER9 DD2.1 and earlier has an issue where some cache inhibited
>> vector load traps to the kernel, causing a performance degradation.  To
>> handle this in memcpy and memmove, lvx/stvx is used for aligned
>> addresses instead of lxvd2x/stxvd2x.
>>
>> Reference: https://patchwork.ozlabs.org/patch/814059/
>>
>> 2017-10-20  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
>>
>> 	* sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
>> 	lxvd2x/stxvd2x with lvx/stvx.
>> 	* sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
>>
>> Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
>
> |Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>|

Pushed as 63da5cd4a09.

Thanks!
  
Carlos O'Donell Oct. 25, 2017, 6:43 p.m. UTC | #3
On 10/25/2017 08:17 AM, Tulio Magno Quites Machado Filho wrote:
> Adhemerval Zanella <adhemerval.zanella@linaro.org> writes:
> 
>> On 20/10/2017 17:22, Tulio Magno Quites Machado Filho wrote:
>>> From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
>>>
>>> POWER9 DD2.1 and earlier has an issue where some cache inhibited
>>> vector load traps to the kernel, causing a performance degradation.  To
>>> handle this in memcpy and memmove, lvx/stvx is used for aligned
>>> addresses instead of lxvd2x/stxvd2x.
>>>
>>> Reference: https://patchwork.ozlabs.org/patch/814059/
>>>
>>> 2017-10-20  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
>>>
>>> 	* sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
>>> 	lxvd2x/stxvd2x with lvx/stvx.
>>> 	* sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
>>>
>>> Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
>>
>> |Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>|
> 
> Pushed as 63da5cd4a09.

Raji, Adhemerval, Tulio,

Thank you all for fixing this up!
  
Florian Weimer Nov. 20, 2017, 9:23 a.m. UTC | #4
On 10/20/2017 09:22 PM, Tulio Magno Quites Machado Filho wrote:
> From: Rajalakshmi Srinivasaraghavan<raji@linux.vnet.ibm.com>
> 
> POWER9 DD2.1 and earlier has an issue where some cache inhibited
> vector load traps to the kernel, causing a performance degradation.  To
> handle this in memcpy and memmove, lvx/stvx is used for aligned
> addresses instead of lxvd2x/stxvd2x.
> 
> Reference:https://patchwork.ozlabs.org/patch/814059/
> 
> 2017-10-20  Rajalakshmi Srinivasaraghavan<raji@linux.vnet.ibm.com>
> 
> 	* sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
> 	lxvd2x/stxvd2x with lvx/stvx.
> 	* sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
> 
> Reviewed-by: Tulio Magno Quites Machado Filho<tuliom@linux.vnet.ibm.com>

Should we backport this change, to 2.26 in particular?

Thanks,
Florian
  
Tulio Magno Quites Machado Filho Nov. 21, 2017, 11:17 a.m. UTC | #5
Florian Weimer <fweimer@redhat.com> writes:

> On 10/20/2017 09:22 PM, Tulio Magno Quites Machado Filho wrote:
>> From: Rajalakshmi Srinivasaraghavan<raji@linux.vnet.ibm.com>
>> 
>> POWER9 DD2.1 and earlier has an issue where some cache inhibited
>> vector load traps to the kernel, causing a performance degradation.  To
>> handle this in memcpy and memmove, lvx/stvx is used for aligned
>> addresses instead of lxvd2x/stxvd2x.
>> 
>> Reference:https://patchwork.ozlabs.org/patch/814059/
>> 
>> 2017-10-20  Rajalakshmi Srinivasaraghavan<raji@linux.vnet.ibm.com>
>> 
>> 	* sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
>> 	lxvd2x/stxvd2x with lvx/stvx.
>> 	* sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
>> 
>> Reviewed-by: Tulio Magno Quites Machado Filho<tuliom@linux.vnet.ibm.com>
>
> Should we backport this change, to 2.26 in particular?

Yes.  That's a good idea.
  

Patch

diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
index 1ccbc2e..a7cdf8b 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -91,63 +91,63 @@  L(aligned_copy):
 	srdi	12,cnt,7
 	cmpdi	12,0
 	beq	L(aligned_tail)
-	lxvd2x	6,0,src
-	lxvd2x	7,src,6
+	lvx	6,0,src
+	lvx	7,src,6
 	mtctr	12
 	b	L(aligned_128loop)
 
 	.align  4
 L(aligned_128head):
 	/* for the 2nd + iteration of this loop. */
-	lxvd2x	6,0,src
-	lxvd2x	7,src,6
+	lvx	6,0,src
+	lvx	7,src,6
 L(aligned_128loop):
-	lxvd2x	8,src,7
-	lxvd2x	9,src,8
-	stxvd2x	6,0,dst
+	lvx	8,src,7
+	lvx	9,src,8
+	stvx	6,0,dst
 	addi	src,src,64
-	stxvd2x	7,dst,6
-	stxvd2x	8,dst,7
-	stxvd2x	9,dst,8
-	lxvd2x	6,0,src
-	lxvd2x	7,src,6
+	stvx	7,dst,6
+	stvx	8,dst,7
+	stvx	9,dst,8
+	lvx	6,0,src
+	lvx	7,src,6
 	addi	dst,dst,64
-	lxvd2x	8,src,7
-	lxvd2x	9,src,8
+	lvx	8,src,7
+	lvx	9,src,8
 	addi	src,src,64
-	stxvd2x	6,0,dst
-	stxvd2x	7,dst,6
-	stxvd2x	8,dst,7
-	stxvd2x	9,dst,8
+	stvx	6,0,dst
+	stvx	7,dst,6
+	stvx	8,dst,7
+	stvx	9,dst,8
 	addi	dst,dst,64
 	bdnz	L(aligned_128head)
 
 L(aligned_tail):
 	mtocrf	0x01,cnt
 	bf	25,32f
-	lxvd2x	6,0,src
-	lxvd2x	7,src,6
-	lxvd2x	8,src,7
-	lxvd2x	9,src,8
+	lvx	6,0,src
+	lvx	7,src,6
+	lvx	8,src,7
+	lvx	9,src,8
 	addi	src,src,64
-	stxvd2x	6,0,dst
-	stxvd2x	7,dst,6
-	stxvd2x	8,dst,7
-	stxvd2x	9,dst,8
+	stvx	6,0,dst
+	stvx	7,dst,6
+	stvx	8,dst,7
+	stvx	9,dst,8
 	addi	dst,dst,64
 32:
 	bf	26,16f
-	lxvd2x	6,0,src
-	lxvd2x	7,src,6
+	lvx	6,0,src
+	lvx	7,src,6
 	addi	src,src,32
-	stxvd2x	6,0,dst
-	stxvd2x	7,dst,6
+	stvx	6,0,dst
+	stvx	7,dst,6
 	addi	dst,dst,32
 16:
 	bf	27,8f
-	lxvd2x	6,0,src
+	lvx	6,0,src
 	addi	src,src,16
-	stxvd2x	6,0,dst
+	stvx	6,0,dst
 	addi	dst,dst,16
 8:
 	bf	28,4f
diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S
index 93baa69..667c6e2 100644
--- a/sysdeps/powerpc/powerpc64/power7/memmove.S
+++ b/sysdeps/powerpc/powerpc64/power7/memmove.S
@@ -92,63 +92,63 @@  L(aligned_copy):
 	srdi	12,r5,7
 	cmpdi	12,0
 	beq	L(aligned_tail)
-	lxvd2x	6,0,r4
-	lxvd2x	7,r4,6
+	lvx	6,0,r4
+	lvx	7,r4,6
 	mtctr	12
 	b	L(aligned_128loop)
 
 	.align  4
 L(aligned_128head):
 	/* for the 2nd + iteration of this loop. */
-	lxvd2x	6,0,r4
-	lxvd2x	7,r4,6
+	lvx	6,0,r4
+	lvx	7,r4,6
 L(aligned_128loop):
-	lxvd2x	8,r4,7
-	lxvd2x	9,r4,8
-	stxvd2x	6,0,r11
+	lvx	8,r4,7
+	lvx	9,r4,8
+	stvx	6,0,r11
 	addi	r4,r4,64
-	stxvd2x	7,r11,6
-	stxvd2x	8,r11,7
-	stxvd2x	9,r11,8
-	lxvd2x	6,0,r4
-	lxvd2x	7,r4,6
+	stvx	7,r11,6
+	stvx	8,r11,7
+	stvx	9,r11,8
+	lvx	6,0,r4
+	lvx	7,r4,6
 	addi	r11,r11,64
-	lxvd2x	8,r4,7
-	lxvd2x	9,r4,8
+	lvx	8,r4,7
+	lvx	9,r4,8
 	addi	r4,r4,64
-	stxvd2x	6,0,r11
-	stxvd2x	7,r11,6
-	stxvd2x	8,r11,7
-	stxvd2x	9,r11,8
+	stvx	6,0,r11
+	stvx	7,r11,6
+	stvx	8,r11,7
+	stvx	9,r11,8
 	addi	r11,r11,64
 	bdnz	L(aligned_128head)
 
 L(aligned_tail):
 	mtocrf	0x01,r5
 	bf	25,32f
-	lxvd2x	6,0,r4
-	lxvd2x	7,r4,6
-	lxvd2x	8,r4,7
-	lxvd2x	9,r4,8
+	lvx	6,0,r4
+	lvx	7,r4,6
+	lvx	8,r4,7
+	lvx	9,r4,8
 	addi	r4,r4,64
-	stxvd2x	6,0,r11
-	stxvd2x	7,r11,6
-	stxvd2x	8,r11,7
-	stxvd2x	9,r11,8
+	stvx	6,0,r11
+	stvx	7,r11,6
+	stvx	8,r11,7
+	stvx	9,r11,8
 	addi	r11,r11,64
 32:
 	bf	26,16f
-	lxvd2x	6,0,r4
-	lxvd2x	7,r4,6
+	lvx	6,0,r4
+	lvx	7,r4,6
 	addi	r4,r4,32
-	stxvd2x	6,0,r11
-	stxvd2x	7,r11,6
+	stvx	6,0,r11
+	stvx	7,r11,6
 	addi	r11,r11,32
 16:
 	bf	27,8f
-	lxvd2x	6,0,r4
+	lvx	6,0,r4
 	addi	r4,r4,16
-	stxvd2x	6,0,r11
+	stvx	6,0,r11
 	addi	r11,r11,16
 8:
 	bf	28,4f
@@ -488,63 +488,63 @@  L(aligned_copy_bwd):
 	srdi	r12,r5,7
 	cmpdi	r12,0
 	beq	L(aligned_tail_bwd)
-	lxvd2x	v6,r4,r6
-	lxvd2x	v7,r4,r7
+	lvx	v6,r4,r6
+	lvx	v7,r4,r7
 	mtctr	12
 	b	L(aligned_128loop_bwd)
 
 	.align  4
 L(aligned_128head_bwd):
 	/* for the 2nd + iteration of this loop. */
-	lxvd2x	v6,r4,r6
-	lxvd2x	v7,r4,r7
+	lvx	v6,r4,r6
+	lvx	v7,r4,r7
 L(aligned_128loop_bwd):
-	lxvd2x	v8,r4,r8
-	lxvd2x	v9,r4,r9
-	stxvd2x	v6,r11,r6
+	lvx	v8,r4,r8
+	lvx	v9,r4,r9
+	stvx	v6,r11,r6
 	subi	r4,r4,64
-	stxvd2x	v7,r11,r7
-	stxvd2x	v8,r11,r8
-	stxvd2x	v9,r11,r9
-	lxvd2x	v6,r4,r6
-	lxvd2x	v7,r4,7
+	stvx	v7,r11,r7
+	stvx	v8,r11,r8
+	stvx	v9,r11,r9
+	lvx	v6,r4,r6
+	lvx	v7,r4,7
 	subi	r11,r11,64
-	lxvd2x	v8,r4,r8
-	lxvd2x	v9,r4,r9
+	lvx	v8,r4,r8
+	lvx	v9,r4,r9
 	subi	r4,r4,64
-	stxvd2x	v6,r11,r6
-	stxvd2x	v7,r11,r7
-	stxvd2x	v8,r11,r8
-	stxvd2x	v9,r11,r9
+	stvx	v6,r11,r6
+	stvx	v7,r11,r7
+	stvx	v8,r11,r8
+	stvx	v9,r11,r9
 	subi	r11,r11,64
 	bdnz	L(aligned_128head_bwd)
 
 L(aligned_tail_bwd):
 	mtocrf	0x01,r5
 	bf	25,32f
-	lxvd2x	v6,r4,r6
-	lxvd2x	v7,r4,r7
-	lxvd2x	v8,r4,r8
-	lxvd2x	v9,r4,r9
+	lvx	v6,r4,r6
+	lvx	v7,r4,r7
+	lvx	v8,r4,r8
+	lvx	v9,r4,r9
 	subi	r4,r4,64
-	stxvd2x	v6,r11,r6
-	stxvd2x	v7,r11,r7
-	stxvd2x	v8,r11,r8
-	stxvd2x	v9,r11,r9
+	stvx	v6,r11,r6
+	stvx	v7,r11,r7
+	stvx	v8,r11,r8
+	stvx	v9,r11,r9
 	subi	r11,r11,64
 32:
 	bf	26,16f
-	lxvd2x	v6,r4,r6
-	lxvd2x	v7,r4,r7
+	lvx	v6,r4,r6
+	lvx	v7,r4,r7
 	subi	r4,r4,32
-	stxvd2x	v6,r11,r6
-	stxvd2x	v7,r11,r7
+	stvx	v6,r11,r6
+	stvx	v7,r11,r7
 	subi	r11,r11,32
 16:
 	bf	27,8f
-	lxvd2x	v6,r4,r6
+	lvx	v6,r4,r6
 	subi	r4,r4,16
-	stxvd2x	v6,r11,r6
+	stvx	v6,r11,r6
 	subi	r11,r11,16
 8:
 	bf	28,4f