[PATCHv3] powerpc: Replace lxvd2x/stxvd2x with lvx/stvx in P7's memcpy/memmove
Commit Message
From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
POWER9 DD2.1 and earlier has an issue where some cache inhibited
vector load traps to the kernel, causing a performance degradation. To
handle this in memcpy and memmove, lvx/stvx is used for aligned
addresses instead of lxvd2x/stxvd2x.
Reference: https://patchwork.ozlabs.org/patch/814059/
2017-10-20 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
lxvd2x/stxvd2x with lvx/stvx.
* sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
---
sysdeps/powerpc/powerpc64/power7/memcpy.S | 64 +++++++--------
sysdeps/powerpc/powerpc64/power7/memmove.S | 128 ++++++++++++++---------------
2 files changed, 96 insertions(+), 96 deletions(-)
Comments
On 20/10/2017 17:22, Tulio Magno Quites Machado Filho wrote:
> From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
>
> POWER9 DD2.1 and earlier has an issue where some cache inhibited
> vector load traps to the kernel, causing a performance degradation. To
> handle this in memcpy and memmove, lvx/stvx is used for aligned
> addresses instead of lxvd2x/stxvd2x.
>
> Reference: https://patchwork.ozlabs.org/patch/814059/
>
> 2017-10-20 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
>
> * sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
> lxvd2x/stxvd2x with lvx/stvx.
> * sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
>
> Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
|Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>|
> ---
> sysdeps/powerpc/powerpc64/power7/memcpy.S | 64 +++++++--------
> sysdeps/powerpc/powerpc64/power7/memmove.S | 128 ++++++++++++++---------------
> 2 files changed, 96 insertions(+), 96 deletions(-)
>
> diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
> index 1ccbc2e..a7cdf8b 100644
> --- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
> +++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
> @@ -91,63 +91,63 @@ L(aligned_copy):
> srdi 12,cnt,7
> cmpdi 12,0
> beq L(aligned_tail)
> - lxvd2x 6,0,src
> - lxvd2x 7,src,6
> + lvx 6,0,src
> + lvx 7,src,6
> mtctr 12
> b L(aligned_128loop)
>
> .align 4
> L(aligned_128head):
> /* for the 2nd + iteration of this loop. */
> - lxvd2x 6,0,src
> - lxvd2x 7,src,6
> + lvx 6,0,src
> + lvx 7,src,6
> L(aligned_128loop):
> - lxvd2x 8,src,7
> - lxvd2x 9,src,8
> - stxvd2x 6,0,dst
> + lvx 8,src,7
> + lvx 9,src,8
> + stvx 6,0,dst
> addi src,src,64
> - stxvd2x 7,dst,6
> - stxvd2x 8,dst,7
> - stxvd2x 9,dst,8
> - lxvd2x 6,0,src
> - lxvd2x 7,src,6
> + stvx 7,dst,6
> + stvx 8,dst,7
> + stvx 9,dst,8
> + lvx 6,0,src
> + lvx 7,src,6
> addi dst,dst,64
> - lxvd2x 8,src,7
> - lxvd2x 9,src,8
> + lvx 8,src,7
> + lvx 9,src,8
> addi src,src,64
> - stxvd2x 6,0,dst
> - stxvd2x 7,dst,6
> - stxvd2x 8,dst,7
> - stxvd2x 9,dst,8
> + stvx 6,0,dst
> + stvx 7,dst,6
> + stvx 8,dst,7
> + stvx 9,dst,8
> addi dst,dst,64
> bdnz L(aligned_128head)
>
> L(aligned_tail):
> mtocrf 0x01,cnt
> bf 25,32f
> - lxvd2x 6,0,src
> - lxvd2x 7,src,6
> - lxvd2x 8,src,7
> - lxvd2x 9,src,8
> + lvx 6,0,src
> + lvx 7,src,6
> + lvx 8,src,7
> + lvx 9,src,8
> addi src,src,64
> - stxvd2x 6,0,dst
> - stxvd2x 7,dst,6
> - stxvd2x 8,dst,7
> - stxvd2x 9,dst,8
> + stvx 6,0,dst
> + stvx 7,dst,6
> + stvx 8,dst,7
> + stvx 9,dst,8
> addi dst,dst,64
> 32:
> bf 26,16f
> - lxvd2x 6,0,src
> - lxvd2x 7,src,6
> + lvx 6,0,src
> + lvx 7,src,6
> addi src,src,32
> - stxvd2x 6,0,dst
> - stxvd2x 7,dst,6
> + stvx 6,0,dst
> + stvx 7,dst,6
> addi dst,dst,32
> 16:
> bf 27,8f
> - lxvd2x 6,0,src
> + lvx 6,0,src
> addi src,src,16
> - stxvd2x 6,0,dst
> + stvx 6,0,dst
> addi dst,dst,16
> 8:
> bf 28,4f
> diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S
> index 93baa69..667c6e2 100644
> --- a/sysdeps/powerpc/powerpc64/power7/memmove.S
> +++ b/sysdeps/powerpc/powerpc64/power7/memmove.S
> @@ -92,63 +92,63 @@ L(aligned_copy):
> srdi 12,r5,7
> cmpdi 12,0
> beq L(aligned_tail)
> - lxvd2x 6,0,r4
> - lxvd2x 7,r4,6
> + lvx 6,0,r4
> + lvx 7,r4,6
> mtctr 12
> b L(aligned_128loop)
>
> .align 4
> L(aligned_128head):
> /* for the 2nd + iteration of this loop. */
> - lxvd2x 6,0,r4
> - lxvd2x 7,r4,6
> + lvx 6,0,r4
> + lvx 7,r4,6
> L(aligned_128loop):
> - lxvd2x 8,r4,7
> - lxvd2x 9,r4,8
> - stxvd2x 6,0,r11
> + lvx 8,r4,7
> + lvx 9,r4,8
> + stvx 6,0,r11
> addi r4,r4,64
> - stxvd2x 7,r11,6
> - stxvd2x 8,r11,7
> - stxvd2x 9,r11,8
> - lxvd2x 6,0,r4
> - lxvd2x 7,r4,6
> + stvx 7,r11,6
> + stvx 8,r11,7
> + stvx 9,r11,8
> + lvx 6,0,r4
> + lvx 7,r4,6
> addi r11,r11,64
> - lxvd2x 8,r4,7
> - lxvd2x 9,r4,8
> + lvx 8,r4,7
> + lvx 9,r4,8
> addi r4,r4,64
> - stxvd2x 6,0,r11
> - stxvd2x 7,r11,6
> - stxvd2x 8,r11,7
> - stxvd2x 9,r11,8
> + stvx 6,0,r11
> + stvx 7,r11,6
> + stvx 8,r11,7
> + stvx 9,r11,8
> addi r11,r11,64
> bdnz L(aligned_128head)
>
> L(aligned_tail):
> mtocrf 0x01,r5
> bf 25,32f
> - lxvd2x 6,0,r4
> - lxvd2x 7,r4,6
> - lxvd2x 8,r4,7
> - lxvd2x 9,r4,8
> + lvx 6,0,r4
> + lvx 7,r4,6
> + lvx 8,r4,7
> + lvx 9,r4,8
> addi r4,r4,64
> - stxvd2x 6,0,r11
> - stxvd2x 7,r11,6
> - stxvd2x 8,r11,7
> - stxvd2x 9,r11,8
> + stvx 6,0,r11
> + stvx 7,r11,6
> + stvx 8,r11,7
> + stvx 9,r11,8
> addi r11,r11,64
> 32:
> bf 26,16f
> - lxvd2x 6,0,r4
> - lxvd2x 7,r4,6
> + lvx 6,0,r4
> + lvx 7,r4,6
> addi r4,r4,32
> - stxvd2x 6,0,r11
> - stxvd2x 7,r11,6
> + stvx 6,0,r11
> + stvx 7,r11,6
> addi r11,r11,32
> 16:
> bf 27,8f
> - lxvd2x 6,0,r4
> + lvx 6,0,r4
> addi r4,r4,16
> - stxvd2x 6,0,r11
> + stvx 6,0,r11
> addi r11,r11,16
> 8:
> bf 28,4f
> @@ -488,63 +488,63 @@ L(aligned_copy_bwd):
> srdi r12,r5,7
> cmpdi r12,0
> beq L(aligned_tail_bwd)
> - lxvd2x v6,r4,r6
> - lxvd2x v7,r4,r7
> + lvx v6,r4,r6
> + lvx v7,r4,r7
> mtctr 12
> b L(aligned_128loop_bwd)
>
> .align 4
> L(aligned_128head_bwd):
> /* for the 2nd + iteration of this loop. */
> - lxvd2x v6,r4,r6
> - lxvd2x v7,r4,r7
> + lvx v6,r4,r6
> + lvx v7,r4,r7
> L(aligned_128loop_bwd):
> - lxvd2x v8,r4,r8
> - lxvd2x v9,r4,r9
> - stxvd2x v6,r11,r6
> + lvx v8,r4,r8
> + lvx v9,r4,r9
> + stvx v6,r11,r6
> subi r4,r4,64
> - stxvd2x v7,r11,r7
> - stxvd2x v8,r11,r8
> - stxvd2x v9,r11,r9
> - lxvd2x v6,r4,r6
> - lxvd2x v7,r4,7
> + stvx v7,r11,r7
> + stvx v8,r11,r8
> + stvx v9,r11,r9
> + lvx v6,r4,r6
> + lvx v7,r4,7
> subi r11,r11,64
> - lxvd2x v8,r4,r8
> - lxvd2x v9,r4,r9
> + lvx v8,r4,r8
> + lvx v9,r4,r9
> subi r4,r4,64
> - stxvd2x v6,r11,r6
> - stxvd2x v7,r11,r7
> - stxvd2x v8,r11,r8
> - stxvd2x v9,r11,r9
> + stvx v6,r11,r6
> + stvx v7,r11,r7
> + stvx v8,r11,r8
> + stvx v9,r11,r9
> subi r11,r11,64
> bdnz L(aligned_128head_bwd)
>
> L(aligned_tail_bwd):
> mtocrf 0x01,r5
> bf 25,32f
> - lxvd2x v6,r4,r6
> - lxvd2x v7,r4,r7
> - lxvd2x v8,r4,r8
> - lxvd2x v9,r4,r9
> + lvx v6,r4,r6
> + lvx v7,r4,r7
> + lvx v8,r4,r8
> + lvx v9,r4,r9
> subi r4,r4,64
> - stxvd2x v6,r11,r6
> - stxvd2x v7,r11,r7
> - stxvd2x v8,r11,r8
> - stxvd2x v9,r11,r9
> + stvx v6,r11,r6
> + stvx v7,r11,r7
> + stvx v8,r11,r8
> + stvx v9,r11,r9
> subi r11,r11,64
> 32:
> bf 26,16f
> - lxvd2x v6,r4,r6
> - lxvd2x v7,r4,r7
> + lvx v6,r4,r6
> + lvx v7,r4,r7
> subi r4,r4,32
> - stxvd2x v6,r11,r6
> - stxvd2x v7,r11,r7
> + stvx v6,r11,r6
> + stvx v7,r11,r7
> subi r11,r11,32
> 16:
> bf 27,8f
> - lxvd2x v6,r4,r6
> + lvx v6,r4,r6
> subi r4,r4,16
> - stxvd2x v6,r11,r6
> + stvx v6,r11,r6
> subi r11,r11,16
> 8:
> bf 28,4f
Adhemerval Zanella <adhemerval.zanella@linaro.org> writes:
> On 20/10/2017 17:22, Tulio Magno Quites Machado Filho wrote:
>> From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
>>
>> POWER9 DD2.1 and earlier has an issue where some cache inhibited
>> vector load traps to the kernel, causing a performance degradation. To
>> handle this in memcpy and memmove, lvx/stvx is used for aligned
>> addresses instead of lxvd2x/stxvd2x.
>>
>> Reference: https://patchwork.ozlabs.org/patch/814059/
>>
>> 2017-10-20 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
>>
>> * sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
>> lxvd2x/stxvd2x with lvx/stvx.
>> * sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
>>
>> Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
>
> |Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>|
Pushed as 63da5cd4a09.
Thanks!
On 10/25/2017 08:17 AM, Tulio Magno Quites Machado Filho wrote:
> Adhemerval Zanella <adhemerval.zanella@linaro.org> writes:
>
>> On 20/10/2017 17:22, Tulio Magno Quites Machado Filho wrote:
>>> From: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
>>>
>>> POWER9 DD2.1 and earlier has an issue where some cache inhibited
>>> vector load traps to the kernel, causing a performance degradation. To
>>> handle this in memcpy and memmove, lvx/stvx is used for aligned
>>> addresses instead of lxvd2x/stxvd2x.
>>>
>>> Reference: https://patchwork.ozlabs.org/patch/814059/
>>>
>>> 2017-10-20 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
>>>
>>> * sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
>>> lxvd2x/stxvd2x with lvx/stvx.
>>> * sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
>>>
>>> Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
>>
>> |Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>|
>
> Pushed as 63da5cd4a09.
Raji, Adhemerval, Tulio,
Thank you all for fixing this up!
On 10/20/2017 09:22 PM, Tulio Magno Quites Machado Filho wrote:
> From: Rajalakshmi Srinivasaraghavan<raji@linux.vnet.ibm.com>
>
> POWER9 DD2.1 and earlier has an issue where some cache inhibited
> vector load traps to the kernel, causing a performance degradation. To
> handle this in memcpy and memmove, lvx/stvx is used for aligned
> addresses instead of lxvd2x/stxvd2x.
>
> Reference:https://patchwork.ozlabs.org/patch/814059/
>
> 2017-10-20 Rajalakshmi Srinivasaraghavan<raji@linux.vnet.ibm.com>
>
> * sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
> lxvd2x/stxvd2x with lvx/stvx.
> * sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
>
> Reviewed-by: Tulio Magno Quites Machado Filho<tuliom@linux.vnet.ibm.com>
Should we backport this change, to 2.26 in particular?
Thanks,
Florian
Florian Weimer <fweimer@redhat.com> writes:
> On 10/20/2017 09:22 PM, Tulio Magno Quites Machado Filho wrote:
>> From: Rajalakshmi Srinivasaraghavan<raji@linux.vnet.ibm.com>
>>
>> POWER9 DD2.1 and earlier has an issue where some cache inhibited
>> vector load traps to the kernel, causing a performance degradation. To
>> handle this in memcpy and memmove, lvx/stvx is used for aligned
>> addresses instead of lxvd2x/stxvd2x.
>>
>> Reference:https://patchwork.ozlabs.org/patch/814059/
>>
>> 2017-10-20 Rajalakshmi Srinivasaraghavan<raji@linux.vnet.ibm.com>
>>
>> * sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
>> lxvd2x/stxvd2x with lvx/stvx.
>> * sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
>>
>> Reviewed-by: Tulio Magno Quites Machado Filho<tuliom@linux.vnet.ibm.com>
>
> Should we backport this change, to 2.26 in particular?
Yes. That's a good idea.
@@ -91,63 +91,63 @@ L(aligned_copy):
srdi 12,cnt,7
cmpdi 12,0
beq L(aligned_tail)
- lxvd2x 6,0,src
- lxvd2x 7,src,6
+ lvx 6,0,src
+ lvx 7,src,6
mtctr 12
b L(aligned_128loop)
.align 4
L(aligned_128head):
/* for the 2nd + iteration of this loop. */
- lxvd2x 6,0,src
- lxvd2x 7,src,6
+ lvx 6,0,src
+ lvx 7,src,6
L(aligned_128loop):
- lxvd2x 8,src,7
- lxvd2x 9,src,8
- stxvd2x 6,0,dst
+ lvx 8,src,7
+ lvx 9,src,8
+ stvx 6,0,dst
addi src,src,64
- stxvd2x 7,dst,6
- stxvd2x 8,dst,7
- stxvd2x 9,dst,8
- lxvd2x 6,0,src
- lxvd2x 7,src,6
+ stvx 7,dst,6
+ stvx 8,dst,7
+ stvx 9,dst,8
+ lvx 6,0,src
+ lvx 7,src,6
addi dst,dst,64
- lxvd2x 8,src,7
- lxvd2x 9,src,8
+ lvx 8,src,7
+ lvx 9,src,8
addi src,src,64
- stxvd2x 6,0,dst
- stxvd2x 7,dst,6
- stxvd2x 8,dst,7
- stxvd2x 9,dst,8
+ stvx 6,0,dst
+ stvx 7,dst,6
+ stvx 8,dst,7
+ stvx 9,dst,8
addi dst,dst,64
bdnz L(aligned_128head)
L(aligned_tail):
mtocrf 0x01,cnt
bf 25,32f
- lxvd2x 6,0,src
- lxvd2x 7,src,6
- lxvd2x 8,src,7
- lxvd2x 9,src,8
+ lvx 6,0,src
+ lvx 7,src,6
+ lvx 8,src,7
+ lvx 9,src,8
addi src,src,64
- stxvd2x 6,0,dst
- stxvd2x 7,dst,6
- stxvd2x 8,dst,7
- stxvd2x 9,dst,8
+ stvx 6,0,dst
+ stvx 7,dst,6
+ stvx 8,dst,7
+ stvx 9,dst,8
addi dst,dst,64
32:
bf 26,16f
- lxvd2x 6,0,src
- lxvd2x 7,src,6
+ lvx 6,0,src
+ lvx 7,src,6
addi src,src,32
- stxvd2x 6,0,dst
- stxvd2x 7,dst,6
+ stvx 6,0,dst
+ stvx 7,dst,6
addi dst,dst,32
16:
bf 27,8f
- lxvd2x 6,0,src
+ lvx 6,0,src
addi src,src,16
- stxvd2x 6,0,dst
+ stvx 6,0,dst
addi dst,dst,16
8:
bf 28,4f
@@ -92,63 +92,63 @@ L(aligned_copy):
srdi 12,r5,7
cmpdi 12,0
beq L(aligned_tail)
- lxvd2x 6,0,r4
- lxvd2x 7,r4,6
+ lvx 6,0,r4
+ lvx 7,r4,6
mtctr 12
b L(aligned_128loop)
.align 4
L(aligned_128head):
/* for the 2nd + iteration of this loop. */
- lxvd2x 6,0,r4
- lxvd2x 7,r4,6
+ lvx 6,0,r4
+ lvx 7,r4,6
L(aligned_128loop):
- lxvd2x 8,r4,7
- lxvd2x 9,r4,8
- stxvd2x 6,0,r11
+ lvx 8,r4,7
+ lvx 9,r4,8
+ stvx 6,0,r11
addi r4,r4,64
- stxvd2x 7,r11,6
- stxvd2x 8,r11,7
- stxvd2x 9,r11,8
- lxvd2x 6,0,r4
- lxvd2x 7,r4,6
+ stvx 7,r11,6
+ stvx 8,r11,7
+ stvx 9,r11,8
+ lvx 6,0,r4
+ lvx 7,r4,6
addi r11,r11,64
- lxvd2x 8,r4,7
- lxvd2x 9,r4,8
+ lvx 8,r4,7
+ lvx 9,r4,8
addi r4,r4,64
- stxvd2x 6,0,r11
- stxvd2x 7,r11,6
- stxvd2x 8,r11,7
- stxvd2x 9,r11,8
+ stvx 6,0,r11
+ stvx 7,r11,6
+ stvx 8,r11,7
+ stvx 9,r11,8
addi r11,r11,64
bdnz L(aligned_128head)
L(aligned_tail):
mtocrf 0x01,r5
bf 25,32f
- lxvd2x 6,0,r4
- lxvd2x 7,r4,6
- lxvd2x 8,r4,7
- lxvd2x 9,r4,8
+ lvx 6,0,r4
+ lvx 7,r4,6
+ lvx 8,r4,7
+ lvx 9,r4,8
addi r4,r4,64
- stxvd2x 6,0,r11
- stxvd2x 7,r11,6
- stxvd2x 8,r11,7
- stxvd2x 9,r11,8
+ stvx 6,0,r11
+ stvx 7,r11,6
+ stvx 8,r11,7
+ stvx 9,r11,8
addi r11,r11,64
32:
bf 26,16f
- lxvd2x 6,0,r4
- lxvd2x 7,r4,6
+ lvx 6,0,r4
+ lvx 7,r4,6
addi r4,r4,32
- stxvd2x 6,0,r11
- stxvd2x 7,r11,6
+ stvx 6,0,r11
+ stvx 7,r11,6
addi r11,r11,32
16:
bf 27,8f
- lxvd2x 6,0,r4
+ lvx 6,0,r4
addi r4,r4,16
- stxvd2x 6,0,r11
+ stvx 6,0,r11
addi r11,r11,16
8:
bf 28,4f
@@ -488,63 +488,63 @@ L(aligned_copy_bwd):
srdi r12,r5,7
cmpdi r12,0
beq L(aligned_tail_bwd)
- lxvd2x v6,r4,r6
- lxvd2x v7,r4,r7
+ lvx v6,r4,r6
+ lvx v7,r4,r7
mtctr 12
b L(aligned_128loop_bwd)
.align 4
L(aligned_128head_bwd):
/* for the 2nd + iteration of this loop. */
- lxvd2x v6,r4,r6
- lxvd2x v7,r4,r7
+ lvx v6,r4,r6
+ lvx v7,r4,r7
L(aligned_128loop_bwd):
- lxvd2x v8,r4,r8
- lxvd2x v9,r4,r9
- stxvd2x v6,r11,r6
+ lvx v8,r4,r8
+ lvx v9,r4,r9
+ stvx v6,r11,r6
subi r4,r4,64
- stxvd2x v7,r11,r7
- stxvd2x v8,r11,r8
- stxvd2x v9,r11,r9
- lxvd2x v6,r4,r6
- lxvd2x v7,r4,7
+ stvx v7,r11,r7
+ stvx v8,r11,r8
+ stvx v9,r11,r9
+ lvx v6,r4,r6
+ lvx v7,r4,7
subi r11,r11,64
- lxvd2x v8,r4,r8
- lxvd2x v9,r4,r9
+ lvx v8,r4,r8
+ lvx v9,r4,r9
subi r4,r4,64
- stxvd2x v6,r11,r6
- stxvd2x v7,r11,r7
- stxvd2x v8,r11,r8
- stxvd2x v9,r11,r9
+ stvx v6,r11,r6
+ stvx v7,r11,r7
+ stvx v8,r11,r8
+ stvx v9,r11,r9
subi r11,r11,64
bdnz L(aligned_128head_bwd)
L(aligned_tail_bwd):
mtocrf 0x01,r5
bf 25,32f
- lxvd2x v6,r4,r6
- lxvd2x v7,r4,r7
- lxvd2x v8,r4,r8
- lxvd2x v9,r4,r9
+ lvx v6,r4,r6
+ lvx v7,r4,r7
+ lvx v8,r4,r8
+ lvx v9,r4,r9
subi r4,r4,64
- stxvd2x v6,r11,r6
- stxvd2x v7,r11,r7
- stxvd2x v8,r11,r8
- stxvd2x v9,r11,r9
+ stvx v6,r11,r6
+ stvx v7,r11,r7
+ stvx v8,r11,r8
+ stvx v9,r11,r9
subi r11,r11,64
32:
bf 26,16f
- lxvd2x v6,r4,r6
- lxvd2x v7,r4,r7
+ lvx v6,r4,r6
+ lvx v7,r4,r7
subi r4,r4,32
- stxvd2x v6,r11,r6
- stxvd2x v7,r11,r7
+ stvx v6,r11,r6
+ stvx v7,r11,r7
subi r11,r11,32
16:
bf 27,8f
- lxvd2x v6,r4,r6
+ lvx v6,r4,r6
subi r4,r4,16
- stxvd2x v6,r11,r6
+ stvx v6,r11,r6
subi r11,r11,16
8:
bf 28,4f