x86_64: Implement V1TI mode shifts/rotates by a constant

Message ID 010301d7c8f4$ff972570$fec57050$@nextmovesoftware.com
State Committed
Commit 6b8b25575570ffde37cc8997af096514b929779d
Headers
Series x86_64: Implement V1TI mode shifts/rotates by a constant |

Commit Message

Roger Sayle Oct. 24, 2021, 4:34 p.m. UTC
  This patch provides RTL expanders to implement logical shifts and
rotates of 128-bit values (stored in vector integer registers) by
constant bit counts.  Previously, GCC would transfer these values
to a pair of scalar registers (TImode) via memory to perform the
operation, then transfer the result back via memory.  Instead these
operations are now expanded using (between 1 and 5) SSE2 vector
instructions.

Logical shifts by multiples of 8 can be implemented using x86_64's
pslldq/psrldq instruction:
ashl_8: pslldq  $1, %xmm0
        ret
lshr_32:
        psrldq  $4, %xmm0
        ret

Logical shifts by greater than 64 can use pslldq/psrldq $8, followed
by a psllq/psrlq for the remaining bits:
ashl_111:
        pslldq  $8, %xmm0
        psllq   $47, %xmm0
        ret
lshr_127:
        psrldq  $8, %xmm0
        psrlq   $63, %xmm0
        ret

The remaining logical shifts make use of the following idiom:
ashl_1:
        movdqa  %xmm0, %xmm1
        psllq   $1, %xmm0
        pslldq  $8, %xmm1
        psrlq   $63, %xmm1
        por     %xmm1, %xmm0
        ret
lshr_15:
        movdqa  %xmm0, %xmm1
        psrlq   $15, %xmm0
        psrldq  $8, %xmm1
        psllq   $49, %xmm1
        por     %xmm1, %xmm0
        ret

Rotates by multiples of 32 can use x86_64's pshufd:
rotr_32:
        pshufd  $57, %xmm0, %xmm0
        ret
rotr_64:
        pshufd  $78, %xmm0, %xmm0
        ret
rotr_96:
        pshufd  $147, %xmm0, %xmm0
        ret

Rotates by multiples of 8 (other than multiples of 32) can make
use of both pslldq and psrldq, followed by por:
rotr_8:
        movdqa  %xmm0, %xmm1
        psrldq  $1, %xmm0
        pslldq  $15, %xmm1
        por     %xmm1, %xmm0
        ret
rotr_112:
        movdqa  %xmm0, %xmm1
        psrldq  $14, %xmm0
        pslldq  $2, %xmm1
        por     %xmm1, %xmm0
        ret

And the remaining rotates use one or two pshufd, followed by a
psrld/pslld/por sequence:
rotr_1:
        movdqa  %xmm0, %xmm1
        pshufd  $57, %xmm0, %xmm0
        psrld   $1, %xmm1
        pslld   $31, %xmm0
        por     %xmm1, %xmm0
        ret
rotr_63:
        pshufd  $78, %xmm0, %xmm1
        pshufd  $57, %xmm0, %xmm0
        pslld   $1, %xmm1
        psrld   $31, %xmm0
        por     %xmm1, %xmm0
        ret
rotr_111:
        pshufd  $147, %xmm0, %xmm1
        pslld   $17, %xmm0
        psrld   $15, %xmm1
        por     %xmm1, %xmm0
        ret

The new test case, sse2-v1ti-shift.c, is a run-time check to confirm that
the results of V1TImode shifts/rotates by constants, exactly match the
expected results of TImode operations, for various input test vectors.

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check with no new failures.  Ok for mainline?


2021-10-24  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	* config/i386/i386-expand.c (ix86_expand_v1ti_shift): New helper
	function to expand V1TI mode logical shifts by integer constants.
	(ix86_expand_v1ti_rotate): New helper function to expand V1TI
	mode rotations by integer constants.
	* config/i386/i386-protos.h (ix86_expand_v1ti_shift,
	ix86_expand_v1ti_rotate): Prototype new functions here.
	* config/i386/sse.md (ashlv1ti3, lshrv1ti3, rotlv1ti3, rotrv1ti3):
	New TARGET_SSE2 expanders to implement V1TI shifts and rotations.

gcc/testsuite/ChangeLog
	* gcc.target/i386/sse2-v1ti-shift.c: New test case.


Thanks in advance,
Roger
--
  

Comments

Uros Bizjak Oct. 25, 2021, 8:02 a.m. UTC | #1
On Sun, Oct 24, 2021 at 6:34 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch provides RTL expanders to implement logical shifts and
> rotates of 128-bit values (stored in vector integer registers) by
> constant bit counts.  Previously, GCC would transfer these values
> to a pair of scalar registers (TImode) via memory to perform the
> operation, then transfer the result back via memory.  Instead these
> operations are now expanded using (between 1 and 5) SSE2 vector
> instructions.

Hm, instead of using memory (without STL forwarding for general -> XMM
moves!) these should use something similar to what clang produces (or
use pextrq/pinsrq, at least with SSE4.1):

       movq    %xmm0, %rax
       pshufd  $78, %xmm0, %xmm0
       movq    %xmm0, %rcx
       shldq   $8, %rax, %rcx
       shlq    $8, %rax
       movq    %rcx, %xmm1
       movq    %rax, %xmm0
       punpcklqdq      %xmm1, %xmm0

> Logical shifts by multiples of 8 can be implemented using x86_64's
> pslldq/psrldq instruction:
> ashl_8: pslldq  $1, %xmm0
>         ret
> lshr_32:
>         psrldq  $4, %xmm0
>         ret
>
> Logical shifts by greater than 64 can use pslldq/psrldq $8, followed
> by a psllq/psrlq for the remaining bits:
> ashl_111:
>         pslldq  $8, %xmm0
>         psllq   $47, %xmm0
>         ret
> lshr_127:
>         psrldq  $8, %xmm0
>         psrlq   $63, %xmm0
>         ret
>
> The remaining logical shifts make use of the following idiom:
> ashl_1:
>         movdqa  %xmm0, %xmm1
>         psllq   $1, %xmm0
>         pslldq  $8, %xmm1
>         psrlq   $63, %xmm1
>         por     %xmm1, %xmm0
>         ret
> lshr_15:
>         movdqa  %xmm0, %xmm1
>         psrlq   $15, %xmm0
>         psrldq  $8, %xmm1
>         psllq   $49, %xmm1
>         por     %xmm1, %xmm0
>         ret
>
> Rotates by multiples of 32 can use x86_64's pshufd:
> rotr_32:
>         pshufd  $57, %xmm0, %xmm0
>         ret
> rotr_64:
>         pshufd  $78, %xmm0, %xmm0
>         ret
> rotr_96:
>         pshufd  $147, %xmm0, %xmm0
>         ret
>
> Rotates by multiples of 8 (other than multiples of 32) can make
> use of both pslldq and psrldq, followed by por:
> rotr_8:
>         movdqa  %xmm0, %xmm1
>         psrldq  $1, %xmm0
>         pslldq  $15, %xmm1
>         por     %xmm1, %xmm0
>         ret
> rotr_112:
>         movdqa  %xmm0, %xmm1
>         psrldq  $14, %xmm0
>         pslldq  $2, %xmm1
>         por     %xmm1, %xmm0
>         ret
>
> And the remaining rotates use one or two pshufd, followed by a
> psrld/pslld/por sequence:
> rotr_1:
>         movdqa  %xmm0, %xmm1
>         pshufd  $57, %xmm0, %xmm0
>         psrld   $1, %xmm1
>         pslld   $31, %xmm0
>         por     %xmm1, %xmm0
>         ret
> rotr_63:
>         pshufd  $78, %xmm0, %xmm1
>         pshufd  $57, %xmm0, %xmm0
>         pslld   $1, %xmm1
>         psrld   $31, %xmm0
>         por     %xmm1, %xmm0
>         ret
> rotr_111:
>         pshufd  $147, %xmm0, %xmm1
>         pslld   $17, %xmm0
>         psrld   $15, %xmm1
>         por     %xmm1, %xmm0
>         ret
>
> The new test case, sse2-v1ti-shift.c, is a run-time check to confirm that
> the results of V1TImode shifts/rotates by constants, exactly match the
> expected results of TImode operations, for various input test vectors.

Is the sequence of 4+ SSE instructions really faster than
pinsrq/pextrq (and two movq insn) + two operations on integer
registers?

Uros.

> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> and make -k check with no new failures.  Ok for mainline?
>
>
> 2021-10-24  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         * config/i386/i386-expand.c (ix86_expand_v1ti_shift): New helper
>         function to expand V1TI mode logical shifts by integer constants.
>         (ix86_expand_v1ti_rotate): New helper function to expand V1TI
>         mode rotations by integer constants.
>         * config/i386/i386-protos.h (ix86_expand_v1ti_shift,
>         ix86_expand_v1ti_rotate): Prototype new functions here.
>         * config/i386/sse.md (ashlv1ti3, lshrv1ti3, rotlv1ti3, rotrv1ti3):
>         New TARGET_SSE2 expanders to implement V1TI shifts and rotations.
>
> gcc/testsuite/ChangeLog
>         * gcc.target/i386/sse2-v1ti-shift.c: New test case.
>
>
> Thanks in advance,
> Roger
> --
>
  
Roger Sayle Oct. 25, 2021, 2:16 p.m. UTC | #2
Hi Uros,
I believe the proposed sequences should be dramatically faster than LLVM's
implementation(s), due to the large latencies required to move values between
the vector and scalar parts on modern x86_64 microarchitectures.  All of the
SSE2 instructions used in the sequences proposed by my patch have single
cycle latencies, so have a maximum total latency of 5 cycles, though due to
multiple issue, typically require between 1 and 3 cycles depending up the sequence.

Moving between units is significantly slower; according to Agner Fog's tables,
the pinsrq/pextrq instructions you suggest have latencies up to 7 cycles on the
Silvermont architecture.  Let's take the LLVM code you've provided, and 
annotate with cycle counts for a recent Intel (cascadelake) and recent AMD
(zen2) CPUs.

movq    %xmm0, %rax		; 2-3 cycles
pshufd  $78, %xmm0, %xmm0	; 1 cycle
movq    %xmm0, %rcx		; 2-3 cycles
shldq   $8, %rax, %rcx		; 3 cycles
shlq    $8, %rax			; 1 cycle
movq    %rcx, %xmm1		; 2-3 cycles
movq    %rax, %xmm0		; 2-3 cycles
punpcklqdq      %xmm1, %xmm0	; 1 cycle

This 8 instruction sequence has a total latency of 14 cycles on CascadeLake and
18 cycles on Zen2, but an scheduled cycle count of 9 cycles and 11 cycles respectively.

The same left shift by 8 as implemented by the proposed patch is:

pslldq  $1, %xmm0		; 1 cycle

And for reference, the code currently generated by GCC is:

movaps  %xmm0, -24(%rsp)	; 3 cycles
movq    -24(%rsp), %rax		; 2 cycles
movq    -16(%rsp), %rdx		; 2 cycles
shldq   $8, %rax, %rdx		; 3 cycles
salq    $8, %rax			; 1 cycle
movq    %rax, -24(%rsp)		; 2 cycles
movq    %rdx, -16(%rsp)		; 2 cycles
movdqa  -24(%rsp), %xmm0	; 2 cycles


The very worst case timing of my patches is the five instruction rotate:
pshufd  $78, %xmm0, %xmm1	; 1 cycle
pshufd  $57, %xmm0, %xmm0	; 1 cycle
pslld   $1, %xmm1		; 1 cycle
psrld   $31, %xmm0		; 1 cycle
por     %xmm1, %xmm0		; 1 cycle

which has 5 cycle total latency, but can complete in 3 cycles when suitably
scheduled as the pshufd can execute concurrently, as then can the two shifts,
finally followed by the por.

Perhaps I'm missing something, but I'd expect this patch to be three or
four times faster, on recent hardware, than the code generated by LLVM.

Let me know if you'd like me to run microbenchmarks, but the documented
timings are such a dramatic improvement, I'm a little surprised you've
asked about performance.  My patch is also a code size win with -Os
(ashl_8 is currently 39 bytes, shrinks to 5 bytes with this patch).


Please let me know what you think.
Roger
--

-----Original Message-----
From: Uros Bizjak <ubizjak@gmail.com> 
Sent: 25 October 2021 09:02
To: Roger Sayle <roger@nextmovesoftware.com>
Cc: GCC Patches <gcc-patches@gcc.gnu.org>
Subject: Re: [PATCH] x86_64: Implement V1TI mode shifts/rotates by a constant

On Sun, Oct 24, 2021 at 6:34 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This patch provides RTL expanders to implement logical shifts and 
> rotates of 128-bit values (stored in vector integer registers) by 
> constant bit counts.  Previously, GCC would transfer these values to a 
> pair of scalar registers (TImode) via memory to perform the operation, 
> then transfer the result back via memory.  Instead these operations 
> are now expanded using (between 1 and 5) SSE2 vector instructions.

Hm, instead of using memory (without STL forwarding for general -> XMM
moves!) these should use something similar to what clang produces (or use pextrq/pinsrq, at least with SSE4.1):

       movq    %xmm0, %rax
       pshufd  $78, %xmm0, %xmm0
       movq    %xmm0, %rcx
       shldq   $8, %rax, %rcx
       shlq    $8, %rax
       movq    %rcx, %xmm1
       movq    %rax, %xmm0
       punpcklqdq      %xmm1, %xmm0

> Logical shifts by multiples of 8 can be implemented using x86_64's 
> pslldq/psrldq instruction:
> ashl_8: pslldq  $1, %xmm0
>         ret
> lshr_32:
>         psrldq  $4, %xmm0
>         ret
>
> Logical shifts by greater than 64 can use pslldq/psrldq $8, followed 
> by a psllq/psrlq for the remaining bits:
> ashl_111:
>         pslldq  $8, %xmm0
>         psllq   $47, %xmm0
>         ret
> lshr_127:
>         psrldq  $8, %xmm0
>         psrlq   $63, %xmm0
>         ret
>
> The remaining logical shifts make use of the following idiom:
> ashl_1:
>         movdqa  %xmm0, %xmm1
>         psllq   $1, %xmm0
>         pslldq  $8, %xmm1
>         psrlq   $63, %xmm1
>         por     %xmm1, %xmm0
>         ret
> lshr_15:
>         movdqa  %xmm0, %xmm1
>         psrlq   $15, %xmm0
>         psrldq  $8, %xmm1
>         psllq   $49, %xmm1
>         por     %xmm1, %xmm0
>         ret
>
> Rotates by multiples of 32 can use x86_64's pshufd:
> rotr_32:
>         pshufd  $57, %xmm0, %xmm0
>         ret
> rotr_64:
>         pshufd  $78, %xmm0, %xmm0
>         ret
> rotr_96:
>         pshufd  $147, %xmm0, %xmm0
>         ret
>
> Rotates by multiples of 8 (other than multiples of 32) can make use of 
> both pslldq and psrldq, followed by por:
> rotr_8:
>         movdqa  %xmm0, %xmm1
>         psrldq  $1, %xmm0
>         pslldq  $15, %xmm1
>         por     %xmm1, %xmm0
>         ret
> rotr_112:
>         movdqa  %xmm0, %xmm1
>         psrldq  $14, %xmm0
>         pslldq  $2, %xmm1
>         por     %xmm1, %xmm0
>         ret
>
> And the remaining rotates use one or two pshufd, followed by a 
> psrld/pslld/por sequence:
> rotr_1:
>         movdqa  %xmm0, %xmm1
>         pshufd  $57, %xmm0, %xmm0
>         psrld   $1, %xmm1
>         pslld   $31, %xmm0
>         por     %xmm1, %xmm0
>         ret
> rotr_63:
>         pshufd  $78, %xmm0, %xmm1
>         pshufd  $57, %xmm0, %xmm0
>         pslld   $1, %xmm1
>         psrld   $31, %xmm0
>         por     %xmm1, %xmm0
>         ret
> rotr_111:
>         pshufd  $147, %xmm0, %xmm1
>         pslld   $17, %xmm0
>         psrld   $15, %xmm1
>         por     %xmm1, %xmm0
>         ret
>
> The new test case, sse2-v1ti-shift.c, is a run-time check to confirm 
> that the results of V1TImode shifts/rotates by constants, exactly 
> match the expected results of TImode operations, for various input test vectors.

Is the sequence of 4+ SSE instructions really faster than pinsrq/pextrq (and two movq insn) + two operations on integer registers?

Uros.

> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap 
> and make -k check with no new failures.  Ok for mainline?
>
>
> 2021-10-24  Roger Sayle  <roger@nextmovesoftware.com>
>
> gcc/ChangeLog
>         * config/i386/i386-expand.c (ix86_expand_v1ti_shift): New helper
>         function to expand V1TI mode logical shifts by integer constants.
>         (ix86_expand_v1ti_rotate): New helper function to expand V1TI
>         mode rotations by integer constants.
>         * config/i386/i386-protos.h (ix86_expand_v1ti_shift,
>         ix86_expand_v1ti_rotate): Prototype new functions here.
>         * config/i386/sse.md (ashlv1ti3, lshrv1ti3, rotlv1ti3, rotrv1ti3):
>         New TARGET_SSE2 expanders to implement V1TI shifts and rotations.
>
> gcc/testsuite/ChangeLog
>         * gcc.target/i386/sse2-v1ti-shift.c: New test case.
>
>
> Thanks in advance,
> Roger
> --
>
  
Uros Bizjak Oct. 25, 2021, 6:57 p.m. UTC | #3
On Mon, Oct 25, 2021 at 4:16 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> Hi Uros,
> I believe the proposed sequences should be dramatically faster than LLVM's
> implementation(s), due to the large latencies required to move values between
> the vector and scalar parts on modern x86_64 microarchitectures.  All of the
> SSE2 instructions used in the sequences proposed by my patch have single
> cycle latencies, so have a maximum total latency of 5 cycles, though due to
> multiple issue, typically require between 1 and 3 cycles depending up the sequence.
>
> Moving between units is significantly slower; according to Agner Fog's tables,
> the pinsrq/pextrq instructions you suggest have latencies up to 7 cycles on the
> Silvermont architecture.  Let's take the LLVM code you've provided, and
> annotate with cycle counts for a recent Intel (cascadelake) and recent AMD
> (zen2) CPUs.
>
> movq    %xmm0, %rax             ; 2-3 cycles
> pshufd  $78, %xmm0, %xmm0       ; 1 cycle
> movq    %xmm0, %rcx             ; 2-3 cycles
> shldq   $8, %rax, %rcx          ; 3 cycles
> shlq    $8, %rax                        ; 1 cycle
> movq    %rcx, %xmm1             ; 2-3 cycles
> movq    %rax, %xmm0             ; 2-3 cycles
> punpcklqdq      %xmm1, %xmm0    ; 1 cycle
>
> This 8 instruction sequence has a total latency of 14 cycles on CascadeLake and
> 18 cycles on Zen2, but an scheduled cycle count of 9 cycles and 11 cycles respectively.
>
> The same left shift by 8 as implemented by the proposed patch is:
>
> pslldq  $1, %xmm0               ; 1 cycle
>
> And for reference, the code currently generated by GCC is:
>
> movaps  %xmm0, -24(%rsp)        ; 3 cycles
> movq    -24(%rsp), %rax         ; 2 cycles
> movq    -16(%rsp), %rdx         ; 2 cycles
> shldq   $8, %rax, %rdx          ; 3 cycles
> salq    $8, %rax                        ; 1 cycle
> movq    %rax, -24(%rsp)         ; 2 cycles
> movq    %rdx, -16(%rsp)         ; 2 cycles
> movdqa  -24(%rsp), %xmm0        ; 2 cycles
>
>
> The very worst case timing of my patches is the five instruction rotate:
> pshufd  $78, %xmm0, %xmm1       ; 1 cycle
> pshufd  $57, %xmm0, %xmm0       ; 1 cycle
> pslld   $1, %xmm1               ; 1 cycle
> psrld   $31, %xmm0              ; 1 cycle
> por     %xmm1, %xmm0            ; 1 cycle
>
> which has 5 cycle total latency, but can complete in 3 cycles when suitably
> scheduled as the pshufd can execute concurrently, as then can the two shifts,
> finally followed by the por.
>
> Perhaps I'm missing something, but I'd expect this patch to be three or
> four times faster, on recent hardware, than the code generated by LLVM.
>
> Let me know if you'd like me to run microbenchmarks, but the documented
> timings are such a dramatic improvement, I'm a little surprised you've
> asked about performance.  My patch is also a code size win with -Os
> (ashl_8 is currently 39 bytes, shrinks to 5 bytes with this patch).

I was a bit worried about latencies, but as shown above in a great
detail, this worry was not justified. Yes, taking into account that
V1TI lives natively in XMM registers, we should keep it there as much
as possible, and even if the sequences look complicated at the first
sight, they win in all cases.

So, the patch is OK.

Thanks,
Uros.

> Please let me know what you think.
> Roger
> --
>
> -----Original Message-----
> From: Uros Bizjak <ubizjak@gmail.com>
> Sent: 25 October 2021 09:02
> To: Roger Sayle <roger@nextmovesoftware.com>
> Cc: GCC Patches <gcc-patches@gcc.gnu.org>
> Subject: Re: [PATCH] x86_64: Implement V1TI mode shifts/rotates by a constant
>
> On Sun, Oct 24, 2021 at 6:34 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
> >
> >
> > This patch provides RTL expanders to implement logical shifts and
> > rotates of 128-bit values (stored in vector integer registers) by
> > constant bit counts.  Previously, GCC would transfer these values to a
> > pair of scalar registers (TImode) via memory to perform the operation,
> > then transfer the result back via memory.  Instead these operations
> > are now expanded using (between 1 and 5) SSE2 vector instructions.
>
> Hm, instead of using memory (without STL forwarding for general -> XMM
> moves!) these should use something similar to what clang produces (or use pextrq/pinsrq, at least with SSE4.1):
>
>        movq    %xmm0, %rax
>        pshufd  $78, %xmm0, %xmm0
>        movq    %xmm0, %rcx
>        shldq   $8, %rax, %rcx
>        shlq    $8, %rax
>        movq    %rcx, %xmm1
>        movq    %rax, %xmm0
>        punpcklqdq      %xmm1, %xmm0
>
> > Logical shifts by multiples of 8 can be implemented using x86_64's
> > pslldq/psrldq instruction:
> > ashl_8: pslldq  $1, %xmm0
> >         ret
> > lshr_32:
> >         psrldq  $4, %xmm0
> >         ret
> >
> > Logical shifts by greater than 64 can use pslldq/psrldq $8, followed
> > by a psllq/psrlq for the remaining bits:
> > ashl_111:
> >         pslldq  $8, %xmm0
> >         psllq   $47, %xmm0
> >         ret
> > lshr_127:
> >         psrldq  $8, %xmm0
> >         psrlq   $63, %xmm0
> >         ret
> >
> > The remaining logical shifts make use of the following idiom:
> > ashl_1:
> >         movdqa  %xmm0, %xmm1
> >         psllq   $1, %xmm0
> >         pslldq  $8, %xmm1
> >         psrlq   $63, %xmm1
> >         por     %xmm1, %xmm0
> >         ret
> > lshr_15:
> >         movdqa  %xmm0, %xmm1
> >         psrlq   $15, %xmm0
> >         psrldq  $8, %xmm1
> >         psllq   $49, %xmm1
> >         por     %xmm1, %xmm0
> >         ret
> >
> > Rotates by multiples of 32 can use x86_64's pshufd:
> > rotr_32:
> >         pshufd  $57, %xmm0, %xmm0
> >         ret
> > rotr_64:
> >         pshufd  $78, %xmm0, %xmm0
> >         ret
> > rotr_96:
> >         pshufd  $147, %xmm0, %xmm0
> >         ret
> >
> > Rotates by multiples of 8 (other than multiples of 32) can make use of
> > both pslldq and psrldq, followed by por:
> > rotr_8:
> >         movdqa  %xmm0, %xmm1
> >         psrldq  $1, %xmm0
> >         pslldq  $15, %xmm1
> >         por     %xmm1, %xmm0
> >         ret
> > rotr_112:
> >         movdqa  %xmm0, %xmm1
> >         psrldq  $14, %xmm0
> >         pslldq  $2, %xmm1
> >         por     %xmm1, %xmm0
> >         ret
> >
> > And the remaining rotates use one or two pshufd, followed by a
> > psrld/pslld/por sequence:
> > rotr_1:
> >         movdqa  %xmm0, %xmm1
> >         pshufd  $57, %xmm0, %xmm0
> >         psrld   $1, %xmm1
> >         pslld   $31, %xmm0
> >         por     %xmm1, %xmm0
> >         ret
> > rotr_63:
> >         pshufd  $78, %xmm0, %xmm1
> >         pshufd  $57, %xmm0, %xmm0
> >         pslld   $1, %xmm1
> >         psrld   $31, %xmm0
> >         por     %xmm1, %xmm0
> >         ret
> > rotr_111:
> >         pshufd  $147, %xmm0, %xmm1
> >         pslld   $17, %xmm0
> >         psrld   $15, %xmm1
> >         por     %xmm1, %xmm0
> >         ret
> >
> > The new test case, sse2-v1ti-shift.c, is a run-time check to confirm
> > that the results of V1TImode shifts/rotates by constants, exactly
> > match the expected results of TImode operations, for various input test vectors.
>
> Is the sequence of 4+ SSE instructions really faster than pinsrq/pextrq (and two movq insn) + two operations on integer registers?
>
> Uros.
>
> > This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
> > and make -k check with no new failures.  Ok for mainline?
> >
> >
> > 2021-10-24  Roger Sayle  <roger@nextmovesoftware.com>
> >
> > gcc/ChangeLog
> >         * config/i386/i386-expand.c (ix86_expand_v1ti_shift): New helper
> >         function to expand V1TI mode logical shifts by integer constants.
> >         (ix86_expand_v1ti_rotate): New helper function to expand V1TI
> >         mode rotations by integer constants.
> >         * config/i386/i386-protos.h (ix86_expand_v1ti_shift,
> >         ix86_expand_v1ti_rotate): Prototype new functions here.
> >         * config/i386/sse.md (ashlv1ti3, lshrv1ti3, rotlv1ti3, rotrv1ti3):
> >         New TARGET_SSE2 expanders to implement V1TI shifts and rotations.
> >
> > gcc/testsuite/ChangeLog
> >         * gcc.target/i386/sse2-v1ti-shift.c: New test case.
> >
> >
> > Thanks in advance,
> > Roger
> > --
> >
>
  

Patch

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 56dd99b..4c3800e 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -6157,6 +6157,169 @@  ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
     }
 }
 
+/* Expand V1TI mode shift (of rtx_code CODE) by constant.  */
+void ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
+{
+  HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
+  rtx op1 = force_reg (V1TImode, operands[1]);
+
+  if (bits == 0)
+    {
+      emit_move_insn (operands[0], op1);
+      return;
+    }
+
+  if ((bits & 7) == 0)
+    {
+      rtx tmp = gen_reg_rtx (V1TImode);
+      if (code == ASHIFT)
+        emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
+      else
+	emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
+      emit_move_insn (operands[0], tmp);
+      return;
+    }
+
+  rtx tmp1 = gen_reg_rtx (V1TImode);
+  if (code == ASHIFT)
+    emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
+  else
+    emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
+
+  /* tmp2 is operands[1] shifted by 64, in V2DImode.  */
+  rtx tmp2 = gen_reg_rtx (V2DImode);
+  emit_move_insn (tmp2, gen_lowpart (V2DImode, tmp1));
+
+  /* tmp3 will be the V2DImode result.  */
+  rtx tmp3 = gen_reg_rtx (V2DImode);
+
+  if (bits > 64)
+    {
+      if (code == ASHIFT)
+	emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
+      else
+	emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
+    }
+  else
+    {
+      /* tmp4 is operands[1], in V2DImode.  */
+      rtx tmp4 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp4, gen_lowpart (V2DImode, op1));
+
+      rtx tmp5 = gen_reg_rtx (V2DImode);
+      if (code == ASHIFT)
+	emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
+      else
+	emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
+
+      rtx tmp6 = gen_reg_rtx (V2DImode);
+      if (code == ASHIFT)
+	emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
+      else
+	emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
+
+      emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
+    }
+
+  /* Convert the result back to V1TImode and store in operands[0].  */
+  rtx tmp7 = gen_reg_rtx (V1TImode);
+  emit_move_insn (tmp7, gen_lowpart (V1TImode, tmp3));
+  emit_move_insn (operands[0], tmp7);
+}
+
+/* Expand V1TI mode rotate (of rtx_code CODE) by constant.  */
+void ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
+{
+  HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
+  rtx op1 = force_reg (V1TImode, operands[1]);
+
+  if (bits == 0)
+    {
+      emit_move_insn (operands[0], op1);
+      return;
+    }
+
+  if (code == ROTATERT)
+    bits = 128 - bits;
+
+  if ((bits & 31) == 0)
+    {
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      rtx tmp3 = gen_reg_rtx (V1TImode);
+
+      emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
+      if (bits == 32)
+	emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
+      else if (bits == 64)
+	emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
+      else
+	emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
+      emit_move_insn (tmp3, gen_lowpart (V1TImode, tmp2));
+      emit_move_insn (operands[0], tmp3);
+      return;
+    }
+
+  if ((bits & 7) == 0)
+    {
+      rtx tmp1 = gen_reg_rtx (V1TImode);
+      rtx tmp2 = gen_reg_rtx (V1TImode);
+      rtx tmp3 = gen_reg_rtx (V1TImode);
+
+      emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
+      emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
+      emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
+      emit_move_insn (operands[0], tmp3);
+      return;
+    }
+
+  rtx op1_v4si = gen_reg_rtx (V4SImode);
+  emit_move_insn (op1_v4si, gen_lowpart (V4SImode, op1));
+
+  rtx lobits;
+  rtx hibits;
+
+  switch (bits >> 5)
+    {
+    case 0:
+      lobits = op1_v4si;
+      hibits = gen_reg_rtx (V4SImode);
+      emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
+      break;
+
+    case 1:
+      lobits = gen_reg_rtx (V4SImode);
+      hibits = gen_reg_rtx (V4SImode);
+      emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
+      emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
+      break;
+
+    case 2:
+      lobits = gen_reg_rtx (V4SImode);
+      hibits = gen_reg_rtx (V4SImode);
+      emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
+      emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
+      break;
+
+    default:
+      lobits = gen_reg_rtx (V4SImode);
+      emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
+      hibits = op1_v4si;
+      break;
+    }
+
+  rtx tmp1 = gen_reg_rtx (V4SImode);
+  rtx tmp2 = gen_reg_rtx (V4SImode);
+  rtx tmp3 = gen_reg_rtx (V4SImode);
+  rtx tmp4 = gen_reg_rtx (V1TImode);
+
+  emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
+  emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
+  emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
+  emit_move_insn (tmp4, gen_lowpart (V1TImode, tmp3));
+  emit_move_insn (operands[0], tmp4);
+}
+
 /* Return mode for the memcpy/memset loop counter.  Prefer SImode over
    DImode for constant loop counts.  */
 
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 708834a..9918a28 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -159,6 +159,8 @@  extern void ix86_split_long_move (rtx[]);
 extern void ix86_split_ashl (rtx *, rtx, machine_mode);
 extern void ix86_split_ashr (rtx *, rtx, machine_mode);
 extern void ix86_split_lshr (rtx *, rtx, machine_mode);
+extern void ix86_expand_v1ti_shift (enum rtx_code, rtx[]);
+extern void ix86_expand_v1ti_rotate (enum rtx_code, rtx[]);
 extern rtx ix86_find_base_term (rtx);
 extern bool ix86_check_movabs (rtx, int);
 extern bool ix86_check_no_addr_space (rtx);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f37c5c0..3ff4579 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -15023,6 +15023,50 @@ 
   operands[4] = gen_lowpart (<MODE>mode, operands[3]);
 })
 
+(define_expand "ashlv1ti3"
+  [(set (match_operand:V1TI 0 "register_operand")
+	(ashift:V1TI
+	 (match_operand:V1TI 1 "register_operand")
+	 (match_operand:SI 2 "const_int_operand")))]
+  "TARGET_SSE2"
+{
+  ix86_expand_v1ti_shift (ASHIFT, operands);
+  DONE;
+})
+
+(define_expand "lshrv1ti3"
+  [(set (match_operand:V1TI 0 "register_operand")
+	(lshiftrt:V1TI
+	 (match_operand:V1TI 1 "register_operand")
+	 (match_operand:SI 2 "const_int_operand")))]
+  "TARGET_SSE2"
+{
+  ix86_expand_v1ti_shift (LSHIFTRT, operands);
+  DONE;
+})
+
+(define_expand "rotlv1ti3"
+  [(set (match_operand:V1TI 0 "register_operand")
+	(rotate:V1TI
+	 (match_operand:V1TI 1 "register_operand")
+	 (match_operand:SI 2 "const_int_operand")))]
+  "TARGET_SSE2"
+{
+  ix86_expand_v1ti_rotate (ROTATE, operands);
+  DONE;
+})
+
+(define_expand "rotrv1ti3"
+  [(set (match_operand:V1TI 0 "register_operand")
+	(rotatert:V1TI
+	 (match_operand:V1TI 1 "register_operand")
+	 (match_operand:SI 2 "const_int_operand")))]
+  "TARGET_SSE2"
+{
+  ix86_expand_v1ti_rotate (ROTATERT, operands);
+  DONE;
+})
+
 (define_insn "avx512bw_<insn><mode>3"
   [(set (match_operand:VIMAX_AVX512VL 0 "register_operand" "=v")
 	(any_lshift:VIMAX_AVX512VL
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-shift.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-shift.c
new file mode 100644
index 0000000..dbae418
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-shift.c
@@ -0,0 +1,212 @@ 
+/* { dg-do run { target int128 } } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+typedef unsigned __int128 v1ti __attribute__ ((__vector_size__ (16)));
+typedef unsigned __int128 ti;
+
+ti ashl(ti x, unsigned int i) { return x << i; }
+ti lshr(ti x, unsigned int i) { return x >> i; }
+ti rotr(ti x, unsigned int i) { return (x >> i) | (x << (128-i)); }
+
+v1ti ashl_1(v1ti x) { return x << 1; }
+v1ti ashl_2(v1ti x) { return x << 2; }
+v1ti ashl_7(v1ti x) { return x << 7; }
+v1ti ashl_8(v1ti x) { return x << 8; }
+v1ti ashl_9(v1ti x) { return x << 9; }
+v1ti ashl_15(v1ti x) { return x << 15; }
+v1ti ashl_16(v1ti x) { return x << 16; }
+v1ti ashl_17(v1ti x) { return x << 17; }
+v1ti ashl_31(v1ti x) { return x << 31; }
+v1ti ashl_32(v1ti x) { return x << 32; }
+v1ti ashl_33(v1ti x) { return x << 33; }
+v1ti ashl_63(v1ti x) { return x << 63; }
+v1ti ashl_64(v1ti x) { return x << 64; }
+v1ti ashl_65(v1ti x) { return x << 65; }
+v1ti ashl_72(v1ti x) { return x << 72; }
+v1ti ashl_95(v1ti x) { return x << 95; }
+v1ti ashl_96(v1ti x) { return x << 96; }
+v1ti ashl_97(v1ti x) { return x << 97; }
+v1ti ashl_111(v1ti x) { return x << 111; }
+v1ti ashl_112(v1ti x) { return x << 112; }
+v1ti ashl_113(v1ti x) { return x << 113; }
+v1ti ashl_119(v1ti x) { return x << 119; }
+v1ti ashl_120(v1ti x) { return x << 120; }
+v1ti ashl_121(v1ti x) { return x << 121; }
+v1ti ashl_126(v1ti x) { return x << 126; }
+v1ti ashl_127(v1ti x) { return x << 127; }
+
+v1ti lshr_1(v1ti x) { return x >> 1; }
+v1ti lshr_2(v1ti x) { return x >> 2; }
+v1ti lshr_7(v1ti x) { return x >> 7; }
+v1ti lshr_8(v1ti x) { return x >> 8; }
+v1ti lshr_9(v1ti x) { return x >> 9; }
+v1ti lshr_15(v1ti x) { return x >> 15; }
+v1ti lshr_16(v1ti x) { return x >> 16; }
+v1ti lshr_17(v1ti x) { return x >> 17; }
+v1ti lshr_31(v1ti x) { return x >> 31; }
+v1ti lshr_32(v1ti x) { return x >> 32; }
+v1ti lshr_33(v1ti x) { return x >> 33; }
+v1ti lshr_63(v1ti x) { return x >> 63; }
+v1ti lshr_64(v1ti x) { return x >> 64; }
+v1ti lshr_65(v1ti x) { return x >> 65; }
+v1ti lshr_72(v1ti x) { return x >> 72; }
+v1ti lshr_95(v1ti x) { return x >> 95; }
+v1ti lshr_96(v1ti x) { return x >> 96; }
+v1ti lshr_97(v1ti x) { return x >> 97; }
+v1ti lshr_111(v1ti x) { return x >> 111; }
+v1ti lshr_112(v1ti x) { return x >> 112; }
+v1ti lshr_113(v1ti x) { return x >> 113; }
+v1ti lshr_119(v1ti x) { return x >> 119; }
+v1ti lshr_120(v1ti x) { return x >> 120; }
+v1ti lshr_121(v1ti x) { return x >> 121; }
+v1ti lshr_126(v1ti x) { return x >> 126; }
+v1ti lshr_127(v1ti x) { return x >> 127; }
+
+v1ti rotr_1(v1ti x) { return (x >> 1) | (x << 127); }
+v1ti rotr_2(v1ti x) { return (x >> 2) | (x << 126); }
+v1ti rotr_7(v1ti x) { return (x >> 7) | (x << 121); }
+v1ti rotr_8(v1ti x) { return (x >> 8) | (x << 120); }
+v1ti rotr_9(v1ti x) { return (x >> 9) | (x << 119); }
+v1ti rotr_15(v1ti x) { return (x >> 15) | (x << 113); }
+v1ti rotr_16(v1ti x) { return (x >> 16) | (x << 112); }
+v1ti rotr_17(v1ti x) { return (x >> 17) | (x << 111); }
+v1ti rotr_31(v1ti x) { return (x >> 31) | (x << 97); }
+v1ti rotr_32(v1ti x) { return (x >> 32) | (x << 96); }
+v1ti rotr_33(v1ti x) { return (x >> 33) | (x << 95); }
+v1ti rotr_63(v1ti x) { return (x >> 63) | (x << 65); }
+v1ti rotr_64(v1ti x) { return (x >> 64) | (x << 64); }
+v1ti rotr_65(v1ti x) { return (x >> 65) | (x << 63); }
+v1ti rotr_72(v1ti x) { return (x >> 72) | (x << 56); }
+v1ti rotr_95(v1ti x) { return (x >> 95) | (x << 33); }
+v1ti rotr_96(v1ti x) { return (x >> 96) | (x << 32); }
+v1ti rotr_97(v1ti x) { return (x >> 97) | (x << 31); }
+v1ti rotr_111(v1ti x) { return (x >> 111) | (x << 17); }
+v1ti rotr_112(v1ti x) { return (x >> 112) | (x << 16); }
+v1ti rotr_113(v1ti x) { return (x >> 113) | (x << 15); }
+v1ti rotr_119(v1ti x) { return (x >> 119) | (x << 9); }
+v1ti rotr_120(v1ti x) { return (x >> 120) | (x << 8); }
+v1ti rotr_121(v1ti x) { return (x >> 121) | (x << 7); }
+v1ti rotr_126(v1ti x) { return (x >> 126) | (x << 2); }
+v1ti rotr_127(v1ti x) { return (x >> 127) | (x << 1); }
+
+
+typedef v1ti (*fun)(v1ti);
+
+struct {
+  unsigned int i;
+  fun ashl;
+  fun lshr;
+  fun rotr;
+} table[26] = {
+  {   1, ashl_1,   lshr_1,   rotr_1   },
+  {   2, ashl_2,   lshr_2,   rotr_2   },
+  {   7, ashl_7,   lshr_7,   rotr_7   },
+  {   8, ashl_8,   lshr_8,   rotr_8   },
+  {   9, ashl_9,   lshr_9,   rotr_9   },
+  {  15, ashl_15,  lshr_15,  rotr_15  },
+  {  16, ashl_16,  lshr_16,  rotr_16  },
+  {  17, ashl_17,  lshr_17,  rotr_17  },
+  {  31, ashl_31,  lshr_31,  rotr_31  },
+  {  32, ashl_32,  lshr_32,  rotr_32  },
+  {  33, ashl_33,  lshr_33,  rotr_33  },
+  {  63, ashl_63,  lshr_63,  rotr_63  },
+  {  64, ashl_64,  lshr_64,  rotr_64  },
+  {  65, ashl_65,  lshr_65,  rotr_65  },
+  {  72, ashl_72,  lshr_72,  rotr_72  },
+  {  95, ashl_95,  lshr_95,  rotr_95  },
+  {  96, ashl_96,  lshr_96,  rotr_96  },
+  {  97, ashl_97,  lshr_97,  rotr_97  },
+  { 111, ashl_111, lshr_111, rotr_111 },
+  { 112, ashl_112, lshr_112, rotr_112 },
+  { 113, ashl_113, lshr_113, rotr_113 },
+  { 119, ashl_119, lshr_119, rotr_119 },
+  { 120, ashl_120, lshr_120, rotr_120 },
+  { 121, ashl_121, lshr_121, rotr_121 },
+  { 126, ashl_126, lshr_126, rotr_126 },
+  { 127, ashl_127, lshr_127, rotr_127 }
+};
+
+void test(ti x)
+{
+  unsigned int i;
+  v1ti t = (v1ti)x;
+
+  for (i=0; i<(sizeof(table)/sizeof(table[0])); i++) {
+    if ((ti)(*table[i].ashl)(t) != ashl(x,table[i].i))
+      __builtin_abort();
+    if ((ti)(*table[i].lshr)(t) != lshr(x,table[i].i))
+      __builtin_abort();
+    if ((ti)(*table[i].rotr)(t) != rotr(x,table[i].i))
+      __builtin_abort();
+  }
+}
+
+int main()
+{
+  ti x;
+
+  x = ((ti)0x0011223344556677ull)<<64 | 0x8899aabbccddeeffull;
+  test(x);
+  x = ((ti)0xffeeddccbbaa9988ull)<<64 | 0x7766554433221100ull;
+  test(x);
+  x = ((ti)0x0123456789abcdefull)<<64 | 0x0123456789abcdefull;
+  test(x);
+  x = ((ti)0xfedcba9876543210ull)<<64 | 0xfedcba9876543210ull;
+  test(x);
+  x = ((ti)0x0123456789abcdefull)<<64 | 0xfedcba9876543210ull;
+  test(x);
+  x = ((ti)0xfedcba9876543210ull)<<64 | 0x0123456789abcdefull;
+  test(x);
+  x = 0;
+  test(x);
+  x = 0xffffffffffffffffull;
+  test(x);
+  x = ((ti)0xffffffffffffffffull)<<64;
+  test(x);
+  x = ((ti)0xffffffffffffffffull)<<64 | 0xffffffffffffffffull;
+  test(x);
+  x = ((ti)0x5a5a5a5a5a5a5a5aull)<<64 | 0x5a5a5a5a5a5a5a5aull;
+  test(x);
+  x = ((ti)0xa5a5a5a5a5a5a5a5ull)<<64 | 0xa5a5a5a5a5a5a5a5ull;
+  test(x);
+  x = 0xffull;
+  test(x);
+  x = 0xff00ull;
+  test(x);
+  x = 0xff0000ull;
+  test(x);
+  x = 0xff000000ull;
+  test(x);
+  x = 0xff00000000ull;
+  test(x);
+  x = 0xff0000000000ull;
+  test(x);
+  x = 0xff000000000000ull;
+  test(x);
+  x = 0xff00000000000000ull;
+  test(x);
+  x = ((ti)0xffull)<<64;
+  test(x);
+  x = ((ti)0xff00ull)<<64;
+  test(x);
+  x = ((ti)0xff0000ull)<<64;
+  test(x);
+  x = ((ti)0xff000000ull)<<64;
+  test(x);
+  x = ((ti)0xff00000000ull)<<64;
+  test(x);
+  x = ((ti)0xff0000000000ull)<<64;
+  test(x);
+  x = ((ti)0xff000000000000ull)<<64;
+  test(x);
+  x = ((ti)0xff00000000000000ull)<<64;
+  test(x);
+  x = 0xdeadbeefcafebabeull;
+  test(x);
+  x = ((ti)0xdeadbeefcafebabeull)<<64;
+  test(x);
+
+  return 0;
+}
+