x86_64: Expand ashrv1ti (and PR target/102986)

Message ID 007b01d7cd77$3ccc8e40$b665aac0$@nextmovesoftware.com
State New
Headers
Series x86_64: Expand ashrv1ti (and PR target/102986) |

Commit Message

Roger Sayle Oct. 30, 2021, 10:16 a.m. UTC
  This patch was originally intended to implement 128-bit arithmetic right
shifts by constants of vector registers (V1TImode), but while working on
it I discovered the (my) recent ICE on valid regression now known as
PR target/102986.

As diagnosed by Jakub, expanders for shifts are not allowed to fail, and
so any backend that provides a shift optab needs to handle variable amount
shifts as well as constant shifts [even though the middle-end knows how
to synthesize these for vector modes].  This constraint could be relaxed
in the middle-end, but it makes sense to fix this in my erroneous code.

The solution is to change the constraints on the recently added (and new)
shift expanders from SImode const_int_register to QImode general operand,
matching the TImode expanders' constraints, and then simply check for
!CONST_INT_P at the start of the ix86_expand_v1ti_* functions, converting
the operands from V1TImode to TImode, performing the TImode operation
and converting the result back to V1TImode.

One nice benefit of this strategy, is that it allows us to implement
Uros' recent suggestion, that we should be more efficiently converting
between these modes, avoiding the use of memory and using the same idiom
as LLVM or using pextrq/pinsrq where available.  The new helper functions
ix86_expand_v1ti_to_ti and ix86_expand_ti_to_v1ti are sufficient to take
care of this.  Interestingly partial support for this is already present,
but x86_64's generic tuning prefers memory transfers to avoid penalizing
microarchitectures with significant interunit delays.  With these changes
we not generate both pextrq and pinsrq for -mtune=native.

The main body of the patch is to implement arithmetic right shift in
addition to the logical right shift and left shift implemented in the
previous patch.  This expander provides no less than 13 different code
sequences, special casing the different constant shifts, including
variants taking advantage of TARGET_AVX2 and TARGET_SSE4_1.  The code
is structured with the faster/shorter sequences and the start, and
the generic implementations at the end.

For the record, the implementations are:

ashr_127:       // Shift 127, 2 operations, 10 bytes
        pshufd  $255, %xmm0, %xmm0
        psrad   $31, %xmm0
        ret

ashr_64:        // Shift by 64, 3 operations, 14 bytes
        pshufd  $255, %xmm0, %xmm1
        psrad   $31, %xmm1
        punpckhqdq      %xmm1, %xmm0
        ret

ashr_96:        // Shift by 96, 3 operations, 18 bytes
        movdqa  %xmm0, %xmm1
        psrad   $31, %xmm1
        punpckhqdq      %xmm1, %xmm0
        pshufd  $253, %xmm0, %xmm0
        ret

ashr_8:         // Shift by 8/16/24/32 on AVX2, 3 operations, 16 bytes
        vpsrad  $8, %xmm0, %xmm1
        vpsrldq $1, %xmm0, %xmm0
        vpblendd        $7, %xmm0, %xmm1, %xmm0
        ret

ashr_8:         // Shift by 8/16/24/32 on SSE4.1, 3 operations, 24 bytes
        movdqa  %xmm0, %xmm1
        psrldq  $1, %xmm0
        psrad   $8, %xmm1
        pblendw $63, %xmm0, %xmm1
        movdqa  %xmm1, %xmm0
        ret

ashr_97:        // Shifts by 97..126, 4 operations, 23 bytes
        movdqa  %xmm0, %xmm1
        psrad   $31, %xmm0
        psrad   $1, %xmm1
        punpckhqdq      %xmm0, %xmm1
        pshufd  $253, %xmm1, %xmm0
        ret

ashr_48:        // Shifts by 48/80 on SSE4.1, 4 operations, 25 bytes
        movdqa  %xmm0, %xmm1
        pshufd  $255, %xmm0, %xmm0
        psrldq  $6, %xmm1
        psrad   $31, %xmm0
        pblendw $31, %xmm1, %xmm0
        ret

ashr_8:         // Shifts by multiples of 8, 5 operations, 28 bytes
        movdqa  %xmm0, %xmm1
        pshufd  $255, %xmm0, %xmm0
        psrad   $31, %xmm0
        psrldq  $1, %xmm1
        pslldq  $15, %xmm0
        por     %xmm1, %xmm0
        ret

ashr_1:         // Shifts by 1..31 on AVX2, 6 operations, 30 bytes
        vpsrldq $8, %xmm0, %xmm2
        vpsrad  $1, %xmm0, %xmm1
        vpsllq  $63, %xmm2, %xmm2
        vpsrlq  $1, %xmm0, %xmm0
        vpor    %xmm2, %xmm0, %xmm0
        vpblendd        $7, %xmm0, %xmm1, %xmm0
        ret

ashr_1:         // Shifts by 1..15 on SSE4.1, 6 operations, 42 bytes
        movdqa  %xmm0, %xmm2
        movdqa  %xmm0, %xmm1
        psrldq  $8, %xmm2
        psrlq   $1, %xmm0
        psllq   $63, %xmm2
        psrad   $1, %xmm1
        por     %xmm2, %xmm0
        pblendw $63, %xmm0, %xmm1
        movdqa  %xmm1, %xmm0
        ret

ashr_1:         // Shift by 1, 8 operations, 46 bytes
        movdqa  %xmm0, %xmm1
        movdqa  %xmm0, %xmm2
        psrldq  $8, %xmm2
        psrlq   $63, %xmm1
        psllq   $63, %xmm2
        psrlq   $1, %xmm0
        pshufd  $191, %xmm1, %xmm1
        por     %xmm2, %xmm0
        psllq   $31, %xmm1
        por     %xmm1, %xmm0
        ret

ashr_65:        // Shifts by 65..95, 8 operations, 42 bytes
        pshufd  $255, %xmm0, %xmm1
        psrldq  $8, %xmm0
        psrad   $31, %xmm1
        psrlq   $1, %xmm0
        movdqa  %xmm1, %xmm2
        psllq   $63, %xmm1
        pslldq  $8, %xmm2
        por     %xmm2, %xmm1
        por     %xmm1, %xmm0
        ret

ashr_2:         // Shifts from 2..63, 9 operations, 47 bytes
        pshufd  $255, %xmm0, %xmm1
        movdqa  %xmm0, %xmm2
        psrad   $31, %xmm1
        psrldq  $8, %xmm2
        psllq   $62, %xmm2
        psrlq   $2, %xmm0
        pslldq  $8, %xmm1
        por     %xmm2, %xmm0
        psllq   $62, %xmm1
        por     %xmm1, %xmm0
        ret

To test these changes there are several new test cases.  sse2-v1ti-shift-2.c
is a compile-test designed to spot/catch PR target/102986 [for all shifts
and rotates by variable amounts], and sse2-v1ti-shift-3.c is an execution
test to confirm shifts/rotates by variable amounts produce the same results
for TImode and V1TImode.  sse2-v1ti-ashiftrt-1.c is a (similar) execution
test to confirm arithmetic right shifts by different constants produce
identical results between TImode and V1TImode.  sse2-v1ti-ashift-[23].c are
duplicates of this file as compilation tests specifying -mavx2 and -msse4.1
respectively to trigger all the paths through the new expander.

This patch has been tested on x86_64-pc-linux-gnu with a make bootstrap
and make -k check with no new failures.  Ok for mainline?  I'm happy to
split this into two pieces, but the functionality overlaps and this patch
was nearly ready to submit to gcc-patches when 102986 appeared in bugzilla.


2021-10-30  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	PR target/102986
	* config/i386/i386-expand.c (ix86_expand_v1ti_to_ti,
	ix86_expand_ti_to_v1ti): New helper functions.
	(ix86_expand_v1ti_shift): Check if the amount operand is an
	integer constant, and expand as a TImode shift if it isn't.
	(ix86_expand_v1ti_rotate): Check if the amount operand is an
	integer constant, and expand as a TImode rotate if it isn't.
	(ix86_expand_v1ti_ashiftrt): New function to expand arithmetic
	right shifts of V1TImode quantities.
	* config/i386/i386-protos.h (ix86_expand_v1ti_ashift): Prototype.
	* config/i386/sse.md (ashlv1ti3, lshrv1ti3): Change constraints
	to QImode general_operand, and let the helper functions lower
	shifts by non-constant operands, as TImode shifts.
	(ashrv1ti3): New expander calling ix86_expand_v1ti_ashiftrt.
	(rotlv1ti3, rotrv1ti3): Change shift operand to QImode.

gcc/testsuite/ChangeLog
	PR target/102986
	* gcc.target/i386/sse2-v1ti-ashiftrt-1.c: New test case.
	* gcc.target/i386/sse2-v1ti-ashiftrt-2.c: New test case.
	* gcc.target/i386/sse2-v1ti-ashiftrt-3.c: New test case.
	* gcc.target/i386/sse2-v1ti-shift-2.c: New test case.
	* gcc.target/i386/sse2-v1ti-shift-3.c: New test case.

Sorry again for the breakage in my last patch.   I wasn't testing things
that shouldn't have been affected/changed.

Roger
--
  

Comments

Jakub Jelinek Oct. 30, 2021, 10:29 a.m. UTC | #1
On Sat, Oct 30, 2021 at 11:16:41AM +0100, Roger Sayle wrote:
> 2021-10-30  Roger Sayle  <roger@nextmovesoftware.com>
> 
> gcc/ChangeLog
> 	PR target/102986
> 	* config/i386/i386-expand.c (ix86_expand_v1ti_to_ti,
> 	ix86_expand_ti_to_v1ti): New helper functions.
> 	(ix86_expand_v1ti_shift): Check if the amount operand is an
> 	integer constant, and expand as a TImode shift if it isn't.
> 	(ix86_expand_v1ti_rotate): Check if the amount operand is an
> 	integer constant, and expand as a TImode rotate if it isn't.
> 	(ix86_expand_v1ti_ashiftrt): New function to expand arithmetic
> 	right shifts of V1TImode quantities.
> 	* config/i386/i386-protos.h (ix86_expand_v1ti_ashift): Prototype.
> 	* config/i386/sse.md (ashlv1ti3, lshrv1ti3): Change constraints
> 	to QImode general_operand, and let the helper functions lower
> 	shifts by non-constant operands, as TImode shifts.
> 	(ashrv1ti3): New expander calling ix86_expand_v1ti_ashiftrt.
> 	(rotlv1ti3, rotrv1ti3): Change shift operand to QImode.
> 
> gcc/testsuite/ChangeLog
> 	PR target/102986
> 	* gcc.target/i386/sse2-v1ti-ashiftrt-1.c: New test case.
> 	* gcc.target/i386/sse2-v1ti-ashiftrt-2.c: New test case.
> 	* gcc.target/i386/sse2-v1ti-ashiftrt-3.c: New test case.
> 	* gcc.target/i386/sse2-v1ti-shift-2.c: New test case.
> 	* gcc.target/i386/sse2-v1ti-shift-3.c: New test case.
> 
> Sorry again for the breakage in my last patch.   I wasn't testing things
> that shouldn't have been affected/changed.

Not a review, will defer that to Uros, but just nits:

> +/* Expand move of V1TI mode register X to a new TI mode register.  */
> +static rtx ix86_expand_v1ti_to_ti (rtx x)

ix86_expand_v1ti_to_ti should be at the start of next line, so
static rtx
ix86_expand_v1ti_to_ti (rtx x)

Ditto for other functions and also in functions you've added by the
previous patch.
> +      emit_insn (code == ASHIFT ? gen_ashlti3(tmp2, tmp1, operands[2])
> +				: gen_lshrti3(tmp2, tmp1, operands[2]));

Space before ( twice.

> +      emit_insn (code == ROTATE ? gen_rotlti3(tmp2, tmp1, operands[2])
> +				: gen_rotrti3(tmp2, tmp1, operands[2]));

Likewise.

> +      emit_insn (gen_ashrti3(tmp2, tmp1, operands[2]));

Similarly.

Also, I wonder for all these patterns (previously and now added), shouldn't
they have && TARGET_64BIT in conditions?  I mean, we don't really support
scalar TImode for ia32, but VALID_SSE_REG_MODE includes V1TImode and while
the constant shifts can be done, I think the variable shifts can't, there
are no TImode shift patterns...

	Jakub
  

Patch

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 4c3800e..ecbbfcf 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -6157,12 +6157,49 @@  ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
     }
 }
 
+/* Expand move of V1TI mode register X to a new TI mode register.  */
+static rtx ix86_expand_v1ti_to_ti (rtx x)
+{
+  rtx result = gen_reg_rtx (TImode);
+  emit_move_insn (result, gen_lowpart (TImode, x));
+  return result;
+}
+
+/* Expand move of TI mode register X to a new V1TI mode register.  */
+static rtx ix86_expand_ti_to_v1ti (rtx x)
+{
+  rtx result = gen_reg_rtx (V1TImode);
+  if (TARGET_SSE2)
+    {
+      rtx lo = gen_lowpart (DImode, x);
+      rtx hi = gen_highpart (DImode, x);
+      rtx tmp = gen_reg_rtx (V2DImode);
+      emit_insn (gen_vec_concatv2di (tmp, lo, hi));
+      emit_move_insn (result, gen_lowpart (V1TImode, tmp));
+    }
+  else
+    emit_move_insn (result, gen_lowpart (V1TImode, x));
+  return result;
+}
+
 /* Expand V1TI mode shift (of rtx_code CODE) by constant.  */
 void ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
 {
-  HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
   rtx op1 = force_reg (V1TImode, operands[1]);
 
+  if (!CONST_INT_P (operands[2]))
+    {
+      rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
+      rtx tmp2 = gen_reg_rtx (TImode);
+      emit_insn (code == ASHIFT ? gen_ashlti3(tmp2, tmp1, operands[2])
+				: gen_lshrti3(tmp2, tmp1, operands[2]));
+      rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
+      emit_move_insn (operands[0], tmp3);
+      return;
+    }
+
+  HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
+
   if (bits == 0)
     {
       emit_move_insn (operands[0], op1);
@@ -6230,9 +6267,21 @@  void ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
 /* Expand V1TI mode rotate (of rtx_code CODE) by constant.  */
 void ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
 {
-  HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
   rtx op1 = force_reg (V1TImode, operands[1]);
 
+  if (!CONST_INT_P (operands[2]))
+    {
+      rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
+      rtx tmp2 = gen_reg_rtx (TImode);
+      emit_insn (code == ROTATE ? gen_rotlti3(tmp2, tmp1, operands[2])
+				: gen_rotrti3(tmp2, tmp1, operands[2]));
+      rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
+      emit_move_insn (operands[0], tmp3);
+      return;
+    }
+
+  HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
+
   if (bits == 0)
     {
       emit_move_insn (operands[0], op1);
@@ -6320,6 +6369,468 @@  void ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
   emit_move_insn (operands[0], tmp4);
 }
 
+/* Expand V1TI mode ashiftrt by constant.  */
+void ix86_expand_v1ti_ashiftrt (rtx operands[])
+{
+  rtx op1 = force_reg (V1TImode, operands[1]);
+
+  if (!CONST_INT_P (operands[2]))
+    {
+      rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
+      rtx tmp2 = gen_reg_rtx (TImode);
+      emit_insn (gen_ashrti3(tmp2, tmp1, operands[2]));
+      rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
+      emit_move_insn (operands[0], tmp3);
+      return;
+    }
+
+  HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
+
+  if (bits == 0)
+    {
+      emit_move_insn (operands[0], op1);
+      return;
+    }
+
+  if (bits == 127)
+    {
+      /* Two operations.  */
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
+      emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
+
+      rtx tmp3 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
+
+      rtx tmp4 = gen_reg_rtx (V1TImode);
+      emit_move_insn (tmp4, gen_lowpart (V1TImode, tmp3));
+      emit_move_insn (operands[0], tmp4);
+      return;
+    }
+
+  if (bits == 64)
+    {
+      /* Three operations.  */
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
+      emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
+
+      rtx tmp3 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
+
+      rtx tmp4 = gen_reg_rtx (V2DImode);
+      rtx tmp5 = gen_reg_rtx (V2DImode);
+      rtx tmp6 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp4, gen_lowpart (V2DImode, tmp1));
+      emit_move_insn (tmp5, gen_lowpart (V2DImode, tmp3));
+      emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
+
+      rtx tmp7 = gen_reg_rtx (V1TImode);
+      emit_move_insn (tmp7, gen_lowpart (V1TImode, tmp6));
+      emit_move_insn (operands[0], tmp7);
+      return;
+    }
+
+  if (bits == 96)
+    {
+      /* Three operations.  */
+      rtx tmp3 = gen_reg_rtx (V2DImode);
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
+      emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
+
+      rtx tmp4 = gen_reg_rtx (V2DImode);
+      rtx tmp5 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp3, gen_lowpart (V2DImode, tmp1));
+      emit_move_insn (tmp4, gen_lowpart (V2DImode, tmp2));
+      emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
+
+      rtx tmp6 = gen_reg_rtx (V4SImode);
+      rtx tmp7 = gen_reg_rtx (V4SImode);
+      emit_move_insn (tmp6, gen_lowpart (V4SImode, tmp5));
+      emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
+
+      rtx tmp8 = gen_reg_rtx (V1TImode);
+      emit_move_insn (tmp8, gen_lowpart (V1TImode, tmp7));
+      emit_move_insn (operands[0], tmp8);
+      return;
+    }
+
+  if (TARGET_AVX2 || TARGET_SSE4_1)
+    {
+      /* Three operations.  */
+      if (bits == 32)
+	{
+	  rtx tmp1 = gen_reg_rtx (V4SImode);
+	  rtx tmp2 = gen_reg_rtx (V4SImode);
+	  emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
+	  emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
+
+	  rtx tmp3 = gen_reg_rtx (V1TImode);
+	  emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
+
+	  if (TARGET_AVX2)
+	    {
+	      rtx tmp4 = gen_reg_rtx (V4SImode);
+	      rtx tmp5 = gen_reg_rtx (V4SImode);
+	      emit_move_insn (tmp4, gen_lowpart (V4SImode, tmp3));
+	      emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
+					       GEN_INT (7)));
+
+	      rtx tmp6 = gen_reg_rtx (V1TImode);
+	      emit_move_insn (tmp6, gen_lowpart (V1TImode, tmp5));
+	      emit_move_insn (operands[0], tmp6);
+	    }
+	  else
+	    {
+	      rtx tmp4 = gen_reg_rtx (V8HImode);
+	      rtx tmp5 = gen_reg_rtx (V8HImode);
+	      rtx tmp6 = gen_reg_rtx (V8HImode);
+	      emit_move_insn (tmp4, gen_lowpart (V8HImode, tmp2));
+	      emit_move_insn (tmp5, gen_lowpart (V8HImode, tmp3));
+	      emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
+					     GEN_INT (0x3f)));
+
+	      rtx tmp7 = gen_reg_rtx (V1TImode);
+	      emit_move_insn (tmp7, gen_lowpart (V1TImode, tmp6));
+	      emit_move_insn (operands[0], tmp7);
+	    }
+	  return;
+	}
+
+      /* Three operations.  */
+      if (bits == 8 || bits == 16 || bits == 24)
+	{
+	  rtx tmp1 = gen_reg_rtx (V4SImode);
+	  rtx tmp2 = gen_reg_rtx (V4SImode);
+	  emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
+	  emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
+
+	  rtx tmp3 = gen_reg_rtx (V1TImode);
+	  emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
+
+	  if (TARGET_AVX2)
+	    {
+	      rtx tmp4 = gen_reg_rtx (V4SImode);
+	      rtx tmp5 = gen_reg_rtx (V4SImode);
+	      emit_move_insn (tmp4, gen_lowpart (V4SImode, tmp3));
+	      emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
+					       GEN_INT (7)));
+
+	      rtx tmp6 = gen_reg_rtx (V1TImode);
+	      emit_move_insn (tmp6, gen_lowpart (V1TImode, tmp5));
+	      emit_move_insn (operands[0], tmp6);
+	    }
+	  else
+	    {
+	      rtx tmp4 = gen_reg_rtx (V8HImode);
+	      rtx tmp5 = gen_reg_rtx (V8HImode);
+	      rtx tmp6 = gen_reg_rtx (V8HImode);
+	      emit_move_insn (tmp4, gen_lowpart (V8HImode, tmp2));
+	      emit_move_insn (tmp5, gen_lowpart (V8HImode, tmp3));
+	      emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
+					     GEN_INT (0x3f)));
+
+	      rtx tmp7 = gen_reg_rtx (V1TImode);
+	      emit_move_insn (tmp7, gen_lowpart (V1TImode, tmp6));
+	      emit_move_insn (operands[0], tmp7);
+	    }
+	  return;
+	}
+    }
+
+  if (bits > 96)
+    {
+      /* Four operations.  */
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
+      emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
+
+      rtx tmp3 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
+
+      rtx tmp4 = gen_reg_rtx (V2DImode);
+      rtx tmp5 = gen_reg_rtx (V2DImode);
+      rtx tmp6 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp4, gen_lowpart (V2DImode, tmp2));
+      emit_move_insn (tmp5, gen_lowpart (V2DImode, tmp3));
+      emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
+
+      rtx tmp7 = gen_reg_rtx (V4SImode);
+      rtx tmp8 = gen_reg_rtx (V4SImode);
+      emit_move_insn (tmp7, gen_lowpart (V4SImode, tmp6));
+      emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
+
+      rtx tmp9 = gen_reg_rtx (V1TImode);
+      emit_move_insn (tmp9, gen_lowpart (V1TImode, tmp8));
+      emit_move_insn (operands[0], tmp9);
+      return;
+    }
+
+  if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
+    {
+      /* Four operations.  */
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
+      emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
+
+      rtx tmp3 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
+
+      rtx tmp4 = gen_reg_rtx (V1TImode);
+      emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
+
+      rtx tmp5 = gen_reg_rtx (V8HImode);
+      rtx tmp6 = gen_reg_rtx (V8HImode);
+      rtx tmp7 = gen_reg_rtx (V8HImode);
+      emit_move_insn (tmp5, gen_lowpart (V8HImode, tmp3));
+      emit_move_insn (tmp6, gen_lowpart (V8HImode, tmp4));
+      emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
+				     GEN_INT (bits == 48 ? 0x1f : 0x07)));
+
+      rtx tmp8 = gen_reg_rtx (V1TImode);
+      emit_move_insn (tmp8, gen_lowpart (V1TImode, tmp7));
+      emit_move_insn (operands[0], tmp8);
+      return;
+    }
+
+  if ((bits & 7) == 0)
+    {
+      /* Five operations.  */
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
+      emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
+
+      rtx tmp3 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
+
+      rtx tmp4 = gen_reg_rtx (V1TImode);
+      emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
+
+      rtx tmp5 = gen_reg_rtx (V1TImode);
+      rtx tmp6 = gen_reg_rtx (V1TImode);
+      emit_move_insn (tmp5, gen_lowpart (V1TImode, tmp3));
+      emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
+
+      rtx tmp7 = gen_reg_rtx (V2DImode);
+      rtx tmp8 = gen_reg_rtx (V2DImode);
+      rtx tmp9 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp7, gen_lowpart (V2DImode, tmp4));
+      emit_move_insn (tmp8, gen_lowpart (V2DImode, tmp6));
+      emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
+
+      rtx tmp10 = gen_reg_rtx (V1TImode);
+      emit_move_insn (tmp10, gen_lowpart (V1TImode, tmp9));
+      emit_move_insn (operands[0], tmp10);
+      return;
+    }
+
+  if (TARGET_AVX2 && bits < 32)
+    {
+      /* Six operations.  */
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
+      emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
+
+      rtx tmp3 = gen_reg_rtx (V1TImode);
+      emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
+
+      rtx tmp4 = gen_reg_rtx (V2DImode);
+      rtx tmp5 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp4, gen_lowpart (V2DImode, op1));
+      emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
+
+      rtx tmp6 = gen_reg_rtx (V2DImode);
+      rtx tmp7 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp6, gen_lowpart (V2DImode, tmp3));
+      emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
+
+      rtx tmp8 = gen_reg_rtx (V2DImode);
+      emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
+
+      rtx tmp9 = gen_reg_rtx (V4SImode);
+      rtx tmp10 = gen_reg_rtx (V4SImode);
+      emit_move_insn (tmp9, gen_lowpart (V4SImode, tmp8));
+      emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
+
+      rtx tmp11 = gen_reg_rtx (V1TImode);
+      emit_move_insn (tmp11, gen_lowpart (V1TImode, tmp10));
+      emit_move_insn (operands[0], tmp11);
+      return;
+    }
+
+  if (TARGET_SSE4_1 && bits < 15)
+    {
+      /* Six operations.  */
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
+      emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
+
+      rtx tmp3 = gen_reg_rtx (V1TImode);
+      emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
+
+      rtx tmp4 = gen_reg_rtx (V2DImode);
+      rtx tmp5 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp4, gen_lowpart (V2DImode, op1));
+      emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
+
+      rtx tmp6 = gen_reg_rtx (V2DImode);
+      rtx tmp7 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp6, gen_lowpart (V2DImode, tmp3));
+      emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
+
+      rtx tmp8 = gen_reg_rtx (V2DImode);
+      emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
+
+      rtx tmp9 = gen_reg_rtx (V8HImode);
+      rtx tmp10 = gen_reg_rtx (V8HImode);
+      rtx tmp11 = gen_reg_rtx (V8HImode);
+      emit_move_insn (tmp9, gen_lowpart (V8HImode, tmp2));
+      emit_move_insn (tmp10, gen_lowpart (V8HImode, tmp8));
+      emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
+
+      rtx tmp12 = gen_reg_rtx (V1TImode);
+      emit_move_insn (tmp12, gen_lowpart (V1TImode, tmp11));
+      emit_move_insn (operands[0], tmp12);
+      return;
+    }
+
+  if (bits == 1)
+    {
+      /* Eight operations.  */
+      rtx tmp1 = gen_reg_rtx (V1TImode);
+      emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
+
+      rtx tmp2 = gen_reg_rtx (V2DImode);
+      rtx tmp3 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp2, gen_lowpart (V2DImode, op1));
+      emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
+
+      rtx tmp4 = gen_reg_rtx (V2DImode);
+      rtx tmp5 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp4, gen_lowpart (V2DImode, tmp1));
+      emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
+
+      rtx tmp6 = gen_reg_rtx (V2DImode);
+      emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
+
+      rtx tmp7 = gen_reg_rtx (V2DImode);
+      emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
+
+      rtx tmp8 = gen_reg_rtx (V4SImode);
+      rtx tmp9 = gen_reg_rtx (V4SImode);
+      emit_move_insn (tmp8, gen_lowpart (V4SImode, tmp7));
+      emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
+
+      rtx tmp10 = gen_reg_rtx (V2DImode);
+      rtx tmp11 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp10, gen_lowpart (V2DImode, tmp9));
+      emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
+
+      rtx tmp12 = gen_reg_rtx (V2DImode);
+      emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
+
+      rtx tmp13 = gen_reg_rtx (V1TImode);
+      emit_move_insn (tmp13, gen_lowpart (V1TImode, tmp12));
+      emit_move_insn (operands[0], tmp13);
+      return;
+    }
+
+  if (bits > 64)
+    {
+      /* Eight operations.  */
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
+      emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
+
+      rtx tmp3 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
+
+      rtx tmp4 = gen_reg_rtx (V1TImode);
+      emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
+
+      rtx tmp5 = gen_reg_rtx (V2DImode);
+      rtx tmp6 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp5, gen_lowpart (V2DImode, tmp4));
+      emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
+
+      rtx tmp7 = gen_reg_rtx (V1TImode);
+      rtx tmp8 = gen_reg_rtx (V1TImode);
+      emit_move_insn (tmp7, gen_lowpart (V1TImode, tmp3));
+      emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
+ 
+      rtx tmp9 = gen_reg_rtx (V2DImode);
+      rtx tmp10 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp9, gen_lowpart (V2DImode, tmp3));
+      emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
+
+      rtx tmp11 = gen_reg_rtx (V2DImode);
+      rtx tmp12 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp11, gen_lowpart (V2DImode, tmp8));
+      emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
+
+      rtx tmp13 = gen_reg_rtx (V2DImode);
+      emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
+
+      rtx tmp14 = gen_reg_rtx (V1TImode);
+      emit_move_insn (tmp14, gen_lowpart (V1TImode, tmp13));
+      emit_move_insn (operands[0], tmp14);
+    }
+  else
+    {
+      /* Nine operations.  */
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      emit_move_insn (tmp1, gen_lowpart (V4SImode, op1));
+      emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
+
+      rtx tmp3 = gen_reg_rtx (V4SImode);
+      emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
+
+      rtx tmp4 = gen_reg_rtx (V1TImode);
+      emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
+
+      rtx tmp5 = gen_reg_rtx (V2DImode);
+      rtx tmp6 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp5, gen_lowpart (V2DImode, op1));
+      emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
+
+      rtx tmp7 = gen_reg_rtx (V2DImode);
+      rtx tmp8 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp7, gen_lowpart (V2DImode, tmp4));
+      emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
+
+      rtx tmp9 = gen_reg_rtx (V2DImode);
+      emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
+
+      rtx tmp10 = gen_reg_rtx (V1TImode);
+      rtx tmp11 = gen_reg_rtx (V1TImode);
+      emit_move_insn (tmp10, gen_lowpart (V1TImode, tmp3));
+      emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
+        
+      rtx tmp12 = gen_reg_rtx (V2DImode);
+      rtx tmp13 = gen_reg_rtx (V2DImode);
+      emit_move_insn (tmp12, gen_lowpart (V2DImode, tmp11));
+      emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
+
+      rtx tmp14 = gen_reg_rtx (V2DImode);
+      emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
+
+      rtx tmp15 = gen_reg_rtx (V1TImode);
+      emit_move_insn (tmp15, gen_lowpart (V1TImode, tmp14));
+      emit_move_insn (operands[0], tmp15);
+    }
+}
+
 /* Return mode for the memcpy/memset loop counter.  Prefer SImode over
    DImode for constant loop counts.  */
 
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 9918a28..bd52450 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -161,6 +161,7 @@  extern void ix86_split_ashr (rtx *, rtx, machine_mode);
 extern void ix86_split_lshr (rtx *, rtx, machine_mode);
 extern void ix86_expand_v1ti_shift (enum rtx_code, rtx[]);
 extern void ix86_expand_v1ti_rotate (enum rtx_code, rtx[]);
+extern void ix86_expand_v1ti_ashiftrt (rtx[]);
 extern rtx ix86_find_base_term (rtx);
 extern bool ix86_check_movabs (rtx, int);
 extern bool ix86_check_no_addr_space (rtx);
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index bdc6067..5c4df8b 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -15079,7 +15079,7 @@ 
   [(set (match_operand:V1TI 0 "register_operand")
 	(ashift:V1TI
 	 (match_operand:V1TI 1 "register_operand")
-	 (match_operand:SI 2 "const_int_operand")))]
+	 (match_operand:QI 2 "general_operand")))]
   "TARGET_SSE2"
 {
   ix86_expand_v1ti_shift (ASHIFT, operands);
@@ -15090,18 +15090,29 @@ 
   [(set (match_operand:V1TI 0 "register_operand")
 	(lshiftrt:V1TI
 	 (match_operand:V1TI 1 "register_operand")
-	 (match_operand:SI 2 "const_int_operand")))]
+	 (match_operand:QI 2 "general_operand")))]
   "TARGET_SSE2"
 {
   ix86_expand_v1ti_shift (LSHIFTRT, operands);
   DONE;
 })
 
+(define_expand "ashrv1ti3"
+  [(set (match_operand:V1TI 0 "register_operand")
+	(ashiftrt:V1TI
+	 (match_operand:V1TI 1 "register_operand")
+	 (match_operand:QI 2 "general_operand")))]
+  "TARGET_SSE2"
+{
+  ix86_expand_v1ti_ashiftrt (operands);
+  DONE;
+})
+
 (define_expand "rotlv1ti3"
   [(set (match_operand:V1TI 0 "register_operand")
 	(rotate:V1TI
 	 (match_operand:V1TI 1 "register_operand")
-	 (match_operand:SI 2 "const_int_operand")))]
+	 (match_operand:QI 2 "const_int_operand")))]
   "TARGET_SSE2"
 {
   ix86_expand_v1ti_rotate (ROTATE, operands);
@@ -15112,7 +15123,7 @@ 
   [(set (match_operand:V1TI 0 "register_operand")
 	(rotatert:V1TI
 	 (match_operand:V1TI 1 "register_operand")
-	 (match_operand:SI 2 "const_int_operand")))]
+	 (match_operand:QI 2 "const_int_operand")))]
   "TARGET_SSE2"
 {
   ix86_expand_v1ti_rotate (ROTATERT, operands);
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-ashiftrt-1.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-ashiftrt-1.c
new file mode 100644
index 0000000..05869bf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-ashiftrt-1.c
@@ -0,0 +1,167 @@ 
+/* { dg-do run { target int128 } } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+typedef __int128 v1ti __attribute__ ((__vector_size__ (16)));
+typedef __int128 ti;
+
+ti ashr(ti x, unsigned int i) { return x >> i; }
+
+v1ti ashr_1(v1ti x) { return x >> 1; }
+v1ti ashr_2(v1ti x) { return x >> 2; }
+v1ti ashr_7(v1ti x) { return x >> 7; }
+v1ti ashr_8(v1ti x) { return x >> 8; }
+v1ti ashr_9(v1ti x) { return x >> 9; }
+v1ti ashr_15(v1ti x) { return x >> 15; }
+v1ti ashr_16(v1ti x) { return x >> 16; }
+v1ti ashr_17(v1ti x) { return x >> 17; }
+v1ti ashr_23(v1ti x) { return x >> 23; }
+v1ti ashr_24(v1ti x) { return x >> 24; }
+v1ti ashr_25(v1ti x) { return x >> 25; }
+v1ti ashr_31(v1ti x) { return x >> 31; }
+v1ti ashr_32(v1ti x) { return x >> 32; }
+v1ti ashr_33(v1ti x) { return x >> 33; }
+v1ti ashr_47(v1ti x) { return x >> 47; }
+v1ti ashr_48(v1ti x) { return x >> 48; }
+v1ti ashr_49(v1ti x) { return x >> 49; }
+v1ti ashr_63(v1ti x) { return x >> 63; }
+v1ti ashr_64(v1ti x) { return x >> 64; }
+v1ti ashr_65(v1ti x) { return x >> 65; }
+v1ti ashr_72(v1ti x) { return x >> 72; }
+v1ti ashr_79(v1ti x) { return x >> 79; }
+v1ti ashr_80(v1ti x) { return x >> 80; }
+v1ti ashr_81(v1ti x) { return x >> 81; }
+v1ti ashr_95(v1ti x) { return x >> 95; }
+v1ti ashr_96(v1ti x) { return x >> 96; }
+v1ti ashr_97(v1ti x) { return x >> 97; }
+v1ti ashr_111(v1ti x) { return x >> 111; }
+v1ti ashr_112(v1ti x) { return x >> 112; }
+v1ti ashr_113(v1ti x) { return x >> 113; }
+v1ti ashr_119(v1ti x) { return x >> 119; }
+v1ti ashr_120(v1ti x) { return x >> 120; }
+v1ti ashr_121(v1ti x) { return x >> 121; }
+v1ti ashr_126(v1ti x) { return x >> 126; }
+v1ti ashr_127(v1ti x) { return x >> 127; }
+
+typedef v1ti (*fun)(v1ti);
+
+struct {
+  unsigned int i;
+  fun ashr;
+} table[35] = {
+  {   1, ashr_1   },
+  {   2, ashr_2   },
+  {   7, ashr_7   },
+  {   8, ashr_8   },
+  {   9, ashr_9   },
+  {  15, ashr_15  },
+  {  16, ashr_16  },
+  {  17, ashr_17  },
+  {  23, ashr_23  },
+  {  24, ashr_24  },
+  {  25, ashr_25  },
+  {  31, ashr_31  },
+  {  32, ashr_32  },
+  {  33, ashr_33  },
+  {  47, ashr_47  },
+  {  48, ashr_48  },
+  {  49, ashr_49  },
+  {  63, ashr_63  },
+  {  64, ashr_64  },
+  {  65, ashr_65  },
+  {  72, ashr_72  },
+  {  79, ashr_79  },
+  {  80, ashr_80  },
+  {  81, ashr_81  },
+  {  95, ashr_95  },
+  {  96, ashr_96  },
+  {  97, ashr_97  },
+  { 111, ashr_111 },
+  { 112, ashr_112 },
+  { 113, ashr_113 },
+  { 119, ashr_119 },
+  { 120, ashr_120 },
+  { 121, ashr_121 },
+  { 126, ashr_126 },
+  { 127, ashr_127 }
+};
+
+void test(ti x)
+{
+  unsigned int i;
+  v1ti t = (v1ti)x;
+
+  for (i=0; i<(sizeof(table)/sizeof(table[0])); i++) {
+    if ((ti)(*table[i].ashr)(t) != ashr(x,table[i].i))
+      __builtin_abort();
+  }
+}
+
+int main()
+{
+  ti x;
+
+  x = ((ti)0x0011223344556677ull)<<64 | 0x8899aabbccddeeffull;
+  test(x);
+  x = ((ti)0xffeeddccbbaa9988ull)<<64 | 0x7766554433221100ull;
+  test(x);
+  x = ((ti)0x0123456789abcdefull)<<64 | 0x0123456789abcdefull;
+  test(x);
+  x = ((ti)0xfedcba9876543210ull)<<64 | 0xfedcba9876543210ull;
+  test(x);
+  x = ((ti)0x0123456789abcdefull)<<64 | 0xfedcba9876543210ull;
+  test(x);
+  x = ((ti)0xfedcba9876543210ull)<<64 | 0x0123456789abcdefull;
+  test(x);
+  x = 0;
+  test(x);
+  x = 0xffffffffffffffffull;
+  test(x);
+  x = ((ti)0xffffffffffffffffull)<<64;
+  test(x);
+  x = ((ti)0xffffffffffffffffull)<<64 | 0xffffffffffffffffull;
+  test(x);
+  x = ((ti)0x5a5a5a5a5a5a5a5aull)<<64 | 0x5a5a5a5a5a5a5a5aull;
+  test(x);
+  x = ((ti)0xa5a5a5a5a5a5a5a5ull)<<64 | 0xa5a5a5a5a5a5a5a5ull;
+  test(x);
+  x = 0xffull;
+  test(x);
+  x = 0xff00ull;
+  test(x);
+  x = 0xff0000ull;
+  test(x);
+  x = 0xff000000ull;
+  test(x);
+  x = 0xff00000000ull;
+  test(x);
+  x = 0xff0000000000ull;
+  test(x);
+  x = 0xff000000000000ull;
+  test(x);
+  x = 0xff00000000000000ull;
+  test(x);
+  x = ((ti)0xffull)<<64;
+  test(x);
+  x = ((ti)0xff00ull)<<64;
+  test(x);
+  x = ((ti)0xff0000ull)<<64;
+  test(x);
+  x = ((ti)0xff000000ull)<<64;
+  test(x);
+  x = ((ti)0xff00000000ull)<<64;
+  test(x);
+  x = ((ti)0xff0000000000ull)<<64;
+  test(x);
+  x = ((ti)0xff000000000000ull)<<64;
+  test(x);
+  x = ((ti)0xff00000000000000ull)<<64;
+  test(x);
+  x = 0xdeadbeefcafebabeull;
+  test(x);
+  x = ((ti)0xdeadbeefcafebabeull)<<64;
+  test(x);
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-ashiftrt-2.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-ashiftrt-2.c
new file mode 100644
index 0000000..b3d0aa3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-ashiftrt-2.c
@@ -0,0 +1,166 @@ 
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse2 -mavx2 " } */
+
+typedef __int128 v1ti __attribute__ ((__vector_size__ (16)));
+typedef __int128 ti;
+
+ti ashr(ti x, unsigned int i) { return x >> i; }
+
+v1ti ashr_1(v1ti x) { return x >> 1; }
+v1ti ashr_2(v1ti x) { return x >> 2; }
+v1ti ashr_7(v1ti x) { return x >> 7; }
+v1ti ashr_8(v1ti x) { return x >> 8; }
+v1ti ashr_9(v1ti x) { return x >> 9; }
+v1ti ashr_15(v1ti x) { return x >> 15; }
+v1ti ashr_16(v1ti x) { return x >> 16; }
+v1ti ashr_17(v1ti x) { return x >> 17; }
+v1ti ashr_23(v1ti x) { return x >> 23; }
+v1ti ashr_24(v1ti x) { return x >> 24; }
+v1ti ashr_25(v1ti x) { return x >> 25; }
+v1ti ashr_31(v1ti x) { return x >> 31; }
+v1ti ashr_32(v1ti x) { return x >> 32; }
+v1ti ashr_33(v1ti x) { return x >> 33; }
+v1ti ashr_47(v1ti x) { return x >> 47; }
+v1ti ashr_48(v1ti x) { return x >> 48; }
+v1ti ashr_49(v1ti x) { return x >> 49; }
+v1ti ashr_63(v1ti x) { return x >> 63; }
+v1ti ashr_64(v1ti x) { return x >> 64; }
+v1ti ashr_65(v1ti x) { return x >> 65; }
+v1ti ashr_72(v1ti x) { return x >> 72; }
+v1ti ashr_79(v1ti x) { return x >> 79; }
+v1ti ashr_80(v1ti x) { return x >> 80; }
+v1ti ashr_81(v1ti x) { return x >> 81; }
+v1ti ashr_95(v1ti x) { return x >> 95; }
+v1ti ashr_96(v1ti x) { return x >> 96; }
+v1ti ashr_97(v1ti x) { return x >> 97; }
+v1ti ashr_111(v1ti x) { return x >> 111; }
+v1ti ashr_112(v1ti x) { return x >> 112; }
+v1ti ashr_113(v1ti x) { return x >> 113; }
+v1ti ashr_119(v1ti x) { return x >> 119; }
+v1ti ashr_120(v1ti x) { return x >> 120; }
+v1ti ashr_121(v1ti x) { return x >> 121; }
+v1ti ashr_126(v1ti x) { return x >> 126; }
+v1ti ashr_127(v1ti x) { return x >> 127; }
+
+typedef v1ti (*fun)(v1ti);
+
+struct {
+  unsigned int i;
+  fun ashr;
+} table[35] = {
+  {   1, ashr_1   },
+  {   2, ashr_2   },
+  {   7, ashr_7   },
+  {   8, ashr_8   },
+  {   9, ashr_9   },
+  {  15, ashr_15  },
+  {  16, ashr_16  },
+  {  17, ashr_17  },
+  {  23, ashr_23  },
+  {  24, ashr_24  },
+  {  25, ashr_25  },
+  {  31, ashr_31  },
+  {  32, ashr_32  },
+  {  33, ashr_33  },
+  {  47, ashr_47  },
+  {  48, ashr_48  },
+  {  49, ashr_49  },
+  {  63, ashr_63  },
+  {  64, ashr_64  },
+  {  65, ashr_65  },
+  {  72, ashr_72  },
+  {  79, ashr_79  },
+  {  80, ashr_80  },
+  {  81, ashr_81  },
+  {  95, ashr_95  },
+  {  96, ashr_96  },
+  {  97, ashr_97  },
+  { 111, ashr_111 },
+  { 112, ashr_112 },
+  { 113, ashr_113 },
+  { 119, ashr_119 },
+  { 120, ashr_120 },
+  { 121, ashr_121 },
+  { 126, ashr_126 },
+  { 127, ashr_127 }
+};
+
+void test(ti x)
+{
+  unsigned int i;
+  v1ti t = (v1ti)x;
+
+  for (i=0; i<(sizeof(table)/sizeof(table[0])); i++) {
+    if ((ti)(*table[i].ashr)(t) != ashr(x,table[i].i))
+      __builtin_abort();
+  }
+}
+
+int main()
+{
+  ti x;
+
+  x = ((ti)0x0011223344556677ull)<<64 | 0x8899aabbccddeeffull;
+  test(x);
+  x = ((ti)0xffeeddccbbaa9988ull)<<64 | 0x7766554433221100ull;
+  test(x);
+  x = ((ti)0x0123456789abcdefull)<<64 | 0x0123456789abcdefull;
+  test(x);
+  x = ((ti)0xfedcba9876543210ull)<<64 | 0xfedcba9876543210ull;
+  test(x);
+  x = ((ti)0x0123456789abcdefull)<<64 | 0xfedcba9876543210ull;
+  test(x);
+  x = ((ti)0xfedcba9876543210ull)<<64 | 0x0123456789abcdefull;
+  test(x);
+  x = 0;
+  test(x);
+  x = 0xffffffffffffffffull;
+  test(x);
+  x = ((ti)0xffffffffffffffffull)<<64;
+  test(x);
+  x = ((ti)0xffffffffffffffffull)<<64 | 0xffffffffffffffffull;
+  test(x);
+  x = ((ti)0x5a5a5a5a5a5a5a5aull)<<64 | 0x5a5a5a5a5a5a5a5aull;
+  test(x);
+  x = ((ti)0xa5a5a5a5a5a5a5a5ull)<<64 | 0xa5a5a5a5a5a5a5a5ull;
+  test(x);
+  x = 0xffull;
+  test(x);
+  x = 0xff00ull;
+  test(x);
+  x = 0xff0000ull;
+  test(x);
+  x = 0xff000000ull;
+  test(x);
+  x = 0xff00000000ull;
+  test(x);
+  x = 0xff0000000000ull;
+  test(x);
+  x = 0xff000000000000ull;
+  test(x);
+  x = 0xff00000000000000ull;
+  test(x);
+  x = ((ti)0xffull)<<64;
+  test(x);
+  x = ((ti)0xff00ull)<<64;
+  test(x);
+  x = ((ti)0xff0000ull)<<64;
+  test(x);
+  x = ((ti)0xff000000ull)<<64;
+  test(x);
+  x = ((ti)0xff00000000ull)<<64;
+  test(x);
+  x = ((ti)0xff0000000000ull)<<64;
+  test(x);
+  x = ((ti)0xff000000000000ull)<<64;
+  test(x);
+  x = ((ti)0xff00000000000000ull)<<64;
+  test(x);
+  x = 0xdeadbeefcafebabeull;
+  test(x);
+  x = ((ti)0xdeadbeefcafebabeull)<<64;
+  test(x);
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-ashiftrt-3.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-ashiftrt-3.c
new file mode 100644
index 0000000..61d4f4c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-ashiftrt-3.c
@@ -0,0 +1,166 @@ 
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse2 -msse4.1" } */
+
+typedef __int128 v1ti __attribute__ ((__vector_size__ (16)));
+typedef __int128 ti;
+
+ti ashr(ti x, unsigned int i) { return x >> i; }
+
+v1ti ashr_1(v1ti x) { return x >> 1; }
+v1ti ashr_2(v1ti x) { return x >> 2; }
+v1ti ashr_7(v1ti x) { return x >> 7; }
+v1ti ashr_8(v1ti x) { return x >> 8; }
+v1ti ashr_9(v1ti x) { return x >> 9; }
+v1ti ashr_15(v1ti x) { return x >> 15; }
+v1ti ashr_16(v1ti x) { return x >> 16; }
+v1ti ashr_17(v1ti x) { return x >> 17; }
+v1ti ashr_23(v1ti x) { return x >> 23; }
+v1ti ashr_24(v1ti x) { return x >> 24; }
+v1ti ashr_25(v1ti x) { return x >> 25; }
+v1ti ashr_31(v1ti x) { return x >> 31; }
+v1ti ashr_32(v1ti x) { return x >> 32; }
+v1ti ashr_33(v1ti x) { return x >> 33; }
+v1ti ashr_47(v1ti x) { return x >> 47; }
+v1ti ashr_48(v1ti x) { return x >> 48; }
+v1ti ashr_49(v1ti x) { return x >> 49; }
+v1ti ashr_63(v1ti x) { return x >> 63; }
+v1ti ashr_64(v1ti x) { return x >> 64; }
+v1ti ashr_65(v1ti x) { return x >> 65; }
+v1ti ashr_72(v1ti x) { return x >> 72; }
+v1ti ashr_79(v1ti x) { return x >> 79; }
+v1ti ashr_80(v1ti x) { return x >> 80; }
+v1ti ashr_81(v1ti x) { return x >> 81; }
+v1ti ashr_95(v1ti x) { return x >> 95; }
+v1ti ashr_96(v1ti x) { return x >> 96; }
+v1ti ashr_97(v1ti x) { return x >> 97; }
+v1ti ashr_111(v1ti x) { return x >> 111; }
+v1ti ashr_112(v1ti x) { return x >> 112; }
+v1ti ashr_113(v1ti x) { return x >> 113; }
+v1ti ashr_119(v1ti x) { return x >> 119; }
+v1ti ashr_120(v1ti x) { return x >> 120; }
+v1ti ashr_121(v1ti x) { return x >> 121; }
+v1ti ashr_126(v1ti x) { return x >> 126; }
+v1ti ashr_127(v1ti x) { return x >> 127; }
+
+typedef v1ti (*fun)(v1ti);
+
+struct {
+  unsigned int i;
+  fun ashr;
+} table[35] = {
+  {   1, ashr_1   },
+  {   2, ashr_2   },
+  {   7, ashr_7   },
+  {   8, ashr_8   },
+  {   9, ashr_9   },
+  {  15, ashr_15  },
+  {  16, ashr_16  },
+  {  17, ashr_17  },
+  {  23, ashr_23  },
+  {  24, ashr_24  },
+  {  25, ashr_25  },
+  {  31, ashr_31  },
+  {  32, ashr_32  },
+  {  33, ashr_33  },
+  {  47, ashr_47  },
+  {  48, ashr_48  },
+  {  49, ashr_49  },
+  {  63, ashr_63  },
+  {  64, ashr_64  },
+  {  65, ashr_65  },
+  {  72, ashr_72  },
+  {  79, ashr_79  },
+  {  80, ashr_80  },
+  {  81, ashr_81  },
+  {  95, ashr_95  },
+  {  96, ashr_96  },
+  {  97, ashr_97  },
+  { 111, ashr_111 },
+  { 112, ashr_112 },
+  { 113, ashr_113 },
+  { 119, ashr_119 },
+  { 120, ashr_120 },
+  { 121, ashr_121 },
+  { 126, ashr_126 },
+  { 127, ashr_127 }
+};
+
+void test(ti x)
+{
+  unsigned int i;
+  v1ti t = (v1ti)x;
+
+  for (i=0; i<(sizeof(table)/sizeof(table[0])); i++) {
+    if ((ti)(*table[i].ashr)(t) != ashr(x,table[i].i))
+      __builtin_abort();
+  }
+}
+
+int main()
+{
+  ti x;
+
+  x = ((ti)0x0011223344556677ull)<<64 | 0x8899aabbccddeeffull;
+  test(x);
+  x = ((ti)0xffeeddccbbaa9988ull)<<64 | 0x7766554433221100ull;
+  test(x);
+  x = ((ti)0x0123456789abcdefull)<<64 | 0x0123456789abcdefull;
+  test(x);
+  x = ((ti)0xfedcba9876543210ull)<<64 | 0xfedcba9876543210ull;
+  test(x);
+  x = ((ti)0x0123456789abcdefull)<<64 | 0xfedcba9876543210ull;
+  test(x);
+  x = ((ti)0xfedcba9876543210ull)<<64 | 0x0123456789abcdefull;
+  test(x);
+  x = 0;
+  test(x);
+  x = 0xffffffffffffffffull;
+  test(x);
+  x = ((ti)0xffffffffffffffffull)<<64;
+  test(x);
+  x = ((ti)0xffffffffffffffffull)<<64 | 0xffffffffffffffffull;
+  test(x);
+  x = ((ti)0x5a5a5a5a5a5a5a5aull)<<64 | 0x5a5a5a5a5a5a5a5aull;
+  test(x);
+  x = ((ti)0xa5a5a5a5a5a5a5a5ull)<<64 | 0xa5a5a5a5a5a5a5a5ull;
+  test(x);
+  x = 0xffull;
+  test(x);
+  x = 0xff00ull;
+  test(x);
+  x = 0xff0000ull;
+  test(x);
+  x = 0xff000000ull;
+  test(x);
+  x = 0xff00000000ull;
+  test(x);
+  x = 0xff0000000000ull;
+  test(x);
+  x = 0xff000000000000ull;
+  test(x);
+  x = 0xff00000000000000ull;
+  test(x);
+  x = ((ti)0xffull)<<64;
+  test(x);
+  x = ((ti)0xff00ull)<<64;
+  test(x);
+  x = ((ti)0xff0000ull)<<64;
+  test(x);
+  x = ((ti)0xff000000ull)<<64;
+  test(x);
+  x = ((ti)0xff00000000ull)<<64;
+  test(x);
+  x = ((ti)0xff0000000000ull)<<64;
+  test(x);
+  x = ((ti)0xff000000000000ull)<<64;
+  test(x);
+  x = ((ti)0xff00000000000000ull)<<64;
+  test(x);
+  x = 0xdeadbeefcafebabeull;
+  test(x);
+  x = ((ti)0xdeadbeefcafebabeull)<<64;
+  test(x);
+
+  return 0;
+}
+
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-shift-2.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-shift-2.c
new file mode 100644
index 0000000..18da2ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-shift-2.c
@@ -0,0 +1,13 @@ 
+/* PR target/102986 */
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
+typedef __int128 sv1ti __attribute__ ((__vector_size__ (16)));
+
+uv1ti ashl(uv1ti x, unsigned int i) { return x << i; }
+uv1ti lshr(uv1ti x, unsigned int i) { return x >> i; }
+sv1ti ashr(sv1ti x, unsigned int i) { return x >> i; }
+uv1ti rotr(uv1ti x, unsigned int i) { return (x >> i) | (x << (128-i)); }
+uv1ti rotl(uv1ti x, unsigned int i) { return (x << i) | (x >> (128-i)); }
+
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-shift-3.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-shift-3.c
new file mode 100644
index 0000000..8d5c122
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-shift-3.c
@@ -0,0 +1,113 @@ 
+/* PR target/102986 */
+/* { dg-do run { target int128 } } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
+typedef __int128 sv1ti __attribute__ ((__vector_size__ (16)));
+typedef __int128 v1ti __attribute__ ((__vector_size__ (16)));
+
+typedef unsigned __int128 uti;
+typedef __int128 sti;
+typedef __int128 ti;
+
+uv1ti ashl_v1ti(uv1ti x, unsigned int i) { return x << i; }
+uv1ti lshr_v1ti(uv1ti x, unsigned int i) { return x >> i; }
+sv1ti ashr_v1ti(sv1ti x, unsigned int i) { return x >> i; }
+uv1ti rotr_v1ti(uv1ti x, unsigned int i) { return (x >> i) | (x << (128-i)); }
+uv1ti rotl_v1ti(uv1ti x, unsigned int i) { return (x << i) | (x >> (128-i)); }
+
+uti ashl_ti(uti x, unsigned int i) { return x << i; }
+uti lshr_ti(uti x, unsigned int i) { return x >> i; }
+sti ashr_ti(sti x, unsigned int i) { return x >> i; }
+uti rotr_ti(uti x, unsigned int i) { return (x >> i) | (x << (128-i)); }
+uti rotl_ti(uti x, unsigned int i) { return (x << i) | (x >> (128-i)); }
+
+void test(ti x)
+{
+  unsigned int i;
+  uv1ti ut = (uv1ti)x;
+  sv1ti st = (sv1ti)x;
+
+  for (i=0; i<128; i++) {
+    if ((ti)ashl_v1ti(ut,i) != (ti)ashl_ti(x,i))
+      __builtin_abort();
+    if ((ti)lshr_v1ti(ut,i) != (ti)lshr_ti(x,i))
+      __builtin_abort();
+    if ((ti)ashr_v1ti(st,i) != (ti)ashr_ti(x,i))
+      __builtin_abort();
+    if ((ti)rotr_v1ti(ut,i) != (ti)rotr_ti(x,i))
+      __builtin_abort();
+    if ((ti)rotl_v1ti(ut,i) != (ti)rotl_ti(x,i))
+      __builtin_abort();
+  }
+}
+
+int main()
+{
+  ti x;
+
+  x = ((ti)0x0011223344556677ull)<<64 | 0x8899aabbccddeeffull;
+  test(x);
+  x = ((ti)0xffeeddccbbaa9988ull)<<64 | 0x7766554433221100ull;
+  test(x);
+  x = ((ti)0x0123456789abcdefull)<<64 | 0x0123456789abcdefull;
+  test(x);
+  x = ((ti)0xfedcba9876543210ull)<<64 | 0xfedcba9876543210ull;
+  test(x);
+  x = ((ti)0x0123456789abcdefull)<<64 | 0xfedcba9876543210ull;
+  test(x);
+  x = ((ti)0xfedcba9876543210ull)<<64 | 0x0123456789abcdefull;
+  test(x);
+  x = 0;
+  test(x);
+  x = 0xffffffffffffffffull;
+  test(x);
+  x = ((ti)0xffffffffffffffffull)<<64;
+  test(x);
+  x = ((ti)0xffffffffffffffffull)<<64 | 0xffffffffffffffffull;
+  test(x);
+  x = ((ti)0x5a5a5a5a5a5a5a5aull)<<64 | 0x5a5a5a5a5a5a5a5aull;
+  test(x);
+  x = ((ti)0xa5a5a5a5a5a5a5a5ull)<<64 | 0xa5a5a5a5a5a5a5a5ull;
+  test(x);
+  x = 0xffull;
+  test(x);
+  x = 0xff00ull;
+  test(x);
+  x = 0xff0000ull;
+  test(x);
+  x = 0xff000000ull;
+  test(x);
+  x = 0xff00000000ull;
+  test(x);
+  x = 0xff0000000000ull;
+  test(x);
+  x = 0xff000000000000ull;
+  test(x);
+  x = 0xff00000000000000ull;
+  test(x);
+  x = ((ti)0xffull)<<64;
+  test(x);
+  x = ((ti)0xff00ull)<<64;
+  test(x);
+  x = ((ti)0xff0000ull)<<64;
+  test(x);
+  x = ((ti)0xff000000ull)<<64;
+  test(x);
+  x = ((ti)0xff00000000ull)<<64;
+  test(x);
+  x = ((ti)0xff0000000000ull)<<64;
+  test(x);
+  x = ((ti)0xff000000000000ull)<<64;
+  test(x);
+  x = ((ti)0xff00000000000000ull)<<64;
+  test(x);
+  x = 0xdeadbeefcafebabeull;
+  test(x);
+  x = ((ti)0xdeadbeefcafebabeull)<<64;
+  test(x);
+
+  return 0;
+}
+